001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.core.sequence.storage; 022 023import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 024import org.biojava.nbio.core.sequence.AccessionID; 025import org.biojava.nbio.core.sequence.compound.NucleotideCompound; 026import org.biojava.nbio.core.sequence.template.*; 027import org.biojava.nbio.core.util.Equals; 028import org.biojava.nbio.core.util.Hashcoder; 029 030import java.util.Iterator; 031import java.util.List; 032import java.util.Map; 033 034/** 035 * An implementation of the popular bit encodings. This class provides the 036 * Sequence view over what is actually carried out in the {@link BitArrayWorker} 037 * instances. These are the objects that carry out array storage as well as 038 * indexing into those arrays. New bit encodings can be written by extending 039 * this class and a worker class. There are a number of issues with this 040 * type of storage engine: 041 * 042 * <ul> 043 * <li>We can only support a finite number of {@link Compound}s; 2 bit allows no N compounds</li> 044 * <li>For real savings you must read the sequence in using your own 045 * Reader and a {@link BitArrayWorker} instance</li> 046 * </ul> 047 * 048 * @author ayates 049 * 050 * @param <C> Type of compound; must extend {@link NucleotideCompound} 051 */ 052public class BitSequenceReader<C extends Compound> implements ProxySequenceReader<C> { 053 054 private final AccessionID accession; 055 private final BitArrayWorker<C> worker; 056 057 /** 058 * Instance which allows you to supply a different @{BitArrayWorker} 059 * object. 060 */ 061 public BitSequenceReader(BitArrayWorker<C> worker, AccessionID accession) { 062 this.accession = accession; 063 this.worker = worker; 064 } 065 066 /** 067 * Class is immutable & so this is unsupported 068 */ 069 @Override 070 public void setCompoundSet(CompoundSet<C> compoundSet) { 071 throw new UnsupportedOperationException("Cannot reset the CompoundSet; object is immutable"); 072 } 073 074 /** 075 * Class is immutable & so this is unsupported 076 */ 077 @Override 078 public void setContents(String sequence) throws CompoundNotFoundException { 079 throw new UnsupportedOperationException(getClass().getSimpleName() + " is an immutable data structure; cannot reset contents"); 080 } 081 082 /** 083 * Counts the number of times a compound appears in this sequence store 084 */ 085 @Override 086 public int countCompounds(C... compounds) { 087 return SequenceMixin.countCompounds(this, compounds); 088 } 089 090 091 @Override 092 public AccessionID getAccession() { 093 return accession; 094 } 095 096 /** 097 * Returns this Sequence store as a List 098 */ 099 @Override 100 public List<C> getAsList() { 101 return SequenceMixin.toList(this); 102 } 103 104 /** 105 * Returns the compound at the specified biological index 106 */ 107 @Override 108 public C getCompoundAt(int position) { 109 return worker.getCompoundAt(position); 110 } 111 112 /** 113 * Returns the compound set backing this store 114 */ 115 @Override 116 public CompoundSet<C> getCompoundSet() { 117 return worker.getCompoundSet(); 118 } 119 120 /** 121 * Returns the first occurrence of the given compound in this store; performs 122 * a linear search 123 */ 124 125 @Override 126 public int getIndexOf(C compound) { 127 return SequenceMixin.indexOf(this, compound); 128 } 129 130 /** 131 * Returns the last occurrence of the given compound in this store; performs 132 * a linear search 133 */ 134 135 @Override 136 public int getLastIndexOf(C compound) { 137 return SequenceMixin.lastIndexOf(this, compound); 138 } 139 140 /** 141 * Returns the length of the sequence 142 */ 143 144 @Override 145 public int getLength() { 146 return worker.getLength(); 147 } 148 149 /** 150 * Returns the sequence as a String 151 */ 152 153 @Override 154 public String getSequenceAsString() { 155 return SequenceMixin.toStringBuilder(this).toString(); 156 } 157 158 /** 159 * Returns a sub sequence view 160 */ 161 public SequenceView<C> getSubSequence(final int start, final int end) { 162 return SequenceMixin.createSubSequence(this, start, end); 163 } 164 165 /** 166 * Provides basic iterable access to this class 167 */ 168 @Override 169 public Iterator<C> iterator() { 170 return SequenceMixin.createIterator(this); 171 } 172 173 @Override 174 public SequenceView<C> getSubSequence(Integer start, Integer end) { 175 return getSubSequence((int) start, (int) end); 176 } 177 178 @Override 179 public SequenceView<C> getInverse() { 180 return SequenceMixin.inverse(this); 181 } 182 183 @Override 184 public int hashCode() { 185 int s = Hashcoder.SEED; 186 s = Hashcoder.hash(s, accession); 187 s = Hashcoder.hash(s, worker); 188 return s; 189 } 190 191 @Override 192 public boolean equals(Object o) { 193 if(Equals.classEqual(this, o)) { 194 @SuppressWarnings("unchecked") 195 BitSequenceReader<C> that = (BitSequenceReader<C>)o; 196 return Equals.equal(this.accession, that.accession) && 197 Equals.equal(this.worker, that.worker); 198 } 199 return false; 200 } 201 202 /** 203 * The logic of working with a bit has been separated out into this class 204 * to help developers create the bit data structures without having to 205 * put the code into an intermediate format and to also use the format 206 * without the need to copy this code. 207 * 208 * This class behaves just like a {@link Sequence} without the interface 209 * 210 * @author ayates 211 * 212 * @param <C> The {@link Compound} to use 213 */ 214 public static abstract class BitArrayWorker<C extends Compound> { 215 216 private final CompoundSet<C> compoundSet; 217 private final int length; 218 private final int[] sequence; 219 private transient List<C> indexToCompoundsLookup = null; 220 private transient Map<C, Integer> compoundsToIndexLookup = null; 221 public static final int BYTES_PER_INT = 32; 222 223 private volatile Integer hashcode = null; 224 225 public BitArrayWorker(Sequence<C> sequence) { 226 this(sequence.getCompoundSet(), sequence.getLength()); 227 populate(sequence); 228 } 229 230 public BitArrayWorker(String sequence, CompoundSet<C> compoundSet) { 231 this(compoundSet, sequence.length()); 232 populate(sequence); 233 } 234 235 public BitArrayWorker(CompoundSet<C> compoundSet, int length) { 236 this.compoundSet = compoundSet; 237 this.length = length; 238 this.sequence = new int[seqArraySize(length)]; 239 } 240 241 public BitArrayWorker(CompoundSet<C> compoundSet, int[] sequence) { 242 this.compoundSet = compoundSet; 243 this.sequence = sequence; 244 this.length = sequence.length; 245 } 246 247 /** 248 * This method should return the bit mask to be used to extract the 249 * bytes you are interested in working with. See solid implementations 250 * on how to create these 251 */ 252 protected abstract byte bitMask(); 253 254 /** 255 * Should return the maximum amount of compounds we can encode per int 256 */ 257 protected abstract int compoundsPerDatatype(); 258 259 /** 260 * Should return the inverse information that {@link #generateCompoundsToIndex() } 261 * returns i.e. if the Compound C returns 1 from compoundsToIndex then we 262 * should find that compound here in position 1 263 */ 264 protected abstract List<C> generateIndexToCompounds(); 265 266 /** 267 * Returns what the value of a compound is in the backing bit storage i.e. 268 * in 2bit storage the value 0 is encoded as 00 (in binary). 269 */ 270 protected abstract Map<C, Integer> generateCompoundsToIndex(); 271 272 /** 273 * Returns how many bits are used to represent a compound e.g. 2 if using 274 * 2bit encoding. 275 */ 276 protected int bitsPerCompound() { 277 return BYTES_PER_INT / compoundsPerDatatype(); 278 } 279 280 public int seqArraySize(int length) { 281 return (int) Math.ceil((double) length / (double) compoundsPerDatatype()); 282 } 283 284 /** 285 * Loops through the Compounds in a Sequence and passes them onto 286 * {@link #setCompoundAt(Compound, int)} 287 */ 288 public void populate(Sequence<C> sequence) { 289 int position = 1; 290 for (C c : sequence) { 291 setCompoundAt(c, position++); 292 } 293 } 294 295 /** 296 * Loops through the chars in a String and passes them onto 297 * {@link #setCompoundAt(char, int)} 298 */ 299 public void populate(String sequence) { 300 for (int index = 0; index < getLength(); index++) { 301 setCompoundAt(sequence.charAt(index), index + 1); 302 } 303 } 304 305 /** 306 * Converts from char to Compound and sets it at the given biological index 307 */ 308 public void setCompoundAt(char base, int position) { 309 C compound = getCompoundSet().getCompoundForString(Character.toString(base)); 310 setCompoundAt(compound, position); 311 } 312 313 /** 314 * Sets the compound at the specified biological index 315 */ 316 public void setCompoundAt(C compound, int position) { 317 hashcode = null; 318 int arrayIndex = biologicalIndexToArrayIndex(position); 319 int currentInt = sequence[arrayIndex]; 320 int shiftBy = shiftBy(position); 321 Integer integerValue = getCompoundsToIndexLookup().get(compound); 322 323 //If we got nothing then throw an error as it's wrong 324 if (integerValue == null) { 325 processUnknownCompound(compound, position); 326 } 327 328 int shiftedValue = integerValue << shiftBy; 329 330 sequence[arrayIndex] = currentInt | shiftedValue; 331 } 332 333 /** 334 * Returns the compound at the specified biological index 335 */ 336 public C getCompoundAt(int position) { 337 //Avoids asking for something which is not encoded by a bit-pair 338 if (position > getLength()) { 339 throw new IllegalArgumentException(position + " is greater than length. Cannot access this position"); 340 } 341 //Just stops us from using 0 indexing 342 if (position < 1) { 343 throw new IllegalArgumentException(position + " is less than 1; you must use biological indexing (indexing from 1)"); 344 } 345 346 int arrayIndex = biologicalIndexToArrayIndex(position); 347 int currentByte = sequence[arrayIndex]; 348 int shiftBy = shiftBy(position); 349 int shifted = currentByte >>> shiftBy; 350 int masked = shifted & bitMask(); 351 352 //If we could encode 4 compounds then our max masked value is 3 353 if (masked > (compoundsPerDatatype() - 1)) { 354 throw new IllegalStateException("Got a masked value of " + masked + "; do not understand values greater than " + (compoundsPerDatatype() - 1)); 355 } 356 return getIndexToCompoundsLookup().get(masked); 357 } 358 359 /** 360 * Since bit encoding only supports a finite number of bases 361 * it is more than likely when processing sequence you will encounter a 362 * compound which is not covered by the encoding e.g. N in a 2bit sequence. 363 * You can override this to convert the unknown base into one you can 364 * process or store locations of unknown bases for a level of post processing 365 * in your subclass. 366 * 367 * @param compound Compound process 368 * @return Byte representation of the compound 369 * @throws IllegalStateException Done whenever this method is invoked 370 */ 371 protected byte processUnknownCompound(C compound, int position) throws IllegalStateException { 372 throw new IllegalStateException("Do not know how to translate the compound " + compound + " to a " + bitsPerCompound() + "bit representation"); 373 } 374 375 /** 376 * Returns a list of compounds the index position of which is used 377 * to translate from the byte representation into a compound. 378 */ 379 protected List<C> getIndexToCompoundsLookup() { 380 if (indexToCompoundsLookup == null) { 381 indexToCompoundsLookup = generateIndexToCompounds(); 382 } 383 return indexToCompoundsLookup; 384 } 385 386 /** 387 * Returns a map which converts from compound to an integer representation 388 */ 389 protected Map<C, Integer> getCompoundsToIndexLookup() { 390 if (compoundsToIndexLookup == null) { 391 compoundsToIndexLookup = generateCompoundsToIndex(); 392 } 393 return compoundsToIndexLookup; 394 } 395 396 /** 397 * Converting a biological index to the int which is used to store that 398 * position's data. 399 * 400 * <ul> 401 * <li>Assuming 2bit encoding using the 17 base in a sequence</li> 402 * <li>Convert into 0 index; 17 - 1 = 16</li> 403 * <li>Divide by the number of compounds per int; 16 / 16</li> 404 * <li>Floor the value; floor(1) = 1</li> 405 * </ul> 406 * 407 * Running this for position 13 408 * 409 * <ul> 410 * <li>13 - 1 = 12</li> 411 * <li>12 / 16 = 0.75</li> 412 * <li>floor(0.75) = 0</li> 413 * </ul> 414 */ 415 private int biologicalIndexToArrayIndex(int index) { 416 return ((index - 1) / compoundsPerDatatype()); 417 } 418 419 /** 420 * Convert from bio to 0 index, remainder & then multiply by 2 421 * <ul> 422 * <li>Using 2bit encoding and working with position 19</li> 423 * <li>19 is the 3rd position in the second int</li> 424 * <li>Means a shift of 4 into that int to get the right data out</li> 425 * <li>Also must convert into the non-bio index</li> 426 * <li>19 - 1 = 18</li> 427 * <li>18 % compoundsPerDatatype() (16) = 2</li> 428 * <li>2 * bits per compound (2) = 4</li> 429 * </ul> 430 */ 431 private byte shiftBy(int index) { 432 return (byte) (((index - 1) % compoundsPerDatatype()) * bitsPerCompound()); 433 } 434 435 /** 436 * Returns the compound set backing this store 437 */ 438 public CompoundSet<C> getCompoundSet() { 439 return compoundSet; 440 } 441 442 public int getLength() { 443 return length; 444 } 445 446 @Override 447 public int hashCode() { 448 if(hashcode == null) { 449 int s = Hashcoder.SEED; 450 s = Hashcoder.hash(s, sequence); 451 s = Hashcoder.hash(s, indexToCompoundsLookup); 452 s = Hashcoder.hash(s, compoundSet); 453 hashcode = s; 454 } 455 return hashcode; 456 } 457 458 @Override 459 @SuppressWarnings("unchecked") 460 public boolean equals(Object o) { 461 if(Equals.classEqual(this, o)) { 462 BitArrayWorker<C> that = (BitArrayWorker<C>)o; 463 return Equals.equal(compoundSet, that.compoundSet) && 464 Equals.equal(indexToCompoundsLookup, that.indexToCompoundsLookup) && 465 Equals.equal(sequence, that.sequence); 466 } 467 return false; 468 } 469 } 470}