001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.core.sequence.storage;
022
023import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
024import org.biojava.nbio.core.sequence.AccessionID;
025import org.biojava.nbio.core.sequence.compound.NucleotideCompound;
026import org.biojava.nbio.core.sequence.template.*;
027import org.biojava.nbio.core.util.Equals;
028import org.biojava.nbio.core.util.Hashcoder;
029
030import java.util.Iterator;
031import java.util.List;
032import java.util.Map;
033
034/**
035 * An implementation of the popular bit encodings. This class provides the
036 * Sequence view over what is actually carried out in the {@link BitArrayWorker}
037 * instances. These are the objects that carry out array storage as well as
038 * indexing into those arrays. New bit encodings can be written by extending
039 * this class and a worker class. There are a number of issues with this
040 * type of storage engine:
041 *
042 * <ul>
043 * <li>We can only support a finite number of {@link Compound}s; 2 bit allows no N compounds</li>
044 * <li>For real savings you must read the sequence in using your own
045 * Reader and a {@link BitArrayWorker} instance</li>
046 * </ul>
047 *
048 * @author ayates
049 *
050 * @param <C> Type of compound; must extend {@link NucleotideCompound}
051 */
052public class BitSequenceReader<C extends Compound> implements ProxySequenceReader<C> {
053
054        private final AccessionID accession;
055        private final BitArrayWorker<C> worker;
056
057        /**
058         * Instance which allows you to supply a different @{BitArrayWorker}
059         * object.
060         */
061        public BitSequenceReader(BitArrayWorker<C> worker, AccessionID accession) {
062                this.accession = accession;
063                this.worker = worker;
064        }
065
066        /**
067         * Class is immutable & so this is unsupported
068         */
069        @Override
070        public void setCompoundSet(CompoundSet<C> compoundSet) {
071                throw new UnsupportedOperationException("Cannot reset the CompoundSet; object is immutable");
072        }
073
074        /**
075         * Class is immutable & so this is unsupported
076         */
077        @Override
078        public void setContents(String sequence) throws CompoundNotFoundException {
079                throw new UnsupportedOperationException(getClass().getSimpleName() + " is an immutable data structure; cannot reset contents");
080        }
081
082        /**
083         * Counts the number of times a compound appears in this sequence store
084         */
085        @Override
086        public int countCompounds(C... compounds) {
087                return SequenceMixin.countCompounds(this, compounds);
088        }
089
090
091        @Override
092        public AccessionID getAccession() {
093                return accession;
094        }
095
096        /**
097         * Returns this Sequence store as a List
098         */
099        @Override
100        public List<C> getAsList() {
101                return SequenceMixin.toList(this);
102        }
103
104        /**
105         * Returns the compound at the specified biological index
106         */
107        @Override
108        public C getCompoundAt(int position) {
109                return worker.getCompoundAt(position);
110        }
111
112        /**
113         * Returns the compound set backing this store
114         */
115        @Override
116        public CompoundSet<C> getCompoundSet() {
117                return worker.getCompoundSet();
118        }
119
120        /**
121         * Returns the first occurrence of the given compound in this store; performs
122         * a linear search
123         */
124
125        @Override
126        public int getIndexOf(C compound) {
127                return SequenceMixin.indexOf(this, compound);
128        }
129
130        /**
131         * Returns the last occurrence of the given compound in this store; performs
132         * a linear search
133         */
134
135        @Override
136        public int getLastIndexOf(C compound) {
137                return SequenceMixin.lastIndexOf(this, compound);
138        }
139
140        /**
141         * Returns the length of the sequence
142         */
143
144        @Override
145        public int getLength() {
146                return worker.getLength();
147        }
148
149        /**
150         * Returns the sequence as a String
151         */
152
153        @Override
154        public String getSequenceAsString() {
155                return SequenceMixin.toStringBuilder(this).toString();
156        }
157
158        /**
159         * Returns a sub sequence view
160         */
161        public SequenceView<C> getSubSequence(final int start, final int end) {
162                return SequenceMixin.createSubSequence(this, start, end);
163        }
164
165        /**
166         * Provides basic iterable access to this class
167         */
168        @Override
169        public Iterator<C> iterator() {
170                return SequenceMixin.createIterator(this);
171        }
172
173        @Override
174        public SequenceView<C> getSubSequence(Integer start, Integer end) {
175                return getSubSequence((int) start, (int) end);
176        }
177
178        @Override
179        public SequenceView<C> getInverse() {
180                return SequenceMixin.inverse(this);
181        }
182
183        @Override
184        public int hashCode() {
185                int s = Hashcoder.SEED;
186                s = Hashcoder.hash(s, accession);
187                s = Hashcoder.hash(s, worker);
188                return s;
189        }
190
191        @Override
192        public boolean equals(Object o) {
193                if(Equals.classEqual(this, o)) {
194                        @SuppressWarnings("unchecked")
195                        BitSequenceReader<C> that = (BitSequenceReader<C>)o;
196                        return  Equals.equal(this.accession, that.accession) &&
197                                        Equals.equal(this.worker, that.worker);
198                }
199                return false;
200        }
201
202        /**
203         * The logic of working with a bit has been separated out into this class
204         * to help developers create the bit data structures without having to
205         * put the code into an intermediate format and to also use the format
206         * without the need to copy this code.
207         *
208         * This class behaves just like a {@link Sequence} without the interface
209         *
210         * @author ayates
211         *
212         * @param <C> The {@link Compound} to use
213         */
214        public static abstract class BitArrayWorker<C extends Compound> {
215
216                private final CompoundSet<C> compoundSet;
217                private final int length;
218                private final int[] sequence;
219                private transient List<C> indexToCompoundsLookup = null;
220                private transient Map<C, Integer> compoundsToIndexLookup = null;
221                public static final int BYTES_PER_INT = 32;
222
223                private volatile Integer hashcode = null;
224
225                public BitArrayWorker(Sequence<C> sequence) {
226                        this(sequence.getCompoundSet(), sequence.getLength());
227                        populate(sequence);
228                }
229
230                public BitArrayWorker(String sequence, CompoundSet<C> compoundSet) {
231                        this(compoundSet, sequence.length());
232                        populate(sequence);
233                }
234
235                public BitArrayWorker(CompoundSet<C> compoundSet, int length) {
236                        this.compoundSet = compoundSet;
237                        this.length = length;
238                        this.sequence = new int[seqArraySize(length)];
239                }
240
241                public BitArrayWorker(CompoundSet<C> compoundSet, int[] sequence) {
242                        this.compoundSet = compoundSet;
243                        this.sequence = sequence;
244                        this.length = sequence.length;
245                }
246
247                /**
248                 * This method should return the bit mask to be used to extract the
249                 * bytes you are interested in working with. See solid implementations
250                 * on how to create these
251                 */
252                protected abstract byte bitMask();
253
254                /**
255                 * Should return the maximum amount of compounds we can encode per int
256                 */
257                protected abstract int compoundsPerDatatype();
258
259                /**
260                 * Should return the inverse information that {@link #generateCompoundsToIndex() }
261                 * returns i.e. if the Compound C returns 1 from compoundsToIndex then we
262                 * should find that compound here in position 1
263                 */
264                protected abstract List<C> generateIndexToCompounds();
265
266                /**
267                 * Returns what the value of a compound is in the backing bit storage i.e.
268                 * in 2bit storage the value 0 is encoded as 00 (in binary).
269                 */
270                protected abstract Map<C, Integer> generateCompoundsToIndex();
271
272                /**
273                 * Returns how many bits are used to represent a compound e.g. 2 if using
274                 * 2bit encoding.
275                 */
276                protected int bitsPerCompound() {
277                        return BYTES_PER_INT / compoundsPerDatatype();
278                }
279
280                public int seqArraySize(int length) {
281                        return (int) Math.ceil((double) length / (double) compoundsPerDatatype());
282                }
283
284                /**
285                 * Loops through the Compounds in a Sequence and passes them onto
286                 * {@link #setCompoundAt(Compound, int)}
287                 */
288                public void populate(Sequence<C> sequence) {
289                        int position = 1;
290                        for (C c : sequence) {
291                                setCompoundAt(c, position++);
292                        }
293                }
294
295                /**
296                 * Loops through the chars in a String and passes them onto
297                 * {@link #setCompoundAt(char, int)}
298                 */
299                public void populate(String sequence) {
300                        for (int index = 0; index < getLength(); index++) {
301                                setCompoundAt(sequence.charAt(index), index + 1);
302                        }
303                }
304
305                /**
306                 * Converts from char to Compound and sets it at the given biological index
307                 */
308                public void setCompoundAt(char base, int position) {
309                        C compound = getCompoundSet().getCompoundForString(Character.toString(base));
310                        setCompoundAt(compound, position);
311                }
312
313                /**
314                 * Sets the compound at the specified biological index
315                 */
316                public void setCompoundAt(C compound, int position) {
317                        hashcode = null;
318                        int arrayIndex = biologicalIndexToArrayIndex(position);
319                        int currentInt = sequence[arrayIndex];
320                        int shiftBy = shiftBy(position);
321                        Integer integerValue = getCompoundsToIndexLookup().get(compound);
322
323                        //If we got nothing then throw an error as it's wrong
324                        if (integerValue == null) {
325                                processUnknownCompound(compound, position);
326                        }
327
328                        int shiftedValue = integerValue << shiftBy;
329
330                        sequence[arrayIndex] = currentInt | shiftedValue;
331                }
332
333                /**
334                 * Returns the compound at the specified biological index
335                 */
336                public C getCompoundAt(int position) {
337                        //Avoids asking for something which is not encoded by a bit-pair
338                        if (position > getLength()) {
339                                throw new IllegalArgumentException(position + " is greater than length. Cannot access this position");
340                        }
341                        //Just stops us from using 0 indexing
342                        if (position < 1) {
343                                throw new IllegalArgumentException(position + " is less than 1; you must use biological indexing (indexing from 1)");
344                        }
345
346                        int arrayIndex = biologicalIndexToArrayIndex(position);
347                        int currentByte = sequence[arrayIndex];
348                        int shiftBy = shiftBy(position);
349                        int shifted = currentByte >>> shiftBy;
350                        int masked = shifted & bitMask();
351
352                        //If we could encode 4 compounds then our max masked value is 3
353                        if (masked > (compoundsPerDatatype() - 1)) {
354                                throw new IllegalStateException("Got a masked value of " + masked + "; do not understand values greater than " + (compoundsPerDatatype() - 1));
355                        }
356                        return getIndexToCompoundsLookup().get(masked);
357                }
358
359                /**
360                 * Since bit encoding only supports a finite number of bases
361                 * it is more than likely when processing sequence you will encounter a
362                 * compound which is not covered by the encoding e.g. N in a 2bit sequence.
363                 * You can override this to convert the unknown base into one you can
364                 * process or store locations of unknown bases for a level of post processing
365                 * in your subclass.
366                 *
367                 * @param compound Compound process
368                 * @return Byte representation of the compound
369                 * @throws IllegalStateException Done whenever this method is invoked
370                 */
371                protected byte processUnknownCompound(C compound, int position) throws IllegalStateException {
372                        throw new IllegalStateException("Do not know how to translate the compound " + compound + " to a " + bitsPerCompound() + "bit representation");
373                }
374
375                /**
376                 * Returns a list of compounds the index position of which is used
377                 * to translate from the byte representation into a compound.
378                 */
379                protected List<C> getIndexToCompoundsLookup() {
380                        if (indexToCompoundsLookup == null) {
381                                indexToCompoundsLookup = generateIndexToCompounds();
382                        }
383                        return indexToCompoundsLookup;
384                }
385
386                /**
387                 * Returns a map which converts from compound to an integer representation
388                 */
389                protected Map<C, Integer> getCompoundsToIndexLookup() {
390                        if (compoundsToIndexLookup == null) {
391                                compoundsToIndexLookup = generateCompoundsToIndex();
392                        }
393                        return compoundsToIndexLookup;
394                }
395
396                /**
397                 * Converting a biological index to the int which is used to store that
398                 * position's data.
399                 *
400                 * <ul>
401                 * <li>Assuming 2bit encoding using the 17 base in a sequence</li>
402                 * <li>Convert into 0 index; 17 - 1 = 16</li>
403                 * <li>Divide by the number of compounds per int; 16 / 16</li>
404                 * <li>Floor the value; floor(1) = 1</li>
405                 * </ul>
406                 *
407                 * Running this for position 13
408                 *
409                 * <ul>
410                 * <li>13 - 1 = 12</li>
411                 * <li>12 / 16 = 0.75</li>
412                 * <li>floor(0.75) = 0</li>
413                 * </ul>
414                 */
415                private int biologicalIndexToArrayIndex(int index) {
416                        return ((index - 1) / compoundsPerDatatype());
417                }
418
419                /**
420                 * Convert from bio to 0 index, remainder & then multiply by 2
421                 * <ul>
422                 * <li>Using 2bit encoding and working with position 19</li>
423                 * <li>19 is the 3rd position in the second int</li>
424                 * <li>Means a shift of 4 into that int to get the right data out</li>
425                 * <li>Also must convert into the non-bio index</li>
426                 * <li>19 - 1 = 18</li>
427                 * <li>18 % compoundsPerDatatype() (16) = 2</li>
428                 * <li>2 * bits per compound (2) = 4</li>
429                 * </ul>
430                 */
431                private byte shiftBy(int index) {
432                        return (byte) (((index - 1) % compoundsPerDatatype()) * bitsPerCompound());
433                }
434
435                /**
436                 * Returns the compound set backing this store
437                 */
438                public CompoundSet<C> getCompoundSet() {
439                        return compoundSet;
440                }
441
442                public int getLength() {
443                        return length;
444                }
445
446                @Override
447                public int hashCode() {
448                        if(hashcode == null) {
449                                int s = Hashcoder.SEED;
450                                s = Hashcoder.hash(s, sequence);
451                                s = Hashcoder.hash(s, indexToCompoundsLookup);
452                                s = Hashcoder.hash(s, compoundSet);
453                                hashcode = s;
454                        }
455                        return hashcode;
456                }
457
458                @Override
459                @SuppressWarnings("unchecked")
460                public boolean equals(Object o) {
461                        if(Equals.classEqual(this, o)) {
462                                BitArrayWorker<C> that = (BitArrayWorker<C>)o;
463                                return  Equals.equal(compoundSet, that.compoundSet) &&
464                                                Equals.equal(indexToCompoundsLookup, that.indexToCompoundsLookup) &&
465                                                Equals.equal(sequence, that.sequence);
466                        }
467                        return false;
468                }
469        }
470}