001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.bio.symbol;
022
023import org.biojava.utils.AssertionFailure;
024
025/**
026 * <p>
027 * A SymbolList that stores symbols as bit-patterns in an array of longs.
028 * </p>
029 *
030 * <p>
031 * Bit-packed symbol lists are space efficient compared to the usual pointer
032 * storage model employed by implementations like SimpleSymbolList. This
033 * comes at the cost of encoding/decoding symbols from the storage. In
034 * practice, the decrease in memory when storing large sequences makes
035 * applications go quicker because of issues like page swapping.
036 * </p>
037 *
038 * <p>
039 * Symbols can be mapped to and from bit-patterns. The Pattern interface
040 * encapsulates this. A SymbolList can then be stored by writing these
041 * bit-patterns into memory. This implementation stores the bits
042 * in the long elements of an array. The first symbol will be packed into
043 * bits 0 through packing.wordLength()-1 of the long at index 0.
044 * <p>
045 *
046 * <h2>Example Usage</h2>
047 * <pre>
048 * SymbolList symL = ...;
049 * SymbolList packed = new PackedSymbolList(
050 *   PackingFactory.getPacking(symL.getAlphabet(), true),
051 *   symL
052 * );
053 * </pre>
054 *
055 * @author Matthew Pocock
056 * @author David Huen (new constructor for Symbol arrays and some speedups)
057 */
058public class PackedSymbolList
059  extends
060    AbstractSymbolList
061  implements
062    java.io.Serializable
063{
064  private static final byte BITS_PER_ELEMENT = 64;
065
066  private final Packing packing;
067  private final long[] syms;
068  private final int length;
069  private final byte symsPerElement;
070  private final byte mask;
071
072  // scratch area for optimisations
073  // WARNING: these variables constitute an opportunity
074  // for things to go wrong when doing multithreaded access
075  // via symbolAt().  Keep SymbolAt() synchronized so they
076  // don't get changed during a lookup! Naaasssty.
077  private int currentMin = Integer.MAX_VALUE;
078  private int currentMax = Integer.MIN_VALUE;
079  private long currentWord;
080  private int wordsize;
081
082  public Alphabet getAlphabet() {
083    return packing.getAlphabet();
084  }
085
086  public int length() {
087    return length;
088  }
089
090  /**
091   * <p>
092   * Create a new PackedSymbolList directly from a bit pattern.
093   * </p>
094   *
095   * <p>
096   * <em>Warning:</em> This is a risky developer method.
097   * You must be sure that the syms array is packed in a
098   * way that is consistent with the packing. Also, it is your
099   * responsibility to ensure that the length is sensible.</em>
100   * </p>
101   *
102   * @param packing the Packing used
103   * @param syms a long array containing already packed symbols
104   * @param length the length of the sequence packed in symbols
105   */
106  public PackedSymbolList(Packing packing, long[] syms, int length) {
107    this.symsPerElement = (byte) (BITS_PER_ELEMENT / packing.wordSize());
108    this.packing = packing;
109    this.syms = syms;
110    this.length = length;
111    this.mask = calcMask(packing);
112    wordsize = packing.wordSize();
113  }
114
115  /**
116   * <p>
117   * Create a new PackedSymbolList as a packed copy of another symbol list.
118   * </p>
119   *
120   * <p>
121   * This will create a new and independent symbol list that is a copy of
122   * the symbols in symList. Both lists can be modified independently.
123   * </p>
124   *
125   * @param packing the way to bit-pack symbols
126   * @param symList the SymbolList to copy
127   */
128  public PackedSymbolList(Packing packing, SymbolList symList)
129  throws IllegalAlphabetException {
130    if(packing.getAlphabet() != symList.getAlphabet()) {
131      throw new IllegalAlphabetException(
132        "Can't pack with alphabet " + packing.getAlphabet() +
133        " and symbol list " + symList.getAlphabet()
134      );
135    }
136
137    try {
138      this.symsPerElement = (byte) (BITS_PER_ELEMENT / packing.wordSize());
139      this.packing = packing;
140      this.length = symList.length();
141      this.syms = new long[
142        length / symsPerElement +
143        ((length % symsPerElement == 0) ? 0 : 1)
144      ];
145      this.mask = calcMask(packing);
146      wordsize = packing.wordSize();
147
148      // pack the body of the sequence
149      int ii = 0;
150      for(int i = 0; i < (syms.length - 1); i++) {
151//        int ii = i * symsPerElement;
152        long l = 0;
153        int jj = 0;
154        for(int j = 0; j < symsPerElement; j++) {
155//          int jj = j * packing.wordSize();
156          long p = packing.pack(symList.symbolAt(ii + j + 1));
157          l |= (long) ((long) p << (long) jj);
158          jj += wordsize;
159        }
160        syms[i] = l;
161        ii += symsPerElement;
162      }
163
164      // pack the final word
165      if(syms.length > 0) {
166        long l = 0;
167        ii = (syms.length - 1) * symsPerElement;
168        int jMax = symList.length() % symsPerElement;
169        if(jMax == 0) {
170          jMax = symsPerElement;
171        }
172        for(int j = 0; j < jMax; j++) {
173          int jj = j * packing.wordSize();
174          long p = packing.pack(symList.symbolAt(ii + j + 1));
175          l |= (long) ((long) p << (long) jj);
176        }
177        syms[syms.length - 1] = l;
178      }
179    } catch (IllegalSymbolException ise) {
180      throw new AssertionFailure("Assertion Failure: Symbol got lost somewhere", ise);
181    }
182  }
183
184  /**
185   * <p>
186   * Create a new PackedSymbolList from an array of Symbols.
187   * </p>
188   *
189   * <p>
190   * This will create a new and independent SymbolList formed from the
191   * the symbol array.
192   * </p>
193   *
194   * @param packing the way to bit-pack symbols
195   * @param symbols an array of Symbols
196   * @param length the number of Symbols to process from symbols
197   * @param alfa the alphabet from which the Symbols are drawn
198   */
199  public PackedSymbolList(Packing packing, Symbol [] symbols, int length, Alphabet alfa)
200  throws IllegalAlphabetException,IllegalArgumentException {
201
202    // verify that the alphabet is one I can deal with.
203    if(packing.getAlphabet() != alfa) {
204      throw new IllegalAlphabetException(
205        "Can't pack with alphabet " + packing.getAlphabet() +
206        " and symbol list " + alfa
207      );
208    }
209
210    // check that array length makes sense
211    if (symbols.length < length) {
212      throw new IllegalArgumentException(
213        "Symbol array size is too small to get " + length +
214        "symbols from."
215      );
216    }
217
218    try {
219      this.symsPerElement = (byte) (BITS_PER_ELEMENT / packing.wordSize());
220      this.packing = packing;
221      this.length = length;
222      this.syms = new long[
223        length / symsPerElement +
224        ((length % symsPerElement == 0) ? 0 : 1)
225      ];
226      this.mask = calcMask(packing);
227      wordsize = packing.wordSize();
228
229      // pack the body of the sequence
230      int ii = 0;
231      for(int i = 0; i < (syms.length - 1); i++) {
232        long l = 0;
233        int jj = 0;
234        for(int j = 0; j < symsPerElement; j++) {
235          long p = packing.pack(symbols[ii + j]);
236          l |= (long) ((long) p << (long) jj);
237          jj += wordsize;
238        }
239        syms[i] = l;
240        ii += symsPerElement;
241      }
242
243      // pack the final word
244      if(syms.length > 0) {
245        long l = 0;
246        ii = (syms.length - 1) * symsPerElement;
247        int jMax = length % symsPerElement;
248        if(jMax == 0) {
249          jMax = symsPerElement;
250        }
251        for(int j = 0; j < jMax; j++) {
252          int jj = j * packing.wordSize();
253          long p = packing.pack(symbols[ii + j]);
254          l |= (long) ((long) p << (long) jj);
255        }
256        syms[syms.length - 1] = l;
257      }
258    } catch (IllegalSymbolException ise) {
259      throw new AssertionFailure("Assertion Failure: Symbol got lost somewhere",ise);
260    }
261  }
262
263  public Symbol symbolAt(int indx) {
264    indx--;
265
266    int word;
267    int offset;
268    long l;
269
270    synchronized(this) {
271      if ((indx < currentMin) || (indx > currentMax)) {
272        word = indx / symsPerElement;
273        offset = indx % symsPerElement;
274
275        currentMin = indx - offset;
276        currentMax = currentMin + symsPerElement - 1;
277        currentWord = syms[word];
278      }
279      else {
280        offset = indx - currentMin;
281      }
282
283      l = currentWord;
284    }
285
286    int jj = offset * wordsize;
287    try {
288      return packing.unpack((byte) ((l >> (long) jj) & mask));
289    } catch (IllegalSymbolException ise) {
290      throw new AssertionFailure("Could not unpack " + indx + " at " + "word" + "," + offset, ise);
291    }
292  }
293
294
295  private static byte calcMask(Packing packing) {
296    byte mask = 0;
297    for(int i = 0; i < packing.wordSize(); i++) {
298      mask |= 1 << i;
299    }
300    return mask;
301  }
302
303  /**
304   * <p>
305   * Return the long array within which the symbols are bit-packed.
306   * </p>
307   *
308   * <p>
309   * <em>Warning:</em> This is a risky developer method.
310   * This is the actual array that this object uses to store the bits
311   * representing symbols. You should not modify this in any way. If you do,
312   * you will modify the symbols returned by symbolAt(). This methd is
313   * provided primarily as an easy way for developers to extract the
314   * bit pattern for storage in such a way as it could be fetched later and
315   * fed into the appropriate constructor.
316   * </p>
317   *
318   * @return the actual long array used to store bit-packed symbols
319   */
320  public long[] getSyms() {
321    return syms;
322  }
323}