001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.bio.symbol; 022 023import org.biojava.utils.AssertionFailure; 024 025/** 026 * <p> 027 * A SymbolList that stores symbols as bit-patterns in an array of longs. 028 * </p> 029 * 030 * <p> 031 * Bit-packed symbol lists are space efficient compared to the usual pointer 032 * storage model employed by implementations like SimpleSymbolList. This 033 * comes at the cost of encoding/decoding symbols from the storage. In 034 * practice, the decrease in memory when storing large sequences makes 035 * applications go quicker because of issues like page swapping. 036 * </p> 037 * 038 * <p> 039 * Symbols can be mapped to and from bit-patterns. The Pattern interface 040 * encapsulates this. A SymbolList can then be stored by writing these 041 * bit-patterns into memory. This implementation stores the bits 042 * in the long elements of an array. The first symbol will be packed into 043 * bits 0 through packing.wordLength()-1 of the long at index 0. 044 * <p> 045 * 046 * <h2>Example Usage</h2> 047 * <pre> 048 * SymbolList symL = ...; 049 * SymbolList packed = new PackedSymbolList( 050 * PackingFactory.getPacking(symL.getAlphabet(), true), 051 * symL 052 * ); 053 * </pre> 054 * 055 * @author Matthew Pocock 056 * @author David Huen (new constructor for Symbol arrays and some speedups) 057 */ 058public class PackedSymbolList 059 extends 060 AbstractSymbolList 061 implements 062 java.io.Serializable 063{ 064 private static final byte BITS_PER_ELEMENT = 64; 065 066 private final Packing packing; 067 private final long[] syms; 068 private final int length; 069 private final byte symsPerElement; 070 private final byte mask; 071 072 // scratch area for optimisations 073 // WARNING: these variables constitute an opportunity 074 // for things to go wrong when doing multithreaded access 075 // via symbolAt(). Keep SymbolAt() synchronized so they 076 // don't get changed during a lookup! Naaasssty. 077 private int currentMin = Integer.MAX_VALUE; 078 private int currentMax = Integer.MIN_VALUE; 079 private long currentWord; 080 private int wordsize; 081 082 public Alphabet getAlphabet() { 083 return packing.getAlphabet(); 084 } 085 086 public int length() { 087 return length; 088 } 089 090 /** 091 * <p> 092 * Create a new PackedSymbolList directly from a bit pattern. 093 * </p> 094 * 095 * <p> 096 * <em>Warning:</em> This is a risky developer method. 097 * You must be sure that the syms array is packed in a 098 * way that is consistent with the packing. Also, it is your 099 * responsibility to ensure that the length is sensible.</em> 100 * </p> 101 * 102 * @param packing the Packing used 103 * @param syms a long array containing already packed symbols 104 * @param length the length of the sequence packed in symbols 105 */ 106 public PackedSymbolList(Packing packing, long[] syms, int length) { 107 this.symsPerElement = (byte) (BITS_PER_ELEMENT / packing.wordSize()); 108 this.packing = packing; 109 this.syms = syms; 110 this.length = length; 111 this.mask = calcMask(packing); 112 wordsize = packing.wordSize(); 113 } 114 115 /** 116 * <p> 117 * Create a new PackedSymbolList as a packed copy of another symbol list. 118 * </p> 119 * 120 * <p> 121 * This will create a new and independent symbol list that is a copy of 122 * the symbols in symList. Both lists can be modified independently. 123 * </p> 124 * 125 * @param packing the way to bit-pack symbols 126 * @param symList the SymbolList to copy 127 */ 128 public PackedSymbolList(Packing packing, SymbolList symList) 129 throws IllegalAlphabetException { 130 if(packing.getAlphabet() != symList.getAlphabet()) { 131 throw new IllegalAlphabetException( 132 "Can't pack with alphabet " + packing.getAlphabet() + 133 " and symbol list " + symList.getAlphabet() 134 ); 135 } 136 137 try { 138 this.symsPerElement = (byte) (BITS_PER_ELEMENT / packing.wordSize()); 139 this.packing = packing; 140 this.length = symList.length(); 141 this.syms = new long[ 142 length / symsPerElement + 143 ((length % symsPerElement == 0) ? 0 : 1) 144 ]; 145 this.mask = calcMask(packing); 146 wordsize = packing.wordSize(); 147 148 // pack the body of the sequence 149 int ii = 0; 150 for(int i = 0; i < (syms.length - 1); i++) { 151// int ii = i * symsPerElement; 152 long l = 0; 153 int jj = 0; 154 for(int j = 0; j < symsPerElement; j++) { 155// int jj = j * packing.wordSize(); 156 long p = packing.pack(symList.symbolAt(ii + j + 1)); 157 l |= (long) ((long) p << (long) jj); 158 jj += wordsize; 159 } 160 syms[i] = l; 161 ii += symsPerElement; 162 } 163 164 // pack the final word 165 if(syms.length > 0) { 166 long l = 0; 167 ii = (syms.length - 1) * symsPerElement; 168 int jMax = symList.length() % symsPerElement; 169 if(jMax == 0) { 170 jMax = symsPerElement; 171 } 172 for(int j = 0; j < jMax; j++) { 173 int jj = j * packing.wordSize(); 174 long p = packing.pack(symList.symbolAt(ii + j + 1)); 175 l |= (long) ((long) p << (long) jj); 176 } 177 syms[syms.length - 1] = l; 178 } 179 } catch (IllegalSymbolException ise) { 180 throw new AssertionFailure("Assertion Failure: Symbol got lost somewhere", ise); 181 } 182 } 183 184 /** 185 * <p> 186 * Create a new PackedSymbolList from an array of Symbols. 187 * </p> 188 * 189 * <p> 190 * This will create a new and independent SymbolList formed from the 191 * the symbol array. 192 * </p> 193 * 194 * @param packing the way to bit-pack symbols 195 * @param symbols an array of Symbols 196 * @param length the number of Symbols to process from symbols 197 * @param alfa the alphabet from which the Symbols are drawn 198 */ 199 public PackedSymbolList(Packing packing, Symbol [] symbols, int length, Alphabet alfa) 200 throws IllegalAlphabetException,IllegalArgumentException { 201 202 // verify that the alphabet is one I can deal with. 203 if(packing.getAlphabet() != alfa) { 204 throw new IllegalAlphabetException( 205 "Can't pack with alphabet " + packing.getAlphabet() + 206 " and symbol list " + alfa 207 ); 208 } 209 210 // check that array length makes sense 211 if (symbols.length < length) { 212 throw new IllegalArgumentException( 213 "Symbol array size is too small to get " + length + 214 "symbols from." 215 ); 216 } 217 218 try { 219 this.symsPerElement = (byte) (BITS_PER_ELEMENT / packing.wordSize()); 220 this.packing = packing; 221 this.length = length; 222 this.syms = new long[ 223 length / symsPerElement + 224 ((length % symsPerElement == 0) ? 0 : 1) 225 ]; 226 this.mask = calcMask(packing); 227 wordsize = packing.wordSize(); 228 229 // pack the body of the sequence 230 int ii = 0; 231 for(int i = 0; i < (syms.length - 1); i++) { 232 long l = 0; 233 int jj = 0; 234 for(int j = 0; j < symsPerElement; j++) { 235 long p = packing.pack(symbols[ii + j]); 236 l |= (long) ((long) p << (long) jj); 237 jj += wordsize; 238 } 239 syms[i] = l; 240 ii += symsPerElement; 241 } 242 243 // pack the final word 244 if(syms.length > 0) { 245 long l = 0; 246 ii = (syms.length - 1) * symsPerElement; 247 int jMax = length % symsPerElement; 248 if(jMax == 0) { 249 jMax = symsPerElement; 250 } 251 for(int j = 0; j < jMax; j++) { 252 int jj = j * packing.wordSize(); 253 long p = packing.pack(symbols[ii + j]); 254 l |= (long) ((long) p << (long) jj); 255 } 256 syms[syms.length - 1] = l; 257 } 258 } catch (IllegalSymbolException ise) { 259 throw new AssertionFailure("Assertion Failure: Symbol got lost somewhere",ise); 260 } 261 } 262 263 public Symbol symbolAt(int indx) { 264 indx--; 265 266 int word; 267 int offset; 268 long l; 269 270 synchronized(this) { 271 if ((indx < currentMin) || (indx > currentMax)) { 272 word = indx / symsPerElement; 273 offset = indx % symsPerElement; 274 275 currentMin = indx - offset; 276 currentMax = currentMin + symsPerElement - 1; 277 currentWord = syms[word]; 278 } 279 else { 280 offset = indx - currentMin; 281 } 282 283 l = currentWord; 284 } 285 286 int jj = offset * wordsize; 287 try { 288 return packing.unpack((byte) ((l >> (long) jj) & mask)); 289 } catch (IllegalSymbolException ise) { 290 throw new AssertionFailure("Could not unpack " + indx + " at " + "word" + "," + offset, ise); 291 } 292 } 293 294 295 private static byte calcMask(Packing packing) { 296 byte mask = 0; 297 for(int i = 0; i < packing.wordSize(); i++) { 298 mask |= 1 << i; 299 } 300 return mask; 301 } 302 303 /** 304 * <p> 305 * Return the long array within which the symbols are bit-packed. 306 * </p> 307 * 308 * <p> 309 * <em>Warning:</em> This is a risky developer method. 310 * This is the actual array that this object uses to store the bits 311 * representing symbols. You should not modify this in any way. If you do, 312 * you will modify the symbols returned by symbolAt(). This methd is 313 * provided primarily as an easy way for developers to extract the 314 * bit pattern for storage in such a way as it could be fetched later and 315 * fed into the appropriate constructor. 316 * </p> 317 * 318 * @return the actual long array used to store bit-packed symbols 319 */ 320 public long[] getSyms() { 321 return syms; 322 } 323}