001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojava.bio.seq.io; 023 024import java.io.IOException; 025import java.util.ArrayList; 026import java.util.List; 027 028import org.biojava.bio.BioException; 029import org.biojava.bio.symbol.Alphabet; 030import org.biojava.bio.symbol.ChunkedSymbolList; 031import org.biojava.bio.symbol.IllegalAlphabetException; 032import org.biojava.bio.symbol.IllegalSymbolException; 033import org.biojava.bio.symbol.SimpleSymbolList; 034import org.biojava.bio.symbol.SimpleSymbolListFactory; 035import org.biojava.bio.symbol.Symbol; 036import org.biojava.bio.symbol.SymbolList; 037import org.biojava.bio.symbol.SymbolListFactory; 038import org.biojava.bio.symbol.SymbolListViews; 039 040/** 041 * class that makes ChunkedSymbolLists with the chunks 042 * implemented as SymbolLists themselves. 043 * <p> 044 * The advantage is that those SymbolLists can be packed 045 * implementations. 046 * <p> 047 * You can build a SequenceBuilderFactory to create a packed chunked sequence from 048 * an input file without making an intermediate symbol list with:- 049 * <pre> 050 * public class PackedChunkedListFactory implements SequenceBuilderFactory 051 * { 052 * public SequenceBuilder makeSequenceBuilder() 053 * { 054 * return new SequenceBuilderBase() { 055 * private ChunkedSymbolListFactory chunker = new ChunkedSymbolListFactory(new PackedSymbolListFactory(true)); 056 * 057 * // deal with symbols 058 * public void addSymbols(Alphabet alpha, Symbol[] syms, int pos, int len) 059 * throws IllegalAlphabetException 060 * { 061 * chunker.addSymbols(alpha, syms, pos, len); 062 * } 063 * 064 * // make the sequence 065 * public Sequence makeSequence() 066 * { 067 * try { 068 * // make the SymbolList 069 * SymbolList symbols = chunker.makeSymbolList(); 070 * seq = new SimpleSequence(symbols, uri, name, annotation); 071 * 072 * // call superclass method 073 * return super.makeSequence(); 074 * } 075 * catch (IllegalAlphabetException iae) { 076 * throw new BioError("couldn't create symbol list"); 077 * } 078 * } 079 * }; 080 * } 081 * } 082 * </pre> 083 * 084 * <p> 085 * Then reading in FASTA files can be done with something like:- 086 * <p> 087 * <pre> 088 * SequenceIterator seqI = new StreamReader(br, new FastaFormat(), 089 * DNATools.getDNA().getTokenization("token"), 090 * new PackedChunkedListFactory() ); 091 * </pre> 092 * <p> 093 * Blend to suit taste. 094 * <p> 095 * Alternatively, you can input Symbols to the factory with addSymbols 096 * make the sequence eventually with makeSymbolList. 097 * <p> 098 * <b>NOTE:</b> An improvement has been introduced where an internal 099 * default SymbolList factory is used for small sequences. This 100 * implementation allows for faster SymbolList creation and access 101 * for small sequences while allowing a more space-efficient 102 * implementation to be selected for large sequences. 103 * <p> 104 * <b>NOTE:</b> This class is inherantly not threadsafe. You should create one 105 * instance for each symbol list you wish to manufacture, and then you should 106 * throw that instance away. 107 * 108 * @author David Huen 109 */ 110public class ChunkedSymbolListFactory 111{ 112 /** 113 * operating mode 114 */ 115 private final static int AUTO_SELECT = 1; 116 private final static int SUPPLIED_FACTORY = 2; 117 private final static int CHUNK_SIZE = 1<<14; 118 119 private final SymbolListFactory userSymListFactory; 120 private SymbolListFactory currSymListFactory = null; 121 private Alphabet alfa; 122 123 // management variables for chunks 124 private Symbol [] headChunk; 125 private int headChunkPos = 0; 126 private List chunkL = new ArrayList(); 127 128 private int symCount = 0; 129 private int opMode = SUPPLIED_FACTORY; 130 private int threshold = 1<<20; 131 132 // interlocks 133 // you can only use symbolAt() or make(), not both. 134 private boolean canDoMake = true; 135 136 /** 137 * @param symListFactory class which produces the SymbolLists that are used 138 * to store the chunked symbols. 139 */ 140 public ChunkedSymbolListFactory(SymbolListFactory symListFactory) 141 { 142 opMode = SUPPLIED_FACTORY; 143 this.userSymListFactory = symListFactory; 144 currSymListFactory = symListFactory; 145 } 146 147 /** 148 * @param userSymListFactory User-supplied class which produces the SymbolLists 149 * that are used to store the chunked symbols (only used when the chunked list 150 * to be created is larger than threshold. 151 * @param threshold the size of the SymbolList beyond which the userSymListFactory 152 * is used. Below that, the internal default SymbolList factory is used. 153 */ 154 public ChunkedSymbolListFactory(SymbolListFactory userSymListFactory, int threshold) 155 { 156 opMode = AUTO_SELECT; 157 158 // set threshold to the user-specified value if that's what's requested 159 if (threshold > 0) this.threshold = threshold; 160 161 this.userSymListFactory = userSymListFactory; 162 163 // the default factory is the SimpleSymbolList 164 currSymListFactory = new SimpleSymbolListFactory(); 165 } 166 167 /** 168 * tool to construct the SymbolList by adding Symbols. 169 * Note that this class is not thread-safe. Also, it 170 * can only assemble one SymbolList at a time. And the 171 * composite formed by adding Symbols must not have interstitial 172 * missing Symbols. 173 */ 174 public void addSymbols(Alphabet alfa, Symbol[] syms, int pos, int len) 175 throws IllegalArgumentException, IllegalAlphabetException 176 { 177 // lock out make() 178 canDoMake = false; 179 180 // check alphabet first 181 if (this.alfa == null) { 182 this.alfa = alfa; 183 } 184 else { 185 if (this.alfa != alfa) { 186 throw new IllegalAlphabetException("Alphabet changed!"); 187 } 188 } 189 190 // increment count 191 symCount += len; 192 193 // if count reaches threshold, initiate conversion but do it 194 // once only and only if we are in AUTO_SELECT mode 195 if ((opMode == AUTO_SELECT) 196 && (currSymListFactory != userSymListFactory) 197 && (symCount > threshold)) useSuppliedSymListFactory(); 198 199 // see if I need to create the first chunk 200 if (headChunk == null) { 201 headChunk = new Symbol[CHUNK_SIZE]; 202 headChunkPos = 0; 203 } 204 205 // transfer over new symbols 206 int ipos = 0; 207 208 while (ipos < len) { 209 // is the current chunk full? 210 if (headChunkPos == CHUNK_SIZE) { 211 // yes. Verify there are no surprises! 212 for (int i= 0; i < CHUNK_SIZE; i++) { 213 if (headChunk[i] == null) { 214 // there's an interstitial null Symbol! 215 throw new IllegalArgumentException("symbols supplied are not tiling contiguously."); 216 } 217 } 218 219 // the chunk is alright, lets stash it. 220 chunkL.add(currSymListFactory.makeSymbolList(headChunk, CHUNK_SIZE, alfa)); 221 headChunkPos = 0; 222 headChunk = new Symbol[CHUNK_SIZE]; 223 } 224 225 int read = Math.min(len - ipos, CHUNK_SIZE - headChunkPos); 226 System.arraycopy(syms, pos + ipos, headChunk, headChunkPos, read); 227 ipos += read; 228 headChunkPos += read; 229 } 230 } 231 232 private void clearState() 233 { 234 this.canDoMake = true; 235 headChunk = null; 236 headChunkPos = 0; 237 symCount = 0; 238 239 chunkL = new ArrayList(); 240 alfa = null; 241 242 // if auto-select return to default symbol list factory 243 if (opMode == AUTO_SELECT) { 244 // the default factory is the SimpleSymbolList 245 currSymListFactory = new SimpleSymbolListFactory(); 246 } 247 } 248 249 /** 250 * Call this to convert from default SymbolList implementation 251 * to user-supplied implementation. 252 */ 253 public void useSuppliedSymListFactory() 254 { 255 // set the symbolListFactory to the user-supplied one 256 currSymListFactory = userSymListFactory; 257 258 // go thru' converting all accumulated chunks to 259 // user supplied implementation 260 int size = chunkL.size(); 261 ArrayList temp = new ArrayList(size); 262 263 for (int i = 0; i < size; i++) { 264 // get sequence length then transfer contents 265 // into a Symbol[] for reencoding. 266 SymbolList symList = (SymbolList) chunkL.get(i); 267 int symListLen = symList.length(); 268 Symbol [] symArray; 269 270 if (symList instanceof org.biojava.bio.symbol.SimpleSymbolList) { 271 symArray = ((SimpleSymbolList) symList).getSymbolArray(); 272 } 273 else { 274 symArray = new Symbol[symListLen]; 275 for (int j =1; j <= symList.length(); j++) { 276 symArray[j-1] = symList.symbolAt(j); 277 } 278 } 279 280 // reencode Symbol array: symListLen and CHUNK_SIZE should match except for final chunk. 281 // put it into new List 282 try { 283 temp.add(currSymListFactory.makeSymbolList(symArray, symListLen, alfa)); 284 } 285 catch (IllegalAlphabetException iae) { 286 // this should be impossible!!!!! 287 } 288 } 289 290 // swap over to the new symbol list array 291 chunkL = temp; 292 } 293 294 /** 295 * Converts accumulated Symbols to a SymbolList 296 */ 297 public SymbolList makeSymbolList() 298 throws IllegalAlphabetException 299 { 300 try { 301 // we have now acquired all symbols 302 // the situation can be;- 303 // i) there are no Symbols; then chunkL.size() == 0. 304 // ii) one chunk worth, not full: chunkL.size() == 0. headChunkPos != 0; 305 // iii) one chunk worth, full: chunkL.size() == 0. headChunkPos == CHUNK_SIZE; 306 // iv) multiple chunks, all full: there'll be an unpacked chunk at headChunk; 307 // v) multiple chunks, last part filled: chunkL.size() != 0. Unpacked chunk at headChunk; 308 309 // so basically, if chunkL.size() != 0, there will be an unpacked chunk to pack 310 // unless headChunkPos == 0; 311 312 // do we have any chunks stashed? 313 if (chunkL.size() == 0) { 314 // no. The only chunk if any is at headChunk. 315 if (headChunkPos == 0) { 316 // no symbols! 317 return SymbolListViews.emptyList(alfa); 318 } 319 else { 320 // we do have ONE chunk to deal with. 321 // reduce its size as necessary. 322 if (headChunkPos < CHUNK_SIZE) { 323 Symbol[] oldChunk = headChunk; 324 headChunk = new Symbol[headChunkPos]; 325 System.arraycopy(oldChunk, 0, headChunk, 0, headChunkPos); 326 } 327 328 // now return a SubArraySymbolList 329 return new SimpleSymbolList(headChunk, headChunkPos, alfa); 330 } 331 } 332 else { 333 // we have multiple chunks. 334 335 // do we have an unstashed chunk? 336 if (headChunkPos != 0) { 337 // yes, let's stash it 338 chunkL.add(currSymListFactory.makeSymbolList(headChunk, headChunkPos, alfa)); 339 } 340 341 // everything we want to put away is now in chunkL 342 // create an array to stash the SymbolLists into 343 SymbolList [] symListArray = new SymbolList[chunkL.size()]; 344 345 // and fill it with the contents of our List 346 for (int cnum=0; cnum < chunkL.size(); ++cnum) { 347 symListArray[cnum] = (SymbolList) chunkL.get(cnum); 348 } 349 350 // now let's return a SymbolList to the user 351 int length = (chunkL.size() - 1) * CHUNK_SIZE + headChunkPos; 352 return new ChunkedSymbolList(symListArray, CHUNK_SIZE, length, alfa); 353 } 354 } 355 finally { 356 clearState(); 357 } 358 } 359 360 /** 361 * Method to create a Sequence with a SymbolReader. (does anyone use this???> 362 * 363 * readme: As of 12/11/03, there are no references to this method in the codebase 364 */ 365 public SymbolList make(SymbolReader sr) 366 throws IOException, IllegalSymbolException, IllegalAlphabetException, BioException 367 { 368 // check interlock 369 if(!canDoMake) throw new BioException("you can't use make() and addSymbol() simultaneously."); 370 371 chunkL = new ArrayList(); 372 headChunk = new Symbol[CHUNK_SIZE]; 373 headChunkPos = 0; 374 alfa = sr.getAlphabet(); 375 376 while (sr.hasMoreSymbols()) { 377 // is chunk full? 378 if (headChunkPos == CHUNK_SIZE) { 379 // yes. Stash away this chunk in packed form. 380 chunkL.add(currSymListFactory.makeSymbolList(headChunk, CHUNK_SIZE, sr.getAlphabet())); 381 headChunkPos = 0; 382 } 383 384 // grab up all available Symbols up to end of chunk. 385 int read = sr.readSymbols(headChunk, headChunkPos, CHUNK_SIZE - headChunkPos); 386 headChunkPos += read; 387 } 388 389 clearState(); 390 return makeSymbolList(); 391 } 392}