001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022package org.biojava.bio.seq.io;
023
024import java.io.IOException;
025import java.util.ArrayList;
026import java.util.List;
027
028import org.biojava.bio.BioException;
029import org.biojava.bio.symbol.Alphabet;
030import org.biojava.bio.symbol.ChunkedSymbolList;
031import org.biojava.bio.symbol.IllegalAlphabetException;
032import org.biojava.bio.symbol.IllegalSymbolException;
033import org.biojava.bio.symbol.SimpleSymbolList;
034import org.biojava.bio.symbol.SimpleSymbolListFactory;
035import org.biojava.bio.symbol.Symbol;
036import org.biojava.bio.symbol.SymbolList;
037import org.biojava.bio.symbol.SymbolListFactory;
038import org.biojava.bio.symbol.SymbolListViews;
039
040/**
041 * class that makes ChunkedSymbolLists with the chunks
042 * implemented as SymbolLists themselves.
043 * <p>
044 * The advantage is that those SymbolLists can be packed
045 * implementations.
046 * <p>
047 * You can build a SequenceBuilderFactory to create a packed chunked sequence from
048 * an input file without making an intermediate symbol list with:-
049 * <pre>
050 * public class PackedChunkedListFactory implements SequenceBuilderFactory
051 * {
052 *   public SequenceBuilder makeSequenceBuilder()
053 *   {
054 *     return new SequenceBuilderBase() {
055 *       private ChunkedSymbolListFactory chunker = new ChunkedSymbolListFactory(new PackedSymbolListFactory(true));
056 *
057 *       // deal with symbols
058 *       public void addSymbols(Alphabet alpha, Symbol[] syms, int pos, int len)
059 *         throws IllegalAlphabetException
060 *       {
061 *         chunker.addSymbols(alpha, syms, pos, len);
062 *       }
063 *
064 *       // make the sequence
065 *       public Sequence makeSequence()
066 *       {
067 *         try {
068 *           // make the SymbolList
069 *           SymbolList symbols = chunker.makeSymbolList();
070 *           seq = new SimpleSequence(symbols, uri, name, annotation);
071 *
072 *           // call superclass method
073 *           return super.makeSequence();
074 *         }
075 *         catch (IllegalAlphabetException iae) {
076 *           throw new BioError("couldn't create symbol list");
077 *         }
078 *       }
079 *     };
080 *   }
081 * }
082 * </pre>
083 *
084 * <p>
085 * Then reading in FASTA files can be done with something like:-
086 * <p>
087 * <pre>
088 * SequenceIterator seqI = new StreamReader(br, new FastaFormat(),
089 *     DNATools.getDNA().getTokenization("token"),
090 *     new PackedChunkedListFactory() );
091 * </pre>
092 * <p>
093 * Blend to suit taste.
094 * <p>
095 * Alternatively, you can input Symbols to the factory with addSymbols
096 * make the sequence eventually with makeSymbolList.
097 * <p>
098 * <b>NOTE:</b> An improvement has been introduced where an internal
099 * default SymbolList factory is used for small sequences.  This
100 * implementation allows for faster SymbolList creation and access
101 * for small sequences while allowing a more space-efficient
102 * implementation to be selected for large sequences.
103 * <p>
104 * <b>NOTE:</b> This class is inherantly not threadsafe. You should create one
105 * instance for each symbol list you wish to manufacture, and then you should
106 * throw that instance away.
107 *
108 * @author David Huen
109 */
110public class ChunkedSymbolListFactory
111{
112    /**
113     * operating mode
114     */
115    private final static int AUTO_SELECT = 1;
116    private final static int SUPPLIED_FACTORY = 2;
117    private final static int CHUNK_SIZE = 1<<14;
118
119    private final SymbolListFactory userSymListFactory;
120    private SymbolListFactory currSymListFactory = null;
121    private Alphabet alfa;
122
123    // management variables for chunks
124    private Symbol [] headChunk;
125    private int headChunkPos = 0;
126    private List chunkL = new ArrayList();
127
128    private int symCount = 0;
129    private int opMode = SUPPLIED_FACTORY;
130    private int threshold = 1<<20;
131
132    // interlocks
133    // you can only use symbolAt() or make(), not both.
134    private boolean canDoMake = true;
135
136  /**
137     * @param symListFactory class which produces the SymbolLists that are used
138     *        to store the chunked symbols.
139     */
140    public ChunkedSymbolListFactory(SymbolListFactory symListFactory)
141    {
142        opMode = SUPPLIED_FACTORY;
143        this.userSymListFactory = symListFactory;
144        currSymListFactory = symListFactory;
145    }
146
147    /**
148     * @param userSymListFactory User-supplied class which produces the SymbolLists
149     * that are used to store the chunked symbols (only used when the chunked list
150     * to be created is larger than threshold.
151     * @param threshold the size of the SymbolList beyond which the userSymListFactory
152     * is used.  Below that, the internal default SymbolList factory is used.
153     */
154    public ChunkedSymbolListFactory(SymbolListFactory userSymListFactory, int threshold)
155    {
156        opMode = AUTO_SELECT;
157
158        // set threshold to the user-specified value if that's what's requested
159        if (threshold > 0) this.threshold = threshold;
160
161        this.userSymListFactory = userSymListFactory;
162
163        // the default factory is the SimpleSymbolList
164        currSymListFactory = new SimpleSymbolListFactory();
165    }
166
167    /**
168     * tool to construct the SymbolList by adding Symbols.
169     * Note that this class is not thread-safe.  Also, it
170     * can only assemble one SymbolList at a time.  And the
171     * composite formed by adding Symbols must not have interstitial
172     * missing Symbols.
173     */
174    public void addSymbols(Alphabet alfa, Symbol[] syms, int pos, int len)
175        throws IllegalArgumentException, IllegalAlphabetException
176    {
177        // lock out make()
178        canDoMake = false;
179
180        // check alphabet first
181        if (this.alfa == null) {
182            this.alfa = alfa;
183        }
184        else {
185            if (this.alfa != alfa) {
186                throw new IllegalAlphabetException("Alphabet changed!");
187            }
188        }
189
190        // increment count
191        symCount += len;
192
193        // if count reaches threshold, initiate conversion but do it
194        // once only and only if we are in AUTO_SELECT mode
195        if ((opMode == AUTO_SELECT)
196            && (currSymListFactory != userSymListFactory)
197            && (symCount > threshold)) useSuppliedSymListFactory();
198
199        // see if I need to create the first chunk
200        if (headChunk == null) {
201            headChunk = new Symbol[CHUNK_SIZE];
202            headChunkPos = 0;
203        }
204
205        // transfer over new symbols
206        int ipos = 0;
207
208        while (ipos < len) {
209            // is the current chunk full?
210            if (headChunkPos == CHUNK_SIZE) {
211                // yes.  Verify there are no surprises!
212                for (int i= 0; i < CHUNK_SIZE; i++) {
213                    if (headChunk[i] == null) {
214                        // there's an interstitial null Symbol!
215                        throw new IllegalArgumentException("symbols supplied are not tiling contiguously.");
216                    }
217                }
218
219                // the chunk is alright, lets stash it.
220                chunkL.add(currSymListFactory.makeSymbolList(headChunk, CHUNK_SIZE, alfa));
221                headChunkPos = 0;
222                headChunk = new Symbol[CHUNK_SIZE];
223            }
224
225            int read = Math.min(len - ipos, CHUNK_SIZE - headChunkPos);
226            System.arraycopy(syms, pos + ipos, headChunk, headChunkPos, read);
227            ipos += read;
228            headChunkPos += read;
229        }
230    }
231
232    private void clearState()
233    {
234        this.canDoMake = true;
235        headChunk = null;
236        headChunkPos = 0;
237        symCount = 0;
238
239        chunkL = new ArrayList();
240        alfa = null;
241
242        // if auto-select return to default symbol list factory
243        if (opMode == AUTO_SELECT) {
244            // the default factory is the SimpleSymbolList
245            currSymListFactory = new SimpleSymbolListFactory();
246        }
247    }
248
249    /**
250     * Call this to convert from default SymbolList implementation
251     * to user-supplied implementation.
252     */
253    public void useSuppliedSymListFactory()
254    {
255        // set the symbolListFactory to the user-supplied one
256        currSymListFactory = userSymListFactory;
257
258        // go thru' converting all accumulated chunks to
259        // user supplied implementation
260        int size = chunkL.size();
261        ArrayList temp = new ArrayList(size);
262
263        for (int i = 0; i < size; i++) {
264            // get sequence length then transfer contents
265            // into a Symbol[] for reencoding.
266            SymbolList symList = (SymbolList) chunkL.get(i);
267            int symListLen = symList.length();
268            Symbol [] symArray;
269
270            if (symList instanceof org.biojava.bio.symbol.SimpleSymbolList) {
271                symArray = ((SimpleSymbolList) symList).getSymbolArray();
272            }
273            else {
274                symArray = new Symbol[symListLen];
275                for (int j =1; j <= symList.length(); j++) {
276                    symArray[j-1] = symList.symbolAt(j);
277                }
278            }
279
280            // reencode Symbol array: symListLen and CHUNK_SIZE should match except for final chunk.
281            // put it into new List
282            try {
283                temp.add(currSymListFactory.makeSymbolList(symArray, symListLen, alfa));
284            }
285            catch (IllegalAlphabetException iae) {
286                // this should be impossible!!!!!
287            }
288        }
289
290        // swap over to the new symbol list array
291        chunkL = temp;
292    }
293
294    /**
295     * Converts accumulated Symbols to a SymbolList
296     */
297    public SymbolList makeSymbolList()
298        throws IllegalAlphabetException
299    {
300        try {
301        // we have now acquired all symbols
302        // the situation can be;-
303        // i) there are no Symbols; then chunkL.size() == 0.
304        // ii) one chunk worth, not full: chunkL.size() == 0. headChunkPos != 0;
305        // iii) one chunk worth, full: chunkL.size() == 0. headChunkPos == CHUNK_SIZE;
306        // iv) multiple chunks, all full: there'll be an unpacked chunk at headChunk;
307        // v) multiple chunks, last part filled: chunkL.size() != 0. Unpacked chunk at headChunk;
308
309        // so basically, if chunkL.size() != 0, there will be an unpacked chunk to pack
310        // unless headChunkPos == 0;
311
312        // do we have any chunks stashed?
313        if (chunkL.size() == 0) {
314            // no.  The only chunk if any is at headChunk.
315            if (headChunkPos == 0) {
316                // no symbols!
317                return SymbolListViews.emptyList(alfa);
318            }
319            else {
320                // we do have ONE chunk to deal with.
321                // reduce its size as necessary.
322                if (headChunkPos < CHUNK_SIZE) {
323                    Symbol[] oldChunk = headChunk;
324                    headChunk = new Symbol[headChunkPos];
325                    System.arraycopy(oldChunk, 0, headChunk, 0, headChunkPos);
326                }
327
328                // now return a SubArraySymbolList
329                return new SimpleSymbolList(headChunk, headChunkPos, alfa);
330            }
331        }
332        else {
333            // we have multiple chunks.
334
335            // do we have an unstashed chunk?
336            if (headChunkPos != 0) {
337                // yes, let's stash it
338                chunkL.add(currSymListFactory.makeSymbolList(headChunk, headChunkPos, alfa));
339            }
340
341            // everything we want to put away is now in chunkL
342            // create an array to stash the SymbolLists into
343            SymbolList [] symListArray = new SymbolList[chunkL.size()];
344
345            // and fill it with the contents of our List
346            for (int cnum=0; cnum < chunkL.size(); ++cnum) {
347                symListArray[cnum] = (SymbolList) chunkL.get(cnum);
348            }
349
350            // now let's return a SymbolList to the user
351            int length = (chunkL.size() - 1) * CHUNK_SIZE + headChunkPos;
352            return new ChunkedSymbolList(symListArray, CHUNK_SIZE, length, alfa);
353        }
354        }
355        finally {
356            clearState();
357        }
358    }
359
360    /**
361     * Method to create a Sequence with a SymbolReader. (does anyone use this???>
362     *
363     * readme: As of 12/11/03, there are no references to this method in the codebase
364     */
365    public SymbolList make(SymbolReader sr)
366        throws IOException, IllegalSymbolException, IllegalAlphabetException, BioException
367    {
368        // check interlock
369        if(!canDoMake) throw new BioException("you can't use make() and addSymbol() simultaneously.");
370
371        chunkL = new ArrayList();
372        headChunk = new Symbol[CHUNK_SIZE];
373        headChunkPos = 0;
374        alfa = sr.getAlphabet();
375
376        while (sr.hasMoreSymbols()) {
377            // is chunk full?
378            if (headChunkPos == CHUNK_SIZE) {
379                // yes.  Stash away this chunk in packed form.
380                chunkL.add(currSymListFactory.makeSymbolList(headChunk, CHUNK_SIZE, sr.getAlphabet()));
381                headChunkPos = 0;
382            }
383
384            // grab up all available Symbols up to end of chunk.
385            int read = sr.readSymbols(headChunk, headChunkPos, CHUNK_SIZE - headChunkPos);
386            headChunkPos += read;
387        }
388
389        clearState();
390        return makeSymbolList();
391    }
392}