001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022package org.biojava.bio.seq.db;
023
024import java.io.BufferedReader;
025import java.io.File;
026import java.io.FileReader;
027import java.io.IOException;
028import java.io.RandomAccessFile;
029import java.io.Serializable;
030import java.util.Iterator;
031import java.util.Set;
032import java.util.Vector;
033
034import org.biojava.bio.BioException;
035import org.biojava.bio.seq.Sequence;
036import org.biojava.bio.seq.SequenceIterator;
037import org.biojava.bio.seq.io.SequenceBuilder;
038import org.biojava.bio.seq.io.SequenceBuilderFactory;
039import org.biojava.bio.seq.io.SequenceBuilderFilter;
040import org.biojava.bio.seq.io.SequenceFormat;
041import org.biojava.bio.seq.io.SymbolTokenization;
042import org.biojava.bio.symbol.Alphabet;
043import org.biojava.bio.symbol.Symbol;
044import org.biojava.utils.ChangeEvent;
045import org.biojava.utils.ChangeSupport;
046import org.biojava.utils.ChangeVetoException;
047import org.biojava.utils.ParseErrorListener;
048import org.biojava.utils.ParseErrorSource;
049import org.biojava.utils.io.CountedBufferedReader;
050import org.biojava.utils.io.RandomAccessReader;
051
052/**
053 * <p>
054 * This class implements SequenceDB on top of a set of sequence files
055 * and sequence offsets within these files.
056 * </p>
057 *
058 * <p> This class is primarily responsible for managing the sequence
059 * IO, such as calculating the sequence file offsets, and parsing
060 * individual sequences based upon file offsets. The actual persistant
061 * storage of all this information is delegated to an instance of
062 * <code>IndexStore</code>, such as TabIndexStore.
063 * </p>
064 *
065 * <pre>
066 * // create a new index store and populate it
067 * // this may take some time
068 * TabIndexStore indexStore = new TabIndexStore(
069 *   storeFile, indexFile, dbName,
070 *   format, sbFactory, symbolParser );
071 * IndexedSequenceDB seqDB = new IndexedSequenceDB(indexStore);
072 *
073 * for(int i = 0; i < files; i++) {
074 *   seqDB.addFile(files[i]);
075 * }
076 *
077 * // load an existing index store and fetch a sequence
078 * // this should be quite quick
079 * TabIndexStore indexStore = TabIndexStore.open(storeFile);
080 * SequenceDB seqDB = new IndexedSequenceDB(indexStore);
081 * Sequence seq = seqDB.getSequence(id);
082 * </pre>
083 * 
084 * <p>
085 * Note: We may be able to improve the indexing speed further by
086 * discarding all feature creation & annotation requests during index
087 * parsing.
088 * </p>
089 *
090 * @author Matthew Pocock
091 * @author Thomas Down
092 * @author Keith James
093 * @see org.biojava.bio.seq.db.TabIndexStore
094 */
095
096public final class IndexedSequenceDB extends AbstractSequenceDB
097    implements SequenceDB, Serializable
098{
099    private final IDMaker idMaker;
100    private final IndexStore indexStore;
101
102    /**
103     * Create an IndexedSequenceDB by specifying both the IDMaker and
104     * IndexStore used.
105     *
106     * <p>
107     * The IDMaker will be used to calculate the ID for each
108     * Sequence. It will delegate the storage and retrieval of the
109     * sequence offsets to the IndexStore.
110     *
111     * @param idMaker  the IDMaker used to calculate Sequence IDs
112     * @param indexStore the IndexStore delegate
113     */
114    public IndexedSequenceDB(IDMaker idMaker, IndexStore indexStore) {
115      this.idMaker = idMaker;
116      this.indexStore = indexStore;
117    }
118
119    /**
120     * Create an IndexedSequenceDB by specifying IndexStore used.
121     *
122     * <p>
123     * IDMaker.byName will be used to calculate the ID for each
124     * Sequence. It will delegate the storage and retrieval of the
125     * sequence offsets to the IndexStore.
126     *
127     * @param indexStore the IndexStore delegate
128     */
129    public IndexedSequenceDB(IndexStore indexStore) {
130      this(IDMaker.byName, indexStore);
131    }
132
133    /**
134     * Retrieve the IndexStore.
135     *
136     * @return the IndexStore delegate
137     */
138    public IndexStore getIndexStore() {
139      return indexStore;
140    }
141
142    /**
143     * Add sequences from a file to the sequence database. This method
144     * works on an "all or nothing" principle. If it can successfully
145     * interpret the entire file, all the sequences will be read
146     * in. However, if it encounters any problems, it will abandon the
147     * whole file; an IOException will be thrown.  Multiple files may
148     * be indexed into a single database. A BioException will be
149     * thrown if it has problems understanding the sequences.
150     *
151     * @param seqFile the file containing the sequence or set of sequences
152     * @throws BioException if for any reason the sequences can't be read
153     *         correctly
154     * @throws ChangeVetoException if there is a listener that vetoes adding
155     *         the files
156     */
157
158    public void addFile(File seqFile)
159        throws IllegalIDException, BioException, ChangeVetoException
160    {
161      boolean completed = false; // initially assume that we will fail
162      try {
163        seqFile = seqFile.getAbsoluteFile();
164        CountedBufferedReader bReader = new CountedBufferedReader(new FileReader(seqFile));
165        SequenceFormat format = indexStore.getFormat();
166        SymbolTokenization symParser = indexStore.getSymbolParser();
167        SequenceBuilderFactory sbFact = indexStore.getSBFactory();
168        long pos = bReader.getFilePointer();
169        boolean hasNextSequence = true;
170        while(hasNextSequence) {
171          SequenceBuilder sb = new ElideSymbolsSequenceBuilder(sbFact.makeSequenceBuilder());
172          hasNextSequence = format.readSequence(bReader, symParser, sb);
173          Sequence seq = sb.makeSequence();
174          String id = idMaker.calcID(seq);
175          long oldPos = pos;
176          pos = bReader.getFilePointer();
177          indexStore.store(new SimpleIndex(seqFile, oldPos, (int) (pos - oldPos), id));
178        }
179
180        if(!hasListeners()) {
181          indexStore.commit();
182        } else {
183            ChangeEvent ce = new ChangeEvent(
184                this,
185                SequenceDB.SEQUENCES
186            );
187            ChangeSupport changeSupport = getChangeSupport(SequenceDB.SEQUENCES);
188            synchronized(changeSupport) {
189              changeSupport.firePreChangeEvent(ce);
190              indexStore.commit();
191              changeSupport.firePostChangeEvent(ce);
192            }
193        }
194        completed = true; // we completed succesfuly
195      } catch (IOException ioe) {
196        throw new BioException("Failed to read sequence file",ioe);
197      } finally {
198        if(!completed) { // if there was a failure, discard changes
199          indexStore.rollback();
200        }
201      }
202    }
203
204    /**
205     * SequenceBuilder implementation that explicitly discards the Symbols
206     *
207     * @author Thomas Down
208     * @author Matthew Pocock
209     * @author Mark Schreiber
210     */
211    private static class ElideSymbolsSequenceBuilder
212        extends SequenceBuilderFilter
213        implements ParseErrorSource {
214
215        private Vector listeners = new Vector();
216
217        public synchronized void removeParseErrorListener(ParseErrorListener p){
218          if(listeners.contains(p))
219            listeners.remove(p);
220        }
221
222        public synchronized void addParseErrorListener(ParseErrorListener p){
223          if(! listeners.contains(p)){
224            listeners.add(p);
225          }
226        }
227
228        public ElideSymbolsSequenceBuilder(SequenceBuilder delegate) {
229            super(delegate);
230        }
231
232        /**
233         * Just ignore the symbols
234         */
235        public void addSymbols(Alphabet alpha, Symbol[] syms, int start, int length) {
236        }
237    }
238
239    /**
240     * Get the name of this sequence database. The name is retrieved
241     * from the IndexStore delegate.
242     *
243     * @return the name of the sequence database, which may be null.
244     */
245    public String getName() {
246        return indexStore.getName();
247    }
248
249    public Sequence getSequence(String id)
250    throws IllegalIDException, BioException
251    {
252        try
253        {
254            Index indx = indexStore.fetch(id);
255
256            RandomAccessReader rar =
257                new RandomAccessReader(new RandomAccessFile(indx.getFile(), "r"));
258
259            long toSkip = indx.getStart();
260            if (toSkip > rar.length())
261                throw new BioException("Reached end of file");
262            rar.seek(toSkip);
263
264            SequenceBuilder sb =
265                indexStore.getSBFactory().makeSequenceBuilder();
266
267            indexStore.getFormat().readSequence(new BufferedReader(rar),
268                                                indexStore.getSymbolParser(),
269                                                sb);
270            Sequence seq = sb.makeSequence();
271
272            rar.close();
273            return seq;
274        }
275        catch (IOException ioe)
276        {
277            throw new BioException("Couldn't grab region of file",ioe);
278        }
279    }
280
281    public SequenceIterator sequenceIterator() {
282        return new SequenceIterator() {
283            private Iterator idI = indexStore.getIDs().iterator();
284
285            public boolean hasNext() {
286                return idI.hasNext();
287            }
288
289            public Sequence nextSequence() throws BioException {
290                return getSequence((String) idI.next());
291            }
292        };
293    }
294
295    public Set ids() {
296        return indexStore.getIDs();
297    }
298
299
300}