001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022package org.biojava.bio.seq.db;
023
024import java.io.BufferedInputStream;
025import java.io.File;
026import java.io.FileInputStream;
027import java.io.FileNotFoundException;
028import java.io.IOException;
029import java.util.Collections;
030import java.util.HashMap;
031import java.util.HashSet;
032import java.util.Map;
033import java.util.Set;
034
035import org.biojava.bio.BioException;
036import org.biojava.bio.seq.db.emblcd.DivisionLkpReader;
037import org.biojava.bio.seq.db.emblcd.EmblCDROMIndexReader;
038import org.biojava.bio.seq.db.emblcd.EmblCDROMRandomAccess;
039import org.biojava.bio.seq.db.emblcd.EntryNamIdxReader;
040import org.biojava.bio.seq.db.emblcd.EntryNamRandomAccess;
041import org.biojava.bio.seq.io.SequenceBuilderFactory;
042import org.biojava.bio.seq.io.SequenceFormat;
043import org.biojava.bio.seq.io.SymbolTokenization;
044
045/**
046 * <p><code>EmblCDROMIndexStore</code>s implement a read-only
047 * <code>IndexStore</code> backed by EMBL CD-ROM format binary
048 * indices. The required index files are typically named
049 * "division.lkp" and "entrynam.idx". As an <code>IndexStore</code>
050 * performs lookups by sequence ID, the index files "acnum.trg" and
051 * "acnum.hit" (which store additional accession number data) are not
052 * used.</p>
053 *
054 * <p>The sequence IDs are found using a binary search via a pointer
055 * into the index file. The whole file is not read unless a request
056 * for all the IDs is made using the getIDs() method. The set of IDs
057 * is then cached after the first pass. This class also has a
058 * <code>close()</code> method to free resources associated with the
059 * underlying <code>RandomAccessFile</code>.</p>
060 *
061 * <p>The binary index files may be created using the EMBOSS programs
062 * dbifasta, dbiblast, dbiflat or dbigcg. The least useful from the
063 * BioJava perspective is dbigcg because we do not have a
064 * <code>SequenceFormat</code> implementation for GCG format
065 * files.</p>
066 *
067 * <p>The <code>Index</code> instances returned by this class do not
068 * have the record length set because this information is not
069 * available in the binary index. The value -1 is used instead, as
070 * described in the <code>Index</code> interface.</p>
071 *
072 * @author Keith James
073 * @since 1.2
074 */
075public class EmblCDROMIndexStore implements IndexStore 
076{
077    private File divisionLkp;
078    private File entryNamIdx;
079
080    // Optional PATH prefix to append to the filename(s) extracted
081    // from the binary indices
082    private File pathPrefix;
083
084    private SequenceFormat         format;
085    private SequenceBuilderFactory factory;
086    private SymbolTokenization     parser;
087
088    // Maps the file numbers used in the indices to the real file names
089    private Map seqFiles;
090    // Set view of file names
091    private Set fileSet;
092    // Lazily instantiated if someone asks for all the IDs at once
093    private Set seqIds;
094    // The database name defined in the index header
095    private String name;
096
097    // Details of the master index records
098    private long divRecordCount;
099    // Details of the ID/offset records
100    private int entryRecordLength;
101    private long entryRecordCount;
102
103    // The random access file containing ID/offset records
104    private EmblCDROMRandomAccess entryRandomAccess;
105
106    /**
107     * Creates a new <code>EmblCDROMIndexStore</code> backed by a
108     * random access binary index.
109     *
110     * @param divisionLkp a <code>File</code> containing the master
111     * index.
112     * @param entryNamIdx a <code>File</code> containing the sequence
113     * IDs and offsets.
114     * @param format a <code>SequenceFormat</code>.
115     * @param factory a <code>SequenceBuilderFactory</code>.
116     * @param parser a <code>SymbolTokenization</code>.
117     *
118     * @exception IOException if an error occurs.
119     */
120    public EmblCDROMIndexStore(File                   divisionLkp,
121                               File                   entryNamIdx,
122                               SequenceFormat         format,
123                               SequenceBuilderFactory factory,
124                               SymbolTokenization     parser)
125        throws IOException
126    {
127        // Set to the empty abstract path
128        this(new File(""), divisionLkp, entryNamIdx,
129             format, factory, parser);
130    }
131
132    /**
133     * Creates a new <code>EmblCDROMIndexStore</code> backed by a
134     * random access binary index.
135     *
136     * @param pathPrefix a <code>File</code> containing the abstract
137     * path to be appended to sequence database filenames retrieved
138     * from the binary index.
139     * @param divisionLkp a <code>File</code> containing the master
140     * index.
141     * @param entryNamIdx a <code>File</code> containing the sequence
142     * IDs and offsets.
143     * @param format a <code>SequenceFormat</code>.
144     * @param factory a <code>SequenceBuilderFactory</code>.
145     * @param parser a <code>SymbolTokenization</code>.
146     *
147     * @exception IOException if an error occurs.
148     */
149    public EmblCDROMIndexStore(File                   pathPrefix,
150                               File                   divisionLkp,
151                               File                   entryNamIdx,
152                               SequenceFormat         format,
153                               SequenceBuilderFactory factory,
154                               SymbolTokenization     parser)
155        throws IOException
156    {
157        this.divisionLkp = divisionLkp;
158        this.entryNamIdx = entryNamIdx;
159        this.format      = format;
160        this.factory     = factory;
161        this.parser      = parser;
162        this.pathPrefix  = pathPrefix;
163
164        initialise();
165    }
166
167    /**
168     * <code>getPathPrefix</code> returns the abstract path currently
169     * being appended to the raw sequence database filenames extracted
170     * from the binary index. This value defaults to the empty
171     * abstract path.
172     *
173     * @return a <code>File</code>.
174     */
175    public File getPathPrefix()
176    {
177        return pathPrefix;
178    }
179
180    /**
181     * <code>setPathPrefix</code> sets the abstract path to be
182     * appended to sequence database filenames retrieved from the
183     * binary index. E.g. if the binary index refers to the database
184     * as 'SWALL' and the <code>pathPrefix</code> is set to
185     * "/usr/local/share/data/seq/", then the <code>IndexStore</code>
186     * will know the database path as
187     * "/usr/local/share/data/seq/swall" and any <code>Index</code>
188     * instances produced by the store will return the latter path
189     * when their getFile() method is called. This value defaults to
190     * the empty abstract path.
191     *
192     * @param pathPrefix a <code>File</code> prefix specifying the
193     * abstract path to append.
194     */
195    public void setPathPrefix(File pathPrefix)
196    {
197        this.pathPrefix = pathPrefix;
198    }
199
200    /**
201     * <code>getName</code> returns the database name as defined
202     * within the EMBL CD-ROM index.
203     *
204     * @return a <code>String</code> value.
205     */
206    public String getName()
207    {
208        return name;
209    }
210
211    /**
212     * <code>store</code> adds an <code>Index</code> to the store. As
213     * EMBL CD-ROM indices are read-only, this implementation throws a
214     * <code>BioException</code>.
215     *
216     * @param index an <code>Index</code>.
217     *
218     * @exception IllegalIDException if an error occurs.
219     * @exception BioException if an error occurs.
220     */
221    public void store(Index index)
222        throws IllegalIDException, BioException
223    {
224        throw new BioException("Failed to add Index: store is read-only."
225                               + " To add sequences use the dbi programs"
226                               + " supplied in EMBOSS");
227    }
228
229    /**
230     * <code>commit</code> commits changes. As EMBL CD-ROM indices are
231     * read-only, this implementation throws a
232     * <code>BioException</code>.
233     *
234     * @exception BioException if an error occurs.
235     */
236    public void commit() throws BioException
237    {
238        throw new BioException("Failed to commit: store is read-only."
239                               + " To add sequences use the dbi programs"
240                               + " supplied in EMBOSS");
241    }
242
243    /**
244     * <code>rollback</code> rolls back changes made since the last
245     * <code>commit</code>. As EMBL CD-ROM indices are read-only, this
246     * implementation does nothing.
247     */
248    public void rollback() { }
249
250    public Index fetch(String id) throws IllegalIDException, BioException
251    {
252        Index index = null;
253
254        try
255        {
256            Object [] enRecord = entryRandomAccess.findRecord(id);
257
258            if (enRecord.length == 0)
259                throw new IllegalIDException("Failed to find ID: " + id);
260
261            // Append current pathPrefix
262            index =
263                new SimpleIndex(new File(pathPrefix,
264                                         (String) seqFiles.get((Integer)
265                                                               enRecord[3])),
266                                ((Long) enRecord[1]).longValue(), -1, id);
267        }
268        catch (IOException ioe)
269        {
270            throw new BioException("Failed to retrieve index for ID: " + id);
271        }
272
273        return index;
274    }
275
276    public Set getIDs()
277    {
278        if (seqIds == null)
279        {
280            seqIds = new HashSet((int) entryRecordCount);
281
282            BufferedInputStream bis = null;
283
284            try
285            {
286                bis =
287                    new BufferedInputStream(new FileInputStream(entryNamIdx));
288                EmblCDROMIndexReader ent = new EntryNamIdxReader(bis);
289
290                for (long i = 0; i < entryRecordCount; i++)
291                {
292                    Object [] enRecord = ent.readRecord();
293                    seqIds.add((String) enRecord[0]);
294                }
295
296                bis.close();
297            }
298            // File was not found, so don't try to close it
299            catch (FileNotFoundException fnfe)
300            {
301                System.err.println("Failed to find file "
302                                   + entryNamIdx.getName());
303                fnfe.printStackTrace();
304            }
305            // File was opened, so try to close it
306            catch (IOException ioe)
307            {
308                try
309                {
310                    bis.close();
311                }
312                catch (IOException ioe2)
313                {
314                    System.err.println("Failed to close input stream from file "
315                                       + entryNamIdx.getName());
316                }
317
318                System.err.println("Failed to read file "
319                                   + entryNamIdx.getName());
320                ioe.printStackTrace();
321            }
322        }
323
324        return Collections.unmodifiableSet(seqIds);
325    }
326
327    public Set getFiles()
328    {
329        return Collections.unmodifiableSet(fileSet);
330    }
331
332    public SequenceFormat getFormat()
333    {
334        return format;
335    }
336
337    public SequenceBuilderFactory getSBFactory()
338    {
339        return factory;
340    }
341
342    public SymbolTokenization getSymbolParser()
343    {
344        return parser;
345    }
346
347    /**
348     * <code>close</code> closes the underlying
349     * <code>EntryNamRandomAccess</code> which in turn closes the
350     * lower level <code>RandomAccessFile</code>. This frees the
351     * resources associated with the file.
352     *
353     * @exception IOException if an error occurs.
354     */
355    public void close() throws IOException
356    {
357        entryRandomAccess.close();
358    }
359
360    /**
361     * <code>initialise</code> reads the headers of the index files to
362     * obtain data about the record sizes and counts, database name
363     * and sequence filenames. It then opens a random access file to
364     * the ID index for lookups.
365     *
366     * @exception IOException if an error occurs.
367     */
368    private void initialise() throws IOException
369    {
370        BufferedInputStream bis = null;
371
372        // First try to get details of file names and numbers from
373        // master index file.
374        try
375        {
376            bis = new BufferedInputStream(new FileInputStream(divisionLkp));
377            EmblCDROMIndexReader div = new DivisionLkpReader(bis);
378
379            divRecordCount  = div.readRecordCount();
380
381            // The database name is the same in all the index headers
382            name = div.readDBName();
383
384            seqFiles = new HashMap((int) divRecordCount);
385
386            // Store the file number->name mapping
387            for (long i = divRecordCount; --i >= 0;)
388            {
389                Object [] divRecord = div.readRecord();
390
391                Integer fileNumber = (Integer) divRecord[0];
392                String    fileName = (String)  divRecord[1];
393
394                seqFiles.put(fileNumber, fileName);
395            }
396
397            // Keep a Set view
398            fileSet = new HashSet((int) divRecordCount);
399            fileSet.addAll(seqFiles.values());
400
401            bis.close();
402        }
403        // File was not found, so don't try to close it
404        catch (FileNotFoundException fnfe)
405        {
406            System.err.println("Failed to find file "
407                               + divisionLkp.getName());
408            // Rethrow
409            throw fnfe;
410        }
411        // File was opened, so try to close it
412        catch (IOException ioe)
413        {
414            try
415            {
416                bis.close();
417            }
418            catch (IOException ioe2)
419            {
420                System.err.println("Failed to close input stream from file "
421                                   + divisionLkp.getName());
422            }
423
424            System.err.println("Failed to read full set of sequence IDs file "
425                               + divisionLkp.getName());
426            // Rethrow
427            throw ioe;
428        }
429
430        // Now try to get details of sequence ID index file
431        try
432        {
433            bis = new BufferedInputStream(new FileInputStream(entryNamIdx));
434            EmblCDROMIndexReader ent = new EntryNamIdxReader(bis);
435
436            entryRecordLength = ent.readRecordLength();
437            entryRecordCount  = ent.readRecordCount();
438
439            bis.close();
440        }
441        // File was not found, so don't try to close it
442        catch (FileNotFoundException fnfe)
443        {
444            System.err.println("Failed to find file "
445                               + entryNamIdx.getName());
446            // Rethrow
447            throw fnfe;
448        }
449        // File was opened, so try to close it
450        catch (IOException ioe)
451        {
452            try
453            {
454                bis.close();
455            }
456            catch (IOException ioe2)
457            {
458                System.err.println("Failed to close input stream from file "
459                                   + entryNamIdx.getName());
460            }
461
462            System.err.println("Failed to read file "
463                               + entryNamIdx.getName());
464            // Rethrow
465            throw ioe;
466        }
467
468        // Try to set up random access file
469        try
470        {
471            entryRandomAccess = new EntryNamRandomAccess(entryNamIdx,
472                                                         300,
473                                                         entryRecordLength,
474                                                         entryRecordCount);
475        }
476        // File was not found, so don't try to close it
477        catch (FileNotFoundException fnfe)
478        {
479            System.err.println("Failed to find file "
480                               + entryNamIdx.getName());
481            try
482            {
483                bis.close();
484            }
485            catch (IOException ioe2)
486            {
487                System.err.println("Failed to close random access file "
488                                   + entryNamIdx.getName());
489            }
490            // Rethrow
491            throw fnfe;
492        }
493    }
494}