001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022package org.biojava.bio.seq.db.emblcd;
023
024import java.io.IOException;
025import java.io.InputStream;
026
027/**
028 * <p><code>EmblCDROMIndexReader</code> is an abstract class whose
029 * concrete subclasses read EMBL CD-ROM format indices from an
030 * underlying <code>InputStream</code>. This format is used by the
031 * EMBOSS package for database indexing (see programs dbiblast,
032 * dbifasta, dbiflat and dbigcg). Indexing produces four binary files
033 * with a simple format:</p>
034 * 
035 * <ul>
036 *   <li>division.lkp : master index</li>
037 *   <li>entrynam.idx : sequence ID index</li>
038 *   <li>   acnum.trg : accession number index</li>
039 *   <li>   acnum.hit : accession number auxiliary index</li>
040 * </ul>
041 *
042 * <p>Internally EMBOSS checks for Big-endian architechtures and
043 * switches the byte order to Little-endian. This means trouble if you
044 * try to read the file using <code>DataInputStream</code>, but at
045 * least the binaries are consistent across architechtures. This class
046 * carries out the necessary conversion.</p>
047 *
048 * <p>The EMBL CD-ROM format stores the date in 4 bytes. One byte is
049 * unused (the first one), leaving one byte for the day, one for the
050 * month and one (!) for the year.</p>
051 *
052 * <p> For further information see the EMBOSS documentation, or for a
053 * full description, the source code of the dbi programs and the Ajax
054 * library.</p>
055 *
056 * @author Keith James
057 * @since 1.2
058 */
059public abstract class EmblCDROMIndexReader
060{
061    protected InputStream  input;
062    protected StringBuffer sb;
063    protected RecordParser recParser;
064
065    // Header fields
066    private byte []      int4 = new byte [4];
067    private byte []      int2 = new byte [2];
068    private byte []    dbName = new byte [20];
069    private byte [] dbRelease = new byte [10];
070    private byte []    dbDate = new byte [4];
071    // Record field
072    private byte []    record;
073
074    private long   fileLength;
075    private long   recordCount;
076    private int    recordLength;
077    private String name;
078    private String release;
079    private String date;
080
081    /**
082     * Creates a new <code>EmblCDROMIndexReader</code> instance. A
083     * <code>BufferedInputStream</code> is probably the most suitable.
084     *
085     * @param input an <code>InputStream</code>.
086     *
087     * @exception IOException if an error occurs.
088     */
089    public EmblCDROMIndexReader(InputStream input)
090        throws IOException
091    {
092        this.input = input;
093        sb = new StringBuffer(512);
094        recParser = new RecordParser();
095
096        parseHeader();
097    }
098
099    /**
100     * <code>readFileLength</code> returns the file length in bytes
101     * (stored within the file's header by the indexing program). This
102     * may be called more than once as the value is cached.
103     *
104     * @return a <code>long</code>.
105     */
106    public long readFileLength()
107    {
108        return fileLength;
109    }
110
111    /**
112     * <code>readRecordCount</code> returns the number of records in
113     * the file. This may be called more than once as the value is
114     * cached.
115     *
116     * @return a <code>long</code>.
117     */
118    public long readRecordCount()
119    {
120        return recordCount;
121    }
122
123    /**
124     * <code>readRecordLength</code> returns the record length
125     * (bytes). This may be called more than once as the value is
126     * cached.
127     *
128     * @return an <code>int</code>.
129     */
130    public int readRecordLength()
131    {
132        return recordLength;
133    }
134
135    /**
136     * <code>readDBName</code> returns the database name from the
137     * index header. This may be called more than once as the value is
138     * cached.
139     *
140     * @return a <code>String</code>.
141     */
142    public String readDBName()
143    {
144        return name;
145    }
146
147    /**
148     * <code>readDBRelease</code> returns the database release from
149     * the index header. This may be called more than once as the
150     * value is cached.
151     *
152     * @return a <code>String</code>.
153     */
154    public String readDBRelease()
155    {
156        return release;
157    }
158
159    /**
160     * <code>readDBDate</code> reads the date from the index
161     * header. The date is stored in 4 bytes: 0, unused; 1, year; 2,
162     * month; 3, day. With a 1 byte year it's not very much use and
163     * I'm not sure that the EMBOSS programs set the value correctly
164     * anyway.
165     *
166     * @return a <code>String</code>.
167     */
168    public String readDBDate()
169    
170    {
171        return date;
172    }
173
174    /**
175     * <code>readRecord</code> returns an array of objects parsed from
176     * a single record. Its content will depend on the type of index
177     * file. Concrete subclasses must provide an implementation of
178     * this method.
179     *
180     * @return an <code>Object []</code> array.
181     *
182     * @exception IOException if an error occurs.
183     */
184    public abstract Object [] readRecord() throws IOException;
185
186    /**
187     * <code>readRawRecord</code> returns the raw bytes of a single
188     * record from the index.
189     *
190     * @return a <code>byte []</code> array.
191     *
192     * @exception IOException if an error occurs.
193     */
194    public byte [] readRawRecord() throws IOException
195    {
196        int eof = input.read(record);
197        if (eof == -1)
198            input.close();
199
200        return record;
201    }
202
203    /**
204     * <code>close</code> closes the underlying
205     * <code>InputStream</code>.
206     *
207     * @exception IOException if an error occurs.
208     */
209    public void close() throws IOException
210    {
211        input.close();
212    }
213
214    /**
215     * <code>parseHeader</code> carries out a full parse of the 300
216     * byte header (common to all the index types) when first
217     * encountered.
218     *
219     * @exception IOException if an error occurs.
220     */
221    private void parseHeader() throws IOException
222    {
223        int eof = 0;
224
225        eof = input.read(int4);
226        if (eof == -1)
227            input.close();
228
229        fileLength = recParser.parseInt4(int4);
230
231        eof = input.read(int4);
232        if (eof == -1)
233            input.close();
234
235        recordCount = recParser.parseInt4(int4);
236
237        eof = input.read(int2);
238        if (eof == -1)
239            input.close();
240
241        recordLength = recParser.parseInt2(int2);
242
243        // Set up array for reading records now that we know their
244        // length
245        record = new byte [recordLength];
246
247        eof = input.read(dbName);
248        if (eof == -1)
249            input.close();
250
251        sb.setLength(0);
252        name = recParser.parseString(sb, dbName);
253
254        eof = input.read(dbRelease);
255        if (eof == -1)
256            input.close();
257
258        sb.setLength(0);
259        release = recParser.parseString(sb, dbRelease);
260
261        eof = input.read(dbDate);
262        if (eof == -1)
263            input.close();
264
265        sb.setLength(0);
266        date = recParser.parseDate(sb, dbDate);
267
268        // Skip the remainder of the header (padding)
269        input.skip(256);
270    }
271}