001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022package org.biojava.bio.seq.db.emblcd;
023
024import java.io.File;
025import java.io.FileNotFoundException;
026import java.io.IOException;
027import java.io.RandomAccessFile;
028
029/**
030 * <p><code>EmblCDROMRandomAccess</code> is an abstract class whose
031 * concrete subclasses can perform fast lookups in EMBL CD-ROM format
032 * index files. As the format of the records varies between file
033 * types, subclasses should implement two methods;
034 * <code>readRecord()</code>, which should parse the record into an
035 * array of objects and <code>getRecordKey()</code> which should
036 * retrieve the the field from the parsed record on which the records
037 * were sorted in the index. This is used during the binary search in
038 * the <code>findRecord()</code> method.</p>
039 *
040 * <p>Implementing <code>readRecord()</code> is easy because it simply
041 * means delegating to the supplied <code>RecordParser</code> and
042 * calling the appropriate method on it.</p>
043 *
044 * @author Keith James
045 * @since 1.2
046 */
047public abstract class EmblCDROMRandomAccess
048{
049    private   File             indexFile;
050    protected RandomAccessFile raIndexFile;
051
052    private int headerLength;
053    private int recordLength;
054    private long recordCount;
055
056    /**
057     * A <code>recParser</code> for implementing
058     * <code>readRecord()</code> specific to each concrete subclass.
059     */
060    protected RecordParser recParser;
061
062    protected byte [] recBytes;
063
064    /**
065     * Creates a new <code>EmblCDROMRandomAccess</code> object.
066     *
067     * @param indexFile a <code>File</code> to wrap.
068     * @param headerLength an <code>int</code> (normally 300 bytes).
069     * @param recordLength an <code>int</code> indicating the length
070     * of a single record.
071     * @param recordCount an <code>long</code> indicating the total
072     * number of records.
073     *
074     * @exception FileNotFoundException if indexFile cannot be found.
075     */
076    public EmblCDROMRandomAccess(File indexFile,
077                                 int  headerLength,
078                                 int  recordLength,
079                                 long recordCount)
080        throws FileNotFoundException
081    {
082        this.indexFile = indexFile;
083        raIndexFile = new RandomAccessFile(indexFile, "r");
084
085        this.headerLength = headerLength;
086        this.recordLength = recordLength;
087        this.recordCount  = recordCount;
088
089        recBytes  = new byte [recordLength];
090        recParser = new RecordParser();
091    }
092
093    /**
094     * <code>getFile</code> returns the <code>File</code> wrapped.
095     *
096     * @return a <code>File</code>.
097     */
098    public File getFile()
099    {
100        return indexFile;
101    }
102
103    /**
104     * <code>findRecord</code> performs a binary search within the
105     * file for a record specified by an identifier String.
106     *
107     * @param identifier a <code>String</code> identifier (sequence ID
108     * or accession number).
109     *
110     * @return an <code>Object []</code> array containing the
111     * record. If there is no such record an empty array is returned.
112     *
113     * @exception IOException if an error occurs.
114     */
115    public Object [] findRecord(String identifier)
116        throws IOException
117    {
118        long startRecord = 0;
119        long  endRecord  = recordCount - 1;
120
121        while (startRecord <= endRecord)
122        {
123            long midPoint = (startRecord + endRecord) / 2;
124            raIndexFile.seek(headerLength + (midPoint * recordLength));
125
126            Object [] record = readRecord();
127            String recordKey = getRecordKey(record).trim();
128
129            if (recordKey.equals(identifier))
130                return record;
131            else if (recordKey.compareTo(identifier) < 0)
132                startRecord = midPoint + 1;
133            else
134                endRecord = midPoint - 1;
135        }
136
137        // No such record
138        return new Object [0];
139    }
140
141    /**
142     * <code>close</code> closes the underlying
143     * <code>RandomAccessFile</code>.
144     *
145     * @exception IOException if an error occurs.
146     */
147    public void close() throws IOException
148    {
149        raIndexFile.close();
150    }
151
152    /**
153     * <code>readRecord</code> returns an array of objects parsed from
154     * a single record. Its content will depend on the type of index
155     * file. Concrete subclasses must provide an implementation of
156     * this method.
157     *
158     * @return an <code>Object []</code> array.
159     *
160     * @exception IOException if an error occurs.
161     */
162    protected abstract Object [] readRecord() throws IOException;
163
164    /**
165     * <code>getRecordKey</code> returns the field from the record on
166     * which the records were sorted in the index. (i.e. sequence ID
167     * or accession number).
168     *
169     * @return a <code>String</code>.
170     */
171    protected abstract String getRecordKey(Object [] record);
172}