Source code

001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022package org.biojava.bio.program.abi;
023
024import java.io.DataInput;
025import java.io.DataInputStream;
026import java.io.File;
027import java.io.FileNotFoundException;
028import java.io.IOException;
029import java.io.InputStream;
030import java.util.Collections;
031import java.util.HashMap;
032import java.util.Iterator;
033import java.util.Map;
034
035import org.biojava.bio.seq.DNATools;
036import org.biojava.bio.symbol.IllegalSymbolException;
037import org.biojava.bio.symbol.Symbol;
038import org.biojava.utils.io.CachingInputStream;
039import org.biojava.utils.io.Seekable;
040
041/**
042 * A general base parser for files produced by ABI software.  This includes
043 * chromatograms derived from ABI sequencers and potentially other data files
044 * as well. The format was described by Clark Tibbetts in his paper "Raw Data
045 * File Formats, and the Digital and Analog Raw Data Streams of the ABI PRISM
046 * 377 DNA Sequencer."  Available online
047 * <kbd><a href="http://www-2.cs.cmu.edu/afs/cs/project/genome/WWW/Papers/clark.html">
048 * http://www-2.cs.cmu.edu/afs/cs/project/genome/WWW/Papers/clark.html</a></kbd>
049 * <p>
050 * Briefly, the format consists of a set of named fixed-length "tagged data
051 * records" which may contain data themselves, or pointers to data elsewhere
052 * in the file.  This class reads these records and exposes them to subclasses
053 * through the {@link #getDataRecord} method.  The attributes of the records as
054 * described in Tibbets' paper are exposed through public (final) fields of
055 * {@link TaggedDataRecord} instances.
056 * </p>
057 * <p>
058 * If a record only contains a pointer to the desired data (see 
059 * {@link TaggedDataRecord#hasOffsetData}, subclasses may get
060 * at the raw data by using {@link TaggedDataRecord#offsetData}:
061 * </p>
062 * <p>
063 * This parser provides methods and classes for dealing with the files as
064 * streams or local files (local files being more memory-efficient).
065 * </p>
066 *
067 * @author Rhett Sutphin (<a href="http://genome.uiowa.edu/">UI CBCB</a>)
068 * @author Richard Holland
069 */
070public class ABIFParser {
071    private ABIFParser.DataAccess din;
072    private boolean parsed = false;
073    private Map records;
074
075    private final int RECORD_COUNT_OFFSET  = 18;
076    private final int RECORD_OFFSET_OFFSET = 26;
077
078    /** Creates a new ABIFParser for a file. */
079    public ABIFParser(File f) throws IOException {
080        this(new ABIFParser.RandomAccessFile(f));
081    }
082
083    /**
084     * Creates a new ABIFParser for an input stream.  Note that the stream
085     * will be wrapped in a {@link CachingInputStream} if it isn't one already.
086     * If it is, it will be seeked to 0.
087     */
088    public ABIFParser(InputStream in) throws IOException {
089        this(new ABIFParser.DataStream(in));
090    }
091
092    /**
093     * Creates a new ABIFParser for the specified {@link DataAccess} object.
094     * If you need to read from something other than a file or a stream, you'll
095     * have to implement a {@link DataAccess}-implementing class wrapping your
096     * source and then pass an instance to this constructor.
097     */
098    public ABIFParser(ABIFParser.DataAccess toParse) throws IOException {
099        din = toParse;
100        readDataRecords();
101    }
102
103    /**
104     * Returns the accessor for the raw data being parsed by this parser.
105     */
106    public final ABIFParser.DataAccess getDataAccess() {
107        return din;
108    }
109
110    private final void readDataRecords() throws IOException {
111        parsed = false;
112        din.seek(RECORD_COUNT_OFFSET);
113        long recordCount  = 0xffffffff & din.readInt();
114        din.seek(RECORD_OFFSET_OFFSET);
115        long recordOffset = 0xffffffff & din.readInt();
116        din.seek(recordOffset);
117        TaggedDataRecord tdr;
118        StringBuffer label;
119        records = new HashMap();
120        for (int i = 0 ; i < recordCount ; i++) {
121            tdr = new TaggedDataRecord(din);
122            label = new StringBuffer(6).append(tdr.tagName).append(tdr.tagNumber);
123            records.put(label.substring(0), tdr);
124        }
125        for (Iterator i = records.values().iterator(); i.hasNext(); ) {
126                TaggedDataRecord record = (TaggedDataRecord)i.next();
127                if (record.hasOffsetData) {
128                        din.seek(record.dataRecord);
129                        din.readFully(record.offsetData);
130                }
131        }
132        parsed = true;
133        din.finishedReading();
134    }
135
136    /**
137     * Decodes a character into a {@link Symbol} in the DNA alphabet.
138     * Uses a definition of characters that is compatible with the ABI format.
139     * @param token the character to decode
140     * @throws IllegalSymbolException when token isn't in
141     *         <code>{ a, A, c, C, g, G, t, T, n, N, - }</code>
142     */
143    public static Symbol decodeDNAToken(char token) throws IllegalSymbolException {
144        switch (token) {
145            case 'a': case 'A':
146                return DNATools.a();
147            case 'c': case 'C':
148                return DNATools.c();
149            case 'g': case 'G':
150                return DNATools.g();
151            case 't': case 'T':
152                return DNATools.t();
153            case 'n': case 'N':
154                return DNATools.n();
155            case '-':
156                return DNATools.getDNA().getGapSymbol();
157            default:
158                throw new IllegalSymbolException("Can't decode token " + token + " into DNA");
159        }
160    }
161
162    /**
163     * Get the entry from the file TOC with the given name and tag number.
164     * @param tagName the four-character string name of the desired data record
165     * @param tagNumber which one of the tags with this name to return (must be positive)
166     * @throws IllegalArgumentException if tagName is the wrong length or tagNumber
167     *         is 0 or negative
168     * @throws IllegalStateException if the initial parsing is not complete
169     * @return the requested data record, or null if no such record exists
170     */
171    public ABIFParser.TaggedDataRecord getDataRecord(String tagName, int tagNumber)
172    throws IllegalArgumentException, IllegalStateException {
173        if (!parsed)
174            throw new IllegalStateException("parsing is not complete");
175        if (tagNumber < 1)
176            throw new IllegalArgumentException("tagNumber must be positive");
177        if (tagName.length() != 4)
178            throw new IllegalArgumentException("tagName must be 4 characters long");
179        return (ABIFParser.TaggedDataRecord) records.get(tagName + tagNumber);
180    }
181    
182    /**
183     * Obtain all data records. Keys of the map are strings consisting of
184     * tag names with tag numbers concatenated immediately afterwards. Values
185     * are TaggedDataRecord objects. The map has no particular order and so 
186     * cannot be relied on to iterate over records in the same order they
187     * were read from the file.
188     * @return the map of all data records.
189     */
190    public Map getAllDataRecords() {
191        return Collections.unmodifiableMap(records);
192    }
193
194    /**
195     * An aggregate immutable type for an ABIF tagged data record.  See the
196     * Tibbets paper (referenced in the javadoc for {@link ABIFParser}) for
197     * more information.
198     */
199    public static class TaggedDataRecord {
200        public static final int DATA_TYPE_ASCII_ARRAY = 2;
201        public static final int DATA_TYPE_INTEGER = 4;
202        public static final int DATA_TYPE_FLOAT   = 7;
203        public static final int DATA_TYPE_DATE    = 10;
204        public static final int DATA_TYPE_TIME    = 11;
205        public static final int DATA_TYPE_PSTRING = 18;
206
207        public final char[] tagName;
208        public final long   tagNumber;
209        public final int    dataType;
210        public final int    elementLength;
211        public final long   numberOfElements;
212        public final long   recordLength;
213        public final long   dataRecord;
214        public final long   crypticVariable;
215        public final boolean hasOffsetData;
216        public final byte[] offsetData;
217        
218        /**
219         * Creates a new TaggedDataRecord from the next 28 bytes of
220         * <code>din</code>.
221         * @param din the source of the raw data to be parsed
222         * @throws IOException if there's a problem with <code>din</code>
223         */
224        public TaggedDataRecord(ABIFParser.DataAccess din) throws IOException {
225            tagName = new char[4];
226            tagName[0] = (char) din.readByte();
227            tagName[1] = (char) din.readByte();
228            tagName[2] = (char) din.readByte();
229            tagName[3] = (char) din.readByte();
230
231            tagNumber        = 0xffffffff & din.readInt();
232            dataType         = 0xffff & din.readShort();
233            elementLength    = 0xffff & din.readShort();
234            numberOfElements = 0xffffffff & din.readInt();
235            recordLength     = 0xffffffff & din.readInt();
236            dataRecord       = 0xffffffff & din.readInt();
237            crypticVariable  = 0xffffffff & din.readInt();
238            
239            hasOffsetData = recordLength>4L;
240            if (hasOffsetData)
241                offsetData = new byte[(int)recordLength];
242            else 
243                offsetData = new byte[0];
244        }
245
246        /**
247         * A very verbose <code>toString</code> that dumps all of the
248         * data in this record in a human-readable format.
249         */
250        public String toString() {
251            StringBuffer sb = new StringBuffer(super.toString()).append("[\n");
252            sb.append("  tagName         = ").append(tagName).append('\n');
253            sb.append("  tagNumber       = ").append(tagNumber).append('\n');
254            sb.append("  dataType        = ");
255            switch (dataType) {
256                case DATA_TYPE_ASCII_ARRAY: sb.append("ASCII"); break;
257                case DATA_TYPE_INTEGER: sb.append("INTEGER"); break;
258                case DATA_TYPE_FLOAT:   sb.append("FLOAT");   break;
259                case DATA_TYPE_DATE:    sb.append("DATE");    break;
260                case DATA_TYPE_TIME:    sb.append("TIME");    break;
261                case DATA_TYPE_PSTRING: sb.append("PSTRING"); break;
262                default: sb.append(dataType);
263            }
264            sb.append('\n');
265            sb.append("  elementLength   = ").append(elementLength).append('\n');
266            sb.append("  numberOfElements= ").append(numberOfElements).append('\n');
267            sb.append("  recordLength    = ").append(recordLength).append('\n');
268            sb.append("  dataRecord      = ");
269            if (recordLength <= 4) {
270                switch (dataType) {
271                case DATA_TYPE_ASCII_ARRAY:
272                    if (recordLength > 3)
273                        sb.append((char) ((dataRecord >>> 24) & 0xFF));
274                    if (recordLength > 2)
275                        sb.append((char) ((dataRecord >>> 16) & 0xFF));
276                    if (recordLength > 1)
277                        sb.append((char) ((dataRecord >>> 8 ) & 0xFF));
278                    sb.append((char) ((dataRecord) & 0xFF));
279                    break;
280                case DATA_TYPE_DATE:
281                    sb.append((dataRecord >>> 16) & 0xffff).append('/');
282                    sb.append((dataRecord >>> 8 ) & 0xff).append('/');
283                    sb.append((dataRecord) & 0xff);
284                    break;
285                case DATA_TYPE_TIME:
286                    sb.append((dataRecord >>> 24) & 0xff).append(':');
287                    sb.append((dataRecord >>> 16) & 0xff).append(':');
288                    sb.append((dataRecord >>> 8 ) & 0xff);
289                    break;
290                case DATA_TYPE_INTEGER:
291                    sb.append(dataRecord >>> (4 - recordLength)*8);
292                    break;
293                default:
294                    hexStringify((int)dataRecord, sb);
295                }
296            }
297            else {
298                hexStringify((int)dataRecord, sb);
299            }
300            sb.append("  hasOffsetData   = ").append(hasOffsetData).append('\n');
301            sb.append('\n');
302            sb.append("  crypticVariable = ").append(crypticVariable).append('\n');
303            sb.append(']');
304            return sb.toString();
305        }
306
307        private void hexStringify(int l, StringBuffer sb) {
308            sb.append("0x");
309            String hex = Integer.toHexString(l).toUpperCase();
310            for (int i = 8 ; i > hex.length() ; i--)
311                sb.append('0');
312            sb.append(hex);
313        }
314    }
315
316    /**
317     * Concatenation of the {@link Seekable} and {@link DataInput} interfaces.
318     */
319    public static interface DataAccess extends Seekable, DataInput { 
320        /**
321         * Called when the parser has finished reading. The access
322         * may choose to close itself at this point, e.g. if it is
323         * using a RandomAccessFile.
324         * @throws IOException if it could not do what it needs to.
325         */
326        public void finishedReading() throws IOException;
327    }
328
329    private static class RandomAccessFile
330    extends java.io.RandomAccessFile implements DataAccess {
331        public RandomAccessFile(File f) throws FileNotFoundException {
332            super(f, "r");
333        }
334        public void finishedReading() throws IOException {
335                this.close();
336        }
337    }
338
339    /** Implements DataAccess by delegating to a CachingStream and a
340     *  DataInputStream */
341    private static class DataStream implements DataAccess {
342        CachingInputStream cin;
343        DataInputStream din;
344
345        public DataStream(InputStream src) throws IOException {
346            if (src instanceof CachingInputStream)
347                cin = (CachingInputStream) src;
348            else
349                cin = new CachingInputStream(src);
350            cin.seek(0);
351            din = new DataInputStream(cin);
352        }
353
354        public DataStream(CachingInputStream cin) throws IOException {
355            this((InputStream) cin);
356        }
357        
358        public void finishedReading() throws IOException {
359                // We don't care.
360        }
361
362        public boolean readBoolean() throws IOException { return din.readBoolean(); }
363        public byte    readByte()    throws IOException { return din.readByte();    }
364        public char    readChar()    throws IOException { return din.readChar();    }
365        public short   readShort()   throws IOException { return din.readShort();   }
366        public int     readInt()     throws IOException { return din.readInt();     }
367        public long    readLong()    throws IOException { return din.readLong();    }
368        public float   readFloat()   throws IOException { return din.readFloat();   }
369        public double  readDouble()  throws IOException { return din.readDouble();  }
370        public String  readUTF()     throws IOException { return din.readUTF();     }
371
372        public int readUnsignedByte()  throws IOException { return din.readUnsignedByte();  }
373        public int readUnsignedShort() throws IOException { return din.readUnsignedShort(); }
374
375        public void readFully(byte[] values) throws IOException {
376            din.readFully(values);
377        }
378
379        public void readFully(byte[] values, int start, int len) throws IOException {
380            din.readFully(values, start, len);
381        }
382
383        public String readLine() throws IOException {
384            throw new UnsupportedOperationException("DataInputStream#readLine is deprecated.  Use readUTF instead");
385        }
386
387        public int skipBytes(int count) throws IOException { return din.skipBytes(count); }
388
389        public void seek(long pos) throws IOException {
390            cin.seek(pos);
391        }
392    }
393}