001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022package org.biojava.bio.program.abi;
023
024import java.io.File;
025import java.io.IOException;
026import java.io.InputStream;
027import java.io.Serializable;
028import java.util.ArrayList;
029import java.util.List;
030import java.util.Map;
031
032import org.biojava.bio.BioError;
033import org.biojava.bio.chromatogram.AbstractChromatogram;
034import org.biojava.bio.chromatogram.Chromatogram;
035import org.biojava.bio.chromatogram.UnsupportedChromatogramFormatException;
036import org.biojava.bio.seq.DNATools;
037import org.biojava.bio.symbol.AtomicSymbol;
038import org.biojava.bio.symbol.IllegalAlphabetException;
039import org.biojava.bio.symbol.IllegalSymbolException;
040import org.biojava.bio.symbol.IntegerAlphabet;
041import org.biojava.bio.symbol.Symbol;
042import org.biojava.utils.SmallMap;
043
044
045/**
046 * An implementation of {@link org.biojava.bio.chromatogram.Chromatogram} to
047 * encapulsulate chromatogram data extracted from the files produced by ABI
048 * sequencers, such as the the 377 and the 3700.  The format was described by
049 * Clark Tibbetts in his paper "Raw Data File Formats, and the Digital and
050 * Analog Raw Data Streams of the ABI PRISM 377 DNA Sequencer."  Available
051 * online <kbd><a href="http://www-2.cs.cmu.edu/afs/cs/project/genome/WWW/Papers/clark.html">
052 * http://www-2.cs.cmu.edu/afs/cs/project/genome/WWW/Papers/clark.html</a></kbd>
053 *
054 * @author Rhett Sutphin (<a href="http://genome.uiowa.edu/">UI CBCB</a>)
055 * @author Richard Holland
056 * @see ABIFParser
057 */
058public class ABIFChromatogram extends AbstractChromatogram implements Serializable {
059    public ABIFChromatogram() {
060        super();
061    }
062
063    /** Create a new ABIF object from a file.
064     *  <p>
065     *  This method is more efficent than {@link #create(InputStream)}.
066     *  </p>
067     */
068    public static ABIFChromatogram create(File f)
069    throws IOException, UnsupportedChromatogramFormatException {
070        ABIFChromatogram newOne = new ABIFChromatogram();
071        newOne.load(f);
072        return newOne;
073    }
074
075    /**
076     * Create a new ABIF object from a stream of bytes.
077     * <p>
078     * Due to the non-single-pass design of the ABI format, this method will
079     * wrap the InputStream in an {@link org.biojava.utils.io.CachingInputStream}.
080     * For this reason, {@link #create(File)} should be preferred.
081     * </p>
082     * @param in the stream from which to read
083     * @return a new ABIFChromatogram object
084     * @throws IOException if there is a problem with the underlying stream
085     */
086    public static ABIFChromatogram create(InputStream in)
087    throws IOException, UnsupportedChromatogramFormatException {
088        ABIFChromatogram newOne = new ABIFChromatogram();
089        newOne.load(in);
090        return newOne;
091    }
092
093    protected ABIFChromatogram load(File f)
094    throws IOException, UnsupportedChromatogramFormatException {
095        new Parser(f);
096        return this;
097    }
098
099    protected ABIFChromatogram load(InputStream in)
100    throws IOException, UnsupportedChromatogramFormatException {
101        new Parser(in);
102        return this;
103    }
104
105    protected AbstractChromatogram reverseComplementInstance() {
106        return new ABIFChromatogram();
107    }
108
109    /**
110     * An extension of {@link ABIFParser} that reads the particular fields from
111     * the ABIF that contain the chromatogram data and initializes the fields
112     * in its enclosing <code>ABIFChromatogram</code> instance.
113     */
114    protected class Parser extends ABIFParser {
115        public Parser(InputStream in)
116        throws IOException, UnsupportedChromatogramFormatException {
117            super(in);
118            parse();
119        }
120
121        public Parser(File f)
122        throws IOException, UnsupportedChromatogramFormatException {
123            super(f);
124            parse();
125        }
126
127        private final void parse()
128        throws IOException, UnsupportedChromatogramFormatException {
129            // read filter-wheel-order tag
130            char[] fwo_ = new char[4];
131            ABIFParser.TaggedDataRecord fwoRec = getDataRecord("FWO_", 1);
132            if (fwoRec == null)
133                throw new UnsupportedChromatogramFormatException("No FWO_ (1) record in ABIF file, therefore no trace data");
134            fwo_[0] = (char) ( (fwoRec.dataRecord >>> 24) & 0xff );
135            fwo_[1] = (char) ( (fwoRec.dataRecord >>> 16) & 0xff );
136            fwo_[2] = (char) ( (fwoRec.dataRecord >>> 8 ) & 0xff );
137            fwo_[3] = (char) ( (fwoRec.dataRecord       ) & 0xff );
138
139            Symbol sym;
140            clearTraces();
141            for (int i = 0 ; i < 4 ; i++) {
142                
143                try {
144                    sym = ABIFParser.decodeDNAToken(fwo_[i]);
145                } catch (IllegalSymbolException ise) {
146                    throw new UnsupportedChromatogramFormatException("An unexpected character (" + fwo_[i] +") was found in the FWO_ tag.  Parsing cannot continue.");
147                }
148                if (!(sym instanceof AtomicSymbol)) {
149                    throw new UnsupportedChromatogramFormatException("An unexpected character (" + fwo_[i] +") was found in the FWO_ tag.  Parsing cannot continue.");
150                }
151                parseTrace((AtomicSymbol) sym, i+9);
152            }
153            parseBaseCalls();
154            getDataAccess().finishedReading();
155        }
156
157        private void parseTrace(AtomicSymbol sym, int whichData) throws IOException, UnsupportedChromatogramFormatException {
158            TaggedDataRecord dataPtr = getDataRecord("DATA", whichData);
159            if (dataPtr.numberOfElements > Integer.MAX_VALUE)
160                throw new UnsupportedChromatogramFormatException("Chromatogram has more than " + Integer.MAX_VALUE + " trace samples -- can't handle it");
161            int count = (int) dataPtr.numberOfElements;
162            int[] trace = new int[count];
163            int max = -1;
164            setBits(8*dataPtr.elementLength);
165            if (dataPtr.elementLength == 2) {
166                byte[] shortArray = dataPtr.offsetData;
167                int i = 0;
168                for (int s = 0; s < shortArray.length; s += 2) {
169                    trace[i] =  ((short)((shortArray[s] << 8) | (shortArray[s + 1] & 0xff))) & 0xffff;
170                    max = Math.max(trace[i++], max);
171                }
172            }
173            else if (dataPtr.elementLength == 1) {
174                byte[] byteArray = dataPtr.offsetData;
175                for (int i = 0; i < byteArray.length; i++) {
176                    trace[i] = byteArray[i] & 0xff;
177                    max = Math.max(trace[i], max);
178                }
179            }
180            else {
181                throw new UnsupportedChromatogramFormatException("Only 8- and 16-bit trace samples are supported");
182            }
183            
184            try {
185                setTrace(sym, trace, max);
186            } catch (IllegalSymbolException ise) {
187                throw new BioError("Can't happen", ise);
188            }
189        }
190
191        private void parseBaseCalls() throws IOException, UnsupportedChromatogramFormatException {
192            // do offsets, then call letters
193            // offsets are in PLOC1 (we'll use the possibly-edited stream)
194            TaggedDataRecord offsetsPtr = getDataRecord("PLOC", 1);
195            // call letters are int PBAS1
196            TaggedDataRecord basesPtr = getDataRecord("PBAS", 1);
197            // these should be equal, but just in case...
198            if (offsetsPtr.numberOfElements != basesPtr.numberOfElements)
199                throw new BioError("PLOC and PBAS are different lengths.  Can't proceed.");
200            if (offsetsPtr.numberOfElements > Integer.MAX_VALUE)
201                throw new UnsupportedChromatogramFormatException("Chromatogram has more than " + Integer.MAX_VALUE + " base calls -- can't handle it");
202            int count = (int) offsetsPtr.numberOfElements;
203            // the list of called bases
204            List dna = new ArrayList(count);
205            // the list of offsets
206            List offsets = new ArrayList(count);
207            // start reading offsets, creating SimpleBaseCalls along the way
208            if (offsetsPtr.elementLength == 2) {
209                byte[] shortArray = offsetsPtr.offsetData;
210                IntegerAlphabet integerAlphabet = IntegerAlphabet.getInstance();
211                for (int s = 0; s < shortArray.length; s += 2) {
212                    offsets.add(integerAlphabet.getSymbol(((short)((shortArray[s] << 8) | (shortArray[s + 1] & 0xff))) & 0xffff));
213                }
214            }
215            else if (offsetsPtr.elementLength == 1) {
216                byte[] byteArray = offsetsPtr.offsetData;
217                IntegerAlphabet integerAlphabet = IntegerAlphabet.getInstance();
218                for (int i = 0 ; i < byteArray.length; i++) {
219                    offsets.add(integerAlphabet.getSymbol(byteArray[i] & 0xff));
220                }
221            }
222            else {
223                throw new IllegalStateException("Only 8- and 16-bit trace samples are supported");
224            }
225
226            // then read the base calls
227            try {
228                byte[] byteArray = basesPtr.offsetData;
229                for (int i = 0; i < byteArray.length; i++) {
230                    dna.add(ABIFParser.decodeDNAToken((char) byteArray[i]));
231                }
232            } catch (IllegalSymbolException ise) {
233                throw new BioError("Can't happen", ise);
234            }
235            // create the base call alignment and set it
236            try {
237                Map baseCalls = new SmallMap(2);
238                baseCalls.put(Chromatogram.DNA, createImmutableSymbolList(DNATools.getDNA(), dna));
239                baseCalls.put(Chromatogram.OFFSETS, createImmutableSymbolList(IntegerAlphabet.getInstance(), offsets));
240                setBaseCallAlignment(createImmutableAlignment(baseCalls));
241            } catch (IllegalAlphabetException iae) {
242                throw new BioError("Can't happen", iae);
243            } catch (IllegalSymbolException ise) {
244                throw new BioError("Can't happen", ise);
245            }
246        }
247    }
248}