001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022package org.biojava.bio.program.scf;
023
024import java.io.BufferedReader;
025import java.io.ByteArrayInputStream;
026import java.io.DataInputStream;
027import java.io.File;
028import java.io.FileInputStream;
029import java.io.IOException;
030import java.io.InputStream;
031import java.io.InputStreamReader;
032import java.util.ArrayList;
033import java.util.Iterator;
034import java.util.List;
035import java.util.Map;
036import java.util.Properties;
037import java.util.TreeMap;
038
039import org.biojava.bio.BioError;
040import org.biojava.bio.chromatogram.AbstractChromatogram;
041import org.biojava.bio.chromatogram.Chromatogram;
042import org.biojava.bio.chromatogram.UnsupportedChromatogramFormatException;
043import org.biojava.bio.seq.DNATools;
044import org.biojava.bio.symbol.IllegalAlphabetException;
045import org.biojava.bio.symbol.IllegalSymbolException;
046import org.biojava.bio.symbol.IntegerAlphabet;
047import org.biojava.bio.symbol.Symbol;
048import org.biojava.bio.symbol.SymbolList;
049import org.biojava.bio.symbol.SymbolListViews;
050import org.biojava.utils.SmallMap;
051
052
053
054/**
055 * A {@link org.biojava.bio.chromatogram.Chromatogram} as loaded from an
056 * SCF v2 or v3 file.  Also loads and exposes the SCF format's "private data"
057 * and "comments" sections.  The quality values from the SCF are stored as
058 * additional sequences on the base call alignment. The labels are the
059 * <code>PROB_</code>* constants in this class.
060 * The values are {@link org.biojava.bio.symbol.IntegerAlphabet.IntegerSymbol}
061 * objects in the range 0 to 255.
062 *
063 *
064 * @author Rhett Sutphin (<a href="http://genome.uiowa.edu/">UI CBCB</a>)
065 */
066public class SCF extends AbstractChromatogram {
067    private byte[] privateData;
068    private Properties comments;
069    
070    private static final IntegerAlphabet.SubIntegerAlphabet
071            PROBABILITY_ALPHABET =
072            IntegerAlphabet.getSubAlphabet(0, 255);
073    
074    /** Represents the maximum unsigned value
075     * of a byte for wrapping purposes */
076    public static final int BYTE_MAX_VALUE =
077            256;
078    
079    /** Represents the maximum unsigned value
080     * of a short for wrapping purposes */
081    public static final int SHORT_MAX_VALUE =
082            65536;
083    
084    /** Base call alignment sequence label for the probability that call
085     * should be A. */
086    public static final String PROB_NUC_A = "quality-a";
087    /** Base call alignment sequence label for the probability that call
088     * should be C. */
089    public static final String PROB_NUC_C = "quality-c";
090    /** Base call alignment sequence label for the probability that call
091     * should be G. */
092    public static final String PROB_NUC_G = "quality-g";
093    /** Base call alignment sequence label for the probability that call
094     * should be T. */
095    public static final String PROB_NUC_T = "quality-t";
096    /**
097     * Base call alignment sequence label for the substitution
098     * probability. In versions of the SCF spec before 3.10, this is called
099     * spareQual[0].
100     */
101    public static final Object PROB_SUBSTITUTION =
102            "substitution-probability";
103    /**
104     * Base call alignment sequence label for the overcall probability.
105     * In versions of the SCF spec before 3.10, this is called
106     * spareQual[1].
107     */
108    public static final Object PROB_OVERCALL     = "overcall-probability";
109    /**
110     * Base call alignment sequence label for the undercall probability.
111     * In versions of the SCF spec before 3.10, this is called
112     * spareQual[2].
113     */
114    public static final Object PROB_UNDERCALL    =
115            "undercall-probability";
116    
117    /** Creates a new, completely empty SCF. */
118    protected SCF() {
119        super();
120        comments = new Properties();
121    }
122    
123    public static SCF create(File f)
124    throws IOException, UnsupportedChromatogramFormatException {
125        SCF out = new SCF();
126        out.load(f);
127        return out;
128    }
129    
130    public static SCF create(InputStream in, long alreadyRead)
131    throws IOException, UnsupportedChromatogramFormatException {
132        SCF out = new SCF();
133        out.load(in, alreadyRead);
134        return out;
135    }
136    
137    protected void load(File f) throws IOException,
138            UnsupportedChromatogramFormatException {
139        FileInputStream fin = new FileInputStream(f);
140        try {
141                load(fin, 0);
142        } finally {
143                fin.close();
144        }
145    }
146    
147    protected void load(InputStream in, long initOffset)
148    throws IOException, UnsupportedChromatogramFormatException {
149        SCF.ParserFactory.parse(in, this, initOffset);
150    }
151    
152    /**
153     * Returns the comments fields as a {@link Properties} mapping.
154     */
155    public Properties getComments() { return comments; }
156    
157    protected AbstractChromatogram reverseComplementInstance() { return
158            new SCF(); }
159    
160    public static IntegerAlphabet.SubIntegerAlphabet
161            getProbabilityAlphabet() { return PROBABILITY_ALPHABET; }
162    
163    /**
164     * Overrides {@link
165     * AbstractChromatogram#reverseComplementBaseCallList} to
166     * support the 7 quality values from the SCF.  These are handled thus:
167     * <ul>
168     *   <li><code>PROB_SUBSTITUTION</code>, <code>PROB_OVERCALL</code>, and
169     *       <code>PROB_UNDERCALL</code> are just reversed &returned.</li>
170     *   <li><code>PROB_NUC_</code>* returns the reverse of the quality
171     *       sequence for the complement base.</li>
172     * </ul>
173     */
174    protected SymbolList reverseComplementBaseCallList(String label) {
175        if (label == PROB_SUBSTITUTION ||
176                label == PROB_OVERCALL ||
177                label == PROB_UNDERCALL) {
178            return
179                    SymbolListViews.reverse(this.getBaseCalls().symbolListForLabel(label));
180        } else if (label.equals( PROB_NUC_A)) {
181            return
182                    SymbolListViews.reverse(this.getBaseCalls().symbolListForLabel(PROB_NUC_T));
183        } else if (label.equals(PROB_NUC_C)) {
184            return
185                    SymbolListViews.reverse(this.getBaseCalls().symbolListForLabel(PROB_NUC_G));
186        } else if (label.equals(PROB_NUC_G)) {
187            return
188                    SymbolListViews.reverse(this.getBaseCalls().symbolListForLabel(PROB_NUC_C));
189        } else if (label.equals( PROB_NUC_T)) {
190            return
191                    SymbolListViews.reverse(this.getBaseCalls().symbolListForLabel(PROB_NUC_A));
192        } else {
193            return super.reverseComplementBaseCallList(label);
194        }
195    }
196    
197    /*** INNER CLASSES FOR PARSER ***/
198    
199    /** Factory class to create the appropriate parser for the given
200     * stream.  This decision is based on the version field in the file's header. */
201    private static class ParserFactory {
202        public static void parse(InputStream in, SCF out, long initOffset)
203        throws IOException, UnsupportedChromatogramFormatException {
204            DataInputStream din = new DataInputStream(in);
205            SCF.Parser parser = createParser(din, out, initOffset);
206            parser.parse();
207        }
208        
209        public static SCF.Parser createParser(DataInputStream din, SCF
210                out, long initOffset)
211                throws UnsupportedChromatogramFormatException, IOException {
212            // read the header to find out the version
213            long offset = initOffset;
214            SCF.Parser.HeaderStruct header =
215                    SCF.Parser.HeaderStruct.create(din, initOffset);
216            offset = SCF.Parser.HeaderStruct.HEADER_LENGTH;
217            out.setBits((int)header.sample_size * 8);
218            SCF.Parser parser;
219            float version;
220            try {
221                version = Float.parseFloat(new String(header.version));
222            } catch (NumberFormatException e) {
223                throw new UnsupportedChromatogramFormatException(
224                        "The SCF's version (" + new String(header.version) +
225                        ") is not a number");
226            }
227            if (version < 3.0f && version >= 2.0f) {
228                parser = new SCF.V2Parser(din, out, header, offset);
229            } else if (version >= 3.0f) {
230                parser = new SCF.V3Parser(din, out, header, offset);
231            } else {
232                throw new UnsupportedChromatogramFormatException(
233                        "Only version 2 and version 3 SCFs are supported (not "
234                        + new String(header.version));
235            }
236            return parser;
237        }
238    }
239    
240    static interface BaseCallUncertaintyDecoder {
241        /**
242         * Returns an appropriate Symbol from the DNA alphabet for
243         * an encoded byte.
244         */
245        public Symbol decode(byte call) throws IllegalSymbolException;
246    }
247    
248    /**
249     * A BaseCallUncertaintyDecoder that works for type 0 (default) and
250     * type 4 (ABI)
251     * code sets.
252     */
253    static class DefaultUncertaintyDecoder implements
254            BaseCallUncertaintyDecoder {
255        public DefaultUncertaintyDecoder() { }
256        public Symbol decode(byte call) throws IllegalSymbolException {
257            char c = (char) call;
258            switch (c) {
259            case 'a':
260            case 'A':
261                return DNATools.a();
262            case 'c':
263            case 'C':
264                return DNATools.c();
265            case 'g':
266            case 'G':
267                return DNATools.g();
268            case 't':
269            case 'T':
270                return DNATools.t();
271            case 'n':
272            case 'N':
273                return DNATools.n();
274            case 'm':
275            case 'M':
276                return DNATools.m();
277            case 'r':
278            case 'R':
279                return DNATools.r();
280            case 'w':
281            case 'W':
282                return DNATools.w();
283            case 's':
284            case 'S':
285                return DNATools.s();
286            case 'y':
287            case 'Y':
288                return DNATools.y();
289            case 'k':
290            case 'K':
291                return DNATools.k();
292            case 'v':
293            case 'V':
294                return DNATools.v();
295            case 'h':
296            case 'H':
297                return DNATools.h();
298            case 'd':
299            case 'D':
300                return DNATools.d();
301            case 'b':
302            case 'B':
303                return DNATools.b();
304            case '-':
305            case '.':
306                return DNATools.getDNA().getGapSymbol();
307            default:
308                throw new IllegalSymbolException("No Symbol for " + c);
309            }
310        }
311    }
312    
313    abstract static class Parser {
314        protected long offset = 0;
315        protected DataInputStream din = null;
316        protected HeaderStruct header;
317        protected BaseCallUncertaintyDecoder decoder;
318        protected SCF out = null;
319        protected boolean parsed = false;
320        
321        Parser(DataInputStream din, SCF out,
322                SCF.Parser.HeaderStruct header, long initOffset)
323                throws UnsupportedChromatogramFormatException {
324            if (din == null)
325                throw new IllegalArgumentException(
326                        "Can't parse a null inputstream");
327            this.din = din;
328            
329            if (out == null)
330                this.out = new SCF();
331            else
332                this.out = out;
333            
334            if (header.samples > Integer.MAX_VALUE)
335                throw new UnsupportedChromatogramFormatException(
336                        "Can't parse an SCF with more than " + Integer.MAX_VALUE + " trace samples");
337            
338            if (header.bases > Integer.MAX_VALUE)
339                throw new UnsupportedChromatogramFormatException(
340                        "Can't parse an SCF with more than " + Integer.MAX_VALUE + " called bases");
341            
342            this.header = header;
343            this.decoder = createDecoder(header.code_set);
344            this.offset = initOffset;
345        }
346        
347        /**
348         * Factory method to create an approriate decoder for the code
349         * set. Currently, only a direct interpretation of the encoded byte
350         * as an ASCII character is supported (see DefaultUncertaintyDecoder).
351         * This interpretation will be used for all code sets, but a
352         * warning will be printed if the code set is not known to work with this
353         * interpretation.
354         */
355        private static BaseCallUncertaintyDecoder createDecoder(long
356                codeSet) {
357            if (codeSet != 0 && codeSet != 2 && codeSet != 4)
358                System.err.println("Warning: the code set (" + codeSet +
359                        ") is not specifically supported.  (It may still work, though.)");
360            return new DefaultUncertaintyDecoder();
361        }
362        
363        public SCF getParsed() {
364            if (parsed) return out;
365            else        return null;
366        }
367        
368        public void parse() throws IOException,
369                UnsupportedChromatogramFormatException {
370            parsed = false;
371            // sort the sections of the file by ascending offset
372            Integer SAMPLES  = new Integer(0),
373                    BASES    = new Integer(1),
374                    COMMENTS = new Integer(2),
375                    PRIVATE  = new Integer(3);
376            TreeMap<Long, Integer> sectionOrder = new TreeMap<Long, Integer>();
377            sectionOrder.put(new Long(header.samples_offset),  SAMPLES);
378            sectionOrder.put(new Long(header.bases_offset),    BASES);
379            if (header.comments_size > 0) {
380                sectionOrder.put(new Long(header.comments_offset), COMMENTS);
381            }
382            if (header.private_size > 0) {
383                sectionOrder.put(new Long(header.private_offset), PRIVATE);
384            }
385            
386            for (Iterator<Long> it = sectionOrder.keySet().iterator() ;
387            it.hasNext() ;) {
388                Integer sect = sectionOrder.get(it.next());
389                if      (sect == SAMPLES)  parseSamples();
390                else if (sect == BASES)    parseBases();
391                else if (sect == COMMENTS) parseComments();
392                else if (sect == PRIVATE)  parsePrivate();
393            }
394            parsed = true;
395        }
396        
397        protected abstract void parseSamples()  throws IOException,
398                UnsupportedChromatogramFormatException;
399        protected abstract void parseBases()    throws IOException,
400                UnsupportedChromatogramFormatException;
401        
402        protected void parseComments() throws IOException {
403            skipTo(header.comments_offset);
404            byte[] raw = new byte[(int)header.comments_size - 1];
405            int read = din.read(raw, 0, raw.length);
406            offset += read;
407            BufferedReader r = new BufferedReader(
408                    new InputStreamReader(
409                    new ByteArrayInputStream(raw),
410                    "ISO-8859-1"
411                    )
412                    );
413            String line, key, value;
414            int eqIdx = -1;
415            while ((line = r.readLine()) != null) {
416                eqIdx = line.indexOf('=');
417                //added line below to skip any truncated comment fields
418                if( eqIdx == -1 ) {
419                    continue;
420                    
421                }
422                key = line.substring(0, eqIdx);
423                value = line.substring(eqIdx+1);
424                out.comments.setProperty(key, value);
425            }
426        }
427        
428        protected void parsePrivate() throws IOException {
429            skipTo(header.private_offset);
430            out.privateData = new byte[(int)header.private_size];
431            int privRead = 0;
432            int thisRead = 0;
433            while (privRead < out.privateData.length && thisRead >= 0) {
434                thisRead = din.read(out.privateData, privRead,
435                        out.privateData.length - privRead);
436                offset += thisRead;
437                privRead += thisRead;
438            }
439        }
440        
441        protected final void skipTo(long newOffset) throws IOException {
442            if (newOffset < offset)
443                throw new IllegalArgumentException("Can't skip backwards: (newOffset==" + newOffset + ") < (offset==" + offset + ")");
444            long skip = newOffset - offset;
445            while (skip > 0) {
446                int actualSkip = din.skipBytes((int)skip);
447                offset += actualSkip;
448                skip   -= actualSkip;
449            }
450        }
451        
452        /**
453         * Does the grunt work of creating the base call alignment from
454         * the given lists of bases, offsets, and probabilities.
455         * Catches and "Can't happens" all exceptions.
456         */
457        protected final void createAndSetBaseCallAlignment(List dna, List
458                offsets, List[] probs) {
459            try {
460                Map<Object, SymbolList> baseCalls = new SmallMap(9);
461                baseCalls.put(Chromatogram.DNA,
462                        out.createImmutableSymbolList(DNATools.getDNA(), dna));
463                baseCalls.put(Chromatogram.OFFSETS,
464                        out.createImmutableSymbolList(IntegerAlphabet.getInstance(), offsets));
465                baseCalls.put(PROB_NUC_A,
466                        out.createImmutableSymbolList(getProbabilityAlphabet(), probs[0]));
467                baseCalls.put(PROB_NUC_C,
468                        out.createImmutableSymbolList(getProbabilityAlphabet(), probs[1]));
469                baseCalls.put(PROB_NUC_G,
470                        out.createImmutableSymbolList(getProbabilityAlphabet(), probs[2]));
471                baseCalls.put(PROB_NUC_T,
472                        out.createImmutableSymbolList(getProbabilityAlphabet(), probs[3]));
473                baseCalls.put(PROB_SUBSTITUTION,
474                        out.createImmutableSymbolList(getProbabilityAlphabet(), probs[4]));
475                baseCalls.put(PROB_OVERCALL,
476                        out.createImmutableSymbolList(getProbabilityAlphabet(), probs[5]));
477                baseCalls.put(PROB_UNDERCALL,
478                        out.createImmutableSymbolList(getProbabilityAlphabet(), probs[6]));
479                out.setBaseCallAlignment(out.createImmutableAlignment(baseCalls));
480            } catch (IllegalSymbolException ise) {
481                throw new BioError(ise,"Can't happen unless the decoder is returning non-DNA symbols");
482            } catch (IllegalAlphabetException iae) {
483                throw new BioError(iae,"Can't happen");
484            }
485        }
486        
487        private static class HeaderStruct {
488            public static final int HEADER_LENGTH = 128;
489            
490            // SCF spec uses unsigned 32-bit ints, so we'll use longs for simplicity
491            public long magic_number;
492            public long samples;
493            public long samples_offset;
494            public long bases;
495            public long bases_left_clip;
496            public long bases_right_clip;
497            public long bases_offset;
498            public long comments_size;
499            public long comments_offset;
500            public char[] version;
501            public long sample_size;
502            public long code_set;
503            public long private_size;
504            public long private_offset;
505            public long[] spare;
506            
507            private HeaderStruct() {
508                version = new char[4];
509                spare = new long[18];
510            }
511            
512            public static HeaderStruct create(DataInputStream din, long
513                    initOffset) throws IOException {
514                HeaderStruct hs = new HeaderStruct();
515                if (initOffset > 4) {
516                    throw new IllegalStateException("Can't skip more than four bytes and still have enough info to read header");
517                } else if (initOffset == 0) {
518                    hs.magic_number = 0xFFFFFFFF & din.readInt();
519                } else {
520                    hs.magic_number = 0;
521                    // skip to the four-byte boundary
522                    for (int i = 0 ; i < 4 - initOffset ; i++)
523                        din.read();
524                }
525                hs.samples          = 0xFFFFFFFF & din.readInt();
526                hs.samples_offset   = 0xFFFFFFFF & din.readInt();
527                hs.bases            = 0xFFFFFFFF & din.readInt();
528                hs.bases_left_clip  = 0xFFFFFFFF & din.readInt();
529                hs.bases_right_clip = 0xFFFFFFFF & din.readInt();
530                hs.bases_offset     = 0xFFFFFFFF & din.readInt();
531                hs.comments_size    = 0xFFFFFFFF & din.readInt();
532                hs.comments_offset  = 0xFFFFFFFF & din.readInt();
533                hs.version[0]       = (char) din.readByte();
534                hs.version[1]       = (char) din.readByte();
535                hs.version[2]       = (char) din.readByte();
536                hs.version[3]       = (char) din.readByte();
537                hs.sample_size      = 0xFFFFFFFF & din.readInt();
538                hs.code_set         = 0xFFFFFFFF & din.readInt();
539                hs.private_size     = 0xFFFFFFFF & din.readInt();
540                hs.private_offset   = 0xFFFFFFFF & din.readInt();
541                for (int i = 0 ; i < hs.spare.length ; i++) {
542                    hs.spare[i] = 0xFFFFFFFF & din.readInt();
543                }
544                return hs;
545            }
546        }
547    }
548    
549    private static class V3Parser extends Parser {
550        V3Parser(DataInputStream din, SCF out,
551                SCF.Parser.HeaderStruct header, long initOffset)
552                throws IOException, UnsupportedChromatogramFormatException {
553            super(din, out, header, initOffset);
554        }
555        
556        protected void parseSamples() throws IOException,
557                UnsupportedChromatogramFormatException {
558            skipTo(header.samples_offset);
559            
560            // load values from file
561            int count = (int)header.samples;
562            int maxValAllowed = 0;
563            if(header.sample_size == 1) {
564                maxValAllowed = BYTE_MAX_VALUE;
565            } else if(header.sample_size == 2) {
566                maxValAllowed = SHORT_MAX_VALUE;
567            }
568            
569            int[][] trace = new int[4][count];
570            int[] maxVal = new int[] { Integer.MIN_VALUE,
571                    Integer.MIN_VALUE,
572                    Integer.MIN_VALUE,
573                    Integer.MIN_VALUE };
574                    for (int n = 0 ; n < 4 ; n++)
575                        readSamplesInto(trace[n]);
576                    
577                    // stored values are delta-delta values; reprocess into actual values
578                    // algorithm cribbed from io_lib's delta_samples function in misc_scf.c
579                    int[] p_sample1 = new int[] { 0, 0, 0, 0 };
580                    int[] p_sample2 = new int[] { 0, 0, 0, 0 };
581                    for (int n = 0 ; n < 4 ; n++) {
582                        for (int i = 0 ; i < trace[n].length ; i++) {
583                            //New version of the code which takes into account what the
584                            //underlying value i.e. is it a byte or is it a short
585                            //and slightly rejigged for speed and sanity sakes
586                            p_sample1[n] += trace[n][i];
587                            if(p_sample1[n] >= maxValAllowed) p_sample1[n] = p_sample1[n] -
588                                    maxValAllowed;
589                            p_sample2[n] = p_sample1[n] + p_sample2[n];
590                            if(p_sample2[n] >= maxValAllowed) p_sample2[n] = p_sample2[n] -
591                                    maxValAllowed;
592                            trace[n][i] = p_sample2[n];
593                            
594                            maxVal[n] = Math.max(maxVal[n], trace[n][i]);
595                        }
596                    }
597                    // set into output SCF chromat object
598                    try {
599                        out.setTrace(DNATools.a(), trace[0], maxVal[0]);
600                        out.setTrace(DNATools.c(), trace[1], maxVal[1]);
601                        out.setTrace(DNATools.g(), trace[2], maxVal[2]);
602                        out.setTrace(DNATools.t(), trace[3], maxVal[3]);
603                    } catch (IllegalSymbolException ise) {
604                        throw new BioError(ise,"Can't happen");
605                    }
606        }
607        
608        protected void readSamplesInto(int[] samps) throws IOException {
609            if (header.sample_size == 1) {
610                for (int i = 0 ; i < samps.length ; i++) {
611                    samps[i] = din.readUnsignedByte();
612                    offset += 1;
613                }
614            } else if (header.sample_size == 2) {
615                for (int i = 0 ; i < samps.length ; i++) {
616                    samps[i] = din.readUnsignedShort();
617                    offset += 2;
618                }
619            }
620        }
621        
622        protected void parseBases() throws IOException,
623                UnsupportedChromatogramFormatException {
624            skipTo(header.bases_offset);
625            int count = (int) header.bases;
626            
627            List<IntegerAlphabet.IntegerSymbol> offsets = new ArrayList<IntegerAlphabet.IntegerSymbol>(count);
628            @SuppressWarnings("unchecked")
629            List<IntegerAlphabet.IntegerSymbol>[] probs = new ArrayList[7];
630            for (int i = 0; i < 7; i++)
631                probs[i] = new ArrayList<IntegerAlphabet.IntegerSymbol>(count);
632            List<Symbol> dna = new ArrayList<Symbol>(count);
633            
634            long tmp;
635            try {
636                for (int i = 0 ; i < count ; i++) {
637                    // read the first set of sequential data (the trace peak offsets)
638                    tmp = 0xFFFFFFFF & din.readInt();
639                    offset += 4;
640                    if (tmp > Integer.MAX_VALUE)
641                        throw new
642                                UnsupportedChromatogramFormatException(
643                                "SCF contains a base with peak offset > " + Integer.MAX_VALUE);
644                    offsets.add(IntegerAlphabet.getInstance().getSymbol((int) tmp));
645                }
646                // read sets of probs for A, C, G, T
647                for (int j = 0 ; j < 4 ; j++) {
648                    for (int i = 0 ; i < count ; i++) {
649                        probs[j].add(getProbabilityAlphabet().getSymbol(din.readByte() & 0xFF));
650                        offset++;
651                    }
652                }
653                // read bases
654                try {
655                    for (int i = 0 ; i < count ; i++) {
656                        dna.add(decoder.decode(din.readByte()));
657                        offset++;
658                    }
659                } catch (IllegalSymbolException ise) {
660                        UnsupportedChromatogramFormatException ue = new UnsupportedChromatogramFormatException("Base call decoding failure");
661                        ue.initCause(ise);
662                        throw ue;
663                }
664                // read 'spare' probs
665                for (int j = 4 ; j < 7 ; j++) {
666                    for (int i = 0 ; i < count ; i++) {
667                        probs[j].add(getProbabilityAlphabet().getSymbol(din.readByte() & 0xFF));
668                        offset++;
669                    }
670                }
671            } catch (IllegalSymbolException ise) {
672                throw new BioError(ise,"Can't happen unless there's a misdefinition of getProbabilityAlphabet() or IntegerAlphabet");
673            }
674            // create/set base call list
675            createAndSetBaseCallAlignment(dna, offsets, probs);
676        }
677    } // end SCFv3Parser
678    
679    private static class V2Parser extends Parser {
680        V2Parser(DataInputStream din, SCF out,
681                SCF.Parser.HeaderStruct header, long initOffset)
682                throws IOException, UnsupportedChromatogramFormatException {
683            super(din, out, header, initOffset);
684        }
685        
686        protected void parseSamples() throws IOException {
687            int count = (int) header.samples;
688            int[][] trace = new int[4][count];
689            int[] maxVal = new int[] { Integer.MIN_VALUE,
690                    Integer.MIN_VALUE,
691                    Integer.MIN_VALUE,
692                    Integer.MIN_VALUE };
693                    
694                    if (header.sample_size == 1) {
695                        for (int i = 0 ; i < count ; i++) {
696                            for (int n = 0 ; n < 4 ; n++) {
697                                trace[n][i] = din.readUnsignedByte();
698                                maxVal[n] = Math.max(trace[n][i], maxVal[n]);
699                                offset++;
700                            }
701                        }
702                    } else if (header.sample_size == 2) {
703                        for (int i = 0 ; i < count ; i++) {
704                            for (int n = 0 ; n < 4 ; n++) {
705                                trace[n][i] = din.readUnsignedShort();
706                                maxVal[n] = Math.max(trace[n][i], maxVal[n]);
707                                offset += 2;
708                            }
709                        }
710                    }
711                    // set into output SCF chromat object
712                    try {
713                        out.setTrace(DNATools.a(), trace[0], maxVal[0]);
714                        out.setTrace(DNATools.c(), trace[1], maxVal[1]);
715                        out.setTrace(DNATools.g(), trace[2], maxVal[2]);
716                        out.setTrace(DNATools.t(), trace[3], maxVal[3]);
717                    } catch (IllegalSymbolException ise) {
718                        throw new BioError(ise,"Can't happen");
719                    }
720        }
721        
722        protected void parseBases() throws IOException,
723                UnsupportedChromatogramFormatException {
724            skipTo(header.bases_offset);
725            
726            int count = (int) header.bases;
727            List[] probs = new ArrayList[7];
728            for (int i = 0 ; i < probs.length ; i++) probs[i] = new
729                    ArrayList(count);
730            List dna = new ArrayList(count);
731            List offsets = new ArrayList(count);
732            long tmp;
733            byte[] probTmp = new byte[7];
734            for (int i = 0 ; i < count ; i++) {
735                // read the peak index
736                tmp = 0xFFFFFFFF & din.readInt();
737                offset += 4;
738                if (tmp > Integer.MAX_VALUE)
739                    throw new UnsupportedChromatogramFormatException(
740                            "SCF contains a base with peak offset > " + Integer.MAX_VALUE);
741                offsets.add(IntegerAlphabet.getInstance().getSymbol((int)
742                tmp));
743                
744                // read the per-base probabilities
745                din.read(probTmp, 0, 4);
746                offset += 4;
747                // read the actual base called
748                try {
749                    dna.add(decoder.decode(din.readByte()));
750                    offset += 1;
751                } catch (IllegalSymbolException ise) {
752                        UnsupportedChromatogramFormatException ue = new UnsupportedChromatogramFormatException(
753                            "Base call decoding failure");
754                        ue.initCause(ise);
755                        throw ue;
756                }
757                // read the spare probability fields
758                din.read(probTmp, 4, 3);
759                offset += 3;
760                try {
761                    for (int p = 0 ; p < 7 ; p++)
762                        probs[p].add(getProbabilityAlphabet().getSymbol(0xFF & probTmp[p]));
763                } catch (IllegalSymbolException ise) {
764                    throw new BioError(ise,"Can't happen unless getProbabilityAlphabet() has been misdefined.");
765                }
766            }
767            createAndSetBaseCallAlignment(dna, offsets, probs);
768        }
769    }
770}
771
772