001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojava.bio.program.abi; 023 024import java.io.File; 025import java.io.IOException; 026import java.io.InputStream; 027import java.io.Serializable; 028import java.util.ArrayList; 029import java.util.List; 030import java.util.Map; 031 032import org.biojava.bio.BioError; 033import org.biojava.bio.chromatogram.AbstractChromatogram; 034import org.biojava.bio.chromatogram.Chromatogram; 035import org.biojava.bio.chromatogram.UnsupportedChromatogramFormatException; 036import org.biojava.bio.seq.DNATools; 037import org.biojava.bio.symbol.AtomicSymbol; 038import org.biojava.bio.symbol.IllegalAlphabetException; 039import org.biojava.bio.symbol.IllegalSymbolException; 040import org.biojava.bio.symbol.IntegerAlphabet; 041import org.biojava.bio.symbol.Symbol; 042import org.biojava.utils.SmallMap; 043 044 045/** 046 * An implementation of {@link org.biojava.bio.chromatogram.Chromatogram} to 047 * encapulsulate chromatogram data extracted from the files produced by ABI 048 * sequencers, such as the the 377 and the 3700. The format was described by 049 * Clark Tibbetts in his paper "Raw Data File Formats, and the Digital and 050 * Analog Raw Data Streams of the ABI PRISM 377 DNA Sequencer." Available 051 * online <kbd><a href="http://www-2.cs.cmu.edu/afs/cs/project/genome/WWW/Papers/clark.html"> 052 * http://www-2.cs.cmu.edu/afs/cs/project/genome/WWW/Papers/clark.html</a></kbd> 053 * 054 * @author Rhett Sutphin (<a href="http://genome.uiowa.edu/">UI CBCB</a>) 055 * @author Richard Holland 056 * @see ABIFParser 057 */ 058public class ABIFChromatogram extends AbstractChromatogram implements Serializable { 059 public ABIFChromatogram() { 060 super(); 061 } 062 063 /** Create a new ABIF object from a file. 064 * <p> 065 * This method is more efficent than {@link #create(InputStream)}. 066 * </p> 067 */ 068 public static ABIFChromatogram create(File f) 069 throws IOException, UnsupportedChromatogramFormatException { 070 ABIFChromatogram newOne = new ABIFChromatogram(); 071 newOne.load(f); 072 return newOne; 073 } 074 075 /** 076 * Create a new ABIF object from a stream of bytes. 077 * <p> 078 * Due to the non-single-pass design of the ABI format, this method will 079 * wrap the InputStream in an {@link org.biojava.utils.io.CachingInputStream}. 080 * For this reason, {@link #create(File)} should be preferred. 081 * </p> 082 * @param in the stream from which to read 083 * @return a new ABIFChromatogram object 084 * @throws IOException if there is a problem with the underlying stream 085 */ 086 public static ABIFChromatogram create(InputStream in) 087 throws IOException, UnsupportedChromatogramFormatException { 088 ABIFChromatogram newOne = new ABIFChromatogram(); 089 newOne.load(in); 090 return newOne; 091 } 092 093 protected ABIFChromatogram load(File f) 094 throws IOException, UnsupportedChromatogramFormatException { 095 new Parser(f); 096 return this; 097 } 098 099 protected ABIFChromatogram load(InputStream in) 100 throws IOException, UnsupportedChromatogramFormatException { 101 new Parser(in); 102 return this; 103 } 104 105 protected AbstractChromatogram reverseComplementInstance() { 106 return new ABIFChromatogram(); 107 } 108 109 /** 110 * An extension of {@link ABIFParser} that reads the particular fields from 111 * the ABIF that contain the chromatogram data and initializes the fields 112 * in its enclosing <code>ABIFChromatogram</code> instance. 113 */ 114 protected class Parser extends ABIFParser { 115 public Parser(InputStream in) 116 throws IOException, UnsupportedChromatogramFormatException { 117 super(in); 118 parse(); 119 } 120 121 public Parser(File f) 122 throws IOException, UnsupportedChromatogramFormatException { 123 super(f); 124 parse(); 125 } 126 127 private final void parse() 128 throws IOException, UnsupportedChromatogramFormatException { 129 // read filter-wheel-order tag 130 char[] fwo_ = new char[4]; 131 ABIFParser.TaggedDataRecord fwoRec = getDataRecord("FWO_", 1); 132 if (fwoRec == null) 133 throw new UnsupportedChromatogramFormatException("No FWO_ (1) record in ABIF file, therefore no trace data"); 134 fwo_[0] = (char) ( (fwoRec.dataRecord >>> 24) & 0xff ); 135 fwo_[1] = (char) ( (fwoRec.dataRecord >>> 16) & 0xff ); 136 fwo_[2] = (char) ( (fwoRec.dataRecord >>> 8 ) & 0xff ); 137 fwo_[3] = (char) ( (fwoRec.dataRecord ) & 0xff ); 138 139 Symbol sym; 140 clearTraces(); 141 for (int i = 0 ; i < 4 ; i++) { 142 143 try { 144 sym = ABIFParser.decodeDNAToken(fwo_[i]); 145 } catch (IllegalSymbolException ise) { 146 throw new UnsupportedChromatogramFormatException("An unexpected character (" + fwo_[i] +") was found in the FWO_ tag. Parsing cannot continue."); 147 } 148 if (!(sym instanceof AtomicSymbol)) { 149 throw new UnsupportedChromatogramFormatException("An unexpected character (" + fwo_[i] +") was found in the FWO_ tag. Parsing cannot continue."); 150 } 151 parseTrace((AtomicSymbol) sym, i+9); 152 } 153 parseBaseCalls(); 154 getDataAccess().finishedReading(); 155 } 156 157 private void parseTrace(AtomicSymbol sym, int whichData) throws IOException, UnsupportedChromatogramFormatException { 158 TaggedDataRecord dataPtr = getDataRecord("DATA", whichData); 159 if (dataPtr.numberOfElements > Integer.MAX_VALUE) 160 throw new UnsupportedChromatogramFormatException("Chromatogram has more than " + Integer.MAX_VALUE + " trace samples -- can't handle it"); 161 int count = (int) dataPtr.numberOfElements; 162 int[] trace = new int[count]; 163 int max = -1; 164 setBits(8*dataPtr.elementLength); 165 if (dataPtr.elementLength == 2) { 166 byte[] shortArray = dataPtr.offsetData; 167 int i = 0; 168 for (int s = 0; s < shortArray.length; s += 2) { 169 trace[i] = ((short)((shortArray[s] << 8) | (shortArray[s + 1] & 0xff))) & 0xffff; 170 max = Math.max(trace[i++], max); 171 } 172 } 173 else if (dataPtr.elementLength == 1) { 174 byte[] byteArray = dataPtr.offsetData; 175 for (int i = 0; i < byteArray.length; i++) { 176 trace[i] = byteArray[i] & 0xff; 177 max = Math.max(trace[i], max); 178 } 179 } 180 else { 181 throw new UnsupportedChromatogramFormatException("Only 8- and 16-bit trace samples are supported"); 182 } 183 184 try { 185 setTrace(sym, trace, max); 186 } catch (IllegalSymbolException ise) { 187 throw new BioError("Can't happen", ise); 188 } 189 } 190 191 private void parseBaseCalls() throws IOException, UnsupportedChromatogramFormatException { 192 // do offsets, then call letters 193 // offsets are in PLOC1 (we'll use the possibly-edited stream) 194 TaggedDataRecord offsetsPtr = getDataRecord("PLOC", 1); 195 // call letters are int PBAS1 196 TaggedDataRecord basesPtr = getDataRecord("PBAS", 1); 197 // these should be equal, but just in case... 198 if (offsetsPtr.numberOfElements != basesPtr.numberOfElements) 199 throw new BioError("PLOC and PBAS are different lengths. Can't proceed."); 200 if (offsetsPtr.numberOfElements > Integer.MAX_VALUE) 201 throw new UnsupportedChromatogramFormatException("Chromatogram has more than " + Integer.MAX_VALUE + " base calls -- can't handle it"); 202 int count = (int) offsetsPtr.numberOfElements; 203 // the list of called bases 204 List dna = new ArrayList(count); 205 // the list of offsets 206 List offsets = new ArrayList(count); 207 // start reading offsets, creating SimpleBaseCalls along the way 208 if (offsetsPtr.elementLength == 2) { 209 byte[] shortArray = offsetsPtr.offsetData; 210 IntegerAlphabet integerAlphabet = IntegerAlphabet.getInstance(); 211 for (int s = 0; s < shortArray.length; s += 2) { 212 offsets.add(integerAlphabet.getSymbol(((short)((shortArray[s] << 8) | (shortArray[s + 1] & 0xff))) & 0xffff)); 213 } 214 } 215 else if (offsetsPtr.elementLength == 1) { 216 byte[] byteArray = offsetsPtr.offsetData; 217 IntegerAlphabet integerAlphabet = IntegerAlphabet.getInstance(); 218 for (int i = 0 ; i < byteArray.length; i++) { 219 offsets.add(integerAlphabet.getSymbol(byteArray[i] & 0xff)); 220 } 221 } 222 else { 223 throw new IllegalStateException("Only 8- and 16-bit trace samples are supported"); 224 } 225 226 // then read the base calls 227 try { 228 byte[] byteArray = basesPtr.offsetData; 229 for (int i = 0; i < byteArray.length; i++) { 230 dna.add(ABIFParser.decodeDNAToken((char) byteArray[i])); 231 } 232 } catch (IllegalSymbolException ise) { 233 throw new BioError("Can't happen", ise); 234 } 235 // create the base call alignment and set it 236 try { 237 Map baseCalls = new SmallMap(2); 238 baseCalls.put(Chromatogram.DNA, createImmutableSymbolList(DNATools.getDNA(), dna)); 239 baseCalls.put(Chromatogram.OFFSETS, createImmutableSymbolList(IntegerAlphabet.getInstance(), offsets)); 240 setBaseCallAlignment(createImmutableAlignment(baseCalls)); 241 } catch (IllegalAlphabetException iae) { 242 throw new BioError("Can't happen", iae); 243 } catch (IllegalSymbolException ise) { 244 throw new BioError("Can't happen", ise); 245 } 246 } 247 } 248}