001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojava.bio.program.abi; 023 024import java.io.DataInput; 025import java.io.DataInputStream; 026import java.io.File; 027import java.io.FileNotFoundException; 028import java.io.IOException; 029import java.io.InputStream; 030import java.util.Collections; 031import java.util.HashMap; 032import java.util.Iterator; 033import java.util.Map; 034 035import org.biojava.bio.seq.DNATools; 036import org.biojava.bio.symbol.IllegalSymbolException; 037import org.biojava.bio.symbol.Symbol; 038import org.biojava.utils.io.CachingInputStream; 039import org.biojava.utils.io.Seekable; 040 041/** 042 * A general base parser for files produced by ABI software. This includes 043 * chromatograms derived from ABI sequencers and potentially other data files 044 * as well. The format was described by Clark Tibbetts in his paper "Raw Data 045 * File Formats, and the Digital and Analog Raw Data Streams of the ABI PRISM 046 * 377 DNA Sequencer." Available online 047 * <kbd><a href="http://www-2.cs.cmu.edu/afs/cs/project/genome/WWW/Papers/clark.html"> 048 * http://www-2.cs.cmu.edu/afs/cs/project/genome/WWW/Papers/clark.html</a></kbd> 049 * <p> 050 * Briefly, the format consists of a set of named fixed-length "tagged data 051 * records" which may contain data themselves, or pointers to data elsewhere 052 * in the file. This class reads these records and exposes them to subclasses 053 * through the {@link #getDataRecord} method. The attributes of the records as 054 * described in Tibbets' paper are exposed through public (final) fields of 055 * {@link TaggedDataRecord} instances. 056 * </p> 057 * <p> 058 * If a record only contains a pointer to the desired data (see 059 * {@link TaggedDataRecord#hasOffsetData}, subclasses may get 060 * at the raw data by using {@link TaggedDataRecord#offsetData}: 061 * </p> 062 * <p> 063 * This parser provides methods and classes for dealing with the files as 064 * streams or local files (local files being more memory-efficient). 065 * </p> 066 * 067 * @author Rhett Sutphin (<a href="http://genome.uiowa.edu/">UI CBCB</a>) 068 * @author Richard Holland 069 */ 070public class ABIFParser { 071 private ABIFParser.DataAccess din; 072 private boolean parsed = false; 073 private Map records; 074 075 private final int RECORD_COUNT_OFFSET = 18; 076 private final int RECORD_OFFSET_OFFSET = 26; 077 078 /** Creates a new ABIFParser for a file. */ 079 public ABIFParser(File f) throws IOException { 080 this(new ABIFParser.RandomAccessFile(f)); 081 } 082 083 /** 084 * Creates a new ABIFParser for an input stream. Note that the stream 085 * will be wrapped in a {@link CachingInputStream} if it isn't one already. 086 * If it is, it will be seeked to 0. 087 */ 088 public ABIFParser(InputStream in) throws IOException { 089 this(new ABIFParser.DataStream(in)); 090 } 091 092 /** 093 * Creates a new ABIFParser for the specified {@link DataAccess} object. 094 * If you need to read from something other than a file or a stream, you'll 095 * have to implement a {@link DataAccess}-implementing class wrapping your 096 * source and then pass an instance to this constructor. 097 */ 098 public ABIFParser(ABIFParser.DataAccess toParse) throws IOException { 099 din = toParse; 100 readDataRecords(); 101 } 102 103 /** 104 * Returns the accessor for the raw data being parsed by this parser. 105 */ 106 public final ABIFParser.DataAccess getDataAccess() { 107 return din; 108 } 109 110 private final void readDataRecords() throws IOException { 111 parsed = false; 112 din.seek(RECORD_COUNT_OFFSET); 113 long recordCount = 0xffffffff & din.readInt(); 114 din.seek(RECORD_OFFSET_OFFSET); 115 long recordOffset = 0xffffffff & din.readInt(); 116 din.seek(recordOffset); 117 TaggedDataRecord tdr; 118 StringBuffer label; 119 records = new HashMap(); 120 for (int i = 0 ; i < recordCount ; i++) { 121 tdr = new TaggedDataRecord(din); 122 label = new StringBuffer(6).append(tdr.tagName).append(tdr.tagNumber); 123 records.put(label.substring(0), tdr); 124 } 125 for (Iterator i = records.values().iterator(); i.hasNext(); ) { 126 TaggedDataRecord record = (TaggedDataRecord)i.next(); 127 if (record.hasOffsetData) { 128 din.seek(record.dataRecord); 129 din.readFully(record.offsetData); 130 } 131 } 132 parsed = true; 133 din.finishedReading(); 134 } 135 136 /** 137 * Decodes a character into a {@link Symbol} in the DNA alphabet. 138 * Uses a definition of characters that is compatible with the ABI format. 139 * @param token the character to decode 140 * @throws IllegalSymbolException when token isn't in 141 * <code>{ a, A, c, C, g, G, t, T, n, N, - }</code> 142 */ 143 public static Symbol decodeDNAToken(char token) throws IllegalSymbolException { 144 switch (token) { 145 case 'a': case 'A': 146 return DNATools.a(); 147 case 'c': case 'C': 148 return DNATools.c(); 149 case 'g': case 'G': 150 return DNATools.g(); 151 case 't': case 'T': 152 return DNATools.t(); 153 case 'n': case 'N': 154 return DNATools.n(); 155 case '-': 156 return DNATools.getDNA().getGapSymbol(); 157 default: 158 throw new IllegalSymbolException("Can't decode token " + token + " into DNA"); 159 } 160 } 161 162 /** 163 * Get the entry from the file TOC with the given name and tag number. 164 * @param tagName the four-character string name of the desired data record 165 * @param tagNumber which one of the tags with this name to return (must be positive) 166 * @throws IllegalArgumentException if tagName is the wrong length or tagNumber 167 * is 0 or negative 168 * @throws IllegalStateException if the initial parsing is not complete 169 * @return the requested data record, or null if no such record exists 170 */ 171 public ABIFParser.TaggedDataRecord getDataRecord(String tagName, int tagNumber) 172 throws IllegalArgumentException, IllegalStateException { 173 if (!parsed) 174 throw new IllegalStateException("parsing is not complete"); 175 if (tagNumber < 1) 176 throw new IllegalArgumentException("tagNumber must be positive"); 177 if (tagName.length() != 4) 178 throw new IllegalArgumentException("tagName must be 4 characters long"); 179 return (ABIFParser.TaggedDataRecord) records.get(tagName + tagNumber); 180 } 181 182 /** 183 * Obtain all data records. Keys of the map are strings consisting of 184 * tag names with tag numbers concatenated immediately afterwards. Values 185 * are TaggedDataRecord objects. The map has no particular order and so 186 * cannot be relied on to iterate over records in the same order they 187 * were read from the file. 188 * @return the map of all data records. 189 */ 190 public Map getAllDataRecords() { 191 return Collections.unmodifiableMap(records); 192 } 193 194 /** 195 * An aggregate immutable type for an ABIF tagged data record. See the 196 * Tibbets paper (referenced in the javadoc for {@link ABIFParser}) for 197 * more information. 198 */ 199 public static class TaggedDataRecord { 200 public static final int DATA_TYPE_ASCII_ARRAY = 2; 201 public static final int DATA_TYPE_INTEGER = 4; 202 public static final int DATA_TYPE_FLOAT = 7; 203 public static final int DATA_TYPE_DATE = 10; 204 public static final int DATA_TYPE_TIME = 11; 205 public static final int DATA_TYPE_PSTRING = 18; 206 207 public final char[] tagName; 208 public final long tagNumber; 209 public final int dataType; 210 public final int elementLength; 211 public final long numberOfElements; 212 public final long recordLength; 213 public final long dataRecord; 214 public final long crypticVariable; 215 public final boolean hasOffsetData; 216 public final byte[] offsetData; 217 218 /** 219 * Creates a new TaggedDataRecord from the next 28 bytes of 220 * <code>din</code>. 221 * @param din the source of the raw data to be parsed 222 * @throws IOException if there's a problem with <code>din</code> 223 */ 224 public TaggedDataRecord(ABIFParser.DataAccess din) throws IOException { 225 tagName = new char[4]; 226 tagName[0] = (char) din.readByte(); 227 tagName[1] = (char) din.readByte(); 228 tagName[2] = (char) din.readByte(); 229 tagName[3] = (char) din.readByte(); 230 231 tagNumber = 0xffffffff & din.readInt(); 232 dataType = 0xffff & din.readShort(); 233 elementLength = 0xffff & din.readShort(); 234 numberOfElements = 0xffffffff & din.readInt(); 235 recordLength = 0xffffffff & din.readInt(); 236 dataRecord = 0xffffffff & din.readInt(); 237 crypticVariable = 0xffffffff & din.readInt(); 238 239 hasOffsetData = recordLength>4L; 240 if (hasOffsetData) 241 offsetData = new byte[(int)recordLength]; 242 else 243 offsetData = new byte[0]; 244 } 245 246 /** 247 * A very verbose <code>toString</code> that dumps all of the 248 * data in this record in a human-readable format. 249 */ 250 public String toString() { 251 StringBuffer sb = new StringBuffer(super.toString()).append("[\n"); 252 sb.append(" tagName = ").append(tagName).append('\n'); 253 sb.append(" tagNumber = ").append(tagNumber).append('\n'); 254 sb.append(" dataType = "); 255 switch (dataType) { 256 case DATA_TYPE_ASCII_ARRAY: sb.append("ASCII"); break; 257 case DATA_TYPE_INTEGER: sb.append("INTEGER"); break; 258 case DATA_TYPE_FLOAT: sb.append("FLOAT"); break; 259 case DATA_TYPE_DATE: sb.append("DATE"); break; 260 case DATA_TYPE_TIME: sb.append("TIME"); break; 261 case DATA_TYPE_PSTRING: sb.append("PSTRING"); break; 262 default: sb.append(dataType); 263 } 264 sb.append('\n'); 265 sb.append(" elementLength = ").append(elementLength).append('\n'); 266 sb.append(" numberOfElements= ").append(numberOfElements).append('\n'); 267 sb.append(" recordLength = ").append(recordLength).append('\n'); 268 sb.append(" dataRecord = "); 269 if (recordLength <= 4) { 270 switch (dataType) { 271 case DATA_TYPE_ASCII_ARRAY: 272 if (recordLength > 3) 273 sb.append((char) ((dataRecord >>> 24) & 0xFF)); 274 if (recordLength > 2) 275 sb.append((char) ((dataRecord >>> 16) & 0xFF)); 276 if (recordLength > 1) 277 sb.append((char) ((dataRecord >>> 8 ) & 0xFF)); 278 sb.append((char) ((dataRecord) & 0xFF)); 279 break; 280 case DATA_TYPE_DATE: 281 sb.append((dataRecord >>> 16) & 0xffff).append('/'); 282 sb.append((dataRecord >>> 8 ) & 0xff).append('/'); 283 sb.append((dataRecord) & 0xff); 284 break; 285 case DATA_TYPE_TIME: 286 sb.append((dataRecord >>> 24) & 0xff).append(':'); 287 sb.append((dataRecord >>> 16) & 0xff).append(':'); 288 sb.append((dataRecord >>> 8 ) & 0xff); 289 break; 290 case DATA_TYPE_INTEGER: 291 sb.append(dataRecord >>> (4 - recordLength)*8); 292 break; 293 default: 294 hexStringify((int)dataRecord, sb); 295 } 296 } 297 else { 298 hexStringify((int)dataRecord, sb); 299 } 300 sb.append(" hasOffsetData = ").append(hasOffsetData).append('\n'); 301 sb.append('\n'); 302 sb.append(" crypticVariable = ").append(crypticVariable).append('\n'); 303 sb.append(']'); 304 return sb.toString(); 305 } 306 307 private void hexStringify(int l, StringBuffer sb) { 308 sb.append("0x"); 309 String hex = Integer.toHexString(l).toUpperCase(); 310 for (int i = 8 ; i > hex.length() ; i--) 311 sb.append('0'); 312 sb.append(hex); 313 } 314 } 315 316 /** 317 * Concatenation of the {@link Seekable} and {@link DataInput} interfaces. 318 */ 319 public static interface DataAccess extends Seekable, DataInput { 320 /** 321 * Called when the parser has finished reading. The access 322 * may choose to close itself at this point, e.g. if it is 323 * using a RandomAccessFile. 324 * @throws IOException if it could not do what it needs to. 325 */ 326 public void finishedReading() throws IOException; 327 } 328 329 private static class RandomAccessFile 330 extends java.io.RandomAccessFile implements DataAccess { 331 public RandomAccessFile(File f) throws FileNotFoundException { 332 super(f, "r"); 333 } 334 public void finishedReading() throws IOException { 335 this.close(); 336 } 337 } 338 339 /** Implements DataAccess by delegating to a CachingStream and a 340 * DataInputStream */ 341 private static class DataStream implements DataAccess { 342 CachingInputStream cin; 343 DataInputStream din; 344 345 public DataStream(InputStream src) throws IOException { 346 if (src instanceof CachingInputStream) 347 cin = (CachingInputStream) src; 348 else 349 cin = new CachingInputStream(src); 350 cin.seek(0); 351 din = new DataInputStream(cin); 352 } 353 354 public DataStream(CachingInputStream cin) throws IOException { 355 this((InputStream) cin); 356 } 357 358 public void finishedReading() throws IOException { 359 // We don't care. 360 } 361 362 public boolean readBoolean() throws IOException { return din.readBoolean(); } 363 public byte readByte() throws IOException { return din.readByte(); } 364 public char readChar() throws IOException { return din.readChar(); } 365 public short readShort() throws IOException { return din.readShort(); } 366 public int readInt() throws IOException { return din.readInt(); } 367 public long readLong() throws IOException { return din.readLong(); } 368 public float readFloat() throws IOException { return din.readFloat(); } 369 public double readDouble() throws IOException { return din.readDouble(); } 370 public String readUTF() throws IOException { return din.readUTF(); } 371 372 public int readUnsignedByte() throws IOException { return din.readUnsignedByte(); } 373 public int readUnsignedShort() throws IOException { return din.readUnsignedShort(); } 374 375 public void readFully(byte[] values) throws IOException { 376 din.readFully(values); 377 } 378 379 public void readFully(byte[] values, int start, int len) throws IOException { 380 din.readFully(values, start, len); 381 } 382 383 public String readLine() throws IOException { 384 throw new UnsupportedOperationException("DataInputStream#readLine is deprecated. Use readUTF instead"); 385 } 386 387 public int skipBytes(int count) throws IOException { return din.skipBytes(count); } 388 389 public void seek(long pos) throws IOException { 390 cin.seek(pos); 391 } 392 } 393}