001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojava.bio.program.scf; 023 024import java.io.BufferedReader; 025import java.io.ByteArrayInputStream; 026import java.io.DataInputStream; 027import java.io.File; 028import java.io.FileInputStream; 029import java.io.IOException; 030import java.io.InputStream; 031import java.io.InputStreamReader; 032import java.util.ArrayList; 033import java.util.Iterator; 034import java.util.List; 035import java.util.Map; 036import java.util.Properties; 037import java.util.TreeMap; 038 039import org.biojava.bio.BioError; 040import org.biojava.bio.chromatogram.AbstractChromatogram; 041import org.biojava.bio.chromatogram.Chromatogram; 042import org.biojava.bio.chromatogram.UnsupportedChromatogramFormatException; 043import org.biojava.bio.seq.DNATools; 044import org.biojava.bio.symbol.IllegalAlphabetException; 045import org.biojava.bio.symbol.IllegalSymbolException; 046import org.biojava.bio.symbol.IntegerAlphabet; 047import org.biojava.bio.symbol.Symbol; 048import org.biojava.bio.symbol.SymbolList; 049import org.biojava.bio.symbol.SymbolListViews; 050import org.biojava.utils.SmallMap; 051 052 053 054/** 055 * A {@link org.biojava.bio.chromatogram.Chromatogram} as loaded from an 056 * SCF v2 or v3 file. Also loads and exposes the SCF format's "private data" 057 * and "comments" sections. The quality values from the SCF are stored as 058 * additional sequences on the base call alignment. The labels are the 059 * <code>PROB_</code>* constants in this class. 060 * The values are {@link org.biojava.bio.symbol.IntegerAlphabet.IntegerSymbol} 061 * objects in the range 0 to 255. 062 * 063 * 064 * @author Rhett Sutphin (<a href="http://genome.uiowa.edu/">UI CBCB</a>) 065 */ 066public class SCF extends AbstractChromatogram { 067 private byte[] privateData; 068 private Properties comments; 069 070 private static final IntegerAlphabet.SubIntegerAlphabet 071 PROBABILITY_ALPHABET = 072 IntegerAlphabet.getSubAlphabet(0, 255); 073 074 /** Represents the maximum unsigned value 075 * of a byte for wrapping purposes */ 076 public static final int BYTE_MAX_VALUE = 077 256; 078 079 /** Represents the maximum unsigned value 080 * of a short for wrapping purposes */ 081 public static final int SHORT_MAX_VALUE = 082 65536; 083 084 /** Base call alignment sequence label for the probability that call 085 * should be A. */ 086 public static final String PROB_NUC_A = "quality-a"; 087 /** Base call alignment sequence label for the probability that call 088 * should be C. */ 089 public static final String PROB_NUC_C = "quality-c"; 090 /** Base call alignment sequence label for the probability that call 091 * should be G. */ 092 public static final String PROB_NUC_G = "quality-g"; 093 /** Base call alignment sequence label for the probability that call 094 * should be T. */ 095 public static final String PROB_NUC_T = "quality-t"; 096 /** 097 * Base call alignment sequence label for the substitution 098 * probability. In versions of the SCF spec before 3.10, this is called 099 * spareQual[0]. 100 */ 101 public static final Object PROB_SUBSTITUTION = 102 "substitution-probability"; 103 /** 104 * Base call alignment sequence label for the overcall probability. 105 * In versions of the SCF spec before 3.10, this is called 106 * spareQual[1]. 107 */ 108 public static final Object PROB_OVERCALL = "overcall-probability"; 109 /** 110 * Base call alignment sequence label for the undercall probability. 111 * In versions of the SCF spec before 3.10, this is called 112 * spareQual[2]. 113 */ 114 public static final Object PROB_UNDERCALL = 115 "undercall-probability"; 116 117 /** Creates a new, completely empty SCF. */ 118 protected SCF() { 119 super(); 120 comments = new Properties(); 121 } 122 123 public static SCF create(File f) 124 throws IOException, UnsupportedChromatogramFormatException { 125 SCF out = new SCF(); 126 out.load(f); 127 return out; 128 } 129 130 public static SCF create(InputStream in, long alreadyRead) 131 throws IOException, UnsupportedChromatogramFormatException { 132 SCF out = new SCF(); 133 out.load(in, alreadyRead); 134 return out; 135 } 136 137 protected void load(File f) throws IOException, 138 UnsupportedChromatogramFormatException { 139 FileInputStream fin = new FileInputStream(f); 140 try { 141 load(fin, 0); 142 } finally { 143 fin.close(); 144 } 145 } 146 147 protected void load(InputStream in, long initOffset) 148 throws IOException, UnsupportedChromatogramFormatException { 149 SCF.ParserFactory.parse(in, this, initOffset); 150 } 151 152 /** 153 * Returns the comments fields as a {@link Properties} mapping. 154 */ 155 public Properties getComments() { return comments; } 156 157 protected AbstractChromatogram reverseComplementInstance() { return 158 new SCF(); } 159 160 public static IntegerAlphabet.SubIntegerAlphabet 161 getProbabilityAlphabet() { return PROBABILITY_ALPHABET; } 162 163 /** 164 * Overrides {@link 165 * AbstractChromatogram#reverseComplementBaseCallList} to 166 * support the 7 quality values from the SCF. These are handled thus: 167 * <ul> 168 * <li><code>PROB_SUBSTITUTION</code>, <code>PROB_OVERCALL</code>, and 169 * <code>PROB_UNDERCALL</code> are just reversed &returned.</li> 170 * <li><code>PROB_NUC_</code>* returns the reverse of the quality 171 * sequence for the complement base.</li> 172 * </ul> 173 */ 174 protected SymbolList reverseComplementBaseCallList(String label) { 175 if (label == PROB_SUBSTITUTION || 176 label == PROB_OVERCALL || 177 label == PROB_UNDERCALL) { 178 return 179 SymbolListViews.reverse(this.getBaseCalls().symbolListForLabel(label)); 180 } else if (label.equals( PROB_NUC_A)) { 181 return 182 SymbolListViews.reverse(this.getBaseCalls().symbolListForLabel(PROB_NUC_T)); 183 } else if (label.equals(PROB_NUC_C)) { 184 return 185 SymbolListViews.reverse(this.getBaseCalls().symbolListForLabel(PROB_NUC_G)); 186 } else if (label.equals(PROB_NUC_G)) { 187 return 188 SymbolListViews.reverse(this.getBaseCalls().symbolListForLabel(PROB_NUC_C)); 189 } else if (label.equals( PROB_NUC_T)) { 190 return 191 SymbolListViews.reverse(this.getBaseCalls().symbolListForLabel(PROB_NUC_A)); 192 } else { 193 return super.reverseComplementBaseCallList(label); 194 } 195 } 196 197 /*** INNER CLASSES FOR PARSER ***/ 198 199 /** Factory class to create the appropriate parser for the given 200 * stream. This decision is based on the version field in the file's header. */ 201 private static class ParserFactory { 202 public static void parse(InputStream in, SCF out, long initOffset) 203 throws IOException, UnsupportedChromatogramFormatException { 204 DataInputStream din = new DataInputStream(in); 205 SCF.Parser parser = createParser(din, out, initOffset); 206 parser.parse(); 207 } 208 209 public static SCF.Parser createParser(DataInputStream din, SCF 210 out, long initOffset) 211 throws UnsupportedChromatogramFormatException, IOException { 212 // read the header to find out the version 213 long offset = initOffset; 214 SCF.Parser.HeaderStruct header = 215 SCF.Parser.HeaderStruct.create(din, initOffset); 216 offset = SCF.Parser.HeaderStruct.HEADER_LENGTH; 217 out.setBits((int)header.sample_size * 8); 218 SCF.Parser parser; 219 float version; 220 try { 221 version = Float.parseFloat(new String(header.version)); 222 } catch (NumberFormatException e) { 223 throw new UnsupportedChromatogramFormatException( 224 "The SCF's version (" + new String(header.version) + 225 ") is not a number"); 226 } 227 if (version < 3.0f && version >= 2.0f) { 228 parser = new SCF.V2Parser(din, out, header, offset); 229 } else if (version >= 3.0f) { 230 parser = new SCF.V3Parser(din, out, header, offset); 231 } else { 232 throw new UnsupportedChromatogramFormatException( 233 "Only version 2 and version 3 SCFs are supported (not " 234 + new String(header.version)); 235 } 236 return parser; 237 } 238 } 239 240 static interface BaseCallUncertaintyDecoder { 241 /** 242 * Returns an appropriate Symbol from the DNA alphabet for 243 * an encoded byte. 244 */ 245 public Symbol decode(byte call) throws IllegalSymbolException; 246 } 247 248 /** 249 * A BaseCallUncertaintyDecoder that works for type 0 (default) and 250 * type 4 (ABI) 251 * code sets. 252 */ 253 static class DefaultUncertaintyDecoder implements 254 BaseCallUncertaintyDecoder { 255 public DefaultUncertaintyDecoder() { } 256 public Symbol decode(byte call) throws IllegalSymbolException { 257 char c = (char) call; 258 switch (c) { 259 case 'a': 260 case 'A': 261 return DNATools.a(); 262 case 'c': 263 case 'C': 264 return DNATools.c(); 265 case 'g': 266 case 'G': 267 return DNATools.g(); 268 case 't': 269 case 'T': 270 return DNATools.t(); 271 case 'n': 272 case 'N': 273 return DNATools.n(); 274 case 'm': 275 case 'M': 276 return DNATools.m(); 277 case 'r': 278 case 'R': 279 return DNATools.r(); 280 case 'w': 281 case 'W': 282 return DNATools.w(); 283 case 's': 284 case 'S': 285 return DNATools.s(); 286 case 'y': 287 case 'Y': 288 return DNATools.y(); 289 case 'k': 290 case 'K': 291 return DNATools.k(); 292 case 'v': 293 case 'V': 294 return DNATools.v(); 295 case 'h': 296 case 'H': 297 return DNATools.h(); 298 case 'd': 299 case 'D': 300 return DNATools.d(); 301 case 'b': 302 case 'B': 303 return DNATools.b(); 304 case '-': 305 case '.': 306 return DNATools.getDNA().getGapSymbol(); 307 default: 308 throw new IllegalSymbolException("No Symbol for " + c); 309 } 310 } 311 } 312 313 abstract static class Parser { 314 protected long offset = 0; 315 protected DataInputStream din = null; 316 protected HeaderStruct header; 317 protected BaseCallUncertaintyDecoder decoder; 318 protected SCF out = null; 319 protected boolean parsed = false; 320 321 Parser(DataInputStream din, SCF out, 322 SCF.Parser.HeaderStruct header, long initOffset) 323 throws UnsupportedChromatogramFormatException { 324 if (din == null) 325 throw new IllegalArgumentException( 326 "Can't parse a null inputstream"); 327 this.din = din; 328 329 if (out == null) 330 this.out = new SCF(); 331 else 332 this.out = out; 333 334 if (header.samples > Integer.MAX_VALUE) 335 throw new UnsupportedChromatogramFormatException( 336 "Can't parse an SCF with more than " + Integer.MAX_VALUE + " trace samples"); 337 338 if (header.bases > Integer.MAX_VALUE) 339 throw new UnsupportedChromatogramFormatException( 340 "Can't parse an SCF with more than " + Integer.MAX_VALUE + " called bases"); 341 342 this.header = header; 343 this.decoder = createDecoder(header.code_set); 344 this.offset = initOffset; 345 } 346 347 /** 348 * Factory method to create an approriate decoder for the code 349 * set. Currently, only a direct interpretation of the encoded byte 350 * as an ASCII character is supported (see DefaultUncertaintyDecoder). 351 * This interpretation will be used for all code sets, but a 352 * warning will be printed if the code set is not known to work with this 353 * interpretation. 354 */ 355 private static BaseCallUncertaintyDecoder createDecoder(long 356 codeSet) { 357 if (codeSet != 0 && codeSet != 2 && codeSet != 4) 358 System.err.println("Warning: the code set (" + codeSet + 359 ") is not specifically supported. (It may still work, though.)"); 360 return new DefaultUncertaintyDecoder(); 361 } 362 363 public SCF getParsed() { 364 if (parsed) return out; 365 else return null; 366 } 367 368 public void parse() throws IOException, 369 UnsupportedChromatogramFormatException { 370 parsed = false; 371 // sort the sections of the file by ascending offset 372 Integer SAMPLES = new Integer(0), 373 BASES = new Integer(1), 374 COMMENTS = new Integer(2), 375 PRIVATE = new Integer(3); 376 TreeMap<Long, Integer> sectionOrder = new TreeMap<Long, Integer>(); 377 sectionOrder.put(new Long(header.samples_offset), SAMPLES); 378 sectionOrder.put(new Long(header.bases_offset), BASES); 379 if (header.comments_size > 0) { 380 sectionOrder.put(new Long(header.comments_offset), COMMENTS); 381 } 382 if (header.private_size > 0) { 383 sectionOrder.put(new Long(header.private_offset), PRIVATE); 384 } 385 386 for (Iterator<Long> it = sectionOrder.keySet().iterator() ; 387 it.hasNext() ;) { 388 Integer sect = sectionOrder.get(it.next()); 389 if (sect == SAMPLES) parseSamples(); 390 else if (sect == BASES) parseBases(); 391 else if (sect == COMMENTS) parseComments(); 392 else if (sect == PRIVATE) parsePrivate(); 393 } 394 parsed = true; 395 } 396 397 protected abstract void parseSamples() throws IOException, 398 UnsupportedChromatogramFormatException; 399 protected abstract void parseBases() throws IOException, 400 UnsupportedChromatogramFormatException; 401 402 protected void parseComments() throws IOException { 403 skipTo(header.comments_offset); 404 byte[] raw = new byte[(int)header.comments_size - 1]; 405 int read = din.read(raw, 0, raw.length); 406 offset += read; 407 BufferedReader r = new BufferedReader( 408 new InputStreamReader( 409 new ByteArrayInputStream(raw), 410 "ISO-8859-1" 411 ) 412 ); 413 String line, key, value; 414 int eqIdx = -1; 415 while ((line = r.readLine()) != null) { 416 eqIdx = line.indexOf('='); 417 //added line below to skip any truncated comment fields 418 if( eqIdx == -1 ) { 419 continue; 420 421 } 422 key = line.substring(0, eqIdx); 423 value = line.substring(eqIdx+1); 424 out.comments.setProperty(key, value); 425 } 426 } 427 428 protected void parsePrivate() throws IOException { 429 skipTo(header.private_offset); 430 out.privateData = new byte[(int)header.private_size]; 431 int privRead = 0; 432 int thisRead = 0; 433 while (privRead < out.privateData.length && thisRead >= 0) { 434 thisRead = din.read(out.privateData, privRead, 435 out.privateData.length - privRead); 436 offset += thisRead; 437 privRead += thisRead; 438 } 439 } 440 441 protected final void skipTo(long newOffset) throws IOException { 442 if (newOffset < offset) 443 throw new IllegalArgumentException("Can't skip backwards: (newOffset==" + newOffset + ") < (offset==" + offset + ")"); 444 long skip = newOffset - offset; 445 while (skip > 0) { 446 int actualSkip = din.skipBytes((int)skip); 447 offset += actualSkip; 448 skip -= actualSkip; 449 } 450 } 451 452 /** 453 * Does the grunt work of creating the base call alignment from 454 * the given lists of bases, offsets, and probabilities. 455 * Catches and "Can't happens" all exceptions. 456 */ 457 protected final void createAndSetBaseCallAlignment(List dna, List 458 offsets, List[] probs) { 459 try { 460 Map<Object, SymbolList> baseCalls = new SmallMap(9); 461 baseCalls.put(Chromatogram.DNA, 462 out.createImmutableSymbolList(DNATools.getDNA(), dna)); 463 baseCalls.put(Chromatogram.OFFSETS, 464 out.createImmutableSymbolList(IntegerAlphabet.getInstance(), offsets)); 465 baseCalls.put(PROB_NUC_A, 466 out.createImmutableSymbolList(getProbabilityAlphabet(), probs[0])); 467 baseCalls.put(PROB_NUC_C, 468 out.createImmutableSymbolList(getProbabilityAlphabet(), probs[1])); 469 baseCalls.put(PROB_NUC_G, 470 out.createImmutableSymbolList(getProbabilityAlphabet(), probs[2])); 471 baseCalls.put(PROB_NUC_T, 472 out.createImmutableSymbolList(getProbabilityAlphabet(), probs[3])); 473 baseCalls.put(PROB_SUBSTITUTION, 474 out.createImmutableSymbolList(getProbabilityAlphabet(), probs[4])); 475 baseCalls.put(PROB_OVERCALL, 476 out.createImmutableSymbolList(getProbabilityAlphabet(), probs[5])); 477 baseCalls.put(PROB_UNDERCALL, 478 out.createImmutableSymbolList(getProbabilityAlphabet(), probs[6])); 479 out.setBaseCallAlignment(out.createImmutableAlignment(baseCalls)); 480 } catch (IllegalSymbolException ise) { 481 throw new BioError(ise,"Can't happen unless the decoder is returning non-DNA symbols"); 482 } catch (IllegalAlphabetException iae) { 483 throw new BioError(iae,"Can't happen"); 484 } 485 } 486 487 private static class HeaderStruct { 488 public static final int HEADER_LENGTH = 128; 489 490 // SCF spec uses unsigned 32-bit ints, so we'll use longs for simplicity 491 public long magic_number; 492 public long samples; 493 public long samples_offset; 494 public long bases; 495 public long bases_left_clip; 496 public long bases_right_clip; 497 public long bases_offset; 498 public long comments_size; 499 public long comments_offset; 500 public char[] version; 501 public long sample_size; 502 public long code_set; 503 public long private_size; 504 public long private_offset; 505 public long[] spare; 506 507 private HeaderStruct() { 508 version = new char[4]; 509 spare = new long[18]; 510 } 511 512 public static HeaderStruct create(DataInputStream din, long 513 initOffset) throws IOException { 514 HeaderStruct hs = new HeaderStruct(); 515 if (initOffset > 4) { 516 throw new IllegalStateException("Can't skip more than four bytes and still have enough info to read header"); 517 } else if (initOffset == 0) { 518 hs.magic_number = 0xFFFFFFFF & din.readInt(); 519 } else { 520 hs.magic_number = 0; 521 // skip to the four-byte boundary 522 for (int i = 0 ; i < 4 - initOffset ; i++) 523 din.read(); 524 } 525 hs.samples = 0xFFFFFFFF & din.readInt(); 526 hs.samples_offset = 0xFFFFFFFF & din.readInt(); 527 hs.bases = 0xFFFFFFFF & din.readInt(); 528 hs.bases_left_clip = 0xFFFFFFFF & din.readInt(); 529 hs.bases_right_clip = 0xFFFFFFFF & din.readInt(); 530 hs.bases_offset = 0xFFFFFFFF & din.readInt(); 531 hs.comments_size = 0xFFFFFFFF & din.readInt(); 532 hs.comments_offset = 0xFFFFFFFF & din.readInt(); 533 hs.version[0] = (char) din.readByte(); 534 hs.version[1] = (char) din.readByte(); 535 hs.version[2] = (char) din.readByte(); 536 hs.version[3] = (char) din.readByte(); 537 hs.sample_size = 0xFFFFFFFF & din.readInt(); 538 hs.code_set = 0xFFFFFFFF & din.readInt(); 539 hs.private_size = 0xFFFFFFFF & din.readInt(); 540 hs.private_offset = 0xFFFFFFFF & din.readInt(); 541 for (int i = 0 ; i < hs.spare.length ; i++) { 542 hs.spare[i] = 0xFFFFFFFF & din.readInt(); 543 } 544 return hs; 545 } 546 } 547 } 548 549 private static class V3Parser extends Parser { 550 V3Parser(DataInputStream din, SCF out, 551 SCF.Parser.HeaderStruct header, long initOffset) 552 throws IOException, UnsupportedChromatogramFormatException { 553 super(din, out, header, initOffset); 554 } 555 556 protected void parseSamples() throws IOException, 557 UnsupportedChromatogramFormatException { 558 skipTo(header.samples_offset); 559 560 // load values from file 561 int count = (int)header.samples; 562 int maxValAllowed = 0; 563 if(header.sample_size == 1) { 564 maxValAllowed = BYTE_MAX_VALUE; 565 } else if(header.sample_size == 2) { 566 maxValAllowed = SHORT_MAX_VALUE; 567 } 568 569 int[][] trace = new int[4][count]; 570 int[] maxVal = new int[] { Integer.MIN_VALUE, 571 Integer.MIN_VALUE, 572 Integer.MIN_VALUE, 573 Integer.MIN_VALUE }; 574 for (int n = 0 ; n < 4 ; n++) 575 readSamplesInto(trace[n]); 576 577 // stored values are delta-delta values; reprocess into actual values 578 // algorithm cribbed from io_lib's delta_samples function in misc_scf.c 579 int[] p_sample1 = new int[] { 0, 0, 0, 0 }; 580 int[] p_sample2 = new int[] { 0, 0, 0, 0 }; 581 for (int n = 0 ; n < 4 ; n++) { 582 for (int i = 0 ; i < trace[n].length ; i++) { 583 //New version of the code which takes into account what the 584 //underlying value i.e. is it a byte or is it a short 585 //and slightly rejigged for speed and sanity sakes 586 p_sample1[n] += trace[n][i]; 587 if(p_sample1[n] >= maxValAllowed) p_sample1[n] = p_sample1[n] - 588 maxValAllowed; 589 p_sample2[n] = p_sample1[n] + p_sample2[n]; 590 if(p_sample2[n] >= maxValAllowed) p_sample2[n] = p_sample2[n] - 591 maxValAllowed; 592 trace[n][i] = p_sample2[n]; 593 594 maxVal[n] = Math.max(maxVal[n], trace[n][i]); 595 } 596 } 597 // set into output SCF chromat object 598 try { 599 out.setTrace(DNATools.a(), trace[0], maxVal[0]); 600 out.setTrace(DNATools.c(), trace[1], maxVal[1]); 601 out.setTrace(DNATools.g(), trace[2], maxVal[2]); 602 out.setTrace(DNATools.t(), trace[3], maxVal[3]); 603 } catch (IllegalSymbolException ise) { 604 throw new BioError(ise,"Can't happen"); 605 } 606 } 607 608 protected void readSamplesInto(int[] samps) throws IOException { 609 if (header.sample_size == 1) { 610 for (int i = 0 ; i < samps.length ; i++) { 611 samps[i] = din.readUnsignedByte(); 612 offset += 1; 613 } 614 } else if (header.sample_size == 2) { 615 for (int i = 0 ; i < samps.length ; i++) { 616 samps[i] = din.readUnsignedShort(); 617 offset += 2; 618 } 619 } 620 } 621 622 protected void parseBases() throws IOException, 623 UnsupportedChromatogramFormatException { 624 skipTo(header.bases_offset); 625 int count = (int) header.bases; 626 627 List<IntegerAlphabet.IntegerSymbol> offsets = new ArrayList<IntegerAlphabet.IntegerSymbol>(count); 628 @SuppressWarnings("unchecked") 629 List<IntegerAlphabet.IntegerSymbol>[] probs = new ArrayList[7]; 630 for (int i = 0; i < 7; i++) 631 probs[i] = new ArrayList<IntegerAlphabet.IntegerSymbol>(count); 632 List<Symbol> dna = new ArrayList<Symbol>(count); 633 634 long tmp; 635 try { 636 for (int i = 0 ; i < count ; i++) { 637 // read the first set of sequential data (the trace peak offsets) 638 tmp = 0xFFFFFFFF & din.readInt(); 639 offset += 4; 640 if (tmp > Integer.MAX_VALUE) 641 throw new 642 UnsupportedChromatogramFormatException( 643 "SCF contains a base with peak offset > " + Integer.MAX_VALUE); 644 offsets.add(IntegerAlphabet.getInstance().getSymbol((int) tmp)); 645 } 646 // read sets of probs for A, C, G, T 647 for (int j = 0 ; j < 4 ; j++) { 648 for (int i = 0 ; i < count ; i++) { 649 probs[j].add(getProbabilityAlphabet().getSymbol(din.readByte() & 0xFF)); 650 offset++; 651 } 652 } 653 // read bases 654 try { 655 for (int i = 0 ; i < count ; i++) { 656 dna.add(decoder.decode(din.readByte())); 657 offset++; 658 } 659 } catch (IllegalSymbolException ise) { 660 UnsupportedChromatogramFormatException ue = new UnsupportedChromatogramFormatException("Base call decoding failure"); 661 ue.initCause(ise); 662 throw ue; 663 } 664 // read 'spare' probs 665 for (int j = 4 ; j < 7 ; j++) { 666 for (int i = 0 ; i < count ; i++) { 667 probs[j].add(getProbabilityAlphabet().getSymbol(din.readByte() & 0xFF)); 668 offset++; 669 } 670 } 671 } catch (IllegalSymbolException ise) { 672 throw new BioError(ise,"Can't happen unless there's a misdefinition of getProbabilityAlphabet() or IntegerAlphabet"); 673 } 674 // create/set base call list 675 createAndSetBaseCallAlignment(dna, offsets, probs); 676 } 677 } // end SCFv3Parser 678 679 private static class V2Parser extends Parser { 680 V2Parser(DataInputStream din, SCF out, 681 SCF.Parser.HeaderStruct header, long initOffset) 682 throws IOException, UnsupportedChromatogramFormatException { 683 super(din, out, header, initOffset); 684 } 685 686 protected void parseSamples() throws IOException { 687 int count = (int) header.samples; 688 int[][] trace = new int[4][count]; 689 int[] maxVal = new int[] { Integer.MIN_VALUE, 690 Integer.MIN_VALUE, 691 Integer.MIN_VALUE, 692 Integer.MIN_VALUE }; 693 694 if (header.sample_size == 1) { 695 for (int i = 0 ; i < count ; i++) { 696 for (int n = 0 ; n < 4 ; n++) { 697 trace[n][i] = din.readUnsignedByte(); 698 maxVal[n] = Math.max(trace[n][i], maxVal[n]); 699 offset++; 700 } 701 } 702 } else if (header.sample_size == 2) { 703 for (int i = 0 ; i < count ; i++) { 704 for (int n = 0 ; n < 4 ; n++) { 705 trace[n][i] = din.readUnsignedShort(); 706 maxVal[n] = Math.max(trace[n][i], maxVal[n]); 707 offset += 2; 708 } 709 } 710 } 711 // set into output SCF chromat object 712 try { 713 out.setTrace(DNATools.a(), trace[0], maxVal[0]); 714 out.setTrace(DNATools.c(), trace[1], maxVal[1]); 715 out.setTrace(DNATools.g(), trace[2], maxVal[2]); 716 out.setTrace(DNATools.t(), trace[3], maxVal[3]); 717 } catch (IllegalSymbolException ise) { 718 throw new BioError(ise,"Can't happen"); 719 } 720 } 721 722 protected void parseBases() throws IOException, 723 UnsupportedChromatogramFormatException { 724 skipTo(header.bases_offset); 725 726 int count = (int) header.bases; 727 List[] probs = new ArrayList[7]; 728 for (int i = 0 ; i < probs.length ; i++) probs[i] = new 729 ArrayList(count); 730 List dna = new ArrayList(count); 731 List offsets = new ArrayList(count); 732 long tmp; 733 byte[] probTmp = new byte[7]; 734 for (int i = 0 ; i < count ; i++) { 735 // read the peak index 736 tmp = 0xFFFFFFFF & din.readInt(); 737 offset += 4; 738 if (tmp > Integer.MAX_VALUE) 739 throw new UnsupportedChromatogramFormatException( 740 "SCF contains a base with peak offset > " + Integer.MAX_VALUE); 741 offsets.add(IntegerAlphabet.getInstance().getSymbol((int) 742 tmp)); 743 744 // read the per-base probabilities 745 din.read(probTmp, 0, 4); 746 offset += 4; 747 // read the actual base called 748 try { 749 dna.add(decoder.decode(din.readByte())); 750 offset += 1; 751 } catch (IllegalSymbolException ise) { 752 UnsupportedChromatogramFormatException ue = new UnsupportedChromatogramFormatException( 753 "Base call decoding failure"); 754 ue.initCause(ise); 755 throw ue; 756 } 757 // read the spare probability fields 758 din.read(probTmp, 4, 3); 759 offset += 3; 760 try { 761 for (int p = 0 ; p < 7 ; p++) 762 probs[p].add(getProbabilityAlphabet().getSymbol(0xFF & probTmp[p])); 763 } catch (IllegalSymbolException ise) { 764 throw new BioError(ise,"Can't happen unless getProbabilityAlphabet() has been misdefined."); 765 } 766 } 767 createAndSetBaseCallAlignment(dna, offsets, probs); 768 } 769 } 770} 771 772