001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojava.bio.program.phred; 023 024import java.io.BufferedReader; 025import java.io.IOException; 026import java.io.OutputStream; 027import java.util.ArrayList; 028import java.util.Iterator; 029import java.util.List; 030 031import org.biojava.bio.BioError; 032import org.biojava.bio.BioException; 033import org.biojava.bio.alignment.Alignment; 034import org.biojava.bio.dist.Distribution; 035import org.biojava.bio.dist.DistributionFactory; 036import org.biojava.bio.dist.DistributionTools; 037import org.biojava.bio.dist.DistributionTrainerContext; 038import org.biojava.bio.dist.SimpleDistributionTrainerContext; 039import org.biojava.bio.seq.DNATools; 040import org.biojava.bio.seq.Sequence; 041import org.biojava.bio.seq.SequenceIterator; 042import org.biojava.bio.seq.db.HashSequenceDB; 043import org.biojava.bio.seq.db.IDMaker; 044import org.biojava.bio.seq.db.SequenceDB; 045import org.biojava.bio.seq.impl.SimpleSequence; 046import org.biojava.bio.seq.io.FastaDescriptionLineParser; 047import org.biojava.bio.seq.io.FastaFormat; 048import org.biojava.bio.seq.io.SeqIOTools; 049import org.biojava.bio.seq.io.SequenceBuilderFactory; 050import org.biojava.bio.seq.io.SimpleSequenceBuilder; 051import org.biojava.bio.seq.io.StreamReader; 052import org.biojava.bio.seq.io.StreamWriter; 053import org.biojava.bio.seq.io.SymbolTokenization; 054import org.biojava.bio.symbol.Alphabet; 055import org.biojava.bio.symbol.AlphabetManager; 056import org.biojava.bio.symbol.BasisSymbol; 057import org.biojava.bio.symbol.FiniteAlphabet; 058import org.biojava.bio.symbol.IllegalAlphabetException; 059import org.biojava.bio.symbol.IllegalSymbolException; 060import org.biojava.bio.symbol.IntegerAlphabet; 061import org.biojava.bio.symbol.SimpleSymbolList; 062import org.biojava.bio.symbol.Symbol; 063import org.biojava.bio.symbol.SymbolList; 064import org.biojava.utils.AssertionFailure; 065import org.biojava.utils.ChangeVetoException; 066import org.biojava.utils.ListTools; 067 068/** 069 * <p>PhredTools contains static methods for working with phred 070 * quality data.</p> 071 * 072 * <p>Copyright (c) 2001</p> 073 * <p>Company: AgResearch</p> 074 * 075 * @author Mark Schreiber 076 * @author Matthew Pocock 077 * @since 1.1 078 * 079 * Note that Phred is a copyright of CodonCode Corporation. 080 */ 081 082public final class PhredTools { 083 084 static{ 085 try { 086 List l = new ArrayList(2); 087 l.add(DNATools.getDNA()); 088 l.add(IntegerAlphabet.getSubAlphabet(0,99)); 089 AlphabetManager.getCrossProductAlphabet(l, "PHRED"); 090 } catch (IllegalAlphabetException iae) { 091 throw new BioError( "Could not create the Phred alphabet", iae); 092 } 093 } 094 095 /** 096 * Retrieves the PHRED alphabet from the AlphabetManager. The Phred alphabet 097 * is a cross product of a subset of the IntegerAlphabet from 0...99 and the 098 * DNA alphabet. The Phred alphabet is finite. 099 * 100 * The Phred Alphabet contains 400 BasisSymbols named, for example, (guanine 47). 101 * The BasisSymbols can be fragmented into their component AtomicSymbols using 102 * the <code>getSymbols()</code> method of BasisSymbol. 103 */ 104 public static final FiniteAlphabet getPhredAlphabet(){ 105 return (FiniteAlphabet)AlphabetManager.alphabetForName("PHRED"); 106 } 107 108 /** 109 * Retrives the DNA symbol component of the Phred BasisSymbol from the 110 * PHRED alphabet. 111 * @throws IllegalSymbolException if the provided symbol is not from the 112 * PHRED alphabet. 113 */ 114 public static final Symbol dnaSymbolFromPhred(Symbol phredSym) 115 throws IllegalSymbolException{ 116 //validate the symbol 117 getPhredAlphabet().validate(phredSym); 118 //get the DNA component of the Phred Symbol 119 List l = ((BasisSymbol)phredSym).getSymbols(); 120 //the first symbol should be DNA 121 return (Symbol) l.get(0); 122 } 123 124 /** 125 * Retrives the IntegerSymbol component of the Phred BasisSymbol from the 126 * PHRED alphabet. 127 * @throws IllegalSymbolException if the provided symbol is not from the 128 * PHRED alphabet. 129 */ 130 public static final IntegerAlphabet.IntegerSymbol integerSymbolFromPhred(Symbol phredSym) 131 throws IllegalSymbolException{ 132 //validate the symbol 133 getPhredAlphabet().validate(phredSym); 134 //get the IntegerSymbol component of the Phred Symbol 135 List l = ((BasisSymbol)phredSym).getSymbols(); 136 //the second symbol should be the IntegerSymbol 137 return (IntegerAlphabet.IntegerSymbol)(l.get(1)); 138 } 139 140 /** 141 * Merges a Symbol List from the DNA alphabet with a SymbolList from the 142 * [0..99] subset of the IntegerAlphabet into a SymbolList from 143 * the PHRED alphabet. 144 * @throws IllegalAlphabetException if the alphabets are not of the required alphabets 145 * @throws IllegalArgumentException if the two SymbolLists are not of equal length. 146 * @throws IllegalSymbolException if a combination of Symbols cannot be represented by 147 * the PHRED alphabet. 148 */ 149 public static SymbolList createPhred(SymbolList dna, SymbolList quality) 150 throws IllegalArgumentException, IllegalAlphabetException, IllegalSymbolException{ 151 //perform initial checks 152 if(dna.length() != quality.length()){ 153 throw new IllegalArgumentException("SymbolLists must be of equal length "+ 154 dna.length()+" : "+quality.length()); 155 } 156 if(dna.getAlphabet() != DNATools.getDNA()){ 157 throw new IllegalAlphabetException( 158 "Expecting SymbolList 'dna' to use the DNA alphabet, uses " 159 +dna.getAlphabet().getName()); 160 } 161 Alphabet subint = IntegerAlphabet.getSubAlphabet(0,99); 162 if(quality.getAlphabet() != subint && quality.getAlphabet() != IntegerAlphabet.getInstance()){ 163 throw new IllegalAlphabetException( 164 "Expecting SymbolList quality to use the "+subint.getName()+" alphabet"+ 165 "or IntegerAlphabet instead uses "+ 166 quality.getAlphabet().getName()); 167 } 168 169 //build the symbollist 170 SimpleSymbolList sl = new SimpleSymbolList(getPhredAlphabet()); 171 172 for(int i = 1; i <= dna.length(); i++){ 173 Symbol d = dna.symbolAt(i); 174 Symbol q = quality.symbolAt(i); 175 try{ 176 sl.addSymbol(getPhredSymbol(d,q)); 177 }catch(ChangeVetoException e){ 178 throw new AssertionFailure(e); 179 } 180 } 181 182 return sl; 183 } 184 185 /** 186 * Creates a symbol from the PHRED alphabet by combining a Symbol from the 187 * DNA alphabet and a Symbol from the IntegerAlphabet (or one of its subsets). 188 * @throws IllegalSymbolException if there is no Symbol in the PHRED alphabet 189 * that represents the two arguments. 190 */ 191 public static final Symbol getPhredSymbol(Symbol dna, Symbol integer) 192 throws IllegalSymbolException{ 193 return getPhredAlphabet().getSymbol(new ListTools.Doublet(dna, integer)); 194 } 195 196 /** 197 * Writes Phred quality data in a Fasta type format. 198 * @param db a bunch of PhredSequence objects 199 * @param qual the OutputStream to write the quality data to. 200 * @param seq the OutputStream to write the sequence data to. 201 * @since 1.2 202 */ 203 public static void writePhredQuality(OutputStream qual, OutputStream seq, SequenceDB db) 204 throws IOException, BioException{ 205 StreamWriter qualw = new StreamWriter(qual,new PhredFormat()); 206 StreamWriter seqw = new StreamWriter(seq, new FastaFormat()); 207 SequenceDB qualDB = new HashSequenceDB(IDMaker.byName); 208 //Get the quality SymbolLists and add them to a SeqDB 209 for(SequenceIterator i = db.sequenceIterator(); i.hasNext();){ 210 Sequence p = i.nextSequence(); 211 if(p instanceof PhredSequence){ 212 PhredSequence ps = (PhredSequence)p; 213 SymbolList ql = ps.getQuality(); 214 try{ 215 qualDB.addSequence( new SimpleSequence(ql,p.getURN(),p.getName(),p.getAnnotation())); 216 }catch(ChangeVetoException cve){ 217 throw new AssertionFailure("Cannot Add Quality Sequences to Database", cve); 218 } 219 } 220 else{ 221 throw new BioException("Expecting PhredSequence, got " + p.getClass().getName()); 222 } 223 } 224 qualw.writeStream(qualDB.sequenceIterator()); 225 seqw.writeStream(db.sequenceIterator());//this works as sequence methods act on the underlying SimpleSequence 226 } 227 228 /** 229 * Constructs a StreamReader to read in Phred quality data in FASTA format. 230 * The data is converted into sequences consisting of Symbols from the IntegerAlphabet. 231 */ 232 public static StreamReader readPhredQuality(BufferedReader br){ 233 return new StreamReader(br, 234 new PhredFormat(), 235 getQualityParser(), 236 getFastaBuilderFactory()); 237 } 238 239 240 241 /** 242 * Calls SeqIOTools.readFastaDNA(br), added here for convinience. 243 */ 244 public static StreamReader readPhredSequence(BufferedReader br){ 245 return (StreamReader)SeqIOTools.readFastaDNA(br); 246 } 247 248 249 private static SequenceBuilderFactory _fastaBuilderFactory; 250 251 /** 252 * Get a default SequenceBuilderFactory for handling FASTA 253 * files. 254 */ 255 private static SequenceBuilderFactory getFastaBuilderFactory() { 256 if (_fastaBuilderFactory == null) { 257 _fastaBuilderFactory = new FastaDescriptionLineParser.Factory(SimpleSequenceBuilder.FACTORY); 258 } 259 return _fastaBuilderFactory; 260 } 261 262 /** 263 * returns the IntegerAlphabet parser 264 */ 265 private static SymbolTokenization getQualityParser() { 266 return IntegerAlphabet.getInstance().getTokenization("token"); 267 } 268 269 /** 270 * The quality value is related to the base call error probability 271 * by the formula QV = - 10 * log_10( P_e ) 272 * where P_e is the probability that the base call is an error. 273 * @return a <code>double</code> value, note that for most Phred scores this will be rounded 274 * to the nearest <code>int</code> 275 */ 276 public static double qualityFromP(double probOfError){ 277 return (-10 * (Math.log(probOfError)/Math.log(10.0))); 278 } 279 280 /** 281 * Calculates the probability of an error from the quality score via the formula 282 * P_e = 10**(QV/-10) 283 */ 284 public static double pFromQuality(double quality){ 285 return Math.pow(10.0,(quality/-10.0)); 286 } 287 288 /** 289 * Calculates the probability of an error from the quality score via the formula 290 * P_e = 10**(QV/-10) 291 */ 292 public static double pFromQuality(int quality){ 293 return pFromQuality((double)quality); 294 } 295 296 /** 297 * Calculates the probability of an error from the quality score via the formula 298 * P_e = 10**(QV/-10) 299 */ 300 public static double pFromQuality(IntegerAlphabet.IntegerSymbol quality){ 301 return pFromQuality(quality.intValue()); 302 } 303 304 /** 305 * Converts a Phred sequence to an array of distributions. Essentially a fuzzy sequence 306 * Assumes that all of the non called bases are equiprobable 307 */ 308 public static Distribution[] phredToDistArray(PhredSequence s){ 309 Distribution[] pos = new Distribution[s.length()]; 310 DistributionTrainerContext dtc = new SimpleDistributionTrainerContext(); 311 312 for (int i = 0; i < s.length(); i++) {// for each symbol in the phred sequence 313 Symbol qual = s.getQualityAt(i + 1); 314 Symbol base = s.getDNAAt(i + 1); 315 double pBase = pFromQuality((IntegerAlphabet.IntegerSymbol)qual); 316 double pOthers = (1.0 - pBase)/3; 317 318 try{ 319 pos[i] = DistributionFactory.DEFAULT.createDistribution(DNATools.getDNA()); 320 dtc.registerDistribution(pos[i]); 321 322 for(Iterator iter = (DNATools.getDNA().iterator()); iter.hasNext();){ 323 Symbol sym = (Symbol)iter.next(); 324 if(sym.equals(base)) pos[i].setWeight(sym,pBase); 325 else pos[i].setWeight(sym,pOthers); 326 } 327 328 dtc.train(); 329 }catch(IllegalAlphabetException iae){ 330 throw new AssertionFailure("Sequence "+s.getName()+" contains an illegal alphabet", iae); 331 }catch(ChangeVetoException cve){ 332 throw new AssertionFailure("The Distribution has become locked", cve); 333 }catch(IllegalSymbolException ise){ 334 throw new AssertionFailure("Sequence "+s.getName()+" contains an illegal symbol", ise); 335 } 336 } 337 return pos; 338 } 339 340 /** 341 * converts an Alignment of PhredSequences to a Distribution[] where each position is the average 342 * distribution of the underlying column of the alignment. 343 * @throws ClassCastException if the sequences in the alignment are not instances of PhredSequence 344 */ 345 public static Distribution[] phredAlignmentToDistArray(Alignment a){ 346 List<String> labels = a.getLabels(); 347 Distribution [] average = new Distribution[a.length()]; 348 349 Distribution[][] matrix = new Distribution[labels.size()][]; 350 for(int y = 0; y < a.length(); y++){// for eaxh position 351 for(Iterator<String> i = labels.iterator(); i.hasNext();){ 352 SymbolList sl = a.symbolListForLabel(i.next()); 353 matrix[y] = phredToDistArray((PhredSequence)sl); 354 } 355 average[y] = DistributionTools.average(matrix[y]); 356 } 357 358 return average; 359 } 360}