001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022package org.biojava.bio.program.phred;
023
024import java.io.BufferedReader;
025import java.io.IOException;
026import java.io.OutputStream;
027import java.util.ArrayList;
028import java.util.Iterator;
029import java.util.List;
030
031import org.biojava.bio.BioError;
032import org.biojava.bio.BioException;
033import org.biojava.bio.alignment.Alignment;
034import org.biojava.bio.dist.Distribution;
035import org.biojava.bio.dist.DistributionFactory;
036import org.biojava.bio.dist.DistributionTools;
037import org.biojava.bio.dist.DistributionTrainerContext;
038import org.biojava.bio.dist.SimpleDistributionTrainerContext;
039import org.biojava.bio.seq.DNATools;
040import org.biojava.bio.seq.Sequence;
041import org.biojava.bio.seq.SequenceIterator;
042import org.biojava.bio.seq.db.HashSequenceDB;
043import org.biojava.bio.seq.db.IDMaker;
044import org.biojava.bio.seq.db.SequenceDB;
045import org.biojava.bio.seq.impl.SimpleSequence;
046import org.biojava.bio.seq.io.FastaDescriptionLineParser;
047import org.biojava.bio.seq.io.FastaFormat;
048import org.biojava.bio.seq.io.SeqIOTools;
049import org.biojava.bio.seq.io.SequenceBuilderFactory;
050import org.biojava.bio.seq.io.SimpleSequenceBuilder;
051import org.biojava.bio.seq.io.StreamReader;
052import org.biojava.bio.seq.io.StreamWriter;
053import org.biojava.bio.seq.io.SymbolTokenization;
054import org.biojava.bio.symbol.Alphabet;
055import org.biojava.bio.symbol.AlphabetManager;
056import org.biojava.bio.symbol.BasisSymbol;
057import org.biojava.bio.symbol.FiniteAlphabet;
058import org.biojava.bio.symbol.IllegalAlphabetException;
059import org.biojava.bio.symbol.IllegalSymbolException;
060import org.biojava.bio.symbol.IntegerAlphabet;
061import org.biojava.bio.symbol.SimpleSymbolList;
062import org.biojava.bio.symbol.Symbol;
063import org.biojava.bio.symbol.SymbolList;
064import org.biojava.utils.AssertionFailure;
065import org.biojava.utils.ChangeVetoException;
066import org.biojava.utils.ListTools;
067
068/**
069 * <p>PhredTools contains static methods for working with phred
070 * quality data.</p>
071 *
072 * <p>Copyright (c) 2001</p>
073 * <p>Company:      AgResearch</p>
074 *
075 * @author Mark Schreiber
076 * @author Matthew Pocock
077 * @since 1.1
078 *
079 * Note that Phred is a copyright of CodonCode Corporation.
080 */
081
082public final class PhredTools {
083
084   static{
085     try {
086       List l = new ArrayList(2);
087       l.add(DNATools.getDNA());
088       l.add(IntegerAlphabet.getSubAlphabet(0,99));
089       AlphabetManager.getCrossProductAlphabet(l, "PHRED");
090     } catch (IllegalAlphabetException iae) {
091       throw new BioError( "Could not create the Phred alphabet", iae);
092     }
093   }
094
095  /**
096   * Retrieves the PHRED alphabet from the AlphabetManager. The Phred alphabet
097   * is a cross product of a subset of the IntegerAlphabet from 0...99 and the
098   * DNA alphabet. The Phred alphabet is finite.
099   *
100   * The Phred Alphabet contains 400 BasisSymbols named, for example, (guanine 47).
101   * The BasisSymbols can be fragmented into their component AtomicSymbols using
102   * the <code>getSymbols()</code> method of BasisSymbol.
103   */
104  public static final FiniteAlphabet getPhredAlphabet(){
105    return (FiniteAlphabet)AlphabetManager.alphabetForName("PHRED");
106  }
107
108  /**
109   * Retrives the DNA symbol component of the Phred BasisSymbol from the
110   * PHRED alphabet.
111   * @throws IllegalSymbolException if the provided symbol is not from the
112   * PHRED alphabet.
113   */
114  public static final Symbol dnaSymbolFromPhred(Symbol phredSym)
115    throws IllegalSymbolException{
116    //validate the symbol
117    getPhredAlphabet().validate(phredSym);
118    //get the DNA component of the Phred Symbol
119    List l = ((BasisSymbol)phredSym).getSymbols();
120    //the first symbol should be DNA
121    return (Symbol) l.get(0);
122  }
123
124  /**
125   * Retrives the IntegerSymbol component of the Phred BasisSymbol from the
126   * PHRED alphabet.
127   * @throws IllegalSymbolException if the provided symbol is not from the
128   * PHRED alphabet.
129   */
130  public static final IntegerAlphabet.IntegerSymbol integerSymbolFromPhred(Symbol phredSym)
131    throws IllegalSymbolException{
132    //validate the symbol
133    getPhredAlphabet().validate(phredSym);
134    //get the IntegerSymbol component of the Phred Symbol
135    List l = ((BasisSymbol)phredSym).getSymbols();
136    //the second symbol should be the IntegerSymbol
137    return (IntegerAlphabet.IntegerSymbol)(l.get(1));
138  }
139
140  /**
141   * Merges a Symbol List from the DNA alphabet with a SymbolList from the
142   * [0..99] subset of the IntegerAlphabet into a SymbolList from
143   * the PHRED alphabet.
144   * @throws IllegalAlphabetException if the alphabets are not of the required alphabets
145   * @throws IllegalArgumentException if the two SymbolLists are not of equal length.
146   * @throws IllegalSymbolException if a combination of Symbols cannot be represented by
147   * the PHRED alphabet.
148   */
149  public static SymbolList createPhred(SymbolList dna, SymbolList quality)
150    throws IllegalArgumentException, IllegalAlphabetException, IllegalSymbolException{
151    //perform initial checks
152    if(dna.length() != quality.length()){
153      throw new IllegalArgumentException("SymbolLists must be of equal length "+
154        dna.length()+" : "+quality.length());
155    }
156    if(dna.getAlphabet() != DNATools.getDNA()){
157      throw new IllegalAlphabetException(
158        "Expecting SymbolList 'dna' to use the DNA alphabet, uses "
159        +dna.getAlphabet().getName());
160    }
161    Alphabet subint = IntegerAlphabet.getSubAlphabet(0,99);
162    if(quality.getAlphabet() != subint && quality.getAlphabet() != IntegerAlphabet.getInstance()){
163      throw new IllegalAlphabetException(
164        "Expecting SymbolList quality to use the "+subint.getName()+" alphabet"+
165        "or IntegerAlphabet instead uses "+
166        quality.getAlphabet().getName());
167    }
168
169    //build the symbollist
170    SimpleSymbolList sl = new SimpleSymbolList(getPhredAlphabet());
171
172    for(int i = 1; i <= dna.length(); i++){
173      Symbol d = dna.symbolAt(i);
174      Symbol q = quality.symbolAt(i);
175      try{
176        sl.addSymbol(getPhredSymbol(d,q));
177      }catch(ChangeVetoException e){
178        throw new AssertionFailure(e);
179      }
180    }
181
182    return sl;
183  }
184
185  /**
186   * Creates a symbol from the PHRED alphabet by combining a Symbol from the
187   * DNA alphabet and a Symbol from the IntegerAlphabet (or one of its subsets).
188   * @throws IllegalSymbolException if there is no Symbol in the PHRED alphabet
189   * that represents the two arguments.
190   */
191  public static final Symbol getPhredSymbol(Symbol dna, Symbol integer)
192    throws IllegalSymbolException{
193    return getPhredAlphabet().getSymbol(new ListTools.Doublet(dna, integer));
194  }
195
196  /**
197   * Writes Phred quality data in a Fasta type format.
198   * @param db a bunch of PhredSequence objects
199   * @param qual the OutputStream to write the quality data to.
200   * @param seq the OutputStream to write the sequence data to.
201   * @since 1.2
202   */
203   public static void writePhredQuality(OutputStream qual, OutputStream seq, SequenceDB db)
204    throws IOException, BioException{
205      StreamWriter qualw = new StreamWriter(qual,new PhredFormat());
206      StreamWriter seqw = new StreamWriter(seq, new FastaFormat());
207      SequenceDB qualDB = new HashSequenceDB(IDMaker.byName);
208      //Get the quality SymbolLists and add them to a SeqDB
209      for(SequenceIterator i = db.sequenceIterator(); i.hasNext();){
210        Sequence p = i.nextSequence();
211        if(p instanceof PhredSequence){
212          PhredSequence ps = (PhredSequence)p;
213          SymbolList ql = ps.getQuality();
214          try{
215            qualDB.addSequence( new SimpleSequence(ql,p.getURN(),p.getName(),p.getAnnotation()));
216          }catch(ChangeVetoException cve){
217            throw new AssertionFailure("Cannot Add Quality Sequences to Database", cve);
218          }
219        }
220        else{
221          throw new BioException("Expecting PhredSequence, got " + p.getClass().getName());
222        }
223      }
224      qualw.writeStream(qualDB.sequenceIterator());
225      seqw.writeStream(db.sequenceIterator());//this works as sequence methods act on the underlying SimpleSequence
226   }
227
228  /**
229   * Constructs a StreamReader to read in Phred quality data in FASTA format.
230   * The data is converted into sequences consisting of Symbols from the IntegerAlphabet.
231   */
232  public static StreamReader readPhredQuality(BufferedReader br){
233    return new StreamReader(br,
234      new PhredFormat(),
235      getQualityParser(),
236      getFastaBuilderFactory());
237  }
238
239
240
241  /**
242   * Calls SeqIOTools.readFastaDNA(br), added here for convinience.
243   */
244  public static StreamReader readPhredSequence(BufferedReader br){
245    return (StreamReader)SeqIOTools.readFastaDNA(br);
246  }
247
248
249  private static SequenceBuilderFactory _fastaBuilderFactory;
250
251    /**
252     * Get a default SequenceBuilderFactory for handling FASTA
253     * files.
254     */
255  private static SequenceBuilderFactory getFastaBuilderFactory() {
256      if (_fastaBuilderFactory == null) {
257          _fastaBuilderFactory = new FastaDescriptionLineParser.Factory(SimpleSequenceBuilder.FACTORY);
258      }
259      return _fastaBuilderFactory;
260  }
261
262  /**
263   * returns the IntegerAlphabet parser
264   */
265  private static SymbolTokenization getQualityParser() {
266    return IntegerAlphabet.getInstance().getTokenization("token");
267  }
268
269  /**
270   * The quality value is related to the base call error probability
271   * by the formula  QV = - 10 * log_10( P_e )
272   * where P_e is the probability that the base call is an error.
273   * @return a <code>double</code> value, note that for most Phred scores this will be rounded
274   * to the nearest <code>int</code>
275   */
276   public static double qualityFromP(double probOfError){
277     return (-10 * (Math.log(probOfError)/Math.log(10.0)));
278   }
279
280   /**
281    * Calculates the probability of an error from the quality score via the formula
282    *  P_e = 10**(QV/-10)
283    */
284    public static double pFromQuality(double quality){
285      return Math.pow(10.0,(quality/-10.0));
286    }
287
288    /**
289     * Calculates the probability of an error from the quality score via the formula
290     *  P_e = 10**(QV/-10)
291     */
292    public static double pFromQuality(int quality){
293      return pFromQuality((double)quality);
294    }
295
296    /**
297     * Calculates the probability of an error from the quality score via the formula
298     *  P_e = 10**(QV/-10)
299     */
300    public static double pFromQuality(IntegerAlphabet.IntegerSymbol quality){
301      return pFromQuality(quality.intValue());
302    }
303
304    /**
305     * Converts a Phred sequence to an array of distributions. Essentially a fuzzy sequence
306     * Assumes that all of the non called bases are equiprobable
307     */
308    public static Distribution[] phredToDistArray(PhredSequence s){
309      Distribution[] pos = new Distribution[s.length()];
310      DistributionTrainerContext dtc = new SimpleDistributionTrainerContext();
311
312      for (int i = 0; i < s.length(); i++) {// for each symbol in the phred sequence
313        Symbol qual = s.getQualityAt(i + 1);
314        Symbol base = s.getDNAAt(i + 1);
315        double pBase = pFromQuality((IntegerAlphabet.IntegerSymbol)qual);
316        double pOthers = (1.0 - pBase)/3;
317
318        try{
319          pos[i] = DistributionFactory.DEFAULT.createDistribution(DNATools.getDNA());
320          dtc.registerDistribution(pos[i]);
321
322          for(Iterator iter = (DNATools.getDNA().iterator()); iter.hasNext();){
323            Symbol sym = (Symbol)iter.next();
324            if(sym.equals(base)) pos[i].setWeight(sym,pBase);
325            else pos[i].setWeight(sym,pOthers);
326          }
327
328          dtc.train();
329        }catch(IllegalAlphabetException iae){
330          throw new AssertionFailure("Sequence "+s.getName()+" contains an illegal alphabet", iae);
331        }catch(ChangeVetoException cve){
332          throw new AssertionFailure("The Distribution has become locked", cve);
333        }catch(IllegalSymbolException ise){
334          throw new AssertionFailure("Sequence "+s.getName()+" contains an illegal symbol", ise);
335        }
336      }
337      return pos;
338    }
339
340    /**
341     * converts an Alignment of PhredSequences to a Distribution[] where each position is the average
342     * distribution of the underlying column of the alignment.
343     * @throws ClassCastException if the sequences in the alignment are not instances of PhredSequence
344     */
345    public static Distribution[] phredAlignmentToDistArray(Alignment a){
346      List<String> labels = a.getLabels();
347      Distribution [] average = new Distribution[a.length()];
348
349      Distribution[][] matrix = new Distribution[labels.size()][];
350      for(int y = 0; y < a.length(); y++){// for eaxh position
351        for(Iterator<String> i = labels.iterator(); i.hasNext();){
352          SymbolList sl = a.symbolListForLabel(i.next());
353          matrix[y] = phredToDistArray((PhredSequence)sl);
354        }
355        average[y] = DistributionTools.average(matrix[y]);
356      }
357
358      return average;
359    }
360}