001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022package org.biojava.bio.seq;
023
024import java.util.HashMap;
025import java.util.HashSet;
026import java.util.Iterator;
027import java.util.Map;
028import java.util.Set;
029
030import org.biojava.bio.BioError;
031import org.biojava.bio.BioException;
032import org.biojava.bio.SimpleAnnotation;
033import org.biojava.bio.dist.Distribution;
034import org.biojava.bio.dist.PairDistribution;
035import org.biojava.bio.dist.SimpleDistribution;
036import org.biojava.bio.seq.impl.SimpleGappedSequence;
037import org.biojava.bio.seq.impl.SimpleSequenceFactory;
038import org.biojava.bio.seq.io.SymbolTokenization;
039import org.biojava.bio.symbol.Alphabet;
040import org.biojava.bio.symbol.AlphabetManager;
041import org.biojava.bio.symbol.AtomicSymbol;
042import org.biojava.bio.symbol.FiniteAlphabet;
043import org.biojava.bio.symbol.IllegalAlphabetException;
044import org.biojava.bio.symbol.IllegalSymbolException;
045import org.biojava.bio.symbol.ReversibleTranslationTable;
046import org.biojava.bio.symbol.SimpleReversibleTranslationTable;
047import org.biojava.bio.symbol.SimpleSymbolList;
048import org.biojava.bio.symbol.Symbol;
049import org.biojava.bio.symbol.SymbolList;
050import org.biojava.bio.symbol.SymbolListViews;
051import org.biojava.utils.ChangeVetoException;
052
053/**
054 * Useful functionality for processing DNA sequences.
055 *
056 * @author Matthew Pocock
057 * @author Keith James (docs)
058 * @author Mark Schreiber
059 * @author David Huen
060 * @author Richard Holland
061 */
062public final class DNATools {
063  private static final ReversibleTranslationTable complementTable;
064  static private final FiniteAlphabet dna;
065    private static final SymbolTokenization dnaTokens;
066
067  static private final AtomicSymbol a;
068  static private final AtomicSymbol g;
069  static private final AtomicSymbol c;
070  static private final AtomicSymbol t;
071  static private final Symbol n;
072  static private final Symbol m;
073  static private final Symbol r;
074  static private final Symbol w;
075  static private final Symbol s;
076  static private final Symbol y;
077  static private final Symbol k;
078  static private final Symbol v;
079  static private final Symbol h;
080  static private final Symbol d;
081  static private final Symbol b;
082  static private final SimpleReversibleTranslationTable transcriptionTable;
083
084
085  static private Map<Symbol, Symbol> symbolToComplement;
086
087  static {
088    try {
089      dna = (FiniteAlphabet) AlphabetManager.alphabetForName("DNA");
090      dnaTokens = dna.getTokenization("token");
091      SymbolList syms = new SimpleSymbolList(dnaTokens, "agctnmrwsykvhdb");
092      a = (AtomicSymbol) syms.symbolAt(1);
093      g = (AtomicSymbol) syms.symbolAt(2);
094      c = (AtomicSymbol) syms.symbolAt(3);
095      t = (AtomicSymbol) syms.symbolAt(4);
096      n = syms.symbolAt(5);
097      m = syms.symbolAt(6);
098      r = syms.symbolAt(7);
099      w = syms.symbolAt(8);
100      s = syms.symbolAt(9);
101      y = syms.symbolAt(10);
102      k = syms.symbolAt(11);
103      v = syms.symbolAt(12);
104      h = syms.symbolAt(13);
105      d = syms.symbolAt(14);
106      b = syms.symbolAt(15);
107
108      symbolToComplement = new HashMap<Symbol, Symbol>();
109
110      // add the gap symbol
111      Symbol gap = dna.getGapSymbol();
112      symbolToComplement.put(gap, gap);
113
114      // add all other ambiguity symbols
115      for(Iterator i = AlphabetManager.getAllSymbols(dna).iterator(); i.hasNext();) {
116          Symbol as = (Symbol) i.next();
117          FiniteAlphabet matches = (FiniteAlphabet) as.getMatches();
118          if (matches.size() > 1) {   // We've hit an ambiguous symbol.
119              Set<Symbol> l = new HashSet<Symbol>();
120              for(Iterator j = matches.iterator(); j.hasNext(); ) {
121                  l.add(complement((Symbol) j.next()));
122              }
123              symbolToComplement.put(as, dna.getAmbiguity(l));
124          }
125      }
126
127
128      complementTable = new DNAComplementTranslationTable();
129      
130      transcriptionTable = new SimpleReversibleTranslationTable(getDNA(), RNATools.getRNA());
131      transcriptionTable.setTranslation(a, RNATools.a());
132      transcriptionTable.setTranslation(c, RNATools.c());
133      transcriptionTable.setTranslation(g, RNATools.g());
134      transcriptionTable.setTranslation(t, RNATools.u());
135      
136    } catch (Throwable th) {
137      throw new BioError("Unable to initialize DNATools", th);
138    }
139  }
140
141  public static AtomicSymbol a() { return a; }
142  public static AtomicSymbol g() { return g; }
143  public static AtomicSymbol c() { return c; }
144  public static AtomicSymbol t() { return t; }
145  public static Symbol n() { return n; }
146  public static Symbol m() { return m; }
147  public static Symbol r() { return r; }
148  public static Symbol w() { return w; }
149  public static Symbol s() { return s; }
150  public static Symbol y() { return y; }
151  public static Symbol k() { return k; }
152  public static Symbol v() { return v; }
153  public static Symbol h() { return h; }
154  public static Symbol d() { return d; }
155  public static Symbol b() { return b; }
156
157  
158  private DNATools() {
159  }
160  
161  /**
162   * Return the DNA alphabet.
163   *
164   * @return a flyweight version of the DNA alphabet
165   */
166  public static FiniteAlphabet getDNA() {
167    return dna;
168  }
169
170  /**
171   * Gets the (DNA x DNA) Alphabet
172   * @return a flyweight version of the (DNA x DNA) alphabet
173   */
174  public static FiniteAlphabet getDNAxDNA(){
175    return (FiniteAlphabet)AlphabetManager.generateCrossProductAlphaFromName("(DNA x DNA)");
176  }
177
178  /**
179   * Gets the (DNA x DNA x DNA) Alphabet
180   * @return a flyweight version of the (DNA x DNA x DNA) alphabet
181   */
182  public static FiniteAlphabet getCodonAlphabet(){
183    return (FiniteAlphabet)AlphabetManager.generateCrossProductAlphaFromName("(DNA x DNA x DNA)");
184  }
185
186  /**
187   * Return a new DNA <span class="type">SymbolList</span> for
188   * <span class="arg">dna</span>.
189   *
190   * @param dna a <span class="type">String</span> to parse into DNA
191   * @return a <span class="type">SymbolList</span> created form
192   *         <span class="arg">dna</span>
193   * @throws IllegalSymbolException if <span class="arg">dna</span> contains
194   *         any non-DNA characters
195   */
196  public static SymbolList createDNA(String dna)
197  throws IllegalSymbolException {
198    SymbolTokenization p = null;
199    try {
200      p = getDNA().getTokenization("token");
201    } catch (BioException e) {
202      throw new BioError("Something has gone badly wrong with DNA", e);
203    }
204    return new SimpleSymbolList(p, dna);
205  }
206
207  /**
208   * Return a new DNA <span class="type">Sequence</span> for
209   * <span class="arg">dna</span>.
210   *
211   * @param dna a <span class="type">String</span> to parse into DNA
212   * @param name a <span class="type">String</span> to use as the name
213   * @return a <span class="type">Sequence</span> created form
214   *         <span class="arg">dna</span>
215   * @throws IllegalSymbolException if <span class="arg">dna</span> contains
216   *         any non-DNA characters
217   */
218  public static Sequence createDNASequence(String dna, String name)
219  throws IllegalSymbolException {
220    //should I be calling createGappedDNASequence?
221    if(dna.indexOf('-') != -1 || dna.indexOf('~') != -1){//there is a gap
222        return createGappedDNASequence(dna, name);
223    }
224    // No need to wrap in try-catch as throws IllegalSymbolException already.
225    //try {
226      return new SimpleSequenceFactory().createSequence(
227        createDNA(dna),
228        "", name, new SimpleAnnotation()
229      );
230    //} catch (BioException se) {
231    //  throw new BioError("Something has gone badly wrong with DNA", se);
232    //}
233  }
234
235
236    /** Get a new dna as a GappedSequence */
237    public static GappedSequence createGappedDNASequence(String dna, String name) throws IllegalSymbolException{
238        String dna1 = dna.replaceAll("-", "");
239        Sequence dnaSeq = createDNASequence(dna1, name);
240        GappedSequence dnaSeq1 = new SimpleGappedSequence(dnaSeq);
241        int pos = dna.indexOf('-', 0);
242        while(pos!=-1){
243            dnaSeq1.addGapInView(pos+1);
244            pos = dna.indexOf('-', pos+1);
245        }
246        return dnaSeq1;
247    }
248
249
250  /**
251   * Return an integer index for a symbol - compatible with
252   * <code>forIndex</code>.
253   *
254   * <p>
255   * The index for a symbol is stable accross virtual machines &
256   * invocations.
257   * </p>
258   *
259   * @param sym  the Symbol to index
260   * @return the index for that symbol
261   *
262   * @throws IllegalSymbolException if sym is not a member of the DNA
263   * alphabet
264   */
265  public static int index(Symbol sym) throws IllegalSymbolException {
266    if(sym == a) {
267      return 0;
268    } else if(sym == g) {
269      return 1;
270    } else if(sym == c) {
271      return 2;
272    } else if(sym == t) {
273      return 3;
274    }
275    getDNA().validate(sym);
276    throw new IllegalSymbolException("Really confused. Can't find index for " +
277                                      sym.getName());
278  }
279
280  /**
281   * Return the symbol for an index - compatible with <code>index</code>.
282   *
283   * <p>
284   * The index for a symbol is stable accross virtual machines &
285   * invocations.
286   * </p>
287   *
288   * @param index  the index to look up
289   * @return       the symbol at that index
290   *
291   * @throws IndexOutOfBoundsException if index is not between 0 and 3
292   */
293  static public Symbol forIndex(int index)
294  throws IndexOutOfBoundsException {
295    if(index == 0)
296      return a;
297    else if(index == 1)
298      return g;
299    else if(index == 2)
300      return c;
301    else if(index == 3)
302      return t;
303    else throw new IndexOutOfBoundsException("No symbol for index " + index);
304  }
305
306  /**
307   * Complement the symbol.
308   *
309   * @param sym  the symbol to complement
310   * @return a Symbol that is the complement of sym
311   * @throws IllegalSymbolException if sym is not a member of the DNA alphabet
312   */
313  static public Symbol complement(Symbol sym)
314  throws IllegalSymbolException {
315    if(sym == a) {
316      return t;
317    } else if(sym == g) {
318      return c;
319    } else if(sym == c) {
320      return g;
321    } else if(sym == t) {
322      return a;
323    }
324    Symbol s = symbolToComplement.get(sym);
325    if(s != null) {
326      return s;
327    } else {
328      getDNA().validate(sym);
329      throw new BioError(
330        "Really confused. Can't find symbol " +
331        sym.getName()
332      );
333    }
334  }
335
336  /**
337   * Retrieve the symbol for a symbol.
338   *
339   * @param token  the char to look up
340   * @return  the symbol for that char
341   * @throws IllegalSymbolException if the char is not a valid IUB dna code
342   */
343  static public Symbol forSymbol(char token)
344  throws IllegalSymbolException {
345    String t = String.valueOf(token);
346    SymbolTokenization toke;
347    try{
348      toke = getDNA().getTokenization("token");
349    }catch(BioException e){
350      throw new BioError("Cannot find the 'token' Tokenization for DNA!?", e);
351    }
352    return toke.parseToken(t);
353  }
354
355  /**
356   * Retrieve a complement view of list.
357   *
358   * @param list  the SymbolList to complement
359   * @return a SymbolList that is the complement
360   * @throws IllegalAlphabetException if list is not a complementable alphabet
361   */
362  public static SymbolList complement(SymbolList list)
363  throws IllegalAlphabetException {
364    return SymbolListViews.translate(list, complementTable());
365  }
366
367  /**
368   * Retrieve a reverse-complement view of list.
369   *
370   * @param list  the SymbolList to complement
371   * @return a SymbolList that is the complement
372   * @throws IllegalAlphabetException if list is not a complementable alphabet
373   */
374  public static SymbolList reverseComplement(SymbolList list)
375  throws IllegalAlphabetException {
376    return SymbolListViews.translate(SymbolListViews.reverse(list), complementTable());
377  }
378
379  /**
380   * Returns a SymbolList that is reverse complemented if the strand is
381   * negative, and the origninal one if it is not.
382   *
383   * @param list  the SymbolList to view
384   * @param strand the Strand to use
385   * @return the apropreate view of the SymbolList
386   * @throws IllegalAlphabetException if list is not a complementable alphabet
387   */
388  public static SymbolList flip(SymbolList list, StrandedFeature.Strand strand)
389  throws IllegalAlphabetException {
390    if(strand == StrandedFeature.NEGATIVE) {
391      return reverseComplement(list);
392    } else {
393      return list;
394    }
395  }
396
397  /**
398   * Get a translation table for complementing DNA symbols.
399   *
400   * @since 1.1
401   */
402
403  public static ReversibleTranslationTable complementTable() {
404    return complementTable;
405  }
406
407    /**
408     * Get a single-character token for a DNA symbol
409     *
410     * @throws IllegalSymbolException if <code>sym</code> is not a member of the DNA alphabet
411     */
412
413    public static char dnaToken(Symbol sym)
414        throws IllegalSymbolException
415    {
416        return dnaTokens.tokenizeSymbol(sym).charAt(0);
417    }
418
419  /**
420   * Sneaky class for complementing DNA bases.
421   */
422
423  private static class DNAComplementTranslationTable
424  implements ReversibleTranslationTable {
425    public Symbol translate(Symbol s)
426          throws IllegalSymbolException {
427            return DNATools.complement(s);
428          }
429
430    public Symbol untranslate(Symbol s)
431          throws IllegalSymbolException {
432            return DNATools.complement(s);
433          }
434
435          public Alphabet getSourceAlphabet() {
436            return DNATools.getDNA();
437          }
438
439          public Alphabet getTargetAlphabet() {
440            return DNATools.getDNA();
441          }
442  }
443
444  /**
445   * return a SimpleDistribution of specified GC content.
446   * @param fractionGC (G+C) content as a fraction.
447   */
448  public static Distribution getDNADistribution(double fractionGC)
449  {
450    try {
451        Distribution dist = new SimpleDistribution(DNATools.getDNA());
452        double gc = 0.5 * fractionGC;
453        double at = 0.5 * (1.0 - fractionGC);
454        dist.setWeight(DNATools.a(), at);
455        dist.setWeight(DNATools.t(), at);
456        dist.setWeight(DNATools.c(), gc);
457        dist.setWeight(DNATools.g(), gc);
458
459        return dist;
460    }
461        // these exceptions are just plain impossible!!!
462    catch (IllegalSymbolException ise) { return null; }
463    catch (ChangeVetoException cve) { return null; }
464  }
465
466  /**
467   * return a (DNA x DNA) cross-product Distribution with specified
468   * DNA contents in each component Alphabet.
469   * @param fractionGC0 (G+C) content of first sequence as a fraction.
470   * @param fractionGC1 (G+C) content of second sequence as a fraction.
471   */
472  public static Distribution getDNAxDNADistribution(
473    double fractionGC0,
474    double fractionGC1
475    )
476  {
477    return new PairDistribution(getDNADistribution(fractionGC0), getDNADistribution(fractionGC1));
478  }
479  
480  /**
481   * Converts a <code>SymbolList</code> from the DNA <code>Alphabet</code> to the
482   * RNA <code>Alphabet</code>.
483   * @param syms the <code>SymbolList</code> to convert to RNA
484   * @return a view on <code>syms</code> where <code>Symbols</code> have been converted to RNA.
485   * Most significantly t's are now u's. The 5' to 3' order of the Symbols is conserved.
486   * @since 1.4
487   * @throws IllegalAlphabetException if <code>syms</code> is not DNA.
488   */
489   public static SymbolList toRNA(SymbolList syms)throws IllegalAlphabetException{
490     return SymbolListViews.translate(syms, transcriptionTable);
491   }
492   
493   /**
494    * Transcribes DNA to RNA. The method more closely represents the biological reality
495    * than <code>toRNA(SymbolList syms)</code> does. The presented DNA <code>SymbolList</code> 
496    * is assumed to be the template strand in the 5' to 3' orientation. The resulting
497    * RNA is transcribed from this template effectively a reverse complement in the RNA alphabet.
498    * The method is equivalent to calling <code>reverseComplement()</code> and <code>toRNA()</code> in sequence.
499    * <p>If you are dealing with cDNA sequences that you want converted to RNA you would be 
500    * better off calling <code>toRNA(SymbolList syms)</code>
501    * @param syms the <code>SymbolList</code> to convert to RNA
502    * @return a view on <code>syms</code> where <code>Symbols</code> have been converted to RNA.
503    * @since 1.4
504    * @throws IllegalAlphabetException if <code>syms</code> is not DNA.
505    */
506   public static SymbolList transcribeToRNA(SymbolList syms) throws IllegalAlphabetException{
507     syms = reverseComplement(syms);
508     return toRNA(syms);
509   }
510   
511   /**
512    * Convenience method that directly converts a DNA sequence to RNA then to
513    * protein. The translated protein is from the +1 reading frame of the
514    * <code>SymbolList</code>. The whole <code>SymbolList</code> is translated
515    * although up to 2 DNA residues may be truncated if full codons cannot be 
516    * formed.
517    * @param syms the sequence to be translated.
518    * @return the translated protein sequence.
519    * @throws org.biojava.bio.symbol.IllegalAlphabetException if <code>syms</code>
520    * is not from the DNA alphabet.
521    * @since 1.5.1
522    */
523   public static SymbolList toProtein(final SymbolList syms) 
524           throws IllegalAlphabetException{
525       SymbolList symz = new SimpleSymbolList(syms);
526       symz = toRNA(symz);
527       //truncate to a length divisible by three.
528       symz = symz.subList(1, symz.length() - (symz.length() %3));
529       return RNATools.translate(symz);
530   }
531   
532   /**
533    * Convenience method to translate a region of a DNA sequence directly into
534    * protein. While the start and end can be specified if the length of the 
535    * specified region is not evenly divisible by three then the translated 
536    * region will be truncated until a full terminal codon can be formed.
537    * @param syms the DNA sequence to be translated.
538    * @param start the location to begin translation.
539    * @param end the end of the translated region.
540    * @return the translated protein sequence.
541    * @throws org.biojava.bio.symbol.IllegalAlphabetException if <code>syms
542    * </code> is not from the DNA alphabet.
543    * @since 1.5.1
544    */
545   public static SymbolList toProtein(final SymbolList syms, int start, int end)
546           throws IllegalAlphabetException{
547       SymbolList symz = new SimpleSymbolList(syms);
548       symz = symz.subList(start, end);
549       //return toProtein(syms);
550       //changed to fix bug 2521
551       return toProtein(symz);
552   }
553}
554
555
556
557
558
559