001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022package org.biojava.bio.seq;
023
024import java.io.InputStream;
025import java.util.HashMap;
026import java.util.HashSet;
027import java.util.Iterator;
028import java.util.Map;
029import java.util.Set;
030
031import javax.xml.parsers.DocumentBuilder;
032import javax.xml.parsers.DocumentBuilderFactory;
033
034import org.biojava.bio.BioError;
035import org.biojava.bio.BioException;
036import org.biojava.bio.SimpleAnnotation;
037import org.biojava.bio.seq.impl.SimpleSequenceFactory;
038import org.biojava.bio.seq.io.SymbolTokenization;
039import org.biojava.bio.symbol.AbstractReversibleTranslationTable;
040import org.biojava.bio.symbol.Alphabet;
041import org.biojava.bio.symbol.AlphabetManager;
042import org.biojava.bio.symbol.AtomicSymbol;
043import org.biojava.bio.symbol.FiniteAlphabet;
044import org.biojava.bio.symbol.IllegalAlphabetException;
045import org.biojava.bio.symbol.IllegalSymbolException;
046import org.biojava.bio.symbol.ManyToOneTranslationTable;
047import org.biojava.bio.symbol.ReversibleTranslationTable;
048import org.biojava.bio.symbol.SimpleGeneticCodeTable;
049import org.biojava.bio.symbol.SimpleReversibleTranslationTable;
050import org.biojava.bio.symbol.SimpleSymbolList;
051import org.biojava.bio.symbol.Symbol;
052import org.biojava.bio.symbol.SymbolList;
053import org.biojava.bio.symbol.SymbolListViews;
054import org.biojava.utils.ClassTools;
055import org.w3c.dom.Document;
056import org.w3c.dom.Element;
057import org.w3c.dom.Node;
058import org.w3c.dom.NodeList;
059import org.xml.sax.InputSource;
060
061/**
062 * Useful functionality for processing DNA and RNA sequences.
063 *
064 * @author Matthew Pocock
065 * @author Keith James (docs)
066 * @author Thomas Down
067 * @author Greg Cox
068 * @author Mark Schreiber
069 * @author David Huen (refactoring)
070 * @author gwaldon (update genetic code translation tables)
071 */
072public final class RNATools {
073  private static final ReversibleTranslationTable complementTable;
074  private static final SimpleReversibleTranslationTable transcriptionTable;
075  static private final FiniteAlphabet rna;
076  static private final Map geneticCodes;
077
078  static private final AtomicSymbol a;
079  static private final AtomicSymbol g;
080  static private final AtomicSymbol c;
081  static private final AtomicSymbol u;
082  static private final Symbol n;
083
084  static private Map symbolToComplement;
085
086  static {
087    try {
088      rna = (FiniteAlphabet) AlphabetManager.alphabetForName("RNA");
089
090      SymbolList syms = new SimpleSymbolList(rna.getTokenization("token"), "agcun");
091      a = (AtomicSymbol) syms.symbolAt(1);
092      g = (AtomicSymbol) syms.symbolAt(2);
093      c = (AtomicSymbol) syms.symbolAt(3);
094      u = (AtomicSymbol) syms.symbolAt(4);
095      n = syms.symbolAt(5);
096
097      symbolToComplement = new HashMap();
098
099      // add the gap symbol
100      Symbol gap = rna.getGapSymbol();
101      symbolToComplement.put(gap, gap);
102
103      // add all other ambiguity symbols
104      for(Iterator i = AlphabetManager.getAllSymbols(rna).iterator(); i.hasNext();) {
105          Symbol as = (Symbol) i.next();
106          FiniteAlphabet matches = (FiniteAlphabet) as.getMatches();
107          if (matches.size() > 1) {   // We've hit an ambiguous symbol.
108              Set l = new HashSet();
109              for(Iterator j = matches.iterator(); j.hasNext(); ) {
110                  l.add(complement((Symbol) j.next()));
111              }
112              symbolToComplement.put(as, rna.getAmbiguity(l));
113          }
114      }
115      complementTable = new RNAComplementTranslationTable();
116
117      transcriptionTable = new SimpleReversibleTranslationTable(DNATools.getDNA(), rna);
118      transcriptionTable.setTranslation(DNATools.a(), a);
119      transcriptionTable.setTranslation(DNATools.c(), c);
120      transcriptionTable.setTranslation(DNATools.g(), g);
121      transcriptionTable.setTranslation(DNATools.t(), u);
122
123      geneticCodes = new HashMap();
124      loadGeneticCodes();
125    } catch (Throwable t) {
126      throw new BioError("Unable to initialize RNATools", t);
127    }
128  }
129
130  public static AtomicSymbol a() { return a; }
131  public static AtomicSymbol g() { return g; }
132  public static AtomicSymbol c() { return c; }
133  public static AtomicSymbol u() { return u; }
134  public static Symbol n() { return n; }
135
136  private RNATools() {
137  }
138  
139  /**
140   * Return the RNA alphabet.
141   *
142   * @return a flyweight version of the RNA alphabet
143   */
144  public static FiniteAlphabet getRNA() {
145    return rna;
146  }
147
148  /**
149   * Gets the (RNA x RNA x RNA) Alphabet
150   * @return a flyweight version of the (RNA x RNA x RNA) alphabet
151   */
152  public static FiniteAlphabet getCodonAlphabet(){
153    return (FiniteAlphabet)AlphabetManager.generateCrossProductAlphaFromName("(RNA x RNA x RNA)");
154  }
155
156  /**
157   * Return a new RNA <span class="type">SymbolList</span> for
158   * <span class="arg">rna</span>.
159   *
160   * @param rna a <span class="type">String</span> to parse into RNA
161   * @return a <span class="type">SymbolList</span> created form
162   *         <span class="arg">rna</span>
163   * @throws IllegalSymbolException if  <span class="arg">rna</span> contains
164   *         any non-RNA characters
165   */
166  public static SymbolList createRNA(String rna)
167  throws IllegalSymbolException {
168    SymbolTokenization p = null;
169    try {
170      p = getRNA().getTokenization("token");
171    } catch (BioException e) {
172      throw new BioError("Something has gone badly wrong with RNA", e);
173    }
174    return new SimpleSymbolList(p, rna);
175
176  }
177
178  /**
179   * Return a new RNA <span class="type">Sequence</span> for
180   * <span class="arg">rna</span>.
181   *
182   * @param rna a <span class="type">String</span> to parse into RNA
183   * @param name a <span class="type">String</span> to use as the name
184   * @return a <span class="type">Sequence</span> created form
185   *         <span class="arg">dna</span>
186   * @throws IllegalSymbolException if <span class="arg">rna</span> contains
187   *         any non-DNA characters
188   */
189  public static Sequence createRNASequence(String rna, String name)
190  throws IllegalSymbolException {
191    try {
192      return new SimpleSequenceFactory().createSequence(
193        createRNA(rna),
194        "", name, new SimpleAnnotation()
195      );
196    } catch (BioException se) {
197      throw new BioError("Something has gone badly wrong with RNA", se);
198    }
199  }
200
201  /**
202   * Return an integer index for a symbol - compatible with forIndex.
203   * <p>
204   * The index for a symbol is stable across virtual machines & invocations.
205   *
206   * @param sym  the Symbol to index
207   * @return     the index for that symbol
208   * @throws IllegalSymbolException if sym is not a member of the DNA alphabet
209   */
210  public static int index(Symbol sym) throws IllegalSymbolException {
211    if(sym == a) {
212      return 0;
213    } else if(sym == g) {
214      return 1;
215    } else if(sym == c) {
216      return 2;
217    } else if(sym == u) {
218      return 3;
219    }
220    getRNA().validate(sym);
221    throw new IllegalSymbolException("Really confused. Can't find index for " +
222                                      sym.getName());
223  }
224
225  /**
226   * Return the symbol for an index - compatible with index.
227   * <p>
228   * The index for a symbol is stable accross virtual machines & invocations.
229   *
230   * @param index  the index to look up
231   * @return       the symbol at that index
232   * @throws IndexOutOfBoundsException if index is not between 0 and 3
233   */
234  static public Symbol forIndex(int index)
235  throws IndexOutOfBoundsException {
236    if(index == 0)
237      return a;
238    else if(index == 1)
239      return g;
240    else if(index == 2)
241      return c;
242    else if(index == 3)
243      return u;
244    else throw new IndexOutOfBoundsException("No symbol for index " + index);
245  }
246
247  /**
248   * Complement the symbol.
249   *
250   * @param sym  the symbol to complement
251   * @return a Symbol that is the complement of sym
252   * @throws IllegalSymbolException if sym is not a member of the RNA alphabet
253   */
254  static public Symbol complement(Symbol sym)
255  throws IllegalSymbolException {
256    if(sym == a) {
257      return u;
258    } else if(sym == g) {
259      return c;
260    } else if(sym == c) {
261      return g;
262    } else if(sym == u) {
263      return a;
264    }
265    Symbol s = (Symbol) symbolToComplement.get(sym);
266    if(s != null) {
267      return s;
268    } else {
269      getRNA().validate(sym);
270      throw new BioError(
271        "Really confused. Can't find symbol " +
272        sym.getName()
273      );
274    }
275  }
276
277  /**
278   * Retrieve the symbol for a symbol.
279   *
280   * @param token  the char to look up
281   * @return  the symbol for that char
282   * @throws IllegalSymbolException if the char is not a valid IUB code.
283   */
284  static public Symbol forSymbol(char token)
285  throws IllegalSymbolException {
286    String t = String.valueOf(token);
287    SymbolTokenization toke;
288
289    try{
290      toke = getRNA().getTokenization("token");
291    }catch(BioException e){
292      throw new BioError("Cannot find the 'token' Tokenization for RNA!?", e);
293    }
294    return toke.parseToken(t);
295  }
296
297  /**
298   * Retrieve a complement view of list.
299   *
300   * @param list  the SymbolList to complement
301   * @return a SymbolList that is the complement
302   * @throws IllegalAlphabetException if list is not a complementable alphabet
303   */
304  public static SymbolList complement(SymbolList list)
305  throws IllegalAlphabetException {
306    return SymbolListViews.translate(list, complementTable());
307  }
308
309  /**
310   * Retrieve a reverse-complement view of list.
311   *
312   * @param list  the SymbolList to complement
313   * @return a SymbolList that is the complement
314   * @throws IllegalAlphabetException if list is not a complementable alphabet
315   */
316  public static SymbolList reverseComplement(SymbolList list)
317  throws IllegalAlphabetException {
318    return SymbolListViews.translate(SymbolListViews.reverse(list), complementTable());
319  }
320
321  /**
322   * Transcribe DNA into RNA.
323   * @deprecated The naming of this method is confusing and inconsistent use either DNATools.toRNA(SymbolList list) or
324   * DNATools.transcribeToRNA(SymbolList list) depending on the desired behaivour.
325   * @param list the SymbolList to transcribe
326   * @return a SymbolList that is the transcribed view
327   * @throws IllegalAlphabetException if the list is not DNA
328   */
329   public static SymbolList transcribe(SymbolList list)
330   throws IllegalAlphabetException {
331     return SymbolListViews.translate(list, transcriptionTable());
332   }
333
334  /**
335   * Get a translation table for complementing DNA symbols.
336   *
337   * @since 1.1
338   */
339
340  public static ReversibleTranslationTable complementTable() {
341    return complementTable;
342  }
343
344  /**
345   * Get a translation table for converting DNA to RNA.
346   *
347   * @since 1.1
348   */
349  public static ReversibleTranslationTable transcriptionTable() {
350    return transcriptionTable;
351  }
352
353  /**
354   * Retrieve a TranslationTable by name. The valid names are:
355   *
356   * <ul>
357   * <li>"UNIVERSAL"</li>
358   * <li>"VERTEBRATE_MITOCHONDRIAL"</li>
359   * <li>"YEAST_MITOCHONDRIAL"</li>
360   * <li>"MOLD_MITOCHONDRIAL"</li>
361   * <li>"INVERTEBRATE_MITOCHONDRIAL"</li>
362   * <li>"CILIATE_NUCLEAR"</li>
363   * <li>"ECHINODERM_MITOCHONDRIAL"</li>
364   * <li>"EUPLOTID_NUCLEAR"</li>
365   * <li>"BACTERIAL"</li>
366   * <li>"ALTERNATIVE_YEAST_NUCLEAR"</li>
367   * <li>"ASCIDIAN_MITOCHONDRIAL"</li>
368   * <li>"FLATWORM_MITOCHONDRIAL"</li>
369   * <li>"BLEPHARISMA_MACRONUCLEAR"</li>
370   * <li>"CHLOROPHYCEAN_MITOCHONDRIAL"</li>
371   * <li>"TREMATODE_MITOCHONDRIAL"</li>
372   * <li>"SCENEDESMUS_MITOCHONDRIAL"</li>
373   * </ul>
374   *
375   * There are public static final fields in the TranslationTable
376   * interface which contain these values. One of these should be used
377   * as the argument for this method.
378   * <p>
379   * You can now get the reverse translation of the residue back to its
380   * (usually several) codons too.
381   *
382   * @since 1.1
383   */
384  public static ManyToOneTranslationTable getGeneticCode(String name) {
385    return (ManyToOneTranslationTable) geneticCodes.get(name);
386  }
387  
388  /**
389   * Retrieve a TranslationTable by number.
390   * These numbers correspond to the transl_table qualifier in the
391   * DDBJ/EMBL/GenBank Feature Table (Version 6.5  Apr 2006): transl_table
392   * defines the genetic code table used if other than the universal 
393   * genetic code table. Tables are described in appendix V,
394   * section 7.5.5:
395   *
396   * <ul>
397   * <li>" 1 - UNIVERSAL"</li>
398   * <li>" 2 - VERTEBRATE_MITOCHONDRIAL"</li>
399   * <li>" 3 - YEAST_MITOCHONDRIAL"</li>
400   * <li>" 4 - MOLD_MITOCHONDRIAL"</li>
401   * <li>" 5 - INVERTEBRATE_MITOCHONDRIAL"</li>
402   * <li>" 6 - CILIATE_NUCLEAR"</li>
403   * <li>" 9 - ECHINODERM_MITOCHONDRIAL"</li>
404   * <li>"10 - EUPLOTID_NUCLEAR"</li>
405   * <li>"11 - BACTERIAL"</li>
406   * <li>"12 - ALTERNATIVE_YEAST_NUCLEAR"</li>
407   * <li>"13 - ASCIDIAN_MITOCHONDRIAL"</li>
408   * <li>"14 - FLATWORM_MITOCHONDRIAL"</li>
409   * <li>"15 - BLEPHARISMA_MACRONUCLEAR"</li>
410   * <li>"16 - 2CHLOROPHYCEAN_MITOCHONDRIAL"</li>
411   * <li>"21 - TREMATODE_MITOCHONDRIAL"</li>
412   * <li>"23 - SCENEDESMUS_MITOCHONDRIAL"</li>
413   * </ul>
414   *
415   * @throws IllegalArgumentException if there is no table with that number.
416   * @since 1.5
417   */
418  public static ManyToOneTranslationTable getGeneticCode(int table_num) {
419      Set tables = getGeneticCodeNames();
420      Iterator it = tables.iterator();
421      while(it.hasNext()) {
422          String tableName = (String) it.next();
423          SimpleGeneticCodeTable table = (SimpleGeneticCodeTable) geneticCodes.get(tableName);
424          if(table.getTableNumber()==table_num)
425              return table;
426      }
427      throw new IllegalArgumentException("There is no genetic code table at that number");
428  }
429  
430  /**
431   * Retrieve a Set containing the name of each genetic code.
432   *
433   * @since 1.1
434   */
435  public static Set getGeneticCodeNames() {
436    return geneticCodes.keySet();
437  }
438
439  /**
440   * Translate RNA into protein (with termination symbols).  For
441   * compatibility with BioJava 1.1, this will also handle sequences
442   * which are already expressed in the (RNA x RNA x RNA) (codon)
443   * alphabet.
444   *
445   * @since 1.1
446   */
447  public static SymbolList translate(SymbolList syms)
448    throws IllegalAlphabetException
449  {
450      if (syms.getAlphabet() == getRNA()) {
451          syms = SymbolListViews.windowedSymbolList(syms, 3);
452      }
453      return SymbolListViews.translate(syms, getGeneticCode("UNIVERSAL"));
454  }
455
456  private static void loadGeneticCodes() {
457    try {
458      InputStream tablesStream = ClassTools.getClassLoader(RNATools.class).getResourceAsStream(
459        "org/biojava/bio/seq/TranslationTables.xml"
460      );
461      if(tablesStream == null ) {
462        throw new BioError("Couldn't locate TranslationTables.xml.");
463      }
464
465      InputSource is = new InputSource(tablesStream);
466      DocumentBuilder parser = DocumentBuilderFactory.newInstance().newDocumentBuilder();
467      Document doc = parser.parse(is);
468
469      NodeList children = doc.getDocumentElement().getChildNodes();
470      for(int i = 0; i < children.getLength(); i++) {
471        Node cnode = children.item(i);
472        if(! (cnode instanceof Element)) {
473          continue;
474        }
475
476        Element child = (Element) cnode;
477        String name = child.getNodeName();
478        if(name.equals("table")) {
479          String tableName = child.getAttribute("name");
480          String source = child.getAttribute("source");
481          String target = child.getAttribute("target");
482          FiniteAlphabet sourceA =
483            (FiniteAlphabet) AlphabetManager.alphabetForName(source);
484          FiniteAlphabet targetA =
485            (FiniteAlphabet) AlphabetManager.alphabetForName(target);
486          SymbolTokenization targetP = targetA.getTokenization("name");
487          SimpleGeneticCodeTable table = new SimpleGeneticCodeTable (
488            sourceA,
489            targetA
490          );
491
492          NodeList translates = child.getChildNodes();
493          for(int j = 0; j < translates.getLength(); j++) {
494            Node tn = translates.item(j);
495            if(tn instanceof Element) {
496              Element te = (Element) tn;
497              if(te.getTagName().equals("transl_table")) {
498                  int num = Integer.valueOf(te.getAttribute("value")).intValue();
499                  String description = te.getAttribute("description");
500                  table.setTableNumber(num);
501                  table.setDescription(description);
502                  continue;
503              }
504              String from = te.getAttribute("from");
505              String to = te.getAttribute("to");
506
507              //
508              // Not the most elegant solution, but I wanted this working
509              // quickly for 1.1.  It's been broken for ages.
510              //     -td 26/i/20001
511              //
512
513              SymbolList fromSymbols = RNATools.createRNA(from);
514              if (fromSymbols.length() != 3) {
515                  throw new BioError("`" + from + "' is not a valid codon");
516              }
517
518              // AtomicSymbol fromS = (AtomicSymbol) sourceP.parseToken(from);
519              AtomicSymbol fromS = (AtomicSymbol) sourceA.getSymbol(fromSymbols.toList());
520              AtomicSymbol toS   = (AtomicSymbol) targetP.parseToken(to);
521              table.setTranslation(fromS, toS);
522            }
523          }
524
525          geneticCodes.put(tableName, table);
526        }
527      }
528    } catch (Exception e) {
529      throw new BioError("Couldn't parse TranslationTables.xml", e);
530    }
531  }
532
533  /**
534   * Sneaky class for complementing RNA bases.
535   */
536
537  private static class RNAComplementTranslationTable
538  extends AbstractReversibleTranslationTable {
539    public Symbol doTranslate(Symbol s)
540          throws IllegalSymbolException {
541            return (Symbol) RNATools.complement(s);
542          }
543
544          public Symbol doUntranslate(Symbol s)
545          throws IllegalSymbolException {
546            return (Symbol) RNATools.complement(s);
547    }
548
549          public Alphabet getSourceAlphabet() {
550            return RNATools.getRNA();
551          }
552
553          public Alphabet getTargetAlphabet() {
554            return RNATools.getRNA();
555          }
556  }
557}
558