001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojava.bio.seq; 023 024import java.io.InputStream; 025import java.util.HashMap; 026import java.util.HashSet; 027import java.util.Iterator; 028import java.util.Map; 029import java.util.Set; 030 031import javax.xml.parsers.DocumentBuilder; 032import javax.xml.parsers.DocumentBuilderFactory; 033 034import org.biojava.bio.BioError; 035import org.biojava.bio.BioException; 036import org.biojava.bio.SimpleAnnotation; 037import org.biojava.bio.seq.impl.SimpleSequenceFactory; 038import org.biojava.bio.seq.io.SymbolTokenization; 039import org.biojava.bio.symbol.AbstractReversibleTranslationTable; 040import org.biojava.bio.symbol.Alphabet; 041import org.biojava.bio.symbol.AlphabetManager; 042import org.biojava.bio.symbol.AtomicSymbol; 043import org.biojava.bio.symbol.FiniteAlphabet; 044import org.biojava.bio.symbol.IllegalAlphabetException; 045import org.biojava.bio.symbol.IllegalSymbolException; 046import org.biojava.bio.symbol.ManyToOneTranslationTable; 047import org.biojava.bio.symbol.ReversibleTranslationTable; 048import org.biojava.bio.symbol.SimpleGeneticCodeTable; 049import org.biojava.bio.symbol.SimpleReversibleTranslationTable; 050import org.biojava.bio.symbol.SimpleSymbolList; 051import org.biojava.bio.symbol.Symbol; 052import org.biojava.bio.symbol.SymbolList; 053import org.biojava.bio.symbol.SymbolListViews; 054import org.biojava.utils.ClassTools; 055import org.w3c.dom.Document; 056import org.w3c.dom.Element; 057import org.w3c.dom.Node; 058import org.w3c.dom.NodeList; 059import org.xml.sax.InputSource; 060 061/** 062 * Useful functionality for processing DNA and RNA sequences. 063 * 064 * @author Matthew Pocock 065 * @author Keith James (docs) 066 * @author Thomas Down 067 * @author Greg Cox 068 * @author Mark Schreiber 069 * @author David Huen (refactoring) 070 * @author gwaldon (update genetic code translation tables) 071 */ 072public final class RNATools { 073 private static final ReversibleTranslationTable complementTable; 074 private static final SimpleReversibleTranslationTable transcriptionTable; 075 static private final FiniteAlphabet rna; 076 static private final Map geneticCodes; 077 078 static private final AtomicSymbol a; 079 static private final AtomicSymbol g; 080 static private final AtomicSymbol c; 081 static private final AtomicSymbol u; 082 static private final Symbol n; 083 084 static private Map symbolToComplement; 085 086 static { 087 try { 088 rna = (FiniteAlphabet) AlphabetManager.alphabetForName("RNA"); 089 090 SymbolList syms = new SimpleSymbolList(rna.getTokenization("token"), "agcun"); 091 a = (AtomicSymbol) syms.symbolAt(1); 092 g = (AtomicSymbol) syms.symbolAt(2); 093 c = (AtomicSymbol) syms.symbolAt(3); 094 u = (AtomicSymbol) syms.symbolAt(4); 095 n = syms.symbolAt(5); 096 097 symbolToComplement = new HashMap(); 098 099 // add the gap symbol 100 Symbol gap = rna.getGapSymbol(); 101 symbolToComplement.put(gap, gap); 102 103 // add all other ambiguity symbols 104 for(Iterator i = AlphabetManager.getAllSymbols(rna).iterator(); i.hasNext();) { 105 Symbol as = (Symbol) i.next(); 106 FiniteAlphabet matches = (FiniteAlphabet) as.getMatches(); 107 if (matches.size() > 1) { // We've hit an ambiguous symbol. 108 Set l = new HashSet(); 109 for(Iterator j = matches.iterator(); j.hasNext(); ) { 110 l.add(complement((Symbol) j.next())); 111 } 112 symbolToComplement.put(as, rna.getAmbiguity(l)); 113 } 114 } 115 complementTable = new RNAComplementTranslationTable(); 116 117 transcriptionTable = new SimpleReversibleTranslationTable(DNATools.getDNA(), rna); 118 transcriptionTable.setTranslation(DNATools.a(), a); 119 transcriptionTable.setTranslation(DNATools.c(), c); 120 transcriptionTable.setTranslation(DNATools.g(), g); 121 transcriptionTable.setTranslation(DNATools.t(), u); 122 123 geneticCodes = new HashMap(); 124 loadGeneticCodes(); 125 } catch (Throwable t) { 126 throw new BioError("Unable to initialize RNATools", t); 127 } 128 } 129 130 public static AtomicSymbol a() { return a; } 131 public static AtomicSymbol g() { return g; } 132 public static AtomicSymbol c() { return c; } 133 public static AtomicSymbol u() { return u; } 134 public static Symbol n() { return n; } 135 136 private RNATools() { 137 } 138 139 /** 140 * Return the RNA alphabet. 141 * 142 * @return a flyweight version of the RNA alphabet 143 */ 144 public static FiniteAlphabet getRNA() { 145 return rna; 146 } 147 148 /** 149 * Gets the (RNA x RNA x RNA) Alphabet 150 * @return a flyweight version of the (RNA x RNA x RNA) alphabet 151 */ 152 public static FiniteAlphabet getCodonAlphabet(){ 153 return (FiniteAlphabet)AlphabetManager.generateCrossProductAlphaFromName("(RNA x RNA x RNA)"); 154 } 155 156 /** 157 * Return a new RNA <span class="type">SymbolList</span> for 158 * <span class="arg">rna</span>. 159 * 160 * @param rna a <span class="type">String</span> to parse into RNA 161 * @return a <span class="type">SymbolList</span> created form 162 * <span class="arg">rna</span> 163 * @throws IllegalSymbolException if <span class="arg">rna</span> contains 164 * any non-RNA characters 165 */ 166 public static SymbolList createRNA(String rna) 167 throws IllegalSymbolException { 168 SymbolTokenization p = null; 169 try { 170 p = getRNA().getTokenization("token"); 171 } catch (BioException e) { 172 throw new BioError("Something has gone badly wrong with RNA", e); 173 } 174 return new SimpleSymbolList(p, rna); 175 176 } 177 178 /** 179 * Return a new RNA <span class="type">Sequence</span> for 180 * <span class="arg">rna</span>. 181 * 182 * @param rna a <span class="type">String</span> to parse into RNA 183 * @param name a <span class="type">String</span> to use as the name 184 * @return a <span class="type">Sequence</span> created form 185 * <span class="arg">dna</span> 186 * @throws IllegalSymbolException if <span class="arg">rna</span> contains 187 * any non-DNA characters 188 */ 189 public static Sequence createRNASequence(String rna, String name) 190 throws IllegalSymbolException { 191 try { 192 return new SimpleSequenceFactory().createSequence( 193 createRNA(rna), 194 "", name, new SimpleAnnotation() 195 ); 196 } catch (BioException se) { 197 throw new BioError("Something has gone badly wrong with RNA", se); 198 } 199 } 200 201 /** 202 * Return an integer index for a symbol - compatible with forIndex. 203 * <p> 204 * The index for a symbol is stable across virtual machines & invocations. 205 * 206 * @param sym the Symbol to index 207 * @return the index for that symbol 208 * @throws IllegalSymbolException if sym is not a member of the DNA alphabet 209 */ 210 public static int index(Symbol sym) throws IllegalSymbolException { 211 if(sym == a) { 212 return 0; 213 } else if(sym == g) { 214 return 1; 215 } else if(sym == c) { 216 return 2; 217 } else if(sym == u) { 218 return 3; 219 } 220 getRNA().validate(sym); 221 throw new IllegalSymbolException("Really confused. Can't find index for " + 222 sym.getName()); 223 } 224 225 /** 226 * Return the symbol for an index - compatible with index. 227 * <p> 228 * The index for a symbol is stable accross virtual machines & invocations. 229 * 230 * @param index the index to look up 231 * @return the symbol at that index 232 * @throws IndexOutOfBoundsException if index is not between 0 and 3 233 */ 234 static public Symbol forIndex(int index) 235 throws IndexOutOfBoundsException { 236 if(index == 0) 237 return a; 238 else if(index == 1) 239 return g; 240 else if(index == 2) 241 return c; 242 else if(index == 3) 243 return u; 244 else throw new IndexOutOfBoundsException("No symbol for index " + index); 245 } 246 247 /** 248 * Complement the symbol. 249 * 250 * @param sym the symbol to complement 251 * @return a Symbol that is the complement of sym 252 * @throws IllegalSymbolException if sym is not a member of the RNA alphabet 253 */ 254 static public Symbol complement(Symbol sym) 255 throws IllegalSymbolException { 256 if(sym == a) { 257 return u; 258 } else if(sym == g) { 259 return c; 260 } else if(sym == c) { 261 return g; 262 } else if(sym == u) { 263 return a; 264 } 265 Symbol s = (Symbol) symbolToComplement.get(sym); 266 if(s != null) { 267 return s; 268 } else { 269 getRNA().validate(sym); 270 throw new BioError( 271 "Really confused. Can't find symbol " + 272 sym.getName() 273 ); 274 } 275 } 276 277 /** 278 * Retrieve the symbol for a symbol. 279 * 280 * @param token the char to look up 281 * @return the symbol for that char 282 * @throws IllegalSymbolException if the char is not a valid IUB code. 283 */ 284 static public Symbol forSymbol(char token) 285 throws IllegalSymbolException { 286 String t = String.valueOf(token); 287 SymbolTokenization toke; 288 289 try{ 290 toke = getRNA().getTokenization("token"); 291 }catch(BioException e){ 292 throw new BioError("Cannot find the 'token' Tokenization for RNA!?", e); 293 } 294 return toke.parseToken(t); 295 } 296 297 /** 298 * Retrieve a complement view of list. 299 * 300 * @param list the SymbolList to complement 301 * @return a SymbolList that is the complement 302 * @throws IllegalAlphabetException if list is not a complementable alphabet 303 */ 304 public static SymbolList complement(SymbolList list) 305 throws IllegalAlphabetException { 306 return SymbolListViews.translate(list, complementTable()); 307 } 308 309 /** 310 * Retrieve a reverse-complement view of list. 311 * 312 * @param list the SymbolList to complement 313 * @return a SymbolList that is the complement 314 * @throws IllegalAlphabetException if list is not a complementable alphabet 315 */ 316 public static SymbolList reverseComplement(SymbolList list) 317 throws IllegalAlphabetException { 318 return SymbolListViews.translate(SymbolListViews.reverse(list), complementTable()); 319 } 320 321 /** 322 * Transcribe DNA into RNA. 323 * @deprecated The naming of this method is confusing and inconsistent use either DNATools.toRNA(SymbolList list) or 324 * DNATools.transcribeToRNA(SymbolList list) depending on the desired behaivour. 325 * @param list the SymbolList to transcribe 326 * @return a SymbolList that is the transcribed view 327 * @throws IllegalAlphabetException if the list is not DNA 328 */ 329 public static SymbolList transcribe(SymbolList list) 330 throws IllegalAlphabetException { 331 return SymbolListViews.translate(list, transcriptionTable()); 332 } 333 334 /** 335 * Get a translation table for complementing DNA symbols. 336 * 337 * @since 1.1 338 */ 339 340 public static ReversibleTranslationTable complementTable() { 341 return complementTable; 342 } 343 344 /** 345 * Get a translation table for converting DNA to RNA. 346 * 347 * @since 1.1 348 */ 349 public static ReversibleTranslationTable transcriptionTable() { 350 return transcriptionTable; 351 } 352 353 /** 354 * Retrieve a TranslationTable by name. The valid names are: 355 * 356 * <ul> 357 * <li>"UNIVERSAL"</li> 358 * <li>"VERTEBRATE_MITOCHONDRIAL"</li> 359 * <li>"YEAST_MITOCHONDRIAL"</li> 360 * <li>"MOLD_MITOCHONDRIAL"</li> 361 * <li>"INVERTEBRATE_MITOCHONDRIAL"</li> 362 * <li>"CILIATE_NUCLEAR"</li> 363 * <li>"ECHINODERM_MITOCHONDRIAL"</li> 364 * <li>"EUPLOTID_NUCLEAR"</li> 365 * <li>"BACTERIAL"</li> 366 * <li>"ALTERNATIVE_YEAST_NUCLEAR"</li> 367 * <li>"ASCIDIAN_MITOCHONDRIAL"</li> 368 * <li>"FLATWORM_MITOCHONDRIAL"</li> 369 * <li>"BLEPHARISMA_MACRONUCLEAR"</li> 370 * <li>"CHLOROPHYCEAN_MITOCHONDRIAL"</li> 371 * <li>"TREMATODE_MITOCHONDRIAL"</li> 372 * <li>"SCENEDESMUS_MITOCHONDRIAL"</li> 373 * </ul> 374 * 375 * There are public static final fields in the TranslationTable 376 * interface which contain these values. One of these should be used 377 * as the argument for this method. 378 * <p> 379 * You can now get the reverse translation of the residue back to its 380 * (usually several) codons too. 381 * 382 * @since 1.1 383 */ 384 public static ManyToOneTranslationTable getGeneticCode(String name) { 385 return (ManyToOneTranslationTable) geneticCodes.get(name); 386 } 387 388 /** 389 * Retrieve a TranslationTable by number. 390 * These numbers correspond to the transl_table qualifier in the 391 * DDBJ/EMBL/GenBank Feature Table (Version 6.5 Apr 2006): transl_table 392 * defines the genetic code table used if other than the universal 393 * genetic code table. Tables are described in appendix V, 394 * section 7.5.5: 395 * 396 * <ul> 397 * <li>" 1 - UNIVERSAL"</li> 398 * <li>" 2 - VERTEBRATE_MITOCHONDRIAL"</li> 399 * <li>" 3 - YEAST_MITOCHONDRIAL"</li> 400 * <li>" 4 - MOLD_MITOCHONDRIAL"</li> 401 * <li>" 5 - INVERTEBRATE_MITOCHONDRIAL"</li> 402 * <li>" 6 - CILIATE_NUCLEAR"</li> 403 * <li>" 9 - ECHINODERM_MITOCHONDRIAL"</li> 404 * <li>"10 - EUPLOTID_NUCLEAR"</li> 405 * <li>"11 - BACTERIAL"</li> 406 * <li>"12 - ALTERNATIVE_YEAST_NUCLEAR"</li> 407 * <li>"13 - ASCIDIAN_MITOCHONDRIAL"</li> 408 * <li>"14 - FLATWORM_MITOCHONDRIAL"</li> 409 * <li>"15 - BLEPHARISMA_MACRONUCLEAR"</li> 410 * <li>"16 - 2CHLOROPHYCEAN_MITOCHONDRIAL"</li> 411 * <li>"21 - TREMATODE_MITOCHONDRIAL"</li> 412 * <li>"23 - SCENEDESMUS_MITOCHONDRIAL"</li> 413 * </ul> 414 * 415 * @throws IllegalArgumentException if there is no table with that number. 416 * @since 1.5 417 */ 418 public static ManyToOneTranslationTable getGeneticCode(int table_num) { 419 Set tables = getGeneticCodeNames(); 420 Iterator it = tables.iterator(); 421 while(it.hasNext()) { 422 String tableName = (String) it.next(); 423 SimpleGeneticCodeTable table = (SimpleGeneticCodeTable) geneticCodes.get(tableName); 424 if(table.getTableNumber()==table_num) 425 return table; 426 } 427 throw new IllegalArgumentException("There is no genetic code table at that number"); 428 } 429 430 /** 431 * Retrieve a Set containing the name of each genetic code. 432 * 433 * @since 1.1 434 */ 435 public static Set getGeneticCodeNames() { 436 return geneticCodes.keySet(); 437 } 438 439 /** 440 * Translate RNA into protein (with termination symbols). For 441 * compatibility with BioJava 1.1, this will also handle sequences 442 * which are already expressed in the (RNA x RNA x RNA) (codon) 443 * alphabet. 444 * 445 * @since 1.1 446 */ 447 public static SymbolList translate(SymbolList syms) 448 throws IllegalAlphabetException 449 { 450 if (syms.getAlphabet() == getRNA()) { 451 syms = SymbolListViews.windowedSymbolList(syms, 3); 452 } 453 return SymbolListViews.translate(syms, getGeneticCode("UNIVERSAL")); 454 } 455 456 private static void loadGeneticCodes() { 457 try { 458 InputStream tablesStream = ClassTools.getClassLoader(RNATools.class).getResourceAsStream( 459 "org/biojava/bio/seq/TranslationTables.xml" 460 ); 461 if(tablesStream == null ) { 462 throw new BioError("Couldn't locate TranslationTables.xml."); 463 } 464 465 InputSource is = new InputSource(tablesStream); 466 DocumentBuilder parser = DocumentBuilderFactory.newInstance().newDocumentBuilder(); 467 Document doc = parser.parse(is); 468 469 NodeList children = doc.getDocumentElement().getChildNodes(); 470 for(int i = 0; i < children.getLength(); i++) { 471 Node cnode = children.item(i); 472 if(! (cnode instanceof Element)) { 473 continue; 474 } 475 476 Element child = (Element) cnode; 477 String name = child.getNodeName(); 478 if(name.equals("table")) { 479 String tableName = child.getAttribute("name"); 480 String source = child.getAttribute("source"); 481 String target = child.getAttribute("target"); 482 FiniteAlphabet sourceA = 483 (FiniteAlphabet) AlphabetManager.alphabetForName(source); 484 FiniteAlphabet targetA = 485 (FiniteAlphabet) AlphabetManager.alphabetForName(target); 486 SymbolTokenization targetP = targetA.getTokenization("name"); 487 SimpleGeneticCodeTable table = new SimpleGeneticCodeTable ( 488 sourceA, 489 targetA 490 ); 491 492 NodeList translates = child.getChildNodes(); 493 for(int j = 0; j < translates.getLength(); j++) { 494 Node tn = translates.item(j); 495 if(tn instanceof Element) { 496 Element te = (Element) tn; 497 if(te.getTagName().equals("transl_table")) { 498 int num = Integer.valueOf(te.getAttribute("value")).intValue(); 499 String description = te.getAttribute("description"); 500 table.setTableNumber(num); 501 table.setDescription(description); 502 continue; 503 } 504 String from = te.getAttribute("from"); 505 String to = te.getAttribute("to"); 506 507 // 508 // Not the most elegant solution, but I wanted this working 509 // quickly for 1.1. It's been broken for ages. 510 // -td 26/i/20001 511 // 512 513 SymbolList fromSymbols = RNATools.createRNA(from); 514 if (fromSymbols.length() != 3) { 515 throw new BioError("`" + from + "' is not a valid codon"); 516 } 517 518 // AtomicSymbol fromS = (AtomicSymbol) sourceP.parseToken(from); 519 AtomicSymbol fromS = (AtomicSymbol) sourceA.getSymbol(fromSymbols.toList()); 520 AtomicSymbol toS = (AtomicSymbol) targetP.parseToken(to); 521 table.setTranslation(fromS, toS); 522 } 523 } 524 525 geneticCodes.put(tableName, table); 526 } 527 } 528 } catch (Exception e) { 529 throw new BioError("Couldn't parse TranslationTables.xml", e); 530 } 531 } 532 533 /** 534 * Sneaky class for complementing RNA bases. 535 */ 536 537 private static class RNAComplementTranslationTable 538 extends AbstractReversibleTranslationTable { 539 public Symbol doTranslate(Symbol s) 540 throws IllegalSymbolException { 541 return (Symbol) RNATools.complement(s); 542 } 543 544 public Symbol doUntranslate(Symbol s) 545 throws IllegalSymbolException { 546 return (Symbol) RNATools.complement(s); 547 } 548 549 public Alphabet getSourceAlphabet() { 550 return RNATools.getRNA(); 551 } 552 553 public Alphabet getTargetAlphabet() { 554 return RNATools.getRNA(); 555 } 556 } 557} 558