001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojava.bio.seq; 023 024import java.util.HashMap; 025import java.util.HashSet; 026import java.util.Iterator; 027import java.util.Map; 028import java.util.Set; 029 030import org.biojava.bio.BioError; 031import org.biojava.bio.BioException; 032import org.biojava.bio.SimpleAnnotation; 033import org.biojava.bio.dist.Distribution; 034import org.biojava.bio.dist.PairDistribution; 035import org.biojava.bio.dist.SimpleDistribution; 036import org.biojava.bio.seq.impl.SimpleGappedSequence; 037import org.biojava.bio.seq.impl.SimpleSequenceFactory; 038import org.biojava.bio.seq.io.SymbolTokenization; 039import org.biojava.bio.symbol.Alphabet; 040import org.biojava.bio.symbol.AlphabetManager; 041import org.biojava.bio.symbol.AtomicSymbol; 042import org.biojava.bio.symbol.FiniteAlphabet; 043import org.biojava.bio.symbol.IllegalAlphabetException; 044import org.biojava.bio.symbol.IllegalSymbolException; 045import org.biojava.bio.symbol.ReversibleTranslationTable; 046import org.biojava.bio.symbol.SimpleReversibleTranslationTable; 047import org.biojava.bio.symbol.SimpleSymbolList; 048import org.biojava.bio.symbol.Symbol; 049import org.biojava.bio.symbol.SymbolList; 050import org.biojava.bio.symbol.SymbolListViews; 051import org.biojava.utils.ChangeVetoException; 052 053/** 054 * Useful functionality for processing DNA sequences. 055 * 056 * @author Matthew Pocock 057 * @author Keith James (docs) 058 * @author Mark Schreiber 059 * @author David Huen 060 * @author Richard Holland 061 */ 062public final class DNATools { 063 private static final ReversibleTranslationTable complementTable; 064 static private final FiniteAlphabet dna; 065 private static final SymbolTokenization dnaTokens; 066 067 static private final AtomicSymbol a; 068 static private final AtomicSymbol g; 069 static private final AtomicSymbol c; 070 static private final AtomicSymbol t; 071 static private final Symbol n; 072 static private final Symbol m; 073 static private final Symbol r; 074 static private final Symbol w; 075 static private final Symbol s; 076 static private final Symbol y; 077 static private final Symbol k; 078 static private final Symbol v; 079 static private final Symbol h; 080 static private final Symbol d; 081 static private final Symbol b; 082 static private final SimpleReversibleTranslationTable transcriptionTable; 083 084 085 static private Map<Symbol, Symbol> symbolToComplement; 086 087 static { 088 try { 089 dna = (FiniteAlphabet) AlphabetManager.alphabetForName("DNA"); 090 dnaTokens = dna.getTokenization("token"); 091 SymbolList syms = new SimpleSymbolList(dnaTokens, "agctnmrwsykvhdb"); 092 a = (AtomicSymbol) syms.symbolAt(1); 093 g = (AtomicSymbol) syms.symbolAt(2); 094 c = (AtomicSymbol) syms.symbolAt(3); 095 t = (AtomicSymbol) syms.symbolAt(4); 096 n = syms.symbolAt(5); 097 m = syms.symbolAt(6); 098 r = syms.symbolAt(7); 099 w = syms.symbolAt(8); 100 s = syms.symbolAt(9); 101 y = syms.symbolAt(10); 102 k = syms.symbolAt(11); 103 v = syms.symbolAt(12); 104 h = syms.symbolAt(13); 105 d = syms.symbolAt(14); 106 b = syms.symbolAt(15); 107 108 symbolToComplement = new HashMap<Symbol, Symbol>(); 109 110 // add the gap symbol 111 Symbol gap = dna.getGapSymbol(); 112 symbolToComplement.put(gap, gap); 113 114 // add all other ambiguity symbols 115 for(Iterator i = AlphabetManager.getAllSymbols(dna).iterator(); i.hasNext();) { 116 Symbol as = (Symbol) i.next(); 117 FiniteAlphabet matches = (FiniteAlphabet) as.getMatches(); 118 if (matches.size() > 1) { // We've hit an ambiguous symbol. 119 Set<Symbol> l = new HashSet<Symbol>(); 120 for(Iterator j = matches.iterator(); j.hasNext(); ) { 121 l.add(complement((Symbol) j.next())); 122 } 123 symbolToComplement.put(as, dna.getAmbiguity(l)); 124 } 125 } 126 127 128 complementTable = new DNAComplementTranslationTable(); 129 130 transcriptionTable = new SimpleReversibleTranslationTable(getDNA(), RNATools.getRNA()); 131 transcriptionTable.setTranslation(a, RNATools.a()); 132 transcriptionTable.setTranslation(c, RNATools.c()); 133 transcriptionTable.setTranslation(g, RNATools.g()); 134 transcriptionTable.setTranslation(t, RNATools.u()); 135 136 } catch (Throwable th) { 137 throw new BioError("Unable to initialize DNATools", th); 138 } 139 } 140 141 public static AtomicSymbol a() { return a; } 142 public static AtomicSymbol g() { return g; } 143 public static AtomicSymbol c() { return c; } 144 public static AtomicSymbol t() { return t; } 145 public static Symbol n() { return n; } 146 public static Symbol m() { return m; } 147 public static Symbol r() { return r; } 148 public static Symbol w() { return w; } 149 public static Symbol s() { return s; } 150 public static Symbol y() { return y; } 151 public static Symbol k() { return k; } 152 public static Symbol v() { return v; } 153 public static Symbol h() { return h; } 154 public static Symbol d() { return d; } 155 public static Symbol b() { return b; } 156 157 158 private DNATools() { 159 } 160 161 /** 162 * Return the DNA alphabet. 163 * 164 * @return a flyweight version of the DNA alphabet 165 */ 166 public static FiniteAlphabet getDNA() { 167 return dna; 168 } 169 170 /** 171 * Gets the (DNA x DNA) Alphabet 172 * @return a flyweight version of the (DNA x DNA) alphabet 173 */ 174 public static FiniteAlphabet getDNAxDNA(){ 175 return (FiniteAlphabet)AlphabetManager.generateCrossProductAlphaFromName("(DNA x DNA)"); 176 } 177 178 /** 179 * Gets the (DNA x DNA x DNA) Alphabet 180 * @return a flyweight version of the (DNA x DNA x DNA) alphabet 181 */ 182 public static FiniteAlphabet getCodonAlphabet(){ 183 return (FiniteAlphabet)AlphabetManager.generateCrossProductAlphaFromName("(DNA x DNA x DNA)"); 184 } 185 186 /** 187 * Return a new DNA <span class="type">SymbolList</span> for 188 * <span class="arg">dna</span>. 189 * 190 * @param dna a <span class="type">String</span> to parse into DNA 191 * @return a <span class="type">SymbolList</span> created form 192 * <span class="arg">dna</span> 193 * @throws IllegalSymbolException if <span class="arg">dna</span> contains 194 * any non-DNA characters 195 */ 196 public static SymbolList createDNA(String dna) 197 throws IllegalSymbolException { 198 SymbolTokenization p = null; 199 try { 200 p = getDNA().getTokenization("token"); 201 } catch (BioException e) { 202 throw new BioError("Something has gone badly wrong with DNA", e); 203 } 204 return new SimpleSymbolList(p, dna); 205 } 206 207 /** 208 * Return a new DNA <span class="type">Sequence</span> for 209 * <span class="arg">dna</span>. 210 * 211 * @param dna a <span class="type">String</span> to parse into DNA 212 * @param name a <span class="type">String</span> to use as the name 213 * @return a <span class="type">Sequence</span> created form 214 * <span class="arg">dna</span> 215 * @throws IllegalSymbolException if <span class="arg">dna</span> contains 216 * any non-DNA characters 217 */ 218 public static Sequence createDNASequence(String dna, String name) 219 throws IllegalSymbolException { 220 //should I be calling createGappedDNASequence? 221 if(dna.indexOf('-') != -1 || dna.indexOf('~') != -1){//there is a gap 222 return createGappedDNASequence(dna, name); 223 } 224 // No need to wrap in try-catch as throws IllegalSymbolException already. 225 //try { 226 return new SimpleSequenceFactory().createSequence( 227 createDNA(dna), 228 "", name, new SimpleAnnotation() 229 ); 230 //} catch (BioException se) { 231 // throw new BioError("Something has gone badly wrong with DNA", se); 232 //} 233 } 234 235 236 /** Get a new dna as a GappedSequence */ 237 public static GappedSequence createGappedDNASequence(String dna, String name) throws IllegalSymbolException{ 238 String dna1 = dna.replaceAll("-", ""); 239 Sequence dnaSeq = createDNASequence(dna1, name); 240 GappedSequence dnaSeq1 = new SimpleGappedSequence(dnaSeq); 241 int pos = dna.indexOf('-', 0); 242 while(pos!=-1){ 243 dnaSeq1.addGapInView(pos+1); 244 pos = dna.indexOf('-', pos+1); 245 } 246 return dnaSeq1; 247 } 248 249 250 /** 251 * Return an integer index for a symbol - compatible with 252 * <code>forIndex</code>. 253 * 254 * <p> 255 * The index for a symbol is stable accross virtual machines & 256 * invocations. 257 * </p> 258 * 259 * @param sym the Symbol to index 260 * @return the index for that symbol 261 * 262 * @throws IllegalSymbolException if sym is not a member of the DNA 263 * alphabet 264 */ 265 public static int index(Symbol sym) throws IllegalSymbolException { 266 if(sym == a) { 267 return 0; 268 } else if(sym == g) { 269 return 1; 270 } else if(sym == c) { 271 return 2; 272 } else if(sym == t) { 273 return 3; 274 } 275 getDNA().validate(sym); 276 throw new IllegalSymbolException("Really confused. Can't find index for " + 277 sym.getName()); 278 } 279 280 /** 281 * Return the symbol for an index - compatible with <code>index</code>. 282 * 283 * <p> 284 * The index for a symbol is stable accross virtual machines & 285 * invocations. 286 * </p> 287 * 288 * @param index the index to look up 289 * @return the symbol at that index 290 * 291 * @throws IndexOutOfBoundsException if index is not between 0 and 3 292 */ 293 static public Symbol forIndex(int index) 294 throws IndexOutOfBoundsException { 295 if(index == 0) 296 return a; 297 else if(index == 1) 298 return g; 299 else if(index == 2) 300 return c; 301 else if(index == 3) 302 return t; 303 else throw new IndexOutOfBoundsException("No symbol for index " + index); 304 } 305 306 /** 307 * Complement the symbol. 308 * 309 * @param sym the symbol to complement 310 * @return a Symbol that is the complement of sym 311 * @throws IllegalSymbolException if sym is not a member of the DNA alphabet 312 */ 313 static public Symbol complement(Symbol sym) 314 throws IllegalSymbolException { 315 if(sym == a) { 316 return t; 317 } else if(sym == g) { 318 return c; 319 } else if(sym == c) { 320 return g; 321 } else if(sym == t) { 322 return a; 323 } 324 Symbol s = symbolToComplement.get(sym); 325 if(s != null) { 326 return s; 327 } else { 328 getDNA().validate(sym); 329 throw new BioError( 330 "Really confused. Can't find symbol " + 331 sym.getName() 332 ); 333 } 334 } 335 336 /** 337 * Retrieve the symbol for a symbol. 338 * 339 * @param token the char to look up 340 * @return the symbol for that char 341 * @throws IllegalSymbolException if the char is not a valid IUB dna code 342 */ 343 static public Symbol forSymbol(char token) 344 throws IllegalSymbolException { 345 String t = String.valueOf(token); 346 SymbolTokenization toke; 347 try{ 348 toke = getDNA().getTokenization("token"); 349 }catch(BioException e){ 350 throw new BioError("Cannot find the 'token' Tokenization for DNA!?", e); 351 } 352 return toke.parseToken(t); 353 } 354 355 /** 356 * Retrieve a complement view of list. 357 * 358 * @param list the SymbolList to complement 359 * @return a SymbolList that is the complement 360 * @throws IllegalAlphabetException if list is not a complementable alphabet 361 */ 362 public static SymbolList complement(SymbolList list) 363 throws IllegalAlphabetException { 364 return SymbolListViews.translate(list, complementTable()); 365 } 366 367 /** 368 * Retrieve a reverse-complement view of list. 369 * 370 * @param list the SymbolList to complement 371 * @return a SymbolList that is the complement 372 * @throws IllegalAlphabetException if list is not a complementable alphabet 373 */ 374 public static SymbolList reverseComplement(SymbolList list) 375 throws IllegalAlphabetException { 376 return SymbolListViews.translate(SymbolListViews.reverse(list), complementTable()); 377 } 378 379 /** 380 * Returns a SymbolList that is reverse complemented if the strand is 381 * negative, and the origninal one if it is not. 382 * 383 * @param list the SymbolList to view 384 * @param strand the Strand to use 385 * @return the apropreate view of the SymbolList 386 * @throws IllegalAlphabetException if list is not a complementable alphabet 387 */ 388 public static SymbolList flip(SymbolList list, StrandedFeature.Strand strand) 389 throws IllegalAlphabetException { 390 if(strand == StrandedFeature.NEGATIVE) { 391 return reverseComplement(list); 392 } else { 393 return list; 394 } 395 } 396 397 /** 398 * Get a translation table for complementing DNA symbols. 399 * 400 * @since 1.1 401 */ 402 403 public static ReversibleTranslationTable complementTable() { 404 return complementTable; 405 } 406 407 /** 408 * Get a single-character token for a DNA symbol 409 * 410 * @throws IllegalSymbolException if <code>sym</code> is not a member of the DNA alphabet 411 */ 412 413 public static char dnaToken(Symbol sym) 414 throws IllegalSymbolException 415 { 416 return dnaTokens.tokenizeSymbol(sym).charAt(0); 417 } 418 419 /** 420 * Sneaky class for complementing DNA bases. 421 */ 422 423 private static class DNAComplementTranslationTable 424 implements ReversibleTranslationTable { 425 public Symbol translate(Symbol s) 426 throws IllegalSymbolException { 427 return DNATools.complement(s); 428 } 429 430 public Symbol untranslate(Symbol s) 431 throws IllegalSymbolException { 432 return DNATools.complement(s); 433 } 434 435 public Alphabet getSourceAlphabet() { 436 return DNATools.getDNA(); 437 } 438 439 public Alphabet getTargetAlphabet() { 440 return DNATools.getDNA(); 441 } 442 } 443 444 /** 445 * return a SimpleDistribution of specified GC content. 446 * @param fractionGC (G+C) content as a fraction. 447 */ 448 public static Distribution getDNADistribution(double fractionGC) 449 { 450 try { 451 Distribution dist = new SimpleDistribution(DNATools.getDNA()); 452 double gc = 0.5 * fractionGC; 453 double at = 0.5 * (1.0 - fractionGC); 454 dist.setWeight(DNATools.a(), at); 455 dist.setWeight(DNATools.t(), at); 456 dist.setWeight(DNATools.c(), gc); 457 dist.setWeight(DNATools.g(), gc); 458 459 return dist; 460 } 461 // these exceptions are just plain impossible!!! 462 catch (IllegalSymbolException ise) { return null; } 463 catch (ChangeVetoException cve) { return null; } 464 } 465 466 /** 467 * return a (DNA x DNA) cross-product Distribution with specified 468 * DNA contents in each component Alphabet. 469 * @param fractionGC0 (G+C) content of first sequence as a fraction. 470 * @param fractionGC1 (G+C) content of second sequence as a fraction. 471 */ 472 public static Distribution getDNAxDNADistribution( 473 double fractionGC0, 474 double fractionGC1 475 ) 476 { 477 return new PairDistribution(getDNADistribution(fractionGC0), getDNADistribution(fractionGC1)); 478 } 479 480 /** 481 * Converts a <code>SymbolList</code> from the DNA <code>Alphabet</code> to the 482 * RNA <code>Alphabet</code>. 483 * @param syms the <code>SymbolList</code> to convert to RNA 484 * @return a view on <code>syms</code> where <code>Symbols</code> have been converted to RNA. 485 * Most significantly t's are now u's. The 5' to 3' order of the Symbols is conserved. 486 * @since 1.4 487 * @throws IllegalAlphabetException if <code>syms</code> is not DNA. 488 */ 489 public static SymbolList toRNA(SymbolList syms)throws IllegalAlphabetException{ 490 return SymbolListViews.translate(syms, transcriptionTable); 491 } 492 493 /** 494 * Transcribes DNA to RNA. The method more closely represents the biological reality 495 * than <code>toRNA(SymbolList syms)</code> does. The presented DNA <code>SymbolList</code> 496 * is assumed to be the template strand in the 5' to 3' orientation. The resulting 497 * RNA is transcribed from this template effectively a reverse complement in the RNA alphabet. 498 * The method is equivalent to calling <code>reverseComplement()</code> and <code>toRNA()</code> in sequence. 499 * <p>If you are dealing with cDNA sequences that you want converted to RNA you would be 500 * better off calling <code>toRNA(SymbolList syms)</code> 501 * @param syms the <code>SymbolList</code> to convert to RNA 502 * @return a view on <code>syms</code> where <code>Symbols</code> have been converted to RNA. 503 * @since 1.4 504 * @throws IllegalAlphabetException if <code>syms</code> is not DNA. 505 */ 506 public static SymbolList transcribeToRNA(SymbolList syms) throws IllegalAlphabetException{ 507 syms = reverseComplement(syms); 508 return toRNA(syms); 509 } 510 511 /** 512 * Convenience method that directly converts a DNA sequence to RNA then to 513 * protein. The translated protein is from the +1 reading frame of the 514 * <code>SymbolList</code>. The whole <code>SymbolList</code> is translated 515 * although up to 2 DNA residues may be truncated if full codons cannot be 516 * formed. 517 * @param syms the sequence to be translated. 518 * @return the translated protein sequence. 519 * @throws org.biojava.bio.symbol.IllegalAlphabetException if <code>syms</code> 520 * is not from the DNA alphabet. 521 * @since 1.5.1 522 */ 523 public static SymbolList toProtein(final SymbolList syms) 524 throws IllegalAlphabetException{ 525 SymbolList symz = new SimpleSymbolList(syms); 526 symz = toRNA(symz); 527 //truncate to a length divisible by three. 528 symz = symz.subList(1, symz.length() - (symz.length() %3)); 529 return RNATools.translate(symz); 530 } 531 532 /** 533 * Convenience method to translate a region of a DNA sequence directly into 534 * protein. While the start and end can be specified if the length of the 535 * specified region is not evenly divisible by three then the translated 536 * region will be truncated until a full terminal codon can be formed. 537 * @param syms the DNA sequence to be translated. 538 * @param start the location to begin translation. 539 * @param end the end of the translated region. 540 * @return the translated protein sequence. 541 * @throws org.biojava.bio.symbol.IllegalAlphabetException if <code>syms 542 * </code> is not from the DNA alphabet. 543 * @since 1.5.1 544 */ 545 public static SymbolList toProtein(final SymbolList syms, int start, int end) 546 throws IllegalAlphabetException{ 547 SymbolList symz = new SimpleSymbolList(syms); 548 symz = symz.subList(start, end); 549 //return toProtein(syms); 550 //changed to fix bug 2521 551 return toProtein(symz); 552 } 553} 554 555 556 557 558 559