001/* 002 003 * BioJava development code 004 005 * 006 007 * This code may be freely distributed and modified under the 008 009 * terms of the GNU Lesser General Public Licence. This should 010 011 * be distributed with the code. If you do not have a copy, 012 013 * see: 014 015 * 016 017 * http://www.gnu.org/copyleft/lesser.html 018 019 * 020 021 * Copyright for this code is held jointly by the individual 022 023 * authors. These should be listed in @author doc comments. 024 025 * 026 027 * For more information on the BioJava project and its aims, 028 029 * or to join the biojava-l mailing list, visit the home page 030 031 * at: 032 033 * 034 035 * http://www.biojava.org/ 036 037 * 038 039 */ 040 041 042 043package org.biojava.bio.seq; 044 045 046 047import java.util.HashMap; 048import java.util.HashSet; 049import java.util.Iterator; 050import java.util.Map; 051import java.util.Set; 052 053import org.biojava.bio.BioError; 054import org.biojava.bio.BioException; 055import org.biojava.bio.SimpleAnnotation; 056import org.biojava.bio.seq.impl.SimpleSequenceFactory; 057import org.biojava.bio.seq.io.SymbolTokenization; 058import org.biojava.bio.symbol.Alphabet; 059import org.biojava.bio.symbol.AlphabetManager; 060import org.biojava.bio.symbol.AtomicSymbol; 061import org.biojava.bio.symbol.FiniteAlphabet; 062import org.biojava.bio.symbol.IllegalAlphabetException; 063import org.biojava.bio.symbol.IllegalSymbolException; 064import org.biojava.bio.symbol.ReversibleTranslationTable; 065import org.biojava.bio.symbol.SimpleSymbolList; 066import org.biojava.bio.symbol.Symbol; 067import org.biojava.bio.symbol.SymbolList; 068import org.biojava.bio.symbol.SymbolListViews; 069 070 071 072/** 073 074 * Useful functionality for processing nucleotide sequences. 075 076 * 077 078 * @author Matthew Pocock 079 080 * @author Keith James (docs) 081 082 */ 083 084public final class NucleotideTools { 085 086 private static final ReversibleTranslationTable complementTable; 087 088 static private final FiniteAlphabet nucleotide; 089 090 private static final SymbolTokenization nucleotideTokens; 091 092 093 094 static private final AtomicSymbol a; 095 096 static private final AtomicSymbol g; 097 098 static private final AtomicSymbol c; 099 100 static private final AtomicSymbol t; 101 102 static private final AtomicSymbol u; 103 104 static private final Symbol r; 105 106 static private final Symbol y; 107 108 static private final Symbol m; 109 110 static private final Symbol k; 111 112 static private final Symbol s; 113 114 static private final Symbol w; 115 116 static private final Symbol b; 117 118 static private final Symbol d; 119 120 static private final Symbol h; 121 122 static private final Symbol v; 123 124 static private final Symbol n; 125 126 127 128 129 130 static private Map symbolToComplement; 131 132 133 134 static { 135 136 try { 137 138 nucleotide = (FiniteAlphabet) AlphabetManager.alphabetForName("NUCLEOTIDE"); 139 140 nucleotideTokens = nucleotide.getTokenization("token"); 141 142 SymbolList syms = new SimpleSymbolList(nucleotideTokens, "agcturymkswbdhvn"); 143 144 a = (AtomicSymbol) syms.symbolAt(1); 145 146 g = (AtomicSymbol) syms.symbolAt(2); 147 148 c = (AtomicSymbol) syms.symbolAt(3); 149 150 t = (AtomicSymbol) syms.symbolAt(4); 151 152 u = (AtomicSymbol) syms.symbolAt(5); 153 154 r = syms.symbolAt(6); 155 156 y = syms.symbolAt(7); 157 158 m = syms.symbolAt(8); 159 160 k = syms.symbolAt(9); 161 162 s = syms.symbolAt(10); 163 164 w = syms.symbolAt(11); 165 166 b = syms.symbolAt(12); 167 168 d = syms.symbolAt(13); 169 170 h = syms.symbolAt(14); 171 172 v = syms.symbolAt(15); 173 174 n = syms.symbolAt(16); 175 176 177 178 symbolToComplement = new HashMap(); 179 180 181 182 // add the gap symbol 183 184 Symbol gap = nucleotide.getGapSymbol(); 185 186 symbolToComplement.put(gap, gap); 187 188 189 190 // add all other ambiguity symbols 191 192 for(Iterator i = AlphabetManager.getAllSymbols(nucleotide).iterator(); i.hasNext();) { 193 194 Symbol as = (Symbol) i.next(); 195 196 FiniteAlphabet matches = (FiniteAlphabet) as.getMatches(); 197 198 if (matches.size() > 1) { // We've hit an ambiguous symbol. 199 200 Set l = new HashSet(); 201 202 for(Iterator j = matches.iterator(); j.hasNext(); ) { 203 204 l.add(complement((Symbol) j.next())); 205 206 } 207 208 symbolToComplement.put(as, nucleotide.getAmbiguity(l)); 209 210 } 211 212 } 213 214 215 216 217 218 complementTable = new NucleotideComplementTranslationTable(); 219 220 } catch (Throwable t) { 221 222 throw new BioError("Unable to initialize NucleotideTools",t); 223 224 } 225 226 } 227 228 229 230 public static AtomicSymbol a() { return a; } 231 232 public static AtomicSymbol g() { return g; } 233 234 public static AtomicSymbol c() { return c; } 235 236 public static AtomicSymbol t() { return t; } 237 238 public static AtomicSymbol u() { return u; } 239 240 public static Symbol r() { return r; } 241 242 public static Symbol y() { return y; } 243 244 public static Symbol m() { return m; } 245 246 public static Symbol k() { return k; } 247 248 public static Symbol s() { return s; } 249 250 public static Symbol w() { return w; } 251 252 public static Symbol b() { return b; } 253 254 public static Symbol d() { return d; } 255 256 public static Symbol h() { return h; } 257 258 public static Symbol v() { return v; } 259 260 public static Symbol n() { return n; } 261 262 263 private NucleotideTools() { 264 } 265 266 /** 267 268 * Return the Nucleotide alphabet. 269 270 * 271 272 * @return a flyweight version of the Nucleotide alphabet 273 274 */ 275 276 public static FiniteAlphabet getNucleotide() { 277 278 return nucleotide; 279 280 } 281 282 283 284 /** 285 286 * Return a new Nucleotide <span class="type">SymbolList</span> for 287 288 * <span class="arg">nucleotide</span>. 289 290 * 291 292 * @param nucleotide a <span class="type">String</span> to parse into Nucleotide 293 294 * @return a <span class="type">SymbolList</span> created form 295 296 * <span class="arg">nucleotide</span> 297 298 * @throws IllegalSymbolException if <span class="arg">nucleotide</span> contains 299 300 * any non-Nucleotide characters 301 302 */ 303 304 public static SymbolList createNucleotide(String nucleotide) 305 306 throws IllegalSymbolException { 307 308 try { 309 310 SymbolTokenization p = getNucleotide().getTokenization("token"); 311 312 return new SimpleSymbolList(p, nucleotide); 313 314 } catch (BioException se) { 315 316 throw new BioError("Something has gone badly wrong with Nucleotide",se); 317 318 } 319 320 } 321 322 323 324 /** 325 326 * Return a new Nucleotide <span class="type">Sequence</span> for 327 328 * <span class="arg">nucleotide</span>. 329 330 * 331 332 * @param nucleotide a <span class="type">String</span> to parse into Nucleotide 333 334 * @param name a <span class="type">String</span> to use as the name 335 336 * @return a <span class="type">Sequence</span> created form 337 338 * <span class="arg">nucleotide</span> 339 340 * @throws IllegalSymbolException if <span class="arg">nucleotide</span> contains 341 342 * any non-Nucleotide characters 343 344 */ 345 346 public static Sequence createNucleotideSequence(String nucleotide, String name) 347 348 throws IllegalSymbolException { 349 350 try { 351 352 return new SimpleSequenceFactory().createSequence( 353 354 createNucleotide(nucleotide), 355 356 "", name, new SimpleAnnotation() 357 358 ); 359 360 } catch (BioException se) { 361 362 throw new BioError("Something has gone badly wrong with Nucleotide",se); 363 364 } 365 366 } 367 368 369 370 /** 371 372 * Return an integer index for a symbol - compatible with 373 374 * <code>forIndex</code>. 375 376 * 377 378 * <p> 379 380 * The index for a symbol is stable accross virtual machines & 381 382 * invocations. 383 384 * </p> 385 386 * 387 388 * @param sym the Symbol to index 389 390 * @return the index for that symbol 391 392 * 393 394 * @throws IllegalSymbolException if sym is not a member of the Nucleotide 395 396 * alphabet 397 398 */ 399 400 public static int index(Symbol sym) throws IllegalSymbolException { 401 402 if(sym == a) { 403 404 return 0; 405 406 } else if(sym == g) { 407 408 return 1; 409 410 } else if(sym == c) { 411 412 return 2; 413 414 } else if(sym == t) { 415 416 return 3; 417 418 } else if(sym == u) { 419 420 return 4; 421 422 } 423 424 getNucleotide().validate(sym); 425 426 throw new IllegalSymbolException("Really confused. Can't find index for " + 427 428 sym.getName()); 429 430 } 431 432 433 434 /** 435 436 * Return the symbol for an index - compatible with <code>index</code>. 437 438 * 439 440 * <p> 441 442 * The index for a symbol is stable accross virtual machines & 443 444 * invocations. 445 446 * </p> 447 448 * 449 450 * @param index the index to look up 451 452 * @return the symbol at that index 453 454 * 455 456 * @throws IndexOutOfBoundsException if index is not between 0 and 3 457 458 */ 459 460 static public Symbol forIndex(int index) 461 462 throws IndexOutOfBoundsException { 463 464 if(index == 0) 465 466 return a; 467 468 else if(index == 1) 469 470 return g; 471 472 else if(index == 2) 473 474 return c; 475 476 else if(index == 3) 477 478 return t; 479 480 else if(index == 4) 481 482 return u; 483 484 else throw new IndexOutOfBoundsException("No symbol for index " + index); 485 486 } 487 488 489 490 /** 491 492 * Complement the symbol. 493 494 * 495 496 * @param sym the symbol to complement 497 498 * @return a Symbol that is the complement of sym 499 500 * @throws IllegalSymbolException if sym is not a member of the Nucleotide alphabet 501 502 */ 503 504 static public Symbol complement(Symbol sym) 505 506 throws IllegalSymbolException { 507 508 if(sym == a) { 509 510 return t; 511 512 } else if(sym == g) { 513 514 return c; 515 516 } else if(sym == c) { 517 518 return g; 519 520 } else if(sym == t) { 521 522 return a; 523 524 } else if(sym == u) { 525 526 return a; 527 528 } 529 530 Symbol s = (Symbol) symbolToComplement.get(sym); 531 532 if(s != null) { 533 534 return s; 535 536 } else { 537 538 getNucleotide().validate(sym); 539 540 throw new BioError( 541 542 "Really confused. Can't find symbol " + 543 544 sym.getName() 545 546 ); 547 548 } 549 550 } 551 552 553 554 /** 555 556 * Retrieve the symbol for a symbol. 557 558 * 559 560 * @param token the char to look up 561 562 * @return the symbol for that char 563 564 * @throws IllegalSymbolException if the char does not belong to {a, g, c, t, u} 565 566 */ 567 568 static public Symbol forSymbol(char token) 569 570 throws IllegalSymbolException { 571 572 if(token == 'a') { 573 574 return a; 575 576 } else if(token == 'g') { 577 578 return g; 579 580 } else if(token == 'c') { 581 582 return c; 583 584 } else if(token == 't') { 585 586 return t; 587 588 } else if(token == 'u') { 589 590 return u; 591 592 } 593 594 throw new IllegalSymbolException("Unable to find symbol for token " + token); 595 596 } 597 598 599 600 /** 601 602 * Retrieve a complement view of list. 603 604 * 605 606 * @param list the SymbolList to complement 607 608 * @return a SymbolList that is the complement 609 610 * @throws IllegalAlphabetException if list is not a complementable alphabet 611 612 */ 613 614 public static SymbolList complement(SymbolList list) 615 616 throws IllegalAlphabetException { 617 618 return SymbolListViews.translate(list, complementTable()); 619 620 } 621 622 623 624 /** 625 626 * Retrieve a reverse-complement view of list. 627 628 * 629 630 * @param list the SymbolList to complement 631 632 * @return a SymbolList that is the complement 633 634 * @throws IllegalAlphabetException if list is not a complementable alphabet 635 636 */ 637 638 public static SymbolList reverseComplement(SymbolList list) 639 640 throws IllegalAlphabetException { 641 642 return SymbolListViews.translate(SymbolListViews.reverse(list), complementTable()); 643 644 } 645 646 647 648 /** 649 650 * Get a translation table for complementing Nucleotide symbols. 651 652 * 653 654 * @since 1.1 655 656 */ 657 658 659 660 public static ReversibleTranslationTable complementTable() { 661 662 return complementTable; 663 664 } 665 666 667 668 /** 669 670 * Get a single-character token for a Nucleotide symbol 671 672 * 673 674 * @throws IllegalSymbolException if <code>sym</code> is not a member of the Nucleotide alphabet 675 676 */ 677 678 679 680 public static char nucleotideToken(Symbol sym) 681 682 throws IllegalSymbolException 683 684 { 685 686 return nucleotideTokens.tokenizeSymbol(sym).charAt(0); 687 688 } 689 690 691 692 /** 693 694 * Sneaky class for complementing Nucleotide bases. 695 696 */ 697 698 699 700 private static class NucleotideComplementTranslationTable 701 702 implements ReversibleTranslationTable { 703 704 public Symbol translate(Symbol s) 705 706 throws IllegalSymbolException { 707 708 return NucleotideTools.complement(s); 709 710 } 711 712 713 714 public Symbol untranslate(Symbol s) 715 716 throws IllegalSymbolException { 717 718 return NucleotideTools.complement(s); 719 720 } 721 722 723 724 public Alphabet getSourceAlphabet() { 725 726 return NucleotideTools.getNucleotide(); 727 728 } 729 730 731 732 public Alphabet getTargetAlphabet() { 733 734 return NucleotideTools.getNucleotide(); 735 736 } 737 738 } 739 740} 741 742 743