001/*
002
003 *                    BioJava development code
004
005 *
006
007 * This code may be freely distributed and modified under the
008
009 * terms of the GNU Lesser General Public Licence.  This should
010
011 * be distributed with the code.  If you do not have a copy,
012
013 * see:
014
015 *
016
017 *      http://www.gnu.org/copyleft/lesser.html
018
019 *
020
021 * Copyright for this code is held jointly by the individual
022
023 * authors.  These should be listed in @author doc comments.
024
025 *
026
027 * For more information on the BioJava project and its aims,
028
029 * or to join the biojava-l mailing list, visit the home page
030
031 * at:
032
033 *
034
035 *      http://www.biojava.org/
036
037 *
038
039 */
040
041
042
043package org.biojava.bio.seq;
044
045
046
047import java.util.HashMap;
048import java.util.HashSet;
049import java.util.Iterator;
050import java.util.Map;
051import java.util.Set;
052
053import org.biojava.bio.BioError;
054import org.biojava.bio.BioException;
055import org.biojava.bio.SimpleAnnotation;
056import org.biojava.bio.seq.impl.SimpleSequenceFactory;
057import org.biojava.bio.seq.io.SymbolTokenization;
058import org.biojava.bio.symbol.Alphabet;
059import org.biojava.bio.symbol.AlphabetManager;
060import org.biojava.bio.symbol.AtomicSymbol;
061import org.biojava.bio.symbol.FiniteAlphabet;
062import org.biojava.bio.symbol.IllegalAlphabetException;
063import org.biojava.bio.symbol.IllegalSymbolException;
064import org.biojava.bio.symbol.ReversibleTranslationTable;
065import org.biojava.bio.symbol.SimpleSymbolList;
066import org.biojava.bio.symbol.Symbol;
067import org.biojava.bio.symbol.SymbolList;
068import org.biojava.bio.symbol.SymbolListViews;
069
070
071
072/**
073
074 * Useful functionality for processing nucleotide sequences.
075
076 *
077
078 * @author Matthew Pocock
079
080 * @author Keith James (docs)
081
082 */
083
084public final class NucleotideTools {
085
086  private static final ReversibleTranslationTable complementTable;
087
088  static private final FiniteAlphabet nucleotide;
089
090    private static final SymbolTokenization nucleotideTokens;
091
092
093
094  static private final AtomicSymbol a;
095
096  static private final AtomicSymbol g;
097
098  static private final AtomicSymbol c;
099
100  static private final AtomicSymbol t;
101
102  static private final AtomicSymbol u;
103
104  static private final Symbol r;
105
106  static private final Symbol y;
107
108  static private final Symbol m;
109
110  static private final Symbol k;
111
112  static private final Symbol s;
113
114  static private final Symbol w;
115
116  static private final Symbol b;
117
118  static private final Symbol d;
119
120  static private final Symbol h;
121
122  static private final Symbol v;
123
124  static private final Symbol n;
125
126
127
128
129
130  static private Map symbolToComplement;
131
132
133
134  static {
135
136    try {
137
138      nucleotide = (FiniteAlphabet) AlphabetManager.alphabetForName("NUCLEOTIDE");
139
140      nucleotideTokens = nucleotide.getTokenization("token");
141
142      SymbolList syms = new SimpleSymbolList(nucleotideTokens, "agcturymkswbdhvn");
143
144      a = (AtomicSymbol) syms.symbolAt(1);
145
146      g = (AtomicSymbol) syms.symbolAt(2);
147
148      c = (AtomicSymbol) syms.symbolAt(3);
149
150      t = (AtomicSymbol) syms.symbolAt(4);
151
152      u = (AtomicSymbol) syms.symbolAt(5);
153
154      r = syms.symbolAt(6);
155
156      y = syms.symbolAt(7);
157
158      m = syms.symbolAt(8);
159
160      k = syms.symbolAt(9);
161
162      s = syms.symbolAt(10);
163
164      w = syms.symbolAt(11);
165
166      b = syms.symbolAt(12);
167
168      d = syms.symbolAt(13);
169
170      h = syms.symbolAt(14);
171
172      v = syms.symbolAt(15);
173
174      n = syms.symbolAt(16);
175
176
177
178      symbolToComplement = new HashMap();
179
180
181
182      // add the gap symbol
183
184      Symbol gap = nucleotide.getGapSymbol();
185
186      symbolToComplement.put(gap, gap);
187
188
189
190      // add all other ambiguity symbols
191
192      for(Iterator i = AlphabetManager.getAllSymbols(nucleotide).iterator(); i.hasNext();) {
193
194          Symbol as = (Symbol) i.next();
195
196          FiniteAlphabet matches = (FiniteAlphabet) as.getMatches();
197
198          if (matches.size() > 1) {   // We've hit an ambiguous symbol.
199
200              Set l = new HashSet();
201
202              for(Iterator j = matches.iterator(); j.hasNext(); ) {
203
204                  l.add(complement((Symbol) j.next()));
205
206              }
207
208              symbolToComplement.put(as, nucleotide.getAmbiguity(l));
209
210          }
211
212      }
213
214
215
216
217
218      complementTable = new NucleotideComplementTranslationTable();
219
220    } catch (Throwable t) {
221
222      throw new BioError("Unable to initialize NucleotideTools",t);
223
224    }
225
226  }
227
228
229
230  public static AtomicSymbol a() { return a; }
231
232  public static AtomicSymbol g() { return g; }
233
234  public static AtomicSymbol c() { return c; }
235
236  public static AtomicSymbol t() { return t; }
237
238  public static AtomicSymbol u() { return u; }
239
240  public static Symbol r() { return r; }
241
242  public static Symbol y() { return y; }
243
244  public static Symbol m() { return m; }
245
246  public static Symbol k() { return k; }
247
248  public static Symbol s() { return s; }
249
250  public static Symbol w() { return w; }
251
252  public static Symbol b() { return b; }
253
254  public static Symbol d() { return d; }
255
256  public static Symbol h() { return h; }
257
258  public static Symbol v() { return v; }
259
260  public static Symbol n() { return n; }
261
262
263  private NucleotideTools() {
264  }
265
266  /**
267
268   * Return the Nucleotide alphabet.
269
270   *
271
272   * @return a flyweight version of the Nucleotide alphabet
273
274   */
275
276  public static FiniteAlphabet getNucleotide() {
277
278    return nucleotide;
279
280  }
281
282
283
284  /**
285
286   * Return a new Nucleotide <span class="type">SymbolList</span> for
287
288   * <span class="arg">nucleotide</span>.
289
290   *
291
292   * @param nucleotide a <span class="type">String</span> to parse into Nucleotide
293
294   * @return a <span class="type">SymbolList</span> created form
295
296   *         <span class="arg">nucleotide</span>
297
298   * @throws IllegalSymbolException if <span class="arg">nucleotide</span> contains
299
300   *         any non-Nucleotide characters
301
302   */
303
304  public static SymbolList createNucleotide(String nucleotide)
305
306  throws IllegalSymbolException {
307
308    try {
309
310      SymbolTokenization p = getNucleotide().getTokenization("token");
311
312      return new SimpleSymbolList(p, nucleotide);
313
314    } catch (BioException se) {
315
316      throw new BioError("Something has gone badly wrong with Nucleotide",se);
317
318    }
319
320  }
321
322
323
324  /**
325
326   * Return a new Nucleotide <span class="type">Sequence</span> for
327
328   * <span class="arg">nucleotide</span>.
329
330   *
331
332   * @param nucleotide a <span class="type">String</span> to parse into Nucleotide
333
334   * @param name a <span class="type">String</span> to use as the name
335
336   * @return a <span class="type">Sequence</span> created form
337
338   *         <span class="arg">nucleotide</span>
339
340   * @throws IllegalSymbolException if <span class="arg">nucleotide</span> contains
341
342   *         any non-Nucleotide characters
343
344   */
345
346  public static Sequence createNucleotideSequence(String nucleotide, String name)
347
348  throws IllegalSymbolException {
349
350    try {
351
352      return new SimpleSequenceFactory().createSequence(
353
354        createNucleotide(nucleotide),
355
356        "", name, new SimpleAnnotation()
357
358      );
359
360    } catch (BioException se) {
361
362      throw new BioError("Something has gone badly wrong with Nucleotide",se);
363
364    }
365
366  }
367
368
369
370  /**
371
372   * Return an integer index for a symbol - compatible with
373
374   * <code>forIndex</code>.
375
376   *
377
378   * <p>
379
380   * The index for a symbol is stable accross virtual machines &
381
382   * invocations.
383
384   * </p>
385
386   *
387
388   * @param sym  the Symbol to index
389
390   * @return the index for that symbol
391
392   *
393
394   * @throws IllegalSymbolException if sym is not a member of the Nucleotide
395
396   * alphabet
397
398   */
399
400  public static int index(Symbol sym) throws IllegalSymbolException {
401
402    if(sym == a) {
403
404      return 0;
405
406    } else if(sym == g) {
407
408      return 1;
409
410    } else if(sym == c) {
411
412      return 2;
413
414    } else if(sym == t) {
415
416      return 3;
417
418    } else if(sym == u) {
419
420      return 4;
421
422    }
423
424    getNucleotide().validate(sym);
425
426    throw new IllegalSymbolException("Really confused. Can't find index for " +
427
428                                      sym.getName());
429
430  }
431
432
433
434  /**
435
436   * Return the symbol for an index - compatible with <code>index</code>.
437
438   *
439
440   * <p>
441
442   * The index for a symbol is stable accross virtual machines &
443
444   * invocations.
445
446   * </p>
447
448   *
449
450   * @param index  the index to look up
451
452   * @return       the symbol at that index
453
454   *
455
456   * @throws IndexOutOfBoundsException if index is not between 0 and 3
457
458   */
459
460  static public Symbol forIndex(int index)
461
462  throws IndexOutOfBoundsException {
463
464    if(index == 0)
465
466      return a;
467
468    else if(index == 1)
469
470      return g;
471
472    else if(index == 2)
473
474      return c;
475
476    else if(index == 3)
477
478      return t;
479
480    else if(index == 4)
481
482      return u;
483
484    else throw new IndexOutOfBoundsException("No symbol for index " + index);
485
486  }
487
488
489
490  /**
491
492   * Complement the symbol.
493
494   *
495
496   * @param sym  the symbol to complement
497
498   * @return a Symbol that is the complement of sym
499
500   * @throws IllegalSymbolException if sym is not a member of the Nucleotide alphabet
501
502   */
503
504  static public Symbol complement(Symbol sym)
505
506  throws IllegalSymbolException {
507
508    if(sym == a) {
509
510      return t;
511
512    } else if(sym == g) {
513
514      return c;
515
516    } else if(sym == c) {
517
518      return g;
519
520    } else if(sym == t) {
521
522      return a;
523
524    } else if(sym == u) {
525
526      return a;
527
528    }
529
530    Symbol s = (Symbol) symbolToComplement.get(sym);
531
532    if(s != null) {
533
534      return s;
535
536    } else {
537
538      getNucleotide().validate(sym);
539
540      throw new BioError(
541
542        "Really confused. Can't find symbol " +
543
544        sym.getName()
545
546      );
547
548    }
549
550  }
551
552
553
554  /**
555
556   * Retrieve the symbol for a symbol.
557
558   *
559
560   * @param token  the char to look up
561
562   * @return  the symbol for that char
563
564   * @throws IllegalSymbolException if the char does not belong to {a, g, c, t, u}
565
566   */
567
568  static public Symbol forSymbol(char token)
569
570  throws IllegalSymbolException {
571
572    if(token == 'a') {
573
574      return a;
575
576    } else if(token == 'g') {
577
578      return g;
579
580    } else if(token == 'c') {
581
582      return c;
583
584    } else if(token == 't') {
585
586      return t;
587
588    } else if(token == 'u') {
589
590      return u;
591
592    }
593
594    throw new IllegalSymbolException("Unable to find symbol for token " + token);
595
596  }
597
598
599
600  /**
601
602   * Retrieve a complement view of list.
603
604   *
605
606   * @param list  the SymbolList to complement
607
608   * @return a SymbolList that is the complement
609
610   * @throws IllegalAlphabetException if list is not a complementable alphabet
611
612   */
613
614  public static SymbolList complement(SymbolList list)
615
616  throws IllegalAlphabetException {
617
618    return SymbolListViews.translate(list, complementTable());
619
620  }
621
622
623
624  /**
625
626   * Retrieve a reverse-complement view of list.
627
628   *
629
630   * @param list  the SymbolList to complement
631
632   * @return a SymbolList that is the complement
633
634   * @throws IllegalAlphabetException if list is not a complementable alphabet
635
636   */
637
638  public static SymbolList reverseComplement(SymbolList list)
639
640  throws IllegalAlphabetException {
641
642    return SymbolListViews.translate(SymbolListViews.reverse(list), complementTable());
643
644  }
645
646
647
648  /**
649
650   * Get a translation table for complementing Nucleotide symbols.
651
652   *
653
654   * @since 1.1
655
656   */
657
658
659
660  public static ReversibleTranslationTable complementTable() {
661
662    return complementTable;
663
664  }
665
666
667
668    /**
669
670     * Get a single-character token for a Nucleotide symbol
671
672     *
673
674     * @throws IllegalSymbolException if <code>sym</code> is not a member of the Nucleotide alphabet
675
676     */
677
678
679
680    public static char nucleotideToken(Symbol sym)
681
682        throws IllegalSymbolException
683
684    {
685
686        return nucleotideTokens.tokenizeSymbol(sym).charAt(0);
687
688    }
689
690
691
692  /**
693
694   * Sneaky class for complementing Nucleotide bases.
695
696   */
697
698
699
700  private static class NucleotideComplementTranslationTable
701
702  implements ReversibleTranslationTable {
703
704    public Symbol translate(Symbol s)
705
706          throws IllegalSymbolException {
707
708            return NucleotideTools.complement(s);
709
710          }
711
712
713
714    public Symbol untranslate(Symbol s)
715
716          throws IllegalSymbolException {
717
718            return NucleotideTools.complement(s);
719
720          }
721
722
723
724          public Alphabet getSourceAlphabet() {
725
726            return NucleotideTools.getNucleotide();
727
728          }
729
730
731
732          public Alphabet getTargetAlphabet() {
733
734            return NucleotideTools.getNucleotide();
735
736          }
737
738  }
739
740}
741
742
743