001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022package org.biojava.bio.seq;
023
024import java.io.InputStream;
025import java.util.HashMap;
026import java.util.Iterator;
027import java.util.Map;
028import java.util.MissingResourceException;
029
030import javax.xml.parsers.DocumentBuilder;
031import javax.xml.parsers.DocumentBuilderFactory;
032
033import org.biojava.bio.BioError;
034import org.biojava.bio.BioException;
035import org.biojava.bio.SimpleAnnotation;
036import org.biojava.bio.seq.impl.SimpleGappedSequence;
037import org.biojava.bio.seq.impl.SimpleSequenceFactory;
038import org.biojava.bio.seq.io.SymbolTokenization;
039import org.biojava.bio.symbol.AlphabetManager;
040import org.biojava.bio.symbol.AtomicSymbol;
041import org.biojava.bio.symbol.FiniteAlphabet;
042import org.biojava.bio.symbol.IllegalSymbolException;
043import org.biojava.bio.symbol.SimpleSymbolList;
044import org.biojava.bio.symbol.SimpleSymbolPropertyTable;
045import org.biojava.bio.symbol.Symbol;
046import org.biojava.bio.symbol.SymbolList;
047import org.biojava.bio.symbol.SymbolPropertyTable;
048import org.biojava.utils.ClassTools;
049import org.w3c.dom.Document;
050import org.w3c.dom.Element;
051import org.w3c.dom.Node;
052import org.w3c.dom.NodeList;
053import org.xml.sax.InputSource;
054
055/**
056 * The central port-of-call for all information and functionality specific to
057 * SymbolLists over the protein alphabet.
058 *
059 * @author Matthew Pocock
060 * @author Greg Cox
061 * @author Thomas Down
062 * @author MarkSchreiber
063 * @author Jonathan Warren
064 * @author gwaldon (pyrrolysine, pKs)
065 */
066public class ProteinTools {
067    private static final FiniteAlphabet proteinAlpha;
068    private static final FiniteAlphabet proteinTAlpha;
069
070    private static final Map tokenToSymbol = new HashMap();
071
072    private static final Map propertyTableMap = new HashMap();
073
074    static {
075        try {
076            proteinAlpha = (FiniteAlphabet) AlphabetManager.alphabetForName("PROTEIN");
077            proteinTAlpha = (FiniteAlphabet) AlphabetManager.alphabetForName("PROTEIN-TERM");
078            SymbolTokenization st = proteinTAlpha.getTokenization("token");
079            for (Iterator i = proteinTAlpha.iterator(); i.hasNext(); ) {
080              AtomicSymbol s = (AtomicSymbol)i.next();
081              tokenToSymbol.put(st.tokenizeSymbol(s), s);
082            }
083
084        } catch (Exception e) {
085            throw new BioError(" Could not initialize ProteinTools", e);
086        }
087    }
088
089
090    static {
091
092        Document doc = null;
093     /*   try {
094            URL proteaseManagerURL = ProteinTools.class.getClassLoader().getResource(
095            "org/biojava/bio/symbol/ResidueProperties.xml"
096            );
097            //If I try and do this here on compile it says "An exception can't be thrown by an initializer"
098            InputSource is = Resolver.createInputSource(proteaseManagerURL, true);
099            doc = XmlDocument.createXmlDocument(is, true);*/
100
101      try {
102          InputStream tablesStream = ClassTools.getClassLoader(ProteinTools.class).getResourceAsStream(
103            "org/biojava/bio/symbol/ResidueProperties.xml"
104          );
105          if(tablesStream == null ) {
106            throw new BioError("Couldn't locate ResidueProperties.xml.");
107          }
108
109          InputSource is = new InputSource(tablesStream);
110          DocumentBuilder parser = DocumentBuilderFactory.newInstance().newDocumentBuilder();
111          doc = parser.parse(is);
112        }catch (MissingResourceException mre) {
113            System.err.println(mre.getMessage());
114        }catch(Exception e){//err
115            e.printStackTrace();
116        }
117
118        try {
119            SimpleSymbolPropertyTable monoMassPropertyTable = new SimpleSymbolPropertyTable(
120            getAlphabet(),
121            SymbolPropertyTable.MONO_MASS
122            );
123
124            SimpleSymbolPropertyTable avgMassPropertyTable = new SimpleSymbolPropertyTable(
125            getAlphabet(),
126            SymbolPropertyTable.AVG_MASS
127            );
128
129            SimpleSymbolPropertyTable pK_NtermPropertyTable = new SimpleSymbolPropertyTable(
130            getAlphabet(),
131            SymbolPropertyTable.PK_Nterm
132            );
133            
134            SimpleSymbolPropertyTable pKPropertyTable = new SimpleSymbolPropertyTable(
135            getAlphabet(),
136            SymbolPropertyTable.PK
137            );
138
139            SimpleSymbolPropertyTable pK_CtermPropertyTable = new SimpleSymbolPropertyTable(
140            getAlphabet(),
141            SymbolPropertyTable.PK_Cterm
142            );
143            
144            SimpleSymbolPropertyTable HydropathicityTable = new SimpleSymbolPropertyTable(
145            getAlphabet(),
146            SymbolPropertyTable.HYDROPATHICITY
147            );
148            
149            SymbolTokenization tokens = getAlphabet().getTokenization("token");
150
151            NodeList children = doc.getDocumentElement().getChildNodes();
152            for(int i = 0; i < children.getLength(); i++) {
153                Node cnode = (Node) children.item(i);
154                if(! (cnode instanceof Element)) {
155                    continue;
156                }
157                Element child = (Element) cnode;
158                if(child.getNodeName().equals("residue")) {
159                    String token = child.getAttribute("token");
160                    Symbol s = tokens.parseToken(token);
161
162                    NodeList properyNodes = child.getChildNodes();
163                    for(int j = 0; j < properyNodes.getLength(); j++) {
164                        cnode = (Node) properyNodes.item(j);
165                        if(! (cnode instanceof Element)) {
166                            continue;
167                        }
168                        Element el = (Element) cnode;
169                        String name = el.getAttribute("name");
170                        if(name.equals(SymbolPropertyTable.MONO_MASS)) {
171                            String value = el.getAttribute("value");
172                            monoMassPropertyTable.setDoubleProperty(s, value);
173                        } else if (name.equals(SymbolPropertyTable.AVG_MASS)) {
174                            String value = el.getAttribute("value");
175                            avgMassPropertyTable.setDoubleProperty(s, value);
176                        } else if (name.equals(SymbolPropertyTable.PK_Nterm)) {
177                            String value = el.getAttribute("value");
178                            pK_NtermPropertyTable.setDoubleProperty(s, value);
179                        } else if (name.equals(SymbolPropertyTable.PK)) {
180                            String value = el.getAttribute("value");
181                            pKPropertyTable.setDoubleProperty(s, value);
182                        } else if (name.equals(SymbolPropertyTable.PK_Cterm)) {
183                            String value = el.getAttribute("value");
184                            pK_CtermPropertyTable.setDoubleProperty(s, value);
185                        }else if (name.equals(SymbolPropertyTable.HYDROPATHICITY)) {
186                            String value = el.getAttribute("value");
187                            HydropathicityTable.setDoubleProperty(s, value);
188                        }
189                    }
190                }
191            }
192
193            propertyTableMap.put(SymbolPropertyTable.MONO_MASS, (SymbolPropertyTable) monoMassPropertyTable);
194            propertyTableMap.put(SymbolPropertyTable.AVG_MASS, (SymbolPropertyTable) avgMassPropertyTable);
195            propertyTableMap.put(SymbolPropertyTable.PK_Nterm, (SymbolPropertyTable) pK_NtermPropertyTable);
196            propertyTableMap.put(SymbolPropertyTable.PK, (SymbolPropertyTable) pKPropertyTable);
197            propertyTableMap.put(SymbolPropertyTable.PK_Cterm, (SymbolPropertyTable) pK_CtermPropertyTable);
198            propertyTableMap.put(SymbolPropertyTable.HYDROPATHICITY, (SymbolPropertyTable) HydropathicityTable);
199        } catch (Exception e) {
200            throw new BioError(" Could not initialize ProteinTools", e);
201        }
202    }
203    
204    private ProteinTools() {
205    }
206    
207    /**
208     *Gets the protein alphabet
209     */
210    public static final FiniteAlphabet getAlphabet() {
211        return proteinAlpha;
212    }
213
214    /**
215     *Gets the protein alphabet including the translation termination symbols
216     */
217    public static final FiniteAlphabet getTAlphabet() {
218        return proteinTAlpha;
219    }
220
221    public static final SymbolPropertyTable getSymbolPropertyTable(String name)
222    {
223        return (SymbolPropertyTable)propertyTableMap.get(name);
224    }
225
226  /**
227   * Return a new Protein <span class="type">SymbolList</span> for <span
228   * class="arg">protein</span>.
229   *
230   * @param theProtein a <span class="type">String</span> to parse into Protein
231   * @return a <span class="type">SymbolList</span> created form <span
232   *         class="arg">Protein</span>
233   * @throws IllegalSymbolException if  <span class="arg">dna</span> contains
234   *                                any non-Amino Acid characters.
235   */
236  public static SymbolList createProtein(String theProtein)
237          throws IllegalSymbolException
238  {
239    SymbolTokenization p = null;
240    try {
241      p = getTAlphabet().getTokenization("token");
242    } catch (BioException e) {
243      throw new BioError("Something has gone badly wrong with Protein", e);
244    }
245    return new SimpleSymbolList(p, theProtein);
246  }
247
248    /** Get a new protein as a GappedSequence */
249    public static GappedSequence createGappedProteinSequence(String theProtein, String name) throws IllegalSymbolException{
250        String theProtein1 = theProtein.replaceAll("-", "");
251        Sequence protein = createProteinSequence(theProtein1, name);
252        GappedSequence protein1 = new SimpleGappedSequence(protein);
253        int pos = theProtein.indexOf('-', 0);
254        while(pos!=-1){
255            protein1.addGapInView(pos+1);
256            pos = theProtein.indexOf('-', pos+1);
257        }
258        return protein1;
259    }
260
261  /**
262   * Return a new PROTEIN <span class="type">Sequence</span> for
263   * <span class="arg">protein</span>.
264   *
265   * @param protein a <span class="type">String</span> to parse into PROTEIN
266   * @param name a <span class="type">String</span> to use as the name
267   * @return a <span class="type">Sequence</span> created form
268   *         <span class="arg">protein</span>
269   * @throws IllegalSymbolException if <span class="arg">protein</span> contains
270   *         any non-PROTEIN characters
271   */
272  public static Sequence createProteinSequence(String protein, String name)
273  throws IllegalSymbolException {
274    try {
275      return new SimpleSequenceFactory().createSequence(
276        createProtein(protein),
277        "", name, new SimpleAnnotation()
278      );
279    } catch (BioException se) {
280      throw new BioError("Something has gone badly wrong with ProteinTAlpha", se);
281    }
282  }
283
284  /**
285   * Returns the <code>AtomicSymbol</code> for the amino acid Alanine
286   * (A)
287   */
288  public static AtomicSymbol ala() {
289    return (AtomicSymbol) tokenToSymbol.get("A");
290  }
291
292  /**
293   * Returns the <code>AtomicSymbol</code> for the amino acid
294   * Alanine
295   */
296  public static AtomicSymbol a() {
297    return ala();
298  }
299
300  /**
301   * Returns the <code>AtomicSymbol</code> for the amino acid
302   * Arginine (R)
303   */
304  public static AtomicSymbol arg() {
305    return (AtomicSymbol) tokenToSymbol.get("R");
306  }
307
308  /**
309   * Returns the <code>AtomicSymbol</code> for the amino acid
310   * Arginine
311   */
312  public static AtomicSymbol r() {
313    return arg();
314  }
315
316  /**
317   * Returns the <code>AtomicSymbol</code> for the amino acid
318   * Asparagine (N)
319   */
320  public static AtomicSymbol asn() {
321    return (AtomicSymbol) tokenToSymbol.get("N");
322  }
323
324  /**
325   * Returns the <code>AtomicSymbol</code> for the amino acid
326   * Asparagine
327   */
328  public static AtomicSymbol n() {
329    return asn();
330  }
331
332  /**
333   * Returns the <code>AtomicSymbol</code> for the amino acid
334   * Aspartic Acid (D)
335   */
336  public static AtomicSymbol asp() {
337    return (AtomicSymbol) tokenToSymbol.get("D");
338  }
339
340  /**
341   * Returns the <code>AtomicSymbol</code> for the amino acid
342   * Aspartic Acid
343   */
344  public static AtomicSymbol d() {
345    return asp();
346  }
347
348  /**
349   * Returns the <code>AtomicSymbol</code> for the amino acid
350   * Cysteine (C)
351   */
352  public static AtomicSymbol cys() {
353    return (AtomicSymbol) tokenToSymbol.get("C");
354  }
355
356  /**
357   * Returns the <code>AtomicSymbol</code> for the amino acid
358   * Cysteine
359   */
360  public static AtomicSymbol c() {
361    return cys();
362  }
363
364  /**
365   * Returns the <code>AtomicSymbol</code> for the amino acid
366   * Glutamine (Q)
367   */
368  public static AtomicSymbol gln() {
369    return (AtomicSymbol) tokenToSymbol.get("Q");
370  }
371
372  /**
373   * Returns the <code>AtomicSymbol</code> for the amino acid
374   * Glutamine
375   */
376  public static AtomicSymbol q() {
377    return gln();
378  }
379
380  /**
381   * Returns the <code>AtomicSymbol</code> for the amino acid
382   * Glutamic Acid (E)
383   */
384  public static AtomicSymbol glu() {
385    return (AtomicSymbol) tokenToSymbol.get("E");
386  }
387
388  /**
389   * Returns the <code>AtomicSymbol</code> for the amino acid
390   * Glutamic Acid
391   */
392  public static AtomicSymbol e() {
393    return glu();
394  }
395
396  /**
397   * Returns the <code>AtomicSymbol</code> for the amino acid
398   * Glycine (G)
399   */
400  public static AtomicSymbol gly() {
401    return (AtomicSymbol) tokenToSymbol.get("G");
402  }
403
404  /**
405   * Returns the <code>AtomicSymbol</code> for the amino acid
406   * Glycine
407   */
408  public static AtomicSymbol g() {
409    return gly();
410  }
411
412  /**
413   * Returns the <code>AtomicSymbol</code> for the amino acid
414   * Histidine (H)
415   */
416  public static AtomicSymbol his() {
417    return (AtomicSymbol) tokenToSymbol.get("H");
418  }
419
420  /**
421   * Returns the <code>AtomicSymbol</code> for the amino acid
422   * Histidine
423   */
424  public static AtomicSymbol h() {
425    return his();
426  }
427
428  /**
429   * Returns the <code>AtomicSymbol</code> for the amino acid
430   * Isoleucine (I)
431   */
432  public static AtomicSymbol ile() {
433    return (AtomicSymbol) tokenToSymbol.get("I");
434  }
435
436  /**
437   * Returns the <code>AtomicSymbol</code> for the amino acid
438   * Isoleucine
439   */
440  public static AtomicSymbol i() {
441    return ile();
442  }
443
444  /**
445   * Returns the <code>AtomicSymbol</code> for the amino acid
446   * Leucine (L)
447   */
448  public static AtomicSymbol leu() {
449    return (AtomicSymbol) tokenToSymbol.get("L");
450  }
451
452  /**
453   * Returns the <code>AtomicSymbol</code> for the amino acid
454   * Leucine
455   */
456  public static AtomicSymbol l() {
457    return leu();
458  }
459
460  /**
461   * Returns the <code>AtomicSymbol</code> for the amino acid
462   * Lysine (K)
463   */
464  public static AtomicSymbol lys() {
465    return (AtomicSymbol) tokenToSymbol.get("K");
466  }
467
468  /**
469   * Returns the <code>AtomicSymbol</code> for the amino acid
470   * Lysine
471   */
472  public static AtomicSymbol k() {
473    return lys();
474  }
475
476  /**
477   * Returns the <code>AtomicSymbol</code> for the amino acid
478   * Methionine (M)
479   */
480  public static AtomicSymbol met() {
481    return (AtomicSymbol) tokenToSymbol.get("M");
482  }
483
484  /**
485   * Returns the <code>AtomicSymbol</code> for the amino acid
486   * Methionine
487   */
488  public static AtomicSymbol m() {
489    return met();
490  }
491
492  /**
493   * Returns the <code>AtomicSymbol</code> for the amino acid
494   * Phenylalanine (F)
495   */
496  public static AtomicSymbol phe() {
497    return (AtomicSymbol) tokenToSymbol.get("F");
498  }
499
500  /**
501   * Returns the <code>AtomicSymbol</code> for the amino acid
502   * Phenylalanine
503   */
504  public static AtomicSymbol f() {
505    return phe();
506  }
507
508  /**
509   * Returns the <code>AtomicSymbol</code> for the amino acid
510   * Proline (P)
511   */
512  public static AtomicSymbol pro() {
513    return (AtomicSymbol) tokenToSymbol.get("P");
514  }
515
516  /**
517   * Returns the <code>AtomicSymbol</code> for the amino acid
518   * Proline
519   */
520  public static AtomicSymbol p() {
521    return pro();
522  }
523
524  /**
525   * Returns the <code>AtomicSymbol</code> for the amino acid
526   * Pyrrolysine (O)
527   */
528  public static AtomicSymbol pyl() {
529    return (AtomicSymbol) tokenToSymbol.get("O");
530  }
531
532  /**
533   * Returns the <code>AtomicSymbol</code> for the amino acid
534   * Pyrrolysine
535   */
536  public static AtomicSymbol o() {
537    return pyl();
538  }
539
540  /**
541   * Returns the <code>AtomicSymbol</code> for the amino acid
542   * Selenocysteine (U)
543   */
544  public static AtomicSymbol sec() {
545    return (AtomicSymbol) tokenToSymbol.get("U");
546  }
547
548  /**
549   * Returns the <code>AtomicSymbol</code> for the amino acid
550   * Selenocysteine
551   */
552   public static AtomicSymbol u(){
553     return sec();
554   }
555   
556  /**
557   * Returns the <code>AtomicSymbol</code> for the amino acid
558   * Serine (S)
559   */
560  public static AtomicSymbol ser() {
561    return (AtomicSymbol) tokenToSymbol.get("S");
562  }
563
564  /**
565   * Returns the <code>AtomicSymbol</code> for the amino acid
566   * Serine
567   */
568  public static AtomicSymbol s() {
569    return ser();
570  }
571
572  /**
573   * Returns the <code>AtomicSymbol</code> for the amino acid
574   * Threonine (T)
575   */
576  public static AtomicSymbol thr() {
577    return (AtomicSymbol) tokenToSymbol.get("T");
578  }
579
580  /**
581   * Returns the <code>AtomicSymbol</code> for the amino acid
582   * Threonine
583   */
584  public static AtomicSymbol t() {
585    return thr();
586  }
587
588  /**
589   * Returns the <code>AtomicSymbol</code> for the amino acid
590   * Tryptophan (W)
591   */
592  public static AtomicSymbol trp() {
593    return (AtomicSymbol) tokenToSymbol.get("W");
594  }
595
596  /**
597   * Returns the <code>AtomicSymbol</code> for the amino acid
598   * Tryptophan
599   */
600  public static AtomicSymbol w() {
601    return trp();
602  }
603
604  /**
605   * Returns the <code>AtomicSymbol</code> for the amino acid
606   * Tyrosine (Y)
607   */
608  public static AtomicSymbol tyr() {
609    return (AtomicSymbol) tokenToSymbol.get("Y");
610  }
611
612  /**
613   * Returns the <code>AtomicSymbol</code> for the amino acid
614   * Tyrosine
615   */
616  public static AtomicSymbol y() {
617    return tyr();
618  }
619
620  /**
621   * Returns the <code>AtomicSymbol</code> for the amino acid Valine (V)
622   */
623  public static AtomicSymbol val() {
624    return (AtomicSymbol) tokenToSymbol.get("V");
625  }
626
627  /**
628   * Returns the <code>AtomicSymbol</code> for the amino acid
629   * Valine
630   */
631  public static AtomicSymbol v() {
632    return val();
633  }
634
635
636   /**
637    * Returns the <code>AtomicSymbol</code> for the termination (*)
638    * placeholder
639    */
640   public static AtomicSymbol ter() {
641     return (AtomicSymbol) tokenToSymbol.get("*");
642   }
643
644}