001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 * Created on 01-21-2010
021 */
022package org.biojava.nbio.aaproperties.xml;
023
024import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
025import org.biojava.nbio.core.sequence.template.CompoundSet;
026import org.biojava.nbio.core.sequence.template.Sequence;
027
028import java.util.*;
029
030/**
031 * Set of proteinogenic amino acids.  Molecular weights are recorded in daltons (Da) as residues of a chain; monomers
032 * outside of a chain would likely have an additional mass of 18.01524 Da contributed by an associated water molecule.
033 *
034 * Currently we have different symbols to handle inserts so not as clean as it should be
035 *
036 * @author Richard Holland
037 * @author Scooter Willis
038 * @author Mark Chapman
039 */
040public class CaseFreeAminoAcidCompoundSet implements CompoundSet<AminoAcidCompound> {
041
042        private final Map<String, AminoAcidCompound> aminoAcidCompoundCache = new HashMap<String, AminoAcidCompound>();
043        private final Map<AminoAcidCompound, Set<AminoAcidCompound>> equivalentsCache =
044                        new HashMap<AminoAcidCompound, Set<AminoAcidCompound>>();
045
046        public CaseFreeAminoAcidCompoundSet() {
047                aminoAcidCompoundCache.put("A", new AminoAcidCompound(null, "A", "Ala", "Alanine", 71.0788f));
048                aminoAcidCompoundCache.put("R", new AminoAcidCompound(null, "R", "Arg", "Arginine", 156.1875f));
049                aminoAcidCompoundCache.put("N", new AminoAcidCompound(null, "N", "Asn", "Asparagine", 114.1039f));
050                aminoAcidCompoundCache.put("D", new AminoAcidCompound(null, "D", "Asp", "Aspartic acid", 115.0886f));
051                aminoAcidCompoundCache.put("C", new AminoAcidCompound(null, "C", "Cys", "Cysteine", 103.1388f));
052                aminoAcidCompoundCache.put("E", new AminoAcidCompound(null, "E", "Glu", "Glutamic acid", 129.1155f));
053                aminoAcidCompoundCache.put("Q", new AminoAcidCompound(null, "Q", "Gln", "Glutamine", 128.1307f));
054                aminoAcidCompoundCache.put("G", new AminoAcidCompound(null, "G", "Gly", "Glycine", 57.0519f));
055                aminoAcidCompoundCache.put("H", new AminoAcidCompound(null, "H", "His", "Histidine", 137.1411f));
056                aminoAcidCompoundCache.put("I", new AminoAcidCompound(null, "I", "Ile", "Isoleucine", 113.1594f));
057                aminoAcidCompoundCache.put("L", new AminoAcidCompound(null, "L", "Leu", "Leucine", 113.1594f));
058                aminoAcidCompoundCache.put("K", new AminoAcidCompound(null, "K", "Lys", "Lysine", 128.1741f));
059                aminoAcidCompoundCache.put("M", new AminoAcidCompound(null, "M", "Met", "Methionine", 131.1986f));
060                aminoAcidCompoundCache.put("F", new AminoAcidCompound(null, "F", "Phe", "Phenylalanine", 147.1766f));
061                aminoAcidCompoundCache.put("P", new AminoAcidCompound(null, "P", "Pro", "Proline", 97.1167f));
062                aminoAcidCompoundCache.put("S", new AminoAcidCompound(null, "S", "Ser", "Serine", 87.0782f));
063                aminoAcidCompoundCache.put("T", new AminoAcidCompound(null, "T", "Thr", "Threonine", 101.1051f));
064                aminoAcidCompoundCache.put("W", new AminoAcidCompound(null, "W", "Trp", "Tryptophan", 186.2132f));
065                aminoAcidCompoundCache.put("Y", new AminoAcidCompound(null, "Y", "Tyr", "Tyrosine", 163.1760f));
066                aminoAcidCompoundCache.put("V", new AminoAcidCompound(null, "V", "Val", "Valine", 99.1326f));
067                aminoAcidCompoundCache.put("B", new AminoAcidCompound(null, "B", "Asx", "Asparagine or Aspartic acid", null));
068                aminoAcidCompoundCache.put("Z", new AminoAcidCompound(null, "Z", "Glx", "Glutamine or Glutamic acid", null));
069                aminoAcidCompoundCache.put("J", new AminoAcidCompound(null, "J", "Xle", "Leucine or Isoleucine", null));
070                aminoAcidCompoundCache.put("X", new AminoAcidCompound(null, "X", "Xaa", "Unspecified", null));
071                aminoAcidCompoundCache.put("-", new AminoAcidCompound(null, "-", "---", "Unspecified", null));
072                aminoAcidCompoundCache.put(".", new AminoAcidCompound(null, ".", "...", "Unspecified", null));
073                aminoAcidCompoundCache.put("_", new AminoAcidCompound(null, "_", "___", "Unspecified", null));
074                aminoAcidCompoundCache.put("*", new AminoAcidCompound(null, "*", "***", "Stop", null));
075
076                //Selenocysteine - this is encoded by UGA with the presence
077                //of a SECIS element (SElenoCysteine Insertion Sequence) in the mRNA
078                //and is a post-translation modification
079                aminoAcidCompoundCache.put("U", new AminoAcidCompound(null, "U", "Sec", "Selenocysteine", 150.0388f));
080
081                //Pyrrolysine is encoded by UAG in mRNA (normally Amber stop codon) which is translated to
082                //this amino acid under the presence of pylT which creates an anti-codon CUA & pylS
083                //which then does the actual conversion to Pyl.
084                aminoAcidCompoundCache.put("O", new AminoAcidCompound(null, "O", "Pyl", "Pyrrolysine", 255.3172f));
085
086                Map<String, AminoAcidCompound> lowerCaseSet = new HashMap<String, AminoAcidCompound>();
087                for(String s:this.aminoAcidCompoundCache.keySet()){
088                        lowerCaseSet.put(s.toLowerCase(), this.aminoAcidCompoundCache.get(s));
089                }
090                this.aminoAcidCompoundCache.putAll(lowerCaseSet);
091        }
092
093        @Override
094        public String getStringForCompound(AminoAcidCompound compound) {
095                return compound.toString();
096        }
097
098        @Override
099        public AminoAcidCompound getCompoundForString(String string) {
100                if (string.length() == 0) {
101                        return null;
102                }
103                if (string.length() > this.getMaxSingleCompoundStringLength()) {
104                        throw new IllegalArgumentException("String supplied ("+string+") is too long. Max is "+getMaxSingleCompoundStringLength());
105                }
106                return this.aminoAcidCompoundCache.get(string);
107        }
108
109        @Override
110        public int getMaxSingleCompoundStringLength() {
111                return 1;
112        }
113
114
115        @Override
116        public boolean isCompoundStringLengthEqual() {
117                return true;
118        }
119
120        private final static CaseFreeAminoAcidCompoundSet aminoAcidCompoundSet = new CaseFreeAminoAcidCompoundSet();
121
122        public static CaseFreeAminoAcidCompoundSet getAminoAcidCompoundSet() {
123                return aminoAcidCompoundSet;
124        }
125
126        @Override
127        public boolean compoundsEquivalent(AminoAcidCompound compoundOne, AminoAcidCompound compoundTwo) {
128                Set<AminoAcidCompound> equivalents = getEquivalentCompounds(compoundOne);
129                return (equivalents != null) && equivalents.contains(compoundTwo);
130        }
131
132        @Override
133        public Set<AminoAcidCompound> getEquivalentCompounds(AminoAcidCompound compound) {
134                if (equivalentsCache.isEmpty()) {
135                        // most compounds are equivalent to themselves alone
136                        for (AminoAcidCompound c : aminoAcidCompoundCache.values()) {
137                                equivalentsCache.put(c, Collections.singleton(c));
138                        }
139                        // ambiguous Asparagine or Aspartic acid
140                        addAmbiguousEquivalents("N", "D", "B");
141                        // ambiguous Glutamine or Glutamic acid
142                        addAmbiguousEquivalents("E", "Q", "Z");
143                        // ambiguous Leucine or Isoleucine
144                        addAmbiguousEquivalents("I", "L", "J");
145                        // ambiguous gaps
146                        AminoAcidCompound gap1, gap2, gap3;
147                        Set<AminoAcidCompound> gaps = new HashSet<AminoAcidCompound>();
148                        gaps.add(gap1 = aminoAcidCompoundCache.get("-"));
149                        gaps.add(gap2 = aminoAcidCompoundCache.get("."));
150                        gaps.add(gap3 = aminoAcidCompoundCache.get("_"));
151                        equivalentsCache.put(gap1, gaps);
152                        equivalentsCache.put(gap2, gaps);
153                        equivalentsCache.put(gap3, gaps);
154                        // X is never equivalent, even to itself
155                        equivalentsCache.put(aminoAcidCompoundCache.get("X"), new HashSet<AminoAcidCompound>());
156                }
157                return equivalentsCache.get(compound);
158        }
159
160        // helper method to initialize the equivalent sets for 2 amino acid compounds and their ambiguity compound
161        private void addAmbiguousEquivalents(String one, String two, String either) {
162                Set<AminoAcidCompound> equivalents;
163                AminoAcidCompound cOne, cTwo, cEither;
164
165                equivalents = new HashSet<AminoAcidCompound>();
166                equivalents.add(cOne = aminoAcidCompoundCache.get(one));
167                equivalents.add(cTwo = aminoAcidCompoundCache.get(two));
168                equivalents.add(cEither = aminoAcidCompoundCache.get(either));
169                equivalentsCache.put(cEither, equivalents);
170
171                equivalents = new HashSet<AminoAcidCompound>();
172                equivalents.add(cOne);
173                equivalents.add(cEither);
174                equivalentsCache.put(cOne, equivalents);
175
176                equivalents = new HashSet<AminoAcidCompound>();
177                equivalents.add(cTwo);
178                equivalents.add(cEither);
179                equivalentsCache.put(cTwo, equivalents);
180        }
181
182        @Override
183        public boolean hasCompound(AminoAcidCompound compound) {
184                return aminoAcidCompoundCache.containsValue(compound);
185        }
186
187        @Override
188        public List<AminoAcidCompound> getAllCompounds() {
189                return new ArrayList<AminoAcidCompound>(aminoAcidCompoundCache.values());
190        }
191
192
193        @Override
194        public boolean isComplementable() {
195                return false;
196        }
197
198        @Override
199        public boolean isValidSequence(Sequence<AminoAcidCompound> sequence) {
200                for (AminoAcidCompound c: sequence) {
201                        if (!hasCompound(c)) {
202                                return false;
203                        }
204                }
205                return true;
206        }
207}