001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 * Created on 01-21-2010
021 */
022
023package org.biojava.nbio.core.sequence.compound;
024
025import org.biojava.nbio.core.sequence.template.CompoundSet;
026import org.biojava.nbio.core.sequence.template.Sequence;
027
028import java.io.Serializable;
029import java.util.*;
030
031/**
032 * Set of proteinogenic amino acids.  Molecular weights are recorded in daltons (Da) as residues of a chain; monomers
033 * outside of a chain would likely have an additional mass of 18.01524 Da contributed by an associated water molecule.
034 *
035 * Currently we have different symbols to handle inserts so not as clean as it should be
036 *
037 * @author Richard Holland
038 * @author Scooter Willis
039 * @author Mark Chapman
040 */
041public class AminoAcidCompoundSet implements CompoundSet<AminoAcidCompound>, Serializable {
042
043        /**
044         *
045         */
046        private static final long serialVersionUID = 4000344194364133456L;
047        private final Map<String, AminoAcidCompound> aminoAcidCompoundCache = new HashMap<>();
048        private final Map<String, AminoAcidCompound> aminoAcidCompoundCache3Letter = new HashMap<>();
049
050        private final Map<AminoAcidCompound, Set<AminoAcidCompound>> equivalentsCache =
051                        new HashMap<>();
052
053        public AminoAcidCompoundSet() {
054                aminoAcidCompoundCache.put("A", new AminoAcidCompound(this, "A", "Ala", "Alanine", 71.0788f));
055                aminoAcidCompoundCache.put("R", new AminoAcidCompound(this, "R", "Arg", "Arginine", 156.1875f));
056                aminoAcidCompoundCache.put("N", new AminoAcidCompound(this, "N", "Asn", "Asparagine", 114.1039f));
057                aminoAcidCompoundCache.put("D", new AminoAcidCompound(this, "D", "Asp", "Aspartic acid", 115.0886f));
058                aminoAcidCompoundCache.put("C", new AminoAcidCompound(this, "C", "Cys", "Cysteine", 103.1388f));
059                aminoAcidCompoundCache.put("E", new AminoAcidCompound(this, "E", "Glu", "Glutamic acid", 129.1155f));
060                aminoAcidCompoundCache.put("Q", new AminoAcidCompound(this, "Q", "Gln", "Glutamine", 128.1307f));
061                aminoAcidCompoundCache.put("G", new AminoAcidCompound(this, "G", "Gly", "Glycine", 57.0519f));
062                aminoAcidCompoundCache.put("H", new AminoAcidCompound(this, "H", "His", "Histidine", 137.1411f));
063                aminoAcidCompoundCache.put("I", new AminoAcidCompound(this, "I", "Ile", "Isoleucine", 113.1594f));
064                aminoAcidCompoundCache.put("L", new AminoAcidCompound(this, "L", "Leu", "Leucine", 113.1594f));
065                aminoAcidCompoundCache.put("K", new AminoAcidCompound(this, "K", "Lys", "Lysine", 128.1741f));
066                aminoAcidCompoundCache.put("M", new AminoAcidCompound(this, "M", "Met", "Methionine", 131.1986f));
067                aminoAcidCompoundCache.put("F", new AminoAcidCompound(this, "F", "Phe", "Phenylalanine", 147.1766f));
068                aminoAcidCompoundCache.put("P", new AminoAcidCompound(this, "P", "Pro", "Proline", 97.1167f));
069                aminoAcidCompoundCache.put("S", new AminoAcidCompound(this, "S", "Ser", "Serine", 87.0782f));
070                aminoAcidCompoundCache.put("T", new AminoAcidCompound(this, "T", "Thr", "Threonine", 101.1051f));
071                aminoAcidCompoundCache.put("W", new AminoAcidCompound(this, "W", "Trp", "Tryptophan", 186.2132f));
072                aminoAcidCompoundCache.put("Y", new AminoAcidCompound(this, "Y", "Tyr", "Tyrosine", 163.1760f));
073                aminoAcidCompoundCache.put("V", new AminoAcidCompound(this, "V", "Val", "Valine", 99.1326f));
074                aminoAcidCompoundCache.put("B", new AminoAcidCompound(this, "B", "Asx", "Asparagine or Aspartic acid", null));
075                aminoAcidCompoundCache.put("Z", new AminoAcidCompound(this, "Z", "Glx", "Glutamine or Glutamic acid", null));
076                aminoAcidCompoundCache.put("J", new AminoAcidCompound(this, "J", "Xle", "Leucine or Isoleucine", null));
077                aminoAcidCompoundCache.put("X", new AminoAcidCompound(this, "X", "Xaa", "Unspecified", null));
078                aminoAcidCompoundCache.put("-", new AminoAcidCompound(this, "-", "---", "Unspecified", null));
079                aminoAcidCompoundCache.put(".", new AminoAcidCompound(this, ".", "...", "Unspecified", null));
080                aminoAcidCompoundCache.put("_", new AminoAcidCompound(this, "_", "___", "Unspecified", null));
081                aminoAcidCompoundCache.put("*", new AminoAcidCompound(this, "*", "***", "Stop", null));
082
083                //Selenocysteine - this is encoded by UGA with the presence
084                //of a SECIS element (SElenoCysteine Insertion Sequence) in the mRNA
085                //and is a post-translation modification
086                aminoAcidCompoundCache.put("U", new AminoAcidCompound(this, "U", "Sec", "Selenocysteine", 150.0388f));
087
088                //Pyrrolysine is encoded by UAG in mRNA (normally Amber stop codon) which is translated to
089                //this amino acid under the presence of pylT which creates an anti-codon CUA & pylS
090                //which then does the actual conversion to Pyl.
091                aminoAcidCompoundCache.put("O", new AminoAcidCompound(this, "O", "Pyl", "Pyrrolysine", 255.3172f));
092
093                for(String oneLtr : aminoAcidCompoundCache.keySet()) {
094                        AminoAcidCompound aa = aminoAcidCompoundCache.get(oneLtr);
095                        String threeLtr = aa.getLongName().toUpperCase();
096                        aminoAcidCompoundCache3Letter.put(threeLtr, aa);
097                }
098        }
099
100        @Override
101        public String getStringForCompound(AminoAcidCompound compound) {
102                return compound.toString();
103        }
104
105        @Override
106        public AminoAcidCompound getCompoundForString(String string) {
107                if (string.length() == 0) {
108                        return null;
109                }
110                if (string.length() == 3) {
111                        return this.aminoAcidCompoundCache3Letter.get(string.toUpperCase());
112                }
113                if (string.length() > this.getMaxSingleCompoundStringLength()) {
114                        throw new IllegalArgumentException("String supplied ("+string+") is too long. Max is "+getMaxSingleCompoundStringLength());
115                }
116                return this.aminoAcidCompoundCache.get(string.toUpperCase());
117        }
118
119        @Override
120        public int getMaxSingleCompoundStringLength() {
121                return 1;
122        }
123
124
125        @Override
126        public boolean isCompoundStringLengthEqual() {
127                return true;
128        }
129
130        private final static AminoAcidCompoundSet aminoAcidCompoundSet = new AminoAcidCompoundSet();
131
132        public static AminoAcidCompoundSet getAminoAcidCompoundSet() {
133                return aminoAcidCompoundSet;
134        }
135
136        @Override
137        public boolean compoundsEquivalent(AminoAcidCompound compoundOne, AminoAcidCompound compoundTwo) {
138                Set<AminoAcidCompound> equivalents = getEquivalentCompounds(compoundOne);
139                return (equivalents != null) && equivalents.contains(compoundTwo);
140        }
141
142        @Override
143        public Set<AminoAcidCompound> getEquivalentCompounds(AminoAcidCompound compound) {
144                if (equivalentsCache.isEmpty()) {
145                        // most compounds are equivalent to themselves alone
146                        for (AminoAcidCompound c : aminoAcidCompoundCache.values()) {
147                                equivalentsCache.put(c, Collections.singleton(c));
148                        }
149                        // ambiguous Asparagine or Aspartic acid
150                        addAmbiguousEquivalents("N", "D", "B");
151                        // ambiguous Glutamine or Glutamic acid
152                        addAmbiguousEquivalents("E", "Q", "Z");
153                        // ambiguous Leucine or Isoleucine
154                        addAmbiguousEquivalents("I", "L", "J");
155                        // ambiguous gaps
156                        AminoAcidCompound gap1, gap2, gap3;
157                        Set<AminoAcidCompound> gaps = new HashSet<>();
158                        gaps.add(gap1 = aminoAcidCompoundCache.get("-"));
159                        gaps.add(gap2 = aminoAcidCompoundCache.get("."));
160                        gaps.add(gap3 = aminoAcidCompoundCache.get("_"));
161                        equivalentsCache.put(gap1, gaps);
162                        equivalentsCache.put(gap2, gaps);
163                        equivalentsCache.put(gap3, gaps);
164                        // X is never equivalent, even to itself
165                        equivalentsCache.put(aminoAcidCompoundCache.get("X"), new HashSet<AminoAcidCompound>());
166                }
167                return equivalentsCache.get(compound);
168        }
169
170        // helper method to initialize the equivalent sets for 2 amino acid compounds and their ambiguity compound
171        private void addAmbiguousEquivalents(String one, String two, String either) {
172                Set<AminoAcidCompound> equivalents;
173                AminoAcidCompound cOne, cTwo, cEither;
174
175                equivalents = new HashSet<>();
176                equivalents.add(cOne = aminoAcidCompoundCache.get(one));
177                equivalents.add(cTwo = aminoAcidCompoundCache.get(two));
178                equivalents.add(cEither = aminoAcidCompoundCache.get(either));
179                equivalentsCache.put(cEither, equivalents);
180
181                equivalents = new HashSet<>();
182                equivalents.add(cOne);
183                equivalents.add(cEither);
184                equivalentsCache.put(cOne, equivalents);
185
186                equivalents = new HashSet<>();
187                equivalents.add(cTwo);
188                equivalents.add(cEither);
189                equivalentsCache.put(cTwo, equivalents);
190        }
191
192        @Override
193        public boolean hasCompound(AminoAcidCompound compound) {
194                return aminoAcidCompoundCache.containsValue(compound);
195        }
196
197        @Override
198        public boolean isValidSequence(Sequence<AminoAcidCompound> sequence) {
199                for (AminoAcidCompound compound: sequence) {
200                        if (!hasCompound(compound)) {
201                                return false;
202                        }
203                }
204                return true;
205        }
206
207        @Override
208        public List<AminoAcidCompound> getAllCompounds() {
209                return new ArrayList<>(aminoAcidCompoundCache.values());
210        }
211
212
213        @Override
214        public boolean isComplementable() {
215                return false;
216        }
217}