001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 * Created on 01-21-2010
021 */
022package org.biojava.nbio.core.sequence.io;
023
024import org.biojava.nbio.core.exceptions.ParserException;
025import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
026import org.biojava.nbio.core.sequence.compound.NucleotideCompound;
027import org.biojava.nbio.core.sequence.io.util.ClasspathResource;
028import org.biojava.nbio.core.sequence.io.util.IOUtils;
029import org.biojava.nbio.core.sequence.template.AbstractCompoundSet;
030import org.biojava.nbio.core.sequence.template.CompoundSet;
031import org.biojava.nbio.core.sequence.transcription.Table;
032
033import java.io.InputStream;
034import java.util.*;
035
036
037/**
038 * Available translations
039 *
040 * <ul>
041 * <li>1 - UNIVERSAL</li>
042 * <li>2 - VERTEBRATE_MITOCHONDRIAL</li>
043 * <li>3 - YEAST_MITOCHONDRIAL</li>
044 * <li>4 - MOLD_MITOCHONDRIAL</li>
045 * <li>5 - INVERTEBRATE_MITOCHONDRIAL</li>
046 * <li>6 - CILIATE_NUCLEAR</li>
047 * <li>9 - ECHINODERM_MITOCHONDRIAL</li>
048 * <li>10 - EUPLOTID_NUCLEAR</li>
049 * <li>11 - BACTERIAL</li>
050 * <li>12 - ALTERNATIVE_YEAST_NUCLEAR</li>
051 * <li>13 - ASCIDIAN_MITOCHONDRIAL</li>
052 * <li>14 - FLATWORM_MITOCHONDRIAL</li>
053 * <li>15 - BLEPHARISMA_MACRONUCLEAR</li>
054 * <li>16 - 2CHLOROPHYCEAN_MITOCHONDRIAL</li>
055 * <li>21 - TREMATODE_MITOCHONDRIAL</li>
056 * <li>23 - SCENEDESMUS_MITOCHONDRIAL</li>
057 * </ul>
058 *
059 * Taken from <a
060 * href="https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi?mode=c"
061 * >NCBI</a> with slight modification and put into the classpath resource.
062 *
063 * Takes in an ID, name, amino acid string and the locations of amino acids
064 * which acts as start codons in the translation table. You can give the 3 codon
065 * position strings that correspond to the amino acid string or if you are using
066 * the default IUPAC codes you can use the hardcoded ones which are consistent
067 * amongst all <a
068 * href="https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi?mode=c"> codon
069 * tables</a>.
070 *
071 * The generated {@link IUPACTable} objects do not parse the data further until
072 * requested so if you do not use a translation table your only penalty is the
073 * loading of the IUPAC data from the classpath.
074 *
075 * @author Andy Yates
076 */
077public class IUPACParser {
078
079        private static class IOD {
080                public static final IUPACParser INSTANCE = new IUPACParser();
081        }
082
083        public static IUPACParser getInstance() {
084                return IOD.INSTANCE;
085        }
086
087        public static final String      IUPAC_LOCATION = "org/biojava/nbio/core/sequence/iupac.txt";
088
089        private InputStream              is;
090        private List<IUPACTable>         tables;
091        private Map<String, IUPACTable>  nameLookup;
092        private Map<Integer, IUPACTable> idLookup;
093
094        /**
095         * Default version and uses the classpath based IUPAC table
096         */
097        public IUPACParser() {
098                //use the preCache version to make sure we don't keep a IO handle open
099                is = new ClasspathResource(IUPAC_LOCATION, true).getInputStream();
100        }
101
102        /**
103         * Allows you to specify a different IUPAC table.
104         */
105        public IUPACParser(InputStream is) {
106                this.is = is;
107        }
108
109        /**
110         * Returns a list of all available IUPAC tables
111         */
112        public List<IUPACTable> getTables() {
113                if (tables == null) {
114                        tables = parseTables();
115                }
116                return tables;
117        }
118
119        /**
120         * Returns a table by its name
121         */
122        public IUPACTable getTable(String name) {
123                populateLookups();
124                return nameLookup.get(name);
125        }
126
127        /**
128         * Returns a table by its identifier i.e. 1 means universal codon tables
129         */
130        public IUPACTable getTable(Integer id) {
131                populateLookups();
132                return idLookup.get(id);
133        }
134
135        private void populateLookups() {
136                if(nameLookup == null) {
137                        nameLookup = new HashMap<>();
138                        idLookup = new HashMap<>();
139                        for(IUPACTable t: getTables()) {
140                                nameLookup.put(t.getName(), t);
141                                idLookup.put(t.getId(), t);
142                        }
143                }
144        }
145
146        private List<IUPACTable> parseTables() {
147                List<IUPACTable> localTables = new ArrayList<>();
148                List<String> lines = IOUtils.getList(is);
149                Integer id = null;
150                String name, aa, starts, baseone, basetwo, basethree;
151                name = aa = starts = baseone = basetwo = basethree = null;
152                for (String line : lines) {
153                        if ("//".equalsIgnoreCase(line)) {
154                                localTables.add(new IUPACTable(name, id, aa, starts, baseone, basetwo,
155                                                basethree));
156                                name = aa = starts = baseone = basetwo = basethree = null;
157                                id = null;
158                        }
159                        else {
160                                String[] keyValue = line.split("\\s*=\\s*");
161                                if ("AAs".equals(keyValue[0])) {
162                                        aa = keyValue[1];
163                                }
164                                else if ("Starts".equals(keyValue[0])) {
165                                        starts = keyValue[1];
166                                }
167                                else if ("Base1".equals(keyValue[0])) {
168                                        baseone = keyValue[1];
169                                }
170                                else if ("Base2".equals(keyValue[0])) {
171                                        basetwo = keyValue[1];
172                                }
173                                else if ("Base3".equals(keyValue[0])) {
174                                        basethree = keyValue[1];
175                                }
176                                else {
177                                        name = keyValue[0];
178                                        id = Integer.parseInt(keyValue[1]);
179                                }
180                        }
181                }
182
183                return localTables;
184        }
185
186        /**
187         * Holds the concept of a codon table from the IUPAC format
188         *
189         * @author Andy Yates
190         */
191        public static class IUPACTable implements Table {
192
193                private final Integer      id;
194                private final String       name;
195                private final String       aminoAcidString;
196                private final String       startCodons;
197                private final String       baseOne;
198                private final String       baseTwo;
199                private final String       baseThree;
200
201                private final List<Codon>  codons    = new ArrayList<>();
202                private CompoundSet<Codon> compounds = null;
203
204                public IUPACTable(String name, int id, String aminoAcidString,
205                                String startCodons, String baseOne, String baseTwo, String baseThree) {
206                        this.aminoAcidString = aminoAcidString;
207                        this.startCodons = startCodons;
208                        this.name = name;
209                        this.id = id;
210                        this.baseOne = baseOne;
211                        this.baseTwo = baseTwo;
212                        this.baseThree = baseThree;
213                }
214
215                /**
216                 * Constructor which uses the basic IUPAC codon table format. Useful
217                 * if you need to specify your own IUPAC table with minimal
218                 * definitions from your side.
219                 */
220                public IUPACTable(String name, Integer id, String aminoAcidString,
221                                String startCodons) {
222                        this(name, id, aminoAcidString, startCodons,
223                                        "TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG",
224                                        "TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG",
225                                        "TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG");
226                }
227
228                public Integer getId() {
229                        return id;
230                }
231
232                public String getName() {
233                        return name;
234                }
235
236                /**
237                 * Returns true if the given compound was a start codon in this
238                 * codon table. This will report true if the compound could ever have
239                 * been a start codon.
240                 *
241                 * @throws IllegalStateException Thrown if
242                 * {@link #getCodons(CompoundSet, CompoundSet)} was not called first.
243                 */
244                                @Override
245                public boolean isStart(AminoAcidCompound compound) {
246                        if(this.codons.isEmpty()) {
247                                throw new IllegalStateException("Codons are empty; please request getCodons() fist before asking this");
248                        }
249                        for(Codon codon: codons) {
250                                //Only check if the codon was a start codon and then ask if the compound was encoded by it
251                                if(codon.isStart()) {
252                                        if(codon.getAminoAcid().equalsIgnoreCase(compound)) {
253                                                return true;
254                                        }
255                                }
256                        }
257                        return false;
258                }
259
260                /**
261                 * Returns a list of codons where the source and target compounds
262                 * are the same as those given by the parameters.
263                 *
264                 * @param nucleotides The nucleotide set to use when building BioJava
265                 * representations of codons
266                 * @param aminoAcids The target amino acid compounds objects
267                 */
268                                @Override
269                public List<Codon> getCodons(CompoundSet<NucleotideCompound> nucleotides,
270                                CompoundSet<AminoAcidCompound> aminoAcids) {
271
272                        if (this.codons.isEmpty()) {
273                                List<String> aminoAcidStrings = aminoAcids();
274                                List<String> startCodonStrings = startCodons();
275                                List<List<String>> codonStrings = codonStrings();
276
277                                for (int i = 0; i < aminoAcidStrings.size(); i++) {
278
279                                        List<String> codonString    = codonStrings.get(i);
280                                        NucleotideCompound one      = getCompound(codonString, 0, nucleotides);
281                                        NucleotideCompound two      = getCompound(codonString, 1, nucleotides);
282                                        NucleotideCompound three    = getCompound(codonString, 2, nucleotides);
283                                        boolean start               = ("M".equals(startCodonStrings.get(i)));
284                                        boolean stop                = ("*".equals(aminoAcidStrings.get(i)));
285                                        AminoAcidCompound aminoAcid = aminoAcids
286                                                        .getCompoundForString(aminoAcidStrings.get(i));
287                                        codons.add(new Codon(new CaseInsensitiveTriplet(one, two, three), aminoAcid, start, stop));
288                                }
289                        }
290
291                        return codons;
292                }
293
294                private NucleotideCompound getCompound(List<String> compounds,
295                                int position, CompoundSet<NucleotideCompound> nucelotides) {
296                        String compound = compounds.get(position);
297                        NucleotideCompound returnCompound = nucelotides
298                                        .getCompoundForString(compound);
299                        if (returnCompound == null) {
300                                if ("T".equalsIgnoreCase(compound)) {
301                                                returnCompound = nucelotides.getCompoundForString("U");
302                                }
303                                else {
304                                        throw new ParserException("Cannot find a compound for string "
305                                                        + compound);
306                                }
307                        }
308                        return returnCompound;
309                }
310
311                /**
312                 * Returns the compound set of codons
313                 */
314                @Override
315        public CompoundSet<Codon> getCodonCompoundSet(
316                                final CompoundSet<NucleotideCompound> rnaCompounds,
317                                final CompoundSet<AminoAcidCompound> aminoAcidCompounds) {
318                        if (compounds == null) {
319                                compounds = new AbstractCompoundSet<Codon>() {
320                                        {
321                                                for (Codon c : getCodons(rnaCompounds, aminoAcidCompounds)) {
322                                                        addCompound(c);
323                                                }
324                                        }
325                                };
326                        }
327                        return compounds;
328                }
329
330                private List<List<String>> codonStrings() {
331                        List<List<String>> codons = new ArrayList<>();
332                        for (int i = 0; i < baseOne.length(); i++) {
333                                List<String> codon = Arrays.asList(Character
334                                                .toString(baseOne.charAt(i)),
335                                                Character.toString(baseTwo.charAt(i)), Character.toString(baseThree
336                                                                .charAt(i)));
337                                codons.add(codon);
338                        }
339                        return codons;
340                }
341
342                private List<String> aminoAcids() {
343                        return split(aminoAcidString);
344                }
345
346                private List<String> startCodons() {
347                        return split(startCodons);
348                }
349
350                private List<String> split(String string) {
351                        List<String> split = new ArrayList<>();
352                        for (int i = 0; i < string.length(); i++) {
353                                split.add(Character.toString(string.charAt(i)));
354                        }
355                        return split;
356                }
357        }
358}