001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 * Created on 01-21-2010
021 */
022package org.biojava.nbio.core.sequence.transcription;
023
024import org.biojava.nbio.core.sequence.RNASequence;
025import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
026import org.biojava.nbio.core.sequence.compound.NucleotideCompound;
027import org.biojava.nbio.core.sequence.io.template.SequenceCreatorInterface;
028import org.biojava.nbio.core.sequence.template.AbstractCompoundTranslator;
029import org.biojava.nbio.core.sequence.template.CompoundSet;
030import org.biojava.nbio.core.sequence.template.Sequence;
031import org.biojava.nbio.core.sequence.template.SequenceView;
032import org.biojava.nbio.core.sequence.transcription.Table.Codon;
033import org.biojava.nbio.core.sequence.views.WindowedSequence;
034
035import java.util.*;
036
037/**
038 * Takes a {@link Sequence} of {@link NucleotideCompound} which should represent
039 * an RNA sequence ({@link RNASequence} is good for this) and returns a list of
040 * {@link Sequence} which hold {@link AminoAcidCompound}. The translator can
041 * also trim stop codons as well as changing any valid start codon to an
042 * initiating met.
043 *
044 * @author ayates
045 */
046public class RNAToAminoAcidTranslator extends
047                AbstractCompoundTranslator<NucleotideCompound, AminoAcidCompound> {
048
049        private final boolean trimStops;
050        private final boolean initMetOnly;
051        private final Map<Table.CaseInsensitiveTriplet, Codon> quickLookup;
052        private final Map<AminoAcidCompound, List<Codon>> aminoAcidToCodon;
053        // Cheeky lookup which uses a hashing value; key is to switch to using this
054        // all the time
055        private final Codon[] codonArray = new Codon[64000];
056        private final AminoAcidCompound unknownAminoAcidCompound;
057        private final AminoAcidCompound methionineAminoAcidCompound;
058        private final boolean translateNCodons;
059
060        // If true, then translation will stop at the first stop codon encountered
061        // in the reading frame (the stop codon will be included as the last residue
062        // in the resulting ProteinSequence, unless removed by #trimStops)
063        private final boolean stopAtStopCodons;
064
065        // If true, then translation will not start until the first start codon
066        // encountered in the reading frame. The start codon will be included as the
067        // first residue in the resulting ProteinSequence
068        private final boolean waitForStartCodon;
069
070        /**
071         * @deprecated Retained for backwards compatability, setting
072         *             {@link #stopAtStopCodons} to <code>false</code>
073         */
074        @Deprecated
075        public RNAToAminoAcidTranslator(
076                        SequenceCreatorInterface<AminoAcidCompound> creator,
077                        CompoundSet<NucleotideCompound> nucleotides,
078                        CompoundSet<Codon> codons,
079                        CompoundSet<AminoAcidCompound> aminoAcids, Table table,
080                        boolean trimStops, boolean initMetOnly, boolean translateNCodons) {
081
082                super(creator, nucleotides, aminoAcids);
083                this.trimStops = trimStops;
084                this.initMetOnly = initMetOnly;
085                this.translateNCodons = translateNCodons;
086
087                quickLookup = new HashMap<Table.CaseInsensitiveTriplet, Codon>(codons
088                                .getAllCompounds().size());
089                aminoAcidToCodon = new HashMap<AminoAcidCompound, List<Codon>>();
090
091                List<Codon> codonList = table.getCodons(nucleotides, aminoAcids);
092                for (Codon codon : codonList) {
093                        quickLookup.put(codon.getTriplet(), codon);
094                        codonArray[codon.getTriplet().intValue()] = codon;
095
096                        List<Codon> codonL = aminoAcidToCodon.get(codon.getAminoAcid());
097                        if (codonL == null) {
098                                codonL = new ArrayList<Codon>();
099                                aminoAcidToCodon.put(codon.getAminoAcid(), codonL);
100                        }
101                        codonL.add(codon);
102
103                }
104                unknownAminoAcidCompound = aminoAcids.getCompoundForString("X");
105                methionineAminoAcidCompound = aminoAcids.getCompoundForString("M");
106                // Set to false for backwards compatability
107                stopAtStopCodons = false;
108                waitForStartCodon = false;
109        }
110
111        @Deprecated
112        public RNAToAminoAcidTranslator(
113                        SequenceCreatorInterface<AminoAcidCompound> creator,
114                        CompoundSet<NucleotideCompound> nucleotides,
115                        CompoundSet<Codon> codons,
116                        CompoundSet<AminoAcidCompound> aminoAcids, Table table,
117                        boolean trimStops, boolean initMetOnly, boolean translateNCodons,
118                        boolean stopAtStopCodons) {
119
120                super(creator, nucleotides, aminoAcids);
121                this.trimStops = trimStops;
122                this.initMetOnly = initMetOnly;
123                this.translateNCodons = translateNCodons;
124
125                quickLookup = new HashMap<Table.CaseInsensitiveTriplet, Codon>(codons
126                                .getAllCompounds().size());
127                aminoAcidToCodon = new HashMap<AminoAcidCompound, List<Codon>>();
128
129                List<Codon> codonList = table.getCodons(nucleotides, aminoAcids);
130                for (Codon codon : codonList) {
131                        quickLookup.put(codon.getTriplet(), codon);
132                        codonArray[codon.getTriplet().intValue()] = codon;
133
134                        List<Codon> codonL = aminoAcidToCodon.get(codon.getAminoAcid());
135                        if (codonL == null) {
136                                codonL = new ArrayList<Codon>();
137                                aminoAcidToCodon.put(codon.getAminoAcid(), codonL);
138                        }
139                        codonL.add(codon);
140
141                }
142                unknownAminoAcidCompound = aminoAcids.getCompoundForString("X");
143                methionineAminoAcidCompound = aminoAcids.getCompoundForString("M");
144                this.stopAtStopCodons = stopAtStopCodons;
145                // Set for backwards compatibility
146                waitForStartCodon = false;
147        }
148
149        public RNAToAminoAcidTranslator(
150                        SequenceCreatorInterface<AminoAcidCompound> creator,
151                        CompoundSet<NucleotideCompound> nucleotides,
152                        CompoundSet<Codon> codons,
153                        CompoundSet<AminoAcidCompound> aminoAcids, Table table,
154                        boolean trimStops, boolean initMetOnly, boolean translateNCodons,
155                        boolean stopAtStopCodons, boolean waitForStartCodon) {
156
157                super(creator, nucleotides, aminoAcids);
158                this.trimStops = trimStops;
159                this.initMetOnly = initMetOnly;
160                this.translateNCodons = translateNCodons;
161
162                quickLookup = new HashMap<Table.CaseInsensitiveTriplet, Codon>(codons
163                                .getAllCompounds().size());
164                aminoAcidToCodon = new HashMap<AminoAcidCompound, List<Codon>>();
165
166                List<Codon> codonList = table.getCodons(nucleotides, aminoAcids);
167                for (Codon codon : codonList) {
168                        quickLookup.put(codon.getTriplet(), codon);
169                        codonArray[codon.getTriplet().intValue()] = codon;
170
171                        List<Codon> codonL = aminoAcidToCodon.get(codon.getAminoAcid());
172                        if (codonL == null) {
173                                codonL = new ArrayList<Codon>();
174                                aminoAcidToCodon.put(codon.getAminoAcid(), codonL);
175                        }
176                        codonL.add(codon);
177
178                }
179                unknownAminoAcidCompound = aminoAcids.getCompoundForString("X");
180                methionineAminoAcidCompound = aminoAcids.getCompoundForString("M");
181                this.stopAtStopCodons = stopAtStopCodons;
182                this.waitForStartCodon = waitForStartCodon;
183        }
184
185        /**
186         * Performs the core conversion of RNA to Peptide. It does this by walking a
187         * windowed version of the given sequence. Any trailing DNA base pairs are
188         * ignored according to the specification of {@link WindowedSequence}.
189         */
190
191        @Override
192        public List<Sequence<AminoAcidCompound>> createSequences(
193                        Sequence<NucleotideCompound> originalSequence) {
194
195                List<List<AminoAcidCompound>> workingList = new ArrayList<List<AminoAcidCompound>>();
196
197                Iterable<SequenceView<NucleotideCompound>> iter = new WindowedSequence<NucleotideCompound>(
198                                originalSequence, 3);
199
200                boolean first = true;
201
202                // If not waiting for a start codon, start translating immediately
203                boolean doTranslate = !waitForStartCodon;
204
205                for (SequenceView<NucleotideCompound> element : iter) {
206                        AminoAcidCompound aminoAcid = null;
207
208                        int i = 1;
209                        Table.CaseInsensitiveTriplet triplet = new Table.CaseInsensitiveTriplet(
210                                        element.getCompoundAt(i++), element.getCompoundAt(i++),
211                                        element.getCompoundAt(i++));
212
213                        Codon target = null;
214
215                        target = quickLookup.get(triplet);
216
217                        // Check for a start
218                        if (doTranslate == false && target.isStart()) {
219                                doTranslate = true;
220                        }
221
222                        if (doTranslate) {
223                                if (target != null)
224                                        aminoAcid = target.getAminoAcid();
225                                if (aminoAcid == null && translateNCodons()) {
226                                        aminoAcid = unknownAminoAcidCompound;
227                                } else {
228                                        if (first && initMetOnly && target.isStart()) {
229                                                aminoAcid = methionineAminoAcidCompound;
230                                        }
231                                }
232
233                                addCompoundsToList(Arrays.asList(aminoAcid), workingList);
234                        }
235
236                        if (doTranslate && stopAtStopCodons && target.isStop()) {
237                                // Check if we need to stop, but dont stop until started!
238                                break;
239                        }
240
241                        first = false;
242                }
243                postProcessCompoundLists(workingList);
244
245                return workingListToSequences(workingList);
246        }
247
248        /**
249         * Performs the trimming of stop codons and the conversion of a valid start
250         * amino acid to M
251         */
252        @Override
253        protected void postProcessCompoundLists(
254                        List<List<AminoAcidCompound>> compoundLists) {
255                for (List<AminoAcidCompound> compounds : compoundLists) {
256                        if (trimStops) {
257                                trimStop(compounds);
258                        }
259                }
260        }
261
262        /**
263         * Imperfect code. Checks the last amino acid to see if a codon could have
264         * translated a stop for it. Left in for the moment
265         */
266        protected void trimStop(List<AminoAcidCompound> sequence) {
267                AminoAcidCompound stop = sequence.get(sequence.size() - 1);
268                boolean isStop = false;
269                if (aminoAcidToCodon.containsKey(stop)) {
270                        for (Codon c : aminoAcidToCodon.get(stop)) {
271                                if (c.isStop()) {
272                                        isStop = true;
273                                        break;
274                                }
275                        }
276                }
277
278                if (isStop) {
279                        sequence.remove(sequence.size() - 1);
280                }
281        }
282
283        /**
284         * Indicates if we want to force exact translation of compounds or not i.e.
285         * those with internal N RNA bases. This will cause a translation to an X
286         * amino acid
287         */
288        public boolean translateNCodons() {
289                return translateNCodons;
290        }
291}