001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 * Created on 01-21-2010
021 */
022package org.biojava.nbio.core.sequence.transcription;
023
024import org.biojava.nbio.core.sequence.RNASequence;
025import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
026import org.biojava.nbio.core.sequence.compound.NucleotideCompound;
027import org.biojava.nbio.core.sequence.io.template.SequenceCreatorInterface;
028import org.biojava.nbio.core.sequence.template.AbstractCompoundTranslator;
029import org.biojava.nbio.core.sequence.template.CompoundSet;
030import org.biojava.nbio.core.sequence.template.Sequence;
031import org.biojava.nbio.core.sequence.template.SequenceView;
032import org.biojava.nbio.core.sequence.transcription.Table.Codon;
033import org.biojava.nbio.core.sequence.views.WindowedSequence;
034
035import java.util.*;
036
037/**
038 * Takes a {@link Sequence} of {@link NucleotideCompound} which should represent
039 * an RNA sequence ({@link RNASequence} is good for this) and returns a list of
040 * {@link Sequence} which hold {@link AminoAcidCompound}. The translator can
041 * also trim stop codons as well as changing any valid start codon to an
042 * initiating met.
043 *
044 * @author ayates
045 */
046public class RNAToAminoAcidTranslator extends
047                AbstractCompoundTranslator<NucleotideCompound, AminoAcidCompound> {
048
049        private final boolean trimStops;
050        private final boolean initMetOnly;
051        private final Map<Table.CaseInsensitiveTriplet, Codon> quickLookup;
052        private final Map<AminoAcidCompound, List<Codon>> aminoAcidToCodon;
053        // Cheeky lookup which uses a hashing value; key is to switch to using this
054        // all the time
055        private final Codon[] codonArray = new Codon[64000];
056        private final AminoAcidCompound unknownAminoAcidCompound;
057        private final AminoAcidCompound methionineAminoAcidCompound;
058        private final boolean translateNCodons;
059
060        // If true, then translation will stop at the first stop codon encountered
061        // in the reading frame (the stop codon will be included as the last residue
062        // in the resulting ProteinSequence, unless removed by #trimStops)
063        private final boolean stopAtStopCodons;
064
065        // If true, then translation will not start until the first start codon
066        // encountered in the reading frame. The start codon will be included as the
067        // first residue in the resulting ProteinSequence
068        private final boolean waitForStartCodon;
069
070
071        public RNAToAminoAcidTranslator(
072                        SequenceCreatorInterface<AminoAcidCompound> creator,
073                        CompoundSet<NucleotideCompound> nucleotides,
074                        CompoundSet<Codon> codons,
075                        CompoundSet<AminoAcidCompound> aminoAcids, Table table,
076                        boolean trimStops, boolean initMetOnly, boolean translateNCodons,
077                        boolean stopAtStopCodons, boolean waitForStartCodon) {
078
079                super(creator, nucleotides, aminoAcids);
080                this.trimStops = trimStops;
081                this.initMetOnly = initMetOnly;
082                this.translateNCodons = translateNCodons;
083
084                quickLookup = new HashMap<Table.CaseInsensitiveTriplet, Codon>(codons
085                                .getAllCompounds().size());
086                aminoAcidToCodon = new HashMap<AminoAcidCompound, List<Codon>>();
087
088                List<Codon> codonList = table.getCodons(nucleotides, aminoAcids);
089                for (Codon codon : codonList) {
090                        quickLookup.put(codon.getTriplet(), codon);
091                        codonArray[codon.getTriplet().intValue()] = codon;
092
093                        List<Codon> codonL = aminoAcidToCodon.get(codon.getAminoAcid());
094                        if (codonL == null) {
095                                codonL = new ArrayList<Codon>();
096                                aminoAcidToCodon.put(codon.getAminoAcid(), codonL);
097                        }
098                        codonL.add(codon);
099
100                }
101                unknownAminoAcidCompound = aminoAcids.getCompoundForString("X");
102                methionineAminoAcidCompound = aminoAcids.getCompoundForString("M");
103                this.stopAtStopCodons = stopAtStopCodons;
104                this.waitForStartCodon = waitForStartCodon;
105        }
106
107        /**
108         * Performs the core conversion of RNA to Peptide. It does this by walking a
109         * windowed version of the given sequence. Any trailing DNA base pairs are
110         * ignored according to the specification of {@link WindowedSequence}.
111         */
112
113        @Override
114        public List<Sequence<AminoAcidCompound>> createSequences(
115                        Sequence<NucleotideCompound> originalSequence) {
116
117                List<List<AminoAcidCompound>> workingList = new ArrayList<List<AminoAcidCompound>>();
118
119                Iterable<SequenceView<NucleotideCompound>> iter = new WindowedSequence<NucleotideCompound>(
120                                originalSequence, 3);
121
122                boolean first = true;
123
124                // If not waiting for a start codon, start translating immediately
125                boolean doTranslate = !waitForStartCodon;
126
127                for (SequenceView<NucleotideCompound> element : iter) {
128                        AminoAcidCompound aminoAcid = null;
129
130                        int i = 1;
131                        Table.CaseInsensitiveTriplet triplet = new Table.CaseInsensitiveTriplet(
132                                        element.getCompoundAt(i++), element.getCompoundAt(i++),
133                                        element.getCompoundAt(i++));
134
135                        Codon target = null;
136
137                        target = quickLookup.get(triplet);
138
139                        // Check for a start
140                        if (!doTranslate && target.isStart()) {
141                                doTranslate = true;
142                        }
143
144                        if (doTranslate) {
145                                if (target != null)
146                                        aminoAcid = target.getAminoAcid();
147                                if (aminoAcid == null && translateNCodons()) {
148                                        aminoAcid = unknownAminoAcidCompound;
149                                } else {
150                                        if (first && initMetOnly && target.isStart()) {
151                                                aminoAcid = methionineAminoAcidCompound;
152                                        }
153                                }
154
155                                addCompoundsToList(Arrays.asList(aminoAcid), workingList);
156                        }
157
158                        if (doTranslate && stopAtStopCodons && target.isStop()) {
159                                // Check if we need to stop, but dont stop until started!
160                                break;
161                        }
162
163                        first = false;
164                }
165                postProcessCompoundLists(workingList);
166
167                return workingListToSequences(workingList);
168        }
169
170        /**
171         * Performs the trimming of stop codons and the conversion of a valid start
172         * amino acid to M
173         */
174        @Override
175        protected void postProcessCompoundLists(
176                        List<List<AminoAcidCompound>> compoundLists) {
177                for (List<AminoAcidCompound> compounds : compoundLists) {
178                        if (trimStops) {
179                                trimStop(compounds);
180                        }
181                }
182        }
183
184        /**
185         * Imperfect code. Checks the last amino acid to see if a codon could have
186         * translated a stop for it. Left in for the moment
187         */
188        protected void trimStop(List<AminoAcidCompound> sequence) {
189                AminoAcidCompound stop = sequence.get(sequence.size() - 1);
190                boolean isStop = false;
191                if (aminoAcidToCodon.containsKey(stop)) {
192                        for (Codon c : aminoAcidToCodon.get(stop)) {
193                                if (c.isStop()) {
194                                        isStop = true;
195                                        break;
196                                }
197                        }
198                }
199
200                if (isStop) {
201                        sequence.remove(sequence.size() - 1);
202                }
203        }
204
205        /**
206         * Indicates if we want to force exact translation of compounds or not i.e.
207         * those with internal N RNA bases. This will cause a translation to an X
208         * amino acid
209         */
210        public boolean translateNCodons() {
211                return translateNCodons;
212        }
213}