001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created on 01-21-2010 021 */ 022package org.biojava.nbio.core.sequence.transcription; 023 024import org.biojava.nbio.core.sequence.RNASequence; 025import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; 026import org.biojava.nbio.core.sequence.compound.NucleotideCompound; 027import org.biojava.nbio.core.sequence.io.template.SequenceCreatorInterface; 028import org.biojava.nbio.core.sequence.template.AbstractCompoundTranslator; 029import org.biojava.nbio.core.sequence.template.CompoundSet; 030import org.biojava.nbio.core.sequence.template.Sequence; 031import org.biojava.nbio.core.sequence.template.SequenceView; 032import org.biojava.nbio.core.sequence.transcription.Table.Codon; 033import org.biojava.nbio.core.sequence.views.WindowedSequence; 034 035import java.util.*; 036 037/** 038 * Takes a {@link Sequence} of {@link NucleotideCompound} which should represent 039 * an RNA sequence ({@link RNASequence} is good for this) and returns a list of 040 * {@link Sequence} which hold {@link AminoAcidCompound}. The translator can 041 * also trim stop codons as well as changing any valid start codon to an 042 * initiating met. 043 * 044 * @author ayates 045 */ 046public class RNAToAminoAcidTranslator extends 047 AbstractCompoundTranslator<NucleotideCompound, AminoAcidCompound> { 048 049 private final boolean trimStops; 050 private final boolean initMetOnly; 051 private final Map<Table.CaseInsensitiveTriplet, Codon> quickLookup; 052 private final Map<AminoAcidCompound, List<Codon>> aminoAcidToCodon; 053 // Cheeky lookup which uses a hashing value; key is to switch to using this 054 // all the time 055 private final Codon[] codonArray = new Codon[64000]; 056 private final AminoAcidCompound unknownAminoAcidCompound; 057 private final AminoAcidCompound methionineAminoAcidCompound; 058 private final boolean translateNCodons; 059 060 // If true, then translation will stop at the first stop codon encountered 061 // in the reading frame (the stop codon will be included as the last residue 062 // in the resulting ProteinSequence, unless removed by #trimStops) 063 private final boolean stopAtStopCodons; 064 065 // If true, then translation will not start until the first start codon 066 // encountered in the reading frame. The start codon will be included as the 067 // first residue in the resulting ProteinSequence 068 private final boolean waitForStartCodon; 069 070 071 public RNAToAminoAcidTranslator( 072 SequenceCreatorInterface<AminoAcidCompound> creator, 073 CompoundSet<NucleotideCompound> nucleotides, 074 CompoundSet<Codon> codons, 075 CompoundSet<AminoAcidCompound> aminoAcids, Table table, 076 boolean trimStops, boolean initMetOnly, boolean translateNCodons, 077 boolean stopAtStopCodons, boolean waitForStartCodon) { 078 079 super(creator, nucleotides, aminoAcids); 080 this.trimStops = trimStops; 081 this.initMetOnly = initMetOnly; 082 this.translateNCodons = translateNCodons; 083 084 quickLookup = new HashMap<Table.CaseInsensitiveTriplet, Codon>(codons 085 .getAllCompounds().size()); 086 aminoAcidToCodon = new HashMap<AminoAcidCompound, List<Codon>>(); 087 088 List<Codon> codonList = table.getCodons(nucleotides, aminoAcids); 089 for (Codon codon : codonList) { 090 quickLookup.put(codon.getTriplet(), codon); 091 codonArray[codon.getTriplet().intValue()] = codon; 092 093 List<Codon> codonL = aminoAcidToCodon.get(codon.getAminoAcid()); 094 if (codonL == null) { 095 codonL = new ArrayList<Codon>(); 096 aminoAcidToCodon.put(codon.getAminoAcid(), codonL); 097 } 098 codonL.add(codon); 099 100 } 101 unknownAminoAcidCompound = aminoAcids.getCompoundForString("X"); 102 methionineAminoAcidCompound = aminoAcids.getCompoundForString("M"); 103 this.stopAtStopCodons = stopAtStopCodons; 104 this.waitForStartCodon = waitForStartCodon; 105 } 106 107 /** 108 * Performs the core conversion of RNA to Peptide. It does this by walking a 109 * windowed version of the given sequence. Any trailing DNA base pairs are 110 * ignored according to the specification of {@link WindowedSequence}. 111 */ 112 113 @Override 114 public List<Sequence<AminoAcidCompound>> createSequences( 115 Sequence<NucleotideCompound> originalSequence) { 116 117 List<List<AminoAcidCompound>> workingList = new ArrayList<List<AminoAcidCompound>>(); 118 119 Iterable<SequenceView<NucleotideCompound>> iter = new WindowedSequence<NucleotideCompound>( 120 originalSequence, 3); 121 122 boolean first = true; 123 124 // If not waiting for a start codon, start translating immediately 125 boolean doTranslate = !waitForStartCodon; 126 127 for (SequenceView<NucleotideCompound> element : iter) { 128 AminoAcidCompound aminoAcid = null; 129 130 int i = 1; 131 Table.CaseInsensitiveTriplet triplet = new Table.CaseInsensitiveTriplet( 132 element.getCompoundAt(i++), element.getCompoundAt(i++), 133 element.getCompoundAt(i++)); 134 135 Codon target = null; 136 137 target = quickLookup.get(triplet); 138 139 // Check for a start 140 if (!doTranslate && target.isStart()) { 141 doTranslate = true; 142 } 143 144 if (doTranslate) { 145 if (target != null) 146 aminoAcid = target.getAminoAcid(); 147 if (aminoAcid == null && translateNCodons()) { 148 aminoAcid = unknownAminoAcidCompound; 149 } else { 150 if (first && initMetOnly && target.isStart()) { 151 aminoAcid = methionineAminoAcidCompound; 152 } 153 } 154 155 addCompoundsToList(Arrays.asList(aminoAcid), workingList); 156 } 157 158 if (doTranslate && stopAtStopCodons && target.isStop()) { 159 // Check if we need to stop, but dont stop until started! 160 break; 161 } 162 163 first = false; 164 } 165 postProcessCompoundLists(workingList); 166 167 return workingListToSequences(workingList); 168 } 169 170 /** 171 * Performs the trimming of stop codons and the conversion of a valid start 172 * amino acid to M 173 */ 174 @Override 175 protected void postProcessCompoundLists( 176 List<List<AminoAcidCompound>> compoundLists) { 177 for (List<AminoAcidCompound> compounds : compoundLists) { 178 if (trimStops) { 179 trimStop(compounds); 180 } 181 } 182 } 183 184 /** 185 * Imperfect code. Checks the last amino acid to see if a codon could have 186 * translated a stop for it. Left in for the moment 187 */ 188 protected void trimStop(List<AminoAcidCompound> sequence) { 189 AminoAcidCompound stop = sequence.get(sequence.size() - 1); 190 boolean isStop = false; 191 if (aminoAcidToCodon.containsKey(stop)) { 192 for (Codon c : aminoAcidToCodon.get(stop)) { 193 if (c.isStop()) { 194 isStop = true; 195 break; 196 } 197 } 198 } 199 200 if (isStop) { 201 sequence.remove(sequence.size() - 1); 202 } 203 } 204 205 /** 206 * Indicates if we want to force exact translation of compounds or not i.e. 207 * those with internal N RNA bases. This will cause a translation to an X 208 * amino acid 209 */ 210 public boolean translateNCodons() { 211 return translateNCodons; 212 } 213}