001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created on 01-21-2010 021 */ 022package org.biojava.nbio.core.sequence.transcription; 023 024import org.biojava.nbio.core.sequence.RNASequence; 025import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; 026import org.biojava.nbio.core.sequence.compound.NucleotideCompound; 027import org.biojava.nbio.core.sequence.io.template.SequenceCreatorInterface; 028import org.biojava.nbio.core.sequence.template.AbstractCompoundTranslator; 029import org.biojava.nbio.core.sequence.template.CompoundSet; 030import org.biojava.nbio.core.sequence.template.Sequence; 031import org.biojava.nbio.core.sequence.template.SequenceView; 032import org.biojava.nbio.core.sequence.transcription.Table.Codon; 033import org.biojava.nbio.core.sequence.views.WindowedSequence; 034 035import java.util.*; 036 037/** 038 * Takes a {@link Sequence} of {@link NucleotideCompound} which should represent 039 * an RNA sequence ({@link RNASequence} is good for this) and returns a list of 040 * {@link Sequence} which hold {@link AminoAcidCompound}. The translator can 041 * also trim stop codons as well as changing any valid start codon to an 042 * initiating met. 043 * 044 * @author ayates 045 */ 046public class RNAToAminoAcidTranslator extends 047 AbstractCompoundTranslator<NucleotideCompound, AminoAcidCompound> { 048 049 private final boolean trimStops; 050 private final boolean initMetOnly; 051 private final Map<Table.CaseInsensitiveTriplet, Codon> quickLookup; 052 private final Map<AminoAcidCompound, List<Codon>> aminoAcidToCodon; 053 // Cheeky lookup which uses a hashing value; key is to switch to using this 054 // all the time 055 private final Codon[] codonArray = new Codon[64000]; 056 private final AminoAcidCompound unknownAminoAcidCompound; 057 private final AminoAcidCompound methionineAminoAcidCompound; 058 private final boolean translateNCodons; 059 060 // If true, then translation will stop at the first stop codon encountered 061 // in the reading frame (the stop codon will be included as the last residue 062 // in the resulting ProteinSequence, unless removed by #trimStops) 063 private final boolean stopAtStopCodons; 064 065 // If true, then translation will not start until the first start codon 066 // encountered in the reading frame. The start codon will be included as the 067 // first residue in the resulting ProteinSequence 068 private final boolean waitForStartCodon; 069 070 /** 071 * @deprecated Retained for backwards compatability, setting 072 * {@link #stopAtStopCodons} to <code>false</code> 073 */ 074 @Deprecated 075 public RNAToAminoAcidTranslator( 076 SequenceCreatorInterface<AminoAcidCompound> creator, 077 CompoundSet<NucleotideCompound> nucleotides, 078 CompoundSet<Codon> codons, 079 CompoundSet<AminoAcidCompound> aminoAcids, Table table, 080 boolean trimStops, boolean initMetOnly, boolean translateNCodons) { 081 082 super(creator, nucleotides, aminoAcids); 083 this.trimStops = trimStops; 084 this.initMetOnly = initMetOnly; 085 this.translateNCodons = translateNCodons; 086 087 quickLookup = new HashMap<Table.CaseInsensitiveTriplet, Codon>(codons 088 .getAllCompounds().size()); 089 aminoAcidToCodon = new HashMap<AminoAcidCompound, List<Codon>>(); 090 091 List<Codon> codonList = table.getCodons(nucleotides, aminoAcids); 092 for (Codon codon : codonList) { 093 quickLookup.put(codon.getTriplet(), codon); 094 codonArray[codon.getTriplet().intValue()] = codon; 095 096 List<Codon> codonL = aminoAcidToCodon.get(codon.getAminoAcid()); 097 if (codonL == null) { 098 codonL = new ArrayList<Codon>(); 099 aminoAcidToCodon.put(codon.getAminoAcid(), codonL); 100 } 101 codonL.add(codon); 102 103 } 104 unknownAminoAcidCompound = aminoAcids.getCompoundForString("X"); 105 methionineAminoAcidCompound = aminoAcids.getCompoundForString("M"); 106 // Set to false for backwards compatability 107 stopAtStopCodons = false; 108 waitForStartCodon = false; 109 } 110 111 @Deprecated 112 public RNAToAminoAcidTranslator( 113 SequenceCreatorInterface<AminoAcidCompound> creator, 114 CompoundSet<NucleotideCompound> nucleotides, 115 CompoundSet<Codon> codons, 116 CompoundSet<AminoAcidCompound> aminoAcids, Table table, 117 boolean trimStops, boolean initMetOnly, boolean translateNCodons, 118 boolean stopAtStopCodons) { 119 120 super(creator, nucleotides, aminoAcids); 121 this.trimStops = trimStops; 122 this.initMetOnly = initMetOnly; 123 this.translateNCodons = translateNCodons; 124 125 quickLookup = new HashMap<Table.CaseInsensitiveTriplet, Codon>(codons 126 .getAllCompounds().size()); 127 aminoAcidToCodon = new HashMap<AminoAcidCompound, List<Codon>>(); 128 129 List<Codon> codonList = table.getCodons(nucleotides, aminoAcids); 130 for (Codon codon : codonList) { 131 quickLookup.put(codon.getTriplet(), codon); 132 codonArray[codon.getTriplet().intValue()] = codon; 133 134 List<Codon> codonL = aminoAcidToCodon.get(codon.getAminoAcid()); 135 if (codonL == null) { 136 codonL = new ArrayList<Codon>(); 137 aminoAcidToCodon.put(codon.getAminoAcid(), codonL); 138 } 139 codonL.add(codon); 140 141 } 142 unknownAminoAcidCompound = aminoAcids.getCompoundForString("X"); 143 methionineAminoAcidCompound = aminoAcids.getCompoundForString("M"); 144 this.stopAtStopCodons = stopAtStopCodons; 145 // Set for backwards compatibility 146 waitForStartCodon = false; 147 } 148 149 public RNAToAminoAcidTranslator( 150 SequenceCreatorInterface<AminoAcidCompound> creator, 151 CompoundSet<NucleotideCompound> nucleotides, 152 CompoundSet<Codon> codons, 153 CompoundSet<AminoAcidCompound> aminoAcids, Table table, 154 boolean trimStops, boolean initMetOnly, boolean translateNCodons, 155 boolean stopAtStopCodons, boolean waitForStartCodon) { 156 157 super(creator, nucleotides, aminoAcids); 158 this.trimStops = trimStops; 159 this.initMetOnly = initMetOnly; 160 this.translateNCodons = translateNCodons; 161 162 quickLookup = new HashMap<Table.CaseInsensitiveTriplet, Codon>(codons 163 .getAllCompounds().size()); 164 aminoAcidToCodon = new HashMap<AminoAcidCompound, List<Codon>>(); 165 166 List<Codon> codonList = table.getCodons(nucleotides, aminoAcids); 167 for (Codon codon : codonList) { 168 quickLookup.put(codon.getTriplet(), codon); 169 codonArray[codon.getTriplet().intValue()] = codon; 170 171 List<Codon> codonL = aminoAcidToCodon.get(codon.getAminoAcid()); 172 if (codonL == null) { 173 codonL = new ArrayList<Codon>(); 174 aminoAcidToCodon.put(codon.getAminoAcid(), codonL); 175 } 176 codonL.add(codon); 177 178 } 179 unknownAminoAcidCompound = aminoAcids.getCompoundForString("X"); 180 methionineAminoAcidCompound = aminoAcids.getCompoundForString("M"); 181 this.stopAtStopCodons = stopAtStopCodons; 182 this.waitForStartCodon = waitForStartCodon; 183 } 184 185 /** 186 * Performs the core conversion of RNA to Peptide. It does this by walking a 187 * windowed version of the given sequence. Any trailing DNA base pairs are 188 * ignored according to the specification of {@link WindowedSequence}. 189 */ 190 191 @Override 192 public List<Sequence<AminoAcidCompound>> createSequences( 193 Sequence<NucleotideCompound> originalSequence) { 194 195 List<List<AminoAcidCompound>> workingList = new ArrayList<List<AminoAcidCompound>>(); 196 197 Iterable<SequenceView<NucleotideCompound>> iter = new WindowedSequence<NucleotideCompound>( 198 originalSequence, 3); 199 200 boolean first = true; 201 202 // If not waiting for a start codon, start translating immediately 203 boolean doTranslate = !waitForStartCodon; 204 205 for (SequenceView<NucleotideCompound> element : iter) { 206 AminoAcidCompound aminoAcid = null; 207 208 int i = 1; 209 Table.CaseInsensitiveTriplet triplet = new Table.CaseInsensitiveTriplet( 210 element.getCompoundAt(i++), element.getCompoundAt(i++), 211 element.getCompoundAt(i++)); 212 213 Codon target = null; 214 215 target = quickLookup.get(triplet); 216 217 // Check for a start 218 if (doTranslate == false && target.isStart()) { 219 doTranslate = true; 220 } 221 222 if (doTranslate) { 223 if (target != null) 224 aminoAcid = target.getAminoAcid(); 225 if (aminoAcid == null && translateNCodons()) { 226 aminoAcid = unknownAminoAcidCompound; 227 } else { 228 if (first && initMetOnly && target.isStart()) { 229 aminoAcid = methionineAminoAcidCompound; 230 } 231 } 232 233 addCompoundsToList(Arrays.asList(aminoAcid), workingList); 234 } 235 236 if (doTranslate && stopAtStopCodons && target.isStop()) { 237 // Check if we need to stop, but dont stop until started! 238 break; 239 } 240 241 first = false; 242 } 243 postProcessCompoundLists(workingList); 244 245 return workingListToSequences(workingList); 246 } 247 248 /** 249 * Performs the trimming of stop codons and the conversion of a valid start 250 * amino acid to M 251 */ 252 @Override 253 protected void postProcessCompoundLists( 254 List<List<AminoAcidCompound>> compoundLists) { 255 for (List<AminoAcidCompound> compounds : compoundLists) { 256 if (trimStops) { 257 trimStop(compounds); 258 } 259 } 260 } 261 262 /** 263 * Imperfect code. Checks the last amino acid to see if a codon could have 264 * translated a stop for it. Left in for the moment 265 */ 266 protected void trimStop(List<AminoAcidCompound> sequence) { 267 AminoAcidCompound stop = sequence.get(sequence.size() - 1); 268 boolean isStop = false; 269 if (aminoAcidToCodon.containsKey(stop)) { 270 for (Codon c : aminoAcidToCodon.get(stop)) { 271 if (c.isStop()) { 272 isStop = true; 273 break; 274 } 275 } 276 } 277 278 if (isStop) { 279 sequence.remove(sequence.size() - 1); 280 } 281 } 282 283 /** 284 * Indicates if we want to force exact translation of compounds or not i.e. 285 * those with internal N RNA bases. This will cause a translation to an X 286 * amino acid 287 */ 288 public boolean translateNCodons() { 289 return translateNCodons; 290 } 291}