001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created on 01-21-2010 021 */ 022package org.biojava.nbio.core.sequence.io; 023 024import org.biojava.nbio.core.exceptions.ParserException; 025import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; 026import org.biojava.nbio.core.sequence.compound.NucleotideCompound; 027import org.biojava.nbio.core.sequence.io.util.ClasspathResource; 028import org.biojava.nbio.core.sequence.io.util.IOUtils; 029import org.biojava.nbio.core.sequence.template.AbstractCompoundSet; 030import org.biojava.nbio.core.sequence.template.CompoundSet; 031import org.biojava.nbio.core.sequence.transcription.Table; 032 033import java.io.InputStream; 034import java.util.*; 035 036 037/** 038 * Available translations 039 * 040 * <ul> 041 * <li>1 - UNIVERSAL</li> 042 * <li>2 - VERTEBRATE_MITOCHONDRIAL</li> 043 * <li>3 - YEAST_MITOCHONDRIAL</li> 044 * <li>4 - MOLD_MITOCHONDRIAL</li> 045 * <li>5 - INVERTEBRATE_MITOCHONDRIAL</li> 046 * <li>6 - CILIATE_NUCLEAR</li> 047 * <li>9 - ECHINODERM_MITOCHONDRIAL</li> 048 * <li>10 - EUPLOTID_NUCLEAR</li> 049 * <li>11 - BACTERIAL</li> 050 * <li>12 - ALTERNATIVE_YEAST_NUCLEAR</li> 051 * <li>13 - ASCIDIAN_MITOCHONDRIAL</li> 052 * <li>14 - FLATWORM_MITOCHONDRIAL</li> 053 * <li>15 - BLEPHARISMA_MACRONUCLEAR</li> 054 * <li>16 - 2CHLOROPHYCEAN_MITOCHONDRIAL</li> 055 * <li>21 - TREMATODE_MITOCHONDRIAL</li> 056 * <li>23 - SCENEDESMUS_MITOCHONDRIAL</li> 057 * </ul> 058 * 059 * Taken from <a 060 * href="https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi?mode=c" 061 * >NCBI</a> with slight modification and put into the classpath resource. 062 * 063 * Takes in an ID, name, amino acid string and the locations of amino acids 064 * which acts as start codons in the translation table. You can give the 3 codon 065 * position strings that correspond to the amino acid string or if you are using 066 * the default IUPAC codes you can use the hardcoded ones which are consistent 067 * amongst all <a 068 * href="https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi?mode=c"> codon 069 * tables</a>. 070 * 071 * The generated {@link IUPACTable} objects do not parse the data further until 072 * requested so if you do not use a translation table your only penalty is the 073 * loading of the IUPAC data from the classpath. 074 * 075 * @author Andy Yates 076 */ 077public class IUPACParser { 078 079 private static class IOD { 080 public static final IUPACParser INSTANCE = new IUPACParser(); 081 } 082 083 public static IUPACParser getInstance() { 084 return IOD.INSTANCE; 085 } 086 087 public static final String IUPAC_LOCATION = "org/biojava/nbio/core/sequence/iupac.txt"; 088 089 private InputStream is; 090 private List<IUPACTable> tables; 091 private Map<String, IUPACTable> nameLookup; 092 private Map<Integer, IUPACTable> idLookup; 093 094 /** 095 * Default version and uses the classpath based IUPAC table 096 */ 097 public IUPACParser() { 098 //use the preCache version to make sure we don't keep a IO handle open 099 is = new ClasspathResource(IUPAC_LOCATION, true).getInputStream(); 100 } 101 102 /** 103 * Allows you to specify a different IUPAC table. 104 */ 105 public IUPACParser(InputStream is) { 106 this.is = is; 107 } 108 109 /** 110 * Returns a list of all available IUPAC tables 111 */ 112 public List<IUPACTable> getTables() { 113 if (tables == null) { 114 tables = parseTables(); 115 } 116 return tables; 117 } 118 119 /** 120 * Returns a table by its name 121 */ 122 public IUPACTable getTable(String name) { 123 populateLookups(); 124 return nameLookup.get(name); 125 } 126 127 /** 128 * Returns a table by its identifier i.e. 1 means universal codon tables 129 */ 130 public IUPACTable getTable(Integer id) { 131 populateLookups(); 132 return idLookup.get(id); 133 } 134 135 private void populateLookups() { 136 if(nameLookup == null) { 137 nameLookup = new HashMap<>(); 138 idLookup = new HashMap<>(); 139 for(IUPACTable t: getTables()) { 140 nameLookup.put(t.getName(), t); 141 idLookup.put(t.getId(), t); 142 } 143 } 144 } 145 146 private List<IUPACTable> parseTables() { 147 List<IUPACTable> localTables = new ArrayList<>(); 148 List<String> lines = IOUtils.getList(is); 149 Integer id = null; 150 String name, aa, starts, baseone, basetwo, basethree; 151 name = aa = starts = baseone = basetwo = basethree = null; 152 for (String line : lines) { 153 if ("//".equalsIgnoreCase(line)) { 154 localTables.add(new IUPACTable(name, id, aa, starts, baseone, basetwo, 155 basethree)); 156 name = aa = starts = baseone = basetwo = basethree = null; 157 id = null; 158 } 159 else { 160 String[] keyValue = line.split("\\s*=\\s*"); 161 if ("AAs".equals(keyValue[0])) { 162 aa = keyValue[1]; 163 } 164 else if ("Starts".equals(keyValue[0])) { 165 starts = keyValue[1]; 166 } 167 else if ("Base1".equals(keyValue[0])) { 168 baseone = keyValue[1]; 169 } 170 else if ("Base2".equals(keyValue[0])) { 171 basetwo = keyValue[1]; 172 } 173 else if ("Base3".equals(keyValue[0])) { 174 basethree = keyValue[1]; 175 } 176 else { 177 name = keyValue[0]; 178 id = Integer.parseInt(keyValue[1]); 179 } 180 } 181 } 182 183 return localTables; 184 } 185 186 /** 187 * Holds the concept of a codon table from the IUPAC format 188 * 189 * @author Andy Yates 190 */ 191 public static class IUPACTable implements Table { 192 193 private final Integer id; 194 private final String name; 195 private final String aminoAcidString; 196 private final String startCodons; 197 private final String baseOne; 198 private final String baseTwo; 199 private final String baseThree; 200 201 private final List<Codon> codons = new ArrayList<>(); 202 private CompoundSet<Codon> compounds = null; 203 204 public IUPACTable(String name, int id, String aminoAcidString, 205 String startCodons, String baseOne, String baseTwo, String baseThree) { 206 this.aminoAcidString = aminoAcidString; 207 this.startCodons = startCodons; 208 this.name = name; 209 this.id = id; 210 this.baseOne = baseOne; 211 this.baseTwo = baseTwo; 212 this.baseThree = baseThree; 213 } 214 215 /** 216 * Constructor which uses the basic IUPAC codon table format. Useful 217 * if you need to specify your own IUPAC table with minimal 218 * definitions from your side. 219 */ 220 public IUPACTable(String name, Integer id, String aminoAcidString, 221 String startCodons) { 222 this(name, id, aminoAcidString, startCodons, 223 "TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG", 224 "TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG", 225 "TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG"); 226 } 227 228 public Integer getId() { 229 return id; 230 } 231 232 public String getName() { 233 return name; 234 } 235 236 /** 237 * Returns true if the given compound was a start codon in this 238 * codon table. This will report true if the compound could ever have 239 * been a start codon. 240 * 241 * @throws IllegalStateException Thrown if 242 * {@link #getCodons(CompoundSet, CompoundSet)} was not called first. 243 */ 244 @Override 245 public boolean isStart(AminoAcidCompound compound) { 246 if(this.codons.isEmpty()) { 247 throw new IllegalStateException("Codons are empty; please request getCodons() fist before asking this"); 248 } 249 for(Codon codon: codons) { 250 //Only check if the codon was a start codon and then ask if the compound was encoded by it 251 if(codon.isStart()) { 252 if(codon.getAminoAcid().equalsIgnoreCase(compound)) { 253 return true; 254 } 255 } 256 } 257 return false; 258 } 259 260 /** 261 * Returns a list of codons where the source and target compounds 262 * are the same as those given by the parameters. 263 * 264 * @param nucleotides The nucleotide set to use when building BioJava 265 * representations of codons 266 * @param aminoAcids The target amino acid compounds objects 267 */ 268 @Override 269 public List<Codon> getCodons(CompoundSet<NucleotideCompound> nucleotides, 270 CompoundSet<AminoAcidCompound> aminoAcids) { 271 272 if (this.codons.isEmpty()) { 273 List<String> aminoAcidStrings = aminoAcids(); 274 List<String> startCodonStrings = startCodons(); 275 List<List<String>> codonStrings = codonStrings(); 276 277 for (int i = 0; i < aminoAcidStrings.size(); i++) { 278 279 List<String> codonString = codonStrings.get(i); 280 NucleotideCompound one = getCompound(codonString, 0, nucleotides); 281 NucleotideCompound two = getCompound(codonString, 1, nucleotides); 282 NucleotideCompound three = getCompound(codonString, 2, nucleotides); 283 boolean start = ("M".equals(startCodonStrings.get(i))); 284 boolean stop = ("*".equals(aminoAcidStrings.get(i))); 285 AminoAcidCompound aminoAcid = aminoAcids 286 .getCompoundForString(aminoAcidStrings.get(i)); 287 codons.add(new Codon(new CaseInsensitiveTriplet(one, two, three), aminoAcid, start, stop)); 288 } 289 } 290 291 return codons; 292 } 293 294 private NucleotideCompound getCompound(List<String> compounds, 295 int position, CompoundSet<NucleotideCompound> nucelotides) { 296 String compound = compounds.get(position); 297 NucleotideCompound returnCompound = nucelotides 298 .getCompoundForString(compound); 299 if (returnCompound == null) { 300 if ("T".equalsIgnoreCase(compound)) { 301 returnCompound = nucelotides.getCompoundForString("U"); 302 } 303 else { 304 throw new ParserException("Cannot find a compound for string " 305 + compound); 306 } 307 } 308 return returnCompound; 309 } 310 311 /** 312 * Returns the compound set of codons 313 */ 314 @Override 315 public CompoundSet<Codon> getCodonCompoundSet( 316 final CompoundSet<NucleotideCompound> rnaCompounds, 317 final CompoundSet<AminoAcidCompound> aminoAcidCompounds) { 318 if (compounds == null) { 319 compounds = new AbstractCompoundSet<Codon>() { 320 { 321 for (Codon c : getCodons(rnaCompounds, aminoAcidCompounds)) { 322 addCompound(c); 323 } 324 } 325 }; 326 } 327 return compounds; 328 } 329 330 private List<List<String>> codonStrings() { 331 List<List<String>> codons = new ArrayList<>(); 332 for (int i = 0; i < baseOne.length(); i++) { 333 List<String> codon = Arrays.asList(Character 334 .toString(baseOne.charAt(i)), 335 Character.toString(baseTwo.charAt(i)), Character.toString(baseThree 336 .charAt(i))); 337 codons.add(codon); 338 } 339 return codons; 340 } 341 342 private List<String> aminoAcids() { 343 return split(aminoAcidString); 344 } 345 346 private List<String> startCodons() { 347 return split(startCodons); 348 } 349 350 private List<String> split(String string) { 351 List<String> split = new ArrayList<>(); 352 for (int i = 0; i < string.length(); i++) { 353 split.add(Character.toString(string.charAt(i))); 354 } 355 return split; 356 } 357 } 358}