001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojava.bio.symbol; 023 024import java.io.BufferedReader; 025import java.io.IOException; 026import java.io.InputStream; 027import java.io.InputStreamReader; 028import java.io.OutputStream; 029import java.io.PrintWriter; 030import java.util.HashMap; 031import java.util.Iterator; 032import java.util.List; 033import java.util.Map; 034import java.util.StringTokenizer; 035 036import javax.xml.parsers.DocumentBuilder; 037import javax.xml.parsers.DocumentBuilderFactory; 038 039import org.biojava.bio.BioException; 040import org.biojava.bio.dist.Count; 041import org.biojava.bio.dist.Distribution; 042import org.biojava.bio.dist.DistributionTools; 043import org.biojava.bio.dist.IndexedCount; 044import org.biojava.bio.seq.RNATools; 045import org.biojava.bio.seq.io.SymbolTokenization; 046import org.biojava.utils.ChangeVetoException; 047import org.biojava.utils.ClassTools; 048import org.biojava.utils.xml.PrettyXMLWriter; 049import org.biojava.utils.xml.XMLWriter; 050import org.w3c.dom.Document; 051import org.w3c.dom.Element; 052import org.w3c.dom.Node; 053import org.w3c.dom.NodeList; 054 055/** 056 * An utility class for codon preferences 057 * 058 * @author David Huen 059 * @author Mark Schreiber 060 * @since 1.3 061 */ 062public class CodonPrefTools 063{ 064 /** 065 * constants for model organisms 066 */ 067 static String JUNIT = "jUnit use only!!!!"; 068 /** 069 * Drosophila melanogaster codon preferences 070 */ 071 public static String DROSOPHILA_MELANOGASTER_NUCLEAR = "Drosophila melanogaster"; 072 /** 073 * Homo sapiens codon preferences 074 */ 075 public static String MAN_NUCLEAR = "Homo sapiens"; 076 /** 077 * Mus musculus codon preferences 078 */ 079 public static String MOUSE_NUCLEAR = "Mus musculus"; 080 /** 081 * Rattus norvegicus codon preferences 082 */ 083 public static String RAT_NUCLEAR = "Rattus norvegicus"; 084 /** 085 * Takifugu rubripes codon preferences 086 */ 087 public static String FUGU_NUCLEAR = "Takifugu rubripes"; 088 /** 089 * Caenorhabditis elegans codon preferences 090 */ 091 public static String WORM_NUCLEAR = "Caenorhabditis elegans"; 092 /** 093 * Saccharomyces cerevisiae codon preferences 094 */ 095 public static String CEREVISIAE_NUCLEAR = "Saccharomyces cerevisiae"; 096 /** 097 * Schizosaccharomyces pombe codon preferences 098 */ 099 public static String POMBE_NUCLEAR = "Schizosaccharomyces pombe"; 100 /** 101 * Escherichia coli codon preferences 102 */ 103 public static String ECOLI = "Escherichia coli"; 104 105 private static Map prefMap; 106 107 final private static AtomicSymbol [] cutg = new AtomicSymbol[64]; 108 109 static { 110 prefMap = new HashMap(); 111 112 loadCodonPreferences(); 113 114 try { 115 loadCodonOrder(); 116 } 117 catch (IllegalSymbolException ise) {} 118 } 119 120 private static class LoadEverythingSelector implements CodonPrefFilter 121 { 122 public boolean isRequired(String id) { return true; } 123 public void put(CodonPref codonPref) 124 { 125 prefMap.put(codonPref.getName(), codonPref); 126 } 127 } 128 129 /** 130 * get the specified codon preference. 131 */ 132 public static CodonPref getCodonPreference(String id) 133 { 134 return (CodonPref) prefMap.get(id); 135 } 136 137 private static void loadCodonPreferences() 138 { 139 try { 140 // parse the predefined codon preferences 141 InputStream prefStream = ClassTools.getClassLoader(CodonPrefTools.class).getResourceAsStream( 142 "org/biojava/bio/symbol/CodonPrefTables.xml" 143 ); 144 145 CodonPrefFilter select = new LoadEverythingSelector(); 146 readFromXML(prefStream, select); 147 } 148 catch (Exception e) { e.printStackTrace(); } 149 } 150 151 /** 152 * returns an RNA dinucleotide alphabet. 153 * Used to represent the non-wobble bases in WobbleDistribution 154 */ 155 public static FiniteAlphabet getDinucleotideAlphabet() 156 { 157 return (FiniteAlphabet)AlphabetManager.generateCrossProductAlphaFromName("(RNA x RNA)"); 158 } 159 160 /** 161 * write out a specified CodonPref object in XML format. 162 */ 163 public static void writeToXML(CodonPref codonPref, PrintWriter writer) 164 throws NullPointerException, IOException, IllegalSymbolException, BioException 165 { 166 XMLWriter xw = new PrettyXMLWriter(writer); 167 168 dumpToXML(codonPref, xw, true); 169 170 writer.flush(); 171 } 172 173 /** 174 * reads a specified CodonPref from an file. 175 * @param name name of organism 176 */ 177 public static CodonPref readFromXML(InputStream prefStream, String name) 178 throws BioException 179 { 180 CodonPrefFilter.ByName filter = new CodonPrefFilter.ByName(name); 181 182 readFromXML(prefStream, filter); 183 184 return filter.getCodonPref(); 185 } 186 187 public static CodonPref[] readFromXML(InputStream prefStream) throws BioException{ 188 CodonPrefFilter.AcceptAll filter = new CodonPrefFilter.AcceptAll(); 189 readFromXML(prefStream, filter); 190 191 List l = filter.getCodonPrefs(); 192 CodonPref[] cp = new CodonPref[l.size()]; 193 return (CodonPref[])l.toArray(cp); 194 } 195 196 /** 197 * read an CodonPref XML stream and handle it with a CodonPrefFilter object. 198 */ 199 public static void readFromXML(InputStream prefStream, CodonPrefFilter filter) 200 throws BioException 201 { 202 try { 203 DocumentBuilder parser = DocumentBuilderFactory.newInstance().newDocumentBuilder(); 204 Document doc = parser.parse(prefStream); 205 206 // get tables for each species 207 NodeList children = doc.getDocumentElement().getChildNodes(); 208 209 for (int i=0; i<children.getLength(); i++) { 210 Node cnode = children.item(i); 211 212 if (!(cnode instanceof Element)) continue; 213 214 Element child = (Element) cnode; 215 216 String name = child.getNodeName(); 217 218 // the node must be a CodonPref record 219 if (!name.equals("CodonPref")) continue; 220 221 // pick up the id and genetic code 222 String codonPrefId = child.getAttribute("id"); 223 String geneticCodeId = child.getAttribute("geneticCodeId"); 224 225 // is this entry one we want? 226 if (!filter.isRequired(codonPrefId)) continue; 227 228 // now handle each codon frequency entry 229 NodeList freqs = child.getChildNodes(); 230 231 // create a Count object for the job 232 Count freqCounts = new IndexedCount(RNATools.getCodonAlphabet()); 233 234 for (int j=0; j < freqs.getLength(); j++) { 235 // load each entry 236 Node freq = freqs.item(j); 237 238 if (!(freq instanceof Element)) continue; 239 240 Element freqElement = (Element) freq; 241 242 // get attributes 243 String codonString = freqElement.getAttribute("codon"); 244 String freqString = freqElement.getAttribute("value"); 245 246 // create codon 247 SymbolList codonSL = RNATools.createRNA(codonString); 248 249 if (codonSL.length() !=3) throw new BioException("'" + codonString + "' is not a valid codon!"); 250 251 AtomicSymbol codon = (AtomicSymbol) RNATools.getCodonAlphabet().getSymbol(codonSL.toList()); 252 253 // recover frequency value too 254 double freqValue = Double.parseDouble(freqString); 255 freqCounts.increaseCount(codon, freqValue); 256 257 } 258 259 // turn the Counts into a Distribution 260 Distribution freqDistribution = DistributionTools.countToDistribution(freqCounts); 261 262 // create a CodonPref object 263 CodonPref newCodonPref = new SimpleCodonPref(geneticCodeId, freqDistribution, codonPrefId); 264 265 filter.put(newCodonPref); 266 } 267 } 268 catch (Exception e) { 269 throw new BioException(e); 270 } 271 } 272 273 /** 274 * reads in a file in Codon Usage Database format and 275 * translate it into our XML format 276 * These can be obtained from the 277 * <a href="http://www.kazusa.or.jp/codon/">Codon Usage Database</a>. 278 * <p> 279 * Note that the output assumes that the universal genetic code is 280 * used as that is not encoded in the CUD files. Edit the output appropriately 281 * to modify the genetic code if necessary. 282 */ 283 public static void translateCUD(InputStream input, OutputStream output) 284 throws IOException 285 { 286 // create a BufferedReader for the job 287 BufferedReader rdr = new BufferedReader(new InputStreamReader(input)); 288 289 // create a PrintWriter for the job 290 PrintWriter pw = new PrintWriter(output); 291 CodonPrefFilter.EverythingToXML filter = new CodonPrefFilter.EverythingToXML(pw); 292 293 // now invoke the CUD reader and stream its output to the XML writer 294 readFromCUD(rdr, filter); 295 296 filter.close(); 297 } 298 299 300 /** 301 * converts a String representation of a codon to its Symbol 302 */ 303 private static AtomicSymbol getCodon(String codonString) 304 throws IllegalSymbolException 305 { 306 return (AtomicSymbol) RNATools.getCodonAlphabet().getSymbol(RNATools.createRNA(codonString).toList()); 307 } 308 309 private static void loadCodonOrder() 310 throws IllegalSymbolException 311 { 312 cutg[0] = getCodon("cga"); 313 cutg[1] = getCodon("cgc"); 314 cutg[2] = getCodon("cgg"); 315 cutg[3] = getCodon("cgu"); 316 317 cutg[4] = getCodon("aga"); 318 cutg[5] = getCodon("agg"); 319 320 cutg[6] = getCodon("cua"); 321 cutg[7] = getCodon("cuc"); 322 cutg[8] = getCodon("cug"); 323 cutg[9] = getCodon("cuu"); 324 325 cutg[10] = getCodon("uua"); 326 cutg[11] = getCodon("uug"); 327 328 cutg[12] = getCodon("uca"); 329 cutg[13] = getCodon("ucc"); 330 cutg[14] = getCodon("ucg"); 331 cutg[15] = getCodon("ucu"); 332 333 cutg[16] = getCodon("agc"); 334 cutg[17] = getCodon("agu"); 335 336 cutg[18] = getCodon("aca"); 337 cutg[19] = getCodon("acc"); 338 cutg[20] = getCodon("acg"); 339 cutg[21] = getCodon("acu"); 340 341 cutg[22] = getCodon("cca"); 342 cutg[23] = getCodon("ccc"); 343 cutg[24] = getCodon("ccg"); 344 cutg[25] = getCodon("ccu"); 345 346 cutg[26] = getCodon("gca"); 347 cutg[27] = getCodon("gcc"); 348 cutg[28] = getCodon("gcg"); 349 cutg[29] = getCodon("gcu"); 350 351 cutg[30] = getCodon("gga"); 352 cutg[31] = getCodon("ggc"); 353 cutg[32] = getCodon("ggg"); 354 cutg[33] = getCodon("ggu"); 355 356 cutg[34] = getCodon("gua"); 357 cutg[35] = getCodon("guc"); 358 cutg[36] = getCodon("gug"); 359 cutg[37] = getCodon("guu"); 360 361 cutg[38] = getCodon("aaa"); 362 cutg[39] = getCodon("aag"); 363 364 cutg[40] = getCodon("aac"); 365 cutg[41] = getCodon("aau"); 366 367 cutg[42] = getCodon("caa"); 368 cutg[43] = getCodon("cag"); 369 370 cutg[44] = getCodon("cac"); 371 cutg[45] = getCodon("cau"); 372 373 cutg[46] = getCodon("gaa"); 374 cutg[47] = getCodon("gag"); 375 376 cutg[48] = getCodon("gac"); 377 cutg[49] = getCodon("gau"); 378 379 cutg[50] = getCodon("uac"); 380 cutg[51] = getCodon("uau"); 381 382 cutg[52] = getCodon("ugc"); 383 cutg[53] = getCodon("ugu"); 384 385 cutg[54] = getCodon("uuc"); 386 cutg[55] = getCodon("uuu"); 387 388 cutg[56] = getCodon("aua"); 389 cutg[57] = getCodon("auc"); 390 cutg[58] = getCodon("auu"); 391 392 cutg[59] = getCodon("aug"); 393 394 cutg[60] = getCodon("ugg"); 395 396 cutg[61] = getCodon("uaa"); 397 cutg[62] = getCodon("uag"); 398 cutg[63] = getCodon("uga"); 399 } 400 401 private static String stringifyCodon(BasisSymbol codon) 402 throws IllegalSymbolException, BioException 403 { 404 // get the component symbols 405 List codonList = codon.getSymbols(); 406 407 // get a tokenizer 408 SymbolTokenization toke = RNATools.getRNA().getTokenization("token"); 409 410 String tokenizedCodon = toke.tokenizeSymbol((Symbol) codonList.get(0)) 411 + toke.tokenizeSymbol((Symbol) codonList.get(1)) 412 + toke.tokenizeSymbol((Symbol) codonList.get(2)); 413 414 return tokenizedCodon; 415 } 416 417 /** 418 * writes out a CodonPref object in XML form 419 */ 420 static void dumpToXML(CodonPref codonPref, XMLWriter xw, boolean writeWrapper) 421 throws NullPointerException, IOException, IllegalSymbolException, BioException 422 { 423 // validate both objects first 424 if ((codonPref == null) || (xw == null)) 425 throw new NullPointerException(); 426 427 // get the CodonPref Distribution 428 Distribution codonDist = codonPref.getFrequency(); 429 430 // start <CodonPrefs> 431 if (writeWrapper) xw.openTag("CodonPrefs"); 432 433 xw.openTag("CodonPref"); 434 xw.attribute("id", codonPref.getName()); 435 xw.attribute("geneticCodeId", codonPref.getGeneticCodeName()); 436 437 // loop over all codons, writing out the stats 438 for (Iterator codonI = RNATools.getCodonAlphabet().iterator(); codonI.hasNext(); ) { 439 BasisSymbol codon = (BasisSymbol) codonI.next(); 440 441 xw.openTag("frequency"); 442 443 // convert codon to a three letter string 444 xw.attribute("codon", stringifyCodon(codon)); 445 xw.attribute("value", Double.toString(codonDist.getWeight(codon))); 446 447 xw.closeTag("frequency"); 448 } 449 450 xw.closeTag("CodonPref"); 451 452 if (writeWrapper) xw.closeTag("CodonPrefs"); 453 } 454 455 /** 456 * reads in records in CUD format 457 */ 458 private static void readFromCUD(BufferedReader rdr, CodonPrefFilter filter) 459 { 460 try { 461 String currLine; 462 while ((currLine = rdr.readLine()) != null) { 463 464 // process comment line 465 StringTokenizer toke = new StringTokenizer(currLine, ":"); 466 if (toke.hasMoreTokens()) { 467 // get id string 468 String id = (toke.nextToken()).trim(); 469 470 // read the codon count 471 currLine = rdr.readLine(); 472 if (currLine == null) break; 473 474 // do we even want to process this record? 475 if (filter.isRequired(id)) { 476 toke = new StringTokenizer(currLine); 477 478 int idx = 0; 479 IndexedCount count = new IndexedCount(RNATools.getCodonAlphabet()); 480 while (toke.hasMoreTokens()) { 481 // check that I haven't read too many values! 482 if (idx > 63) continue; 483 count.increaseCount(cutg[idx], Double.parseDouble(toke.nextToken())); 484 idx++; 485 } 486 487 if (idx != 64) continue; 488 489 // ok, I now have the counts and the name, let's stash it 490 Distribution codonDist = DistributionTools.countToDistribution(count); 491 492 CodonPref codonPref = new SimpleCodonPref("UNIVERSAL", codonDist, id); 493 filter.put(codonPref); 494 } 495 } 496 } 497 } 498 catch (IOException ioe) {} 499 catch (IllegalSymbolException ise) {} 500 catch (IllegalAlphabetException iae) {} 501 catch (ChangeVetoException cve) {} 502 catch (BioException be) {} 503 } 504} 505