001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022package org.biojava.bio.symbol;
023
024import java.io.BufferedReader;
025import java.io.IOException;
026import java.io.InputStream;
027import java.io.InputStreamReader;
028import java.io.OutputStream;
029import java.io.PrintWriter;
030import java.util.HashMap;
031import java.util.Iterator;
032import java.util.List;
033import java.util.Map;
034import java.util.StringTokenizer;
035
036import javax.xml.parsers.DocumentBuilder;
037import javax.xml.parsers.DocumentBuilderFactory;
038
039import org.biojava.bio.BioException;
040import org.biojava.bio.dist.Count;
041import org.biojava.bio.dist.Distribution;
042import org.biojava.bio.dist.DistributionTools;
043import org.biojava.bio.dist.IndexedCount;
044import org.biojava.bio.seq.RNATools;
045import org.biojava.bio.seq.io.SymbolTokenization;
046import org.biojava.utils.ChangeVetoException;
047import org.biojava.utils.ClassTools;
048import org.biojava.utils.xml.PrettyXMLWriter;
049import org.biojava.utils.xml.XMLWriter;
050import org.w3c.dom.Document;
051import org.w3c.dom.Element;
052import org.w3c.dom.Node;
053import org.w3c.dom.NodeList;
054
055/**
056 * An utility class for codon preferences
057 *
058 * @author David Huen
059 * @author Mark Schreiber
060 * @since 1.3
061 */
062public class CodonPrefTools
063{
064    /**
065     * constants for model organisms
066     */
067    static String JUNIT = "jUnit use only!!!!";
068    /**
069     * Drosophila melanogaster codon preferences
070     */
071    public static String DROSOPHILA_MELANOGASTER_NUCLEAR = "Drosophila melanogaster";
072    /**
073     * Homo sapiens codon preferences
074     */
075    public static String MAN_NUCLEAR = "Homo sapiens";
076    /**
077     * Mus musculus codon preferences
078     */
079    public static String MOUSE_NUCLEAR = "Mus musculus";
080    /**
081     * Rattus norvegicus codon preferences
082     */
083    public static String RAT_NUCLEAR = "Rattus norvegicus";
084    /**
085     * Takifugu rubripes codon preferences
086     */
087    public static String FUGU_NUCLEAR = "Takifugu rubripes";
088    /**
089     * Caenorhabditis elegans codon preferences
090     */
091    public static String WORM_NUCLEAR = "Caenorhabditis elegans";
092    /**
093     * Saccharomyces cerevisiae codon preferences
094     */
095    public static String CEREVISIAE_NUCLEAR = "Saccharomyces cerevisiae";
096    /**
097     * Schizosaccharomyces pombe codon preferences
098     */
099    public static String POMBE_NUCLEAR = "Schizosaccharomyces pombe";
100    /**
101     * Escherichia coli codon preferences
102     */
103    public static String ECOLI = "Escherichia coli";
104
105    private static Map prefMap;
106
107    final private static AtomicSymbol [] cutg = new AtomicSymbol[64];
108
109    static {
110        prefMap = new HashMap();
111
112        loadCodonPreferences();
113
114        try {
115            loadCodonOrder();
116        }
117        catch (IllegalSymbolException ise) {}
118    }
119
120    private static class LoadEverythingSelector implements CodonPrefFilter
121    {
122        public boolean isRequired(String id) { return true; }
123        public void put(CodonPref codonPref)
124        {
125            prefMap.put(codonPref.getName(), codonPref);
126        }
127    }
128
129    /**
130     * get the specified codon preference.
131     */
132    public static CodonPref getCodonPreference(String id)
133    {
134        return (CodonPref) prefMap.get(id);
135    }
136
137    private static void loadCodonPreferences()
138    {
139        try {
140            // parse the predefined codon preferences
141            InputStream prefStream = ClassTools.getClassLoader(CodonPrefTools.class).getResourceAsStream(
142                "org/biojava/bio/symbol/CodonPrefTables.xml"
143            );
144
145            CodonPrefFilter select = new LoadEverythingSelector();
146            readFromXML(prefStream, select);
147        }
148        catch (Exception e) { e.printStackTrace(); }
149    }
150
151    /**
152     * returns an RNA dinucleotide alphabet.
153     * Used to represent the non-wobble bases in WobbleDistribution
154     */
155    public static FiniteAlphabet getDinucleotideAlphabet()
156    {
157        return (FiniteAlphabet)AlphabetManager.generateCrossProductAlphaFromName("(RNA x RNA)");
158    }
159
160    /**
161     * write out a specified CodonPref object in XML format.
162     */
163    public static void writeToXML(CodonPref codonPref, PrintWriter writer)
164        throws NullPointerException, IOException, IllegalSymbolException, BioException
165    {
166        XMLWriter xw = new PrettyXMLWriter(writer);
167
168        dumpToXML(codonPref, xw, true);
169
170        writer.flush();
171    }
172
173    /**
174     * reads a specified CodonPref from an file.
175     * @param name name of organism
176     */
177    public static CodonPref readFromXML(InputStream prefStream, String name)
178        throws BioException
179    {
180        CodonPrefFilter.ByName filter = new CodonPrefFilter.ByName(name);
181
182        readFromXML(prefStream, filter);
183
184        return filter.getCodonPref();
185    }
186
187    public static CodonPref[] readFromXML(InputStream prefStream) throws BioException{
188      CodonPrefFilter.AcceptAll filter = new CodonPrefFilter.AcceptAll();
189      readFromXML(prefStream, filter);
190
191      List l = filter.getCodonPrefs();
192      CodonPref[] cp = new CodonPref[l.size()];
193      return (CodonPref[])l.toArray(cp);
194    }
195
196    /**
197     * read an CodonPref XML stream and handle it with a CodonPrefFilter object.
198     */
199    public static void readFromXML(InputStream prefStream, CodonPrefFilter filter)
200        throws BioException
201    {
202        try {
203            DocumentBuilder parser = DocumentBuilderFactory.newInstance().newDocumentBuilder();
204            Document doc = parser.parse(prefStream);
205
206            // get tables for each species
207            NodeList children = doc.getDocumentElement().getChildNodes();
208
209            for (int i=0; i<children.getLength(); i++) {
210                Node cnode = children.item(i);
211
212                if (!(cnode instanceof Element)) continue;
213
214                Element child = (Element) cnode;
215
216                String name = child.getNodeName();
217
218                // the node must be a CodonPref record
219                if (!name.equals("CodonPref")) continue;
220
221                // pick up the id and genetic code
222                String codonPrefId = child.getAttribute("id");
223                String geneticCodeId = child.getAttribute("geneticCodeId");
224
225                // is this entry one we want?
226                if (!filter.isRequired(codonPrefId)) continue;
227
228                // now handle each codon frequency entry
229                NodeList freqs = child.getChildNodes();
230
231                // create a Count object for the job
232                Count freqCounts = new IndexedCount(RNATools.getCodonAlphabet());
233
234                for (int j=0; j < freqs.getLength(); j++) {
235                    // load each entry
236                    Node freq = freqs.item(j);
237
238                    if (!(freq instanceof Element)) continue;
239
240                    Element freqElement = (Element) freq;
241
242                    // get attributes
243                    String codonString = freqElement.getAttribute("codon");
244                    String freqString = freqElement.getAttribute("value");
245
246                    // create codon
247                    SymbolList codonSL = RNATools.createRNA(codonString);
248
249                    if (codonSL.length() !=3) throw new BioException("'" + codonString + "' is not a valid codon!");
250
251                    AtomicSymbol codon = (AtomicSymbol) RNATools.getCodonAlphabet().getSymbol(codonSL.toList());
252
253                    // recover frequency value too
254                    double freqValue = Double.parseDouble(freqString);
255                    freqCounts.increaseCount(codon, freqValue);
256
257                }
258
259                // turn the Counts into a Distribution
260                Distribution freqDistribution = DistributionTools.countToDistribution(freqCounts);
261
262                // create a CodonPref object
263                CodonPref newCodonPref = new SimpleCodonPref(geneticCodeId, freqDistribution, codonPrefId);
264
265                filter.put(newCodonPref);
266            }
267        }
268        catch (Exception e) {
269            throw new BioException(e);
270        }
271    }
272
273    /**
274     * reads in a file in Codon Usage Database format and
275     * translate it into our XML format
276     * These can be obtained from the
277     * <a href="http://www.kazusa.or.jp/codon/">Codon Usage Database</a>.
278     * <p>
279     * Note that the output assumes that the universal genetic code is
280     * used as that is not encoded in the CUD files.  Edit the output appropriately
281     * to modify the genetic code if necessary.
282     */
283    public static void translateCUD(InputStream input, OutputStream output)
284        throws IOException
285    {
286        // create a BufferedReader for the job
287        BufferedReader rdr = new BufferedReader(new InputStreamReader(input));
288
289        // create a PrintWriter for the job
290        PrintWriter pw = new PrintWriter(output);
291        CodonPrefFilter.EverythingToXML filter = new CodonPrefFilter.EverythingToXML(pw);
292
293        // now invoke the CUD reader and stream its output to the XML writer
294        readFromCUD(rdr, filter);
295
296        filter.close();
297    }
298
299
300    /**
301     * converts a String representation of a codon to its Symbol
302     */
303    private static AtomicSymbol getCodon(String codonString)
304        throws IllegalSymbolException
305    {
306        return (AtomicSymbol) RNATools.getCodonAlphabet().getSymbol(RNATools.createRNA(codonString).toList());
307    }
308
309    private static void loadCodonOrder()
310        throws IllegalSymbolException
311    {
312        cutg[0] = getCodon("cga");
313        cutg[1] = getCodon("cgc");
314        cutg[2] = getCodon("cgg");
315        cutg[3] = getCodon("cgu");
316
317        cutg[4] = getCodon("aga");
318        cutg[5] = getCodon("agg");
319
320        cutg[6] = getCodon("cua");
321        cutg[7] = getCodon("cuc");
322        cutg[8] = getCodon("cug");
323        cutg[9] = getCodon("cuu");
324
325        cutg[10] = getCodon("uua");
326        cutg[11] = getCodon("uug");
327
328        cutg[12] = getCodon("uca");
329        cutg[13] = getCodon("ucc");
330        cutg[14] = getCodon("ucg");
331        cutg[15] = getCodon("ucu");
332
333        cutg[16] = getCodon("agc");
334        cutg[17] = getCodon("agu");
335
336        cutg[18] = getCodon("aca");
337        cutg[19] = getCodon("acc");
338        cutg[20] = getCodon("acg");
339        cutg[21] = getCodon("acu");
340
341        cutg[22] = getCodon("cca");
342        cutg[23] = getCodon("ccc");
343        cutg[24] = getCodon("ccg");
344        cutg[25] = getCodon("ccu");
345
346        cutg[26] = getCodon("gca");
347        cutg[27] = getCodon("gcc");
348        cutg[28] = getCodon("gcg");
349        cutg[29] = getCodon("gcu");
350
351        cutg[30] = getCodon("gga");
352        cutg[31] = getCodon("ggc");
353        cutg[32] = getCodon("ggg");
354        cutg[33] = getCodon("ggu");
355
356        cutg[34] = getCodon("gua");
357        cutg[35] = getCodon("guc");
358        cutg[36] = getCodon("gug");
359        cutg[37] = getCodon("guu");
360
361        cutg[38] = getCodon("aaa");
362        cutg[39] = getCodon("aag");
363
364        cutg[40] = getCodon("aac");
365        cutg[41] = getCodon("aau");
366
367        cutg[42] = getCodon("caa");
368        cutg[43] = getCodon("cag");
369
370        cutg[44] = getCodon("cac");
371        cutg[45] = getCodon("cau");
372
373        cutg[46] = getCodon("gaa");
374        cutg[47] = getCodon("gag");
375
376        cutg[48] = getCodon("gac");
377        cutg[49] = getCodon("gau");
378
379        cutg[50] = getCodon("uac");
380        cutg[51] = getCodon("uau");
381
382        cutg[52] = getCodon("ugc");
383        cutg[53] = getCodon("ugu");
384
385        cutg[54] = getCodon("uuc");
386        cutg[55] = getCodon("uuu");
387
388        cutg[56] = getCodon("aua");
389        cutg[57] = getCodon("auc");
390        cutg[58] = getCodon("auu");
391
392        cutg[59] = getCodon("aug");
393
394        cutg[60] = getCodon("ugg");
395
396        cutg[61] = getCodon("uaa");
397        cutg[62] = getCodon("uag");
398        cutg[63] = getCodon("uga");
399    }
400
401    private static String stringifyCodon(BasisSymbol codon)
402        throws IllegalSymbolException, BioException
403    {
404        // get the component symbols
405        List codonList = codon.getSymbols();
406
407        // get a tokenizer
408        SymbolTokenization toke = RNATools.getRNA().getTokenization("token");
409
410        String tokenizedCodon = toke.tokenizeSymbol((Symbol) codonList.get(0))
411            + toke.tokenizeSymbol((Symbol) codonList.get(1))
412            + toke.tokenizeSymbol((Symbol) codonList.get(2));
413
414        return tokenizedCodon;
415    }
416
417    /**
418     * writes out a CodonPref object in XML form
419     */
420    static void dumpToXML(CodonPref codonPref, XMLWriter xw, boolean writeWrapper)
421        throws NullPointerException, IOException, IllegalSymbolException, BioException
422    {
423        // validate both objects first
424        if ((codonPref == null) || (xw == null))
425            throw new NullPointerException();
426
427        // get the CodonPref Distribution
428        Distribution codonDist = codonPref.getFrequency();
429
430        // start <CodonPrefs>
431        if (writeWrapper) xw.openTag("CodonPrefs");
432
433        xw.openTag("CodonPref");
434        xw.attribute("id", codonPref.getName());
435        xw.attribute("geneticCodeId", codonPref.getGeneticCodeName());
436
437        // loop over all codons, writing out the stats
438        for (Iterator codonI = RNATools.getCodonAlphabet().iterator(); codonI.hasNext(); ) {
439            BasisSymbol codon = (BasisSymbol) codonI.next();
440
441            xw.openTag("frequency");
442
443            // convert codon to a three letter string
444            xw.attribute("codon", stringifyCodon(codon));
445            xw.attribute("value", Double.toString(codonDist.getWeight(codon)));
446
447            xw.closeTag("frequency");
448        }
449
450        xw.closeTag("CodonPref");
451
452        if (writeWrapper) xw.closeTag("CodonPrefs");
453    }
454
455    /**
456     * reads in records in CUD format
457     */
458    private static void readFromCUD(BufferedReader rdr, CodonPrefFilter filter)
459    {
460        try {
461            String currLine;
462            while ((currLine = rdr.readLine()) != null) {
463
464                // process comment line
465                StringTokenizer toke = new StringTokenizer(currLine, ":");
466                if (toke.hasMoreTokens()) {
467                    // get id string
468                    String id = (toke.nextToken()).trim();
469
470                    // read the codon count
471                    currLine = rdr.readLine();
472                    if (currLine == null) break;
473
474                    // do we even want to process this record?
475                    if (filter.isRequired(id)) {
476                        toke = new StringTokenizer(currLine);
477
478                        int idx = 0;
479                        IndexedCount count = new IndexedCount(RNATools.getCodonAlphabet());
480                        while (toke.hasMoreTokens()) {
481                            // check that I haven't read too many values!
482                            if (idx > 63) continue;
483                            count.increaseCount(cutg[idx], Double.parseDouble(toke.nextToken()));
484                            idx++;
485                        }
486
487                        if (idx != 64) continue;
488
489                        // ok, I now have the counts and the name, let's stash it
490                        Distribution codonDist = DistributionTools.countToDistribution(count);
491
492                        CodonPref codonPref = new SimpleCodonPref("UNIVERSAL", codonDist, id);
493                        filter.put(codonPref);
494                    }
495                }
496            }
497        }
498        catch (IOException ioe) {}
499        catch (IllegalSymbolException ise) {}
500        catch (IllegalAlphabetException iae) {}
501        catch (ChangeVetoException cve) {}
502        catch (BioException be) {}
503    }
504}
505