001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.genome.parsers.geneid;
022
023import org.biojava.nbio.core.sequence.AccessionID;
024import org.biojava.nbio.core.sequence.DNASequence;
025import org.biojava.nbio.core.sequence.ProteinSequence;
026import org.biojava.nbio.core.sequence.io.FastaWriterHelper;
027import org.biojava.nbio.core.util.XMLHelper;
028import org.slf4j.Logger;
029import org.slf4j.LoggerFactory;
030import org.w3c.dom.Document;
031import org.w3c.dom.Element;
032
033import java.io.File;
034import java.util.ArrayList;
035import java.util.LinkedHashMap;
036
037/**
038 *
039 * @author Scooter Willis <willishf at gmail dot com>
040 */
041public class GeneIDXMLReader {
042
043        private static final Logger logger = LoggerFactory.getLogger(GeneIDXMLReader.class);
044
045        Document geneidDoc = null;
046
047        public GeneIDXMLReader(String geneidXMLFile) throws Exception {
048                logger.info("Start read of {}", geneidXMLFile);
049                geneidDoc = XMLHelper.loadXML(geneidXMLFile);
050                logger.info("Read finished");
051        }
052
053        public LinkedHashMap<String, ProteinSequence> getProteinSequences() throws Exception {
054                LinkedHashMap<String, ProteinSequence> proteinSequenceList = new LinkedHashMap<String, ProteinSequence>();
055                ArrayList<Element> elementList = XMLHelper.selectElements(geneidDoc.getDocumentElement(), "prediction/gene/protein");
056                logger.info("{} hits", elementList.size());
057
058                for (Element proteinElement : elementList) {
059                        Element geneElement = (Element) proteinElement.getParentNode();
060                        String sequence = proteinElement.getTextContent().replaceAll("\\W","");
061                        ProteinSequence proteinSequence = new ProteinSequence(sequence);
062                        String idGene = geneElement.getAttribute("idGene");
063                        proteinSequence.setAccession(new AccessionID(idGene));
064                        proteinSequenceList.put(idGene, proteinSequence);
065                }
066
067                return proteinSequenceList;
068        }
069
070        public LinkedHashMap<String, DNASequence> getDNACodingSequences() throws Exception {
071                LinkedHashMap<String, DNASequence> dnaSequenceList = new LinkedHashMap<String, DNASequence>();
072                ArrayList<Element> elementList = XMLHelper.selectElements(geneidDoc.getDocumentElement(), "prediction/gene/cDNA");
073                logger.info("{} hits", elementList.size());
074
075                for (Element dnaElement : elementList) {
076                        Element geneElement = (Element) dnaElement.getParentNode();
077                        String sequence = dnaElement.getTextContent().replaceAll("\\W","");
078                        DNASequence dnaSequence = new DNASequence(sequence);
079                        String idGene = geneElement.getAttribute("idGene");
080                        dnaSequence.setAccession(new AccessionID(idGene));
081                        dnaSequenceList.put(idGene, dnaSequence);
082                }
083
084                return dnaSequenceList;
085        }
086
087        public static void main(String[] args) {
088                try {
089                        GeneIDXMLReader geneIDXMLReader = new GeneIDXMLReader("/Users/Scooter/scripps/dyadic/geneid/geneid/c1_geneid.xml");
090                        LinkedHashMap<String, ProteinSequence> proteinSequenceHashMap = geneIDXMLReader.getProteinSequences();
091                        FastaWriterHelper.writeProteinSequence(new File("/Users/Scooter/scripps/dyadic/geneid/geneid/c1_geneid.faa"), proteinSequenceHashMap.values());
092
093                        LinkedHashMap<String, DNASequence> dnaSequenceHashMap = geneIDXMLReader.getDNACodingSequences();
094                        FastaWriterHelper.writeNucleotideSequence(new File("/Users/Scooter/scripps/dyadic/geneid/geneid/c1_geneid.fna"), dnaSequenceHashMap.values());
095
096                } catch (Exception e) {
097                        logger.error("Exception: ", e);
098                }
099        }
100}