001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.genome.parsers.geneid;
022
023import org.biojava.nbio.core.sequence.AccessionID;
024import org.biojava.nbio.core.sequence.DNASequence;
025import org.biojava.nbio.core.sequence.ProteinSequence;
026import org.biojava.nbio.core.sequence.io.FastaWriterHelper;
027import org.biojava.nbio.core.util.XMLHelper;
028import org.slf4j.Logger;
029import org.slf4j.LoggerFactory;
030import org.w3c.dom.Document;
031import org.w3c.dom.Element;
032
033import java.io.File;
034import java.util.ArrayList;
035import java.util.LinkedHashMap;
036import java.util.List;
037
038/**
039 *
040 * @author Scooter Willis 
041 */
042public class GeneIDXMLReader {
043
044        private static final Logger logger = LoggerFactory.getLogger(GeneIDXMLReader.class);
045
046        Document geneidDoc = null;
047
048        public GeneIDXMLReader(String geneidXMLFile) throws Exception {
049                logger.info("Start read of {}", geneidXMLFile);
050                geneidDoc = XMLHelper.loadXML(geneidXMLFile);
051                logger.info("Read finished");
052        }
053
054        public LinkedHashMap<String, ProteinSequence> getProteinSequences() throws Exception {
055                LinkedHashMap<String, ProteinSequence> proteinSequenceList = new LinkedHashMap<>();
056                List<Element> elementList = XMLHelper.selectElements(geneidDoc.getDocumentElement(), "prediction/gene/protein");
057                logger.info("{} hits", elementList.size());
058
059                for (Element proteinElement : elementList) {
060                        Element geneElement = (Element) proteinElement.getParentNode();
061                        String sequence = proteinElement.getTextContent().replaceAll("\\W","");
062                        ProteinSequence proteinSequence = new ProteinSequence(sequence);
063                        String idGene = geneElement.getAttribute("idGene");
064                        proteinSequence.setAccession(new AccessionID(idGene));
065                        proteinSequenceList.put(idGene, proteinSequence);
066                }
067
068                return proteinSequenceList;
069        }
070
071        public LinkedHashMap<String, DNASequence> getDNACodingSequences() throws Exception {
072                LinkedHashMap<String, DNASequence> dnaSequenceList = new LinkedHashMap<>();
073                List<Element> elementList = XMLHelper.selectElements(geneidDoc.getDocumentElement(), "prediction/gene/cDNA");
074                logger.info("{} hits", elementList.size());
075
076                for (Element dnaElement : elementList) {
077                        Element geneElement = (Element) dnaElement.getParentNode();
078                        String sequence = dnaElement.getTextContent().replaceAll("\\W","");
079                        DNASequence dnaSequence = new DNASequence(sequence);
080                        String idGene = geneElement.getAttribute("idGene");
081                        dnaSequence.setAccession(new AccessionID(idGene));
082                        dnaSequenceList.put(idGene, dnaSequence);
083                }
084
085                return dnaSequenceList;
086        }
087
088        public static void main(String[] args) {
089                try {
090                        GeneIDXMLReader geneIDXMLReader = new GeneIDXMLReader("/Users/Scooter/scripps/dyadic/geneid/geneid/c1_geneid.xml");
091                        LinkedHashMap<String, ProteinSequence> proteinSequenceHashMap = geneIDXMLReader.getProteinSequences();
092                        FastaWriterHelper.writeProteinSequence(new File("/Users/Scooter/scripps/dyadic/geneid/geneid/c1_geneid.faa"), proteinSequenceHashMap.values());
093
094                        LinkedHashMap<String, DNASequence> dnaSequenceHashMap = geneIDXMLReader.getDNACodingSequences();
095                        FastaWriterHelper.writeNucleotideSequence(new File("/Users/Scooter/scripps/dyadic/geneid/geneid/c1_geneid.fna"), dnaSequenceHashMap.values());
096
097                } catch (Exception e) {
098                        logger.error("Exception: ", e);
099                }
100        }
101}