001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.genome.parsers.geneid; 022 023import org.biojava.nbio.core.sequence.AccessionID; 024import org.biojava.nbio.core.sequence.DNASequence; 025import org.biojava.nbio.core.sequence.ProteinSequence; 026import org.biojava.nbio.core.sequence.io.FastaWriterHelper; 027import org.biojava.nbio.core.util.XMLHelper; 028import org.slf4j.Logger; 029import org.slf4j.LoggerFactory; 030import org.w3c.dom.Document; 031import org.w3c.dom.Element; 032 033import java.io.File; 034import java.util.ArrayList; 035import java.util.LinkedHashMap; 036 037/** 038 * 039 * @author Scooter Willis <willishf at gmail dot com> 040 */ 041public class GeneIDXMLReader { 042 043 private static final Logger logger = LoggerFactory.getLogger(GeneIDXMLReader.class); 044 045 Document geneidDoc = null; 046 047 public GeneIDXMLReader(String geneidXMLFile) throws Exception { 048 logger.info("Start read of {}", geneidXMLFile); 049 geneidDoc = XMLHelper.loadXML(geneidXMLFile); 050 logger.info("Read finished"); 051 } 052 053 public LinkedHashMap<String, ProteinSequence> getProteinSequences() throws Exception { 054 LinkedHashMap<String, ProteinSequence> proteinSequenceList = new LinkedHashMap<String, ProteinSequence>(); 055 ArrayList<Element> elementList = XMLHelper.selectElements(geneidDoc.getDocumentElement(), "prediction/gene/protein"); 056 logger.info("{} hits", elementList.size()); 057 058 for (Element proteinElement : elementList) { 059 Element geneElement = (Element) proteinElement.getParentNode(); 060 String sequence = proteinElement.getTextContent().replaceAll("\\W",""); 061 ProteinSequence proteinSequence = new ProteinSequence(sequence); 062 String idGene = geneElement.getAttribute("idGene"); 063 proteinSequence.setAccession(new AccessionID(idGene)); 064 proteinSequenceList.put(idGene, proteinSequence); 065 } 066 067 return proteinSequenceList; 068 } 069 070 public LinkedHashMap<String, DNASequence> getDNACodingSequences() throws Exception { 071 LinkedHashMap<String, DNASequence> dnaSequenceList = new LinkedHashMap<String, DNASequence>(); 072 ArrayList<Element> elementList = XMLHelper.selectElements(geneidDoc.getDocumentElement(), "prediction/gene/cDNA"); 073 logger.info("{} hits", elementList.size()); 074 075 for (Element dnaElement : elementList) { 076 Element geneElement = (Element) dnaElement.getParentNode(); 077 String sequence = dnaElement.getTextContent().replaceAll("\\W",""); 078 DNASequence dnaSequence = new DNASequence(sequence); 079 String idGene = geneElement.getAttribute("idGene"); 080 dnaSequence.setAccession(new AccessionID(idGene)); 081 dnaSequenceList.put(idGene, dnaSequence); 082 } 083 084 return dnaSequenceList; 085 } 086 087 public static void main(String[] args) { 088 try { 089 GeneIDXMLReader geneIDXMLReader = new GeneIDXMLReader("/Users/Scooter/scripps/dyadic/geneid/geneid/c1_geneid.xml"); 090 LinkedHashMap<String, ProteinSequence> proteinSequenceHashMap = geneIDXMLReader.getProteinSequences(); 091 FastaWriterHelper.writeProteinSequence(new File("/Users/Scooter/scripps/dyadic/geneid/geneid/c1_geneid.faa"), proteinSequenceHashMap.values()); 092 093 LinkedHashMap<String, DNASequence> dnaSequenceHashMap = geneIDXMLReader.getDNACodingSequences(); 094 FastaWriterHelper.writeNucleotideSequence(new File("/Users/Scooter/scripps/dyadic/geneid/geneid/c1_geneid.fna"), dnaSequenceHashMap.values()); 095 096 } catch (Exception e) { 097 logger.error("Exception: ", e); 098 } 099 } 100}