001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.genome.parsers.geneid; 022 023import org.biojava.nbio.core.sequence.AccessionID; 024import org.biojava.nbio.core.sequence.DNASequence; 025import org.biojava.nbio.core.sequence.ProteinSequence; 026import org.biojava.nbio.core.sequence.io.FastaWriterHelper; 027import org.biojava.nbio.core.util.XMLHelper; 028import org.slf4j.Logger; 029import org.slf4j.LoggerFactory; 030import org.w3c.dom.Document; 031import org.w3c.dom.Element; 032 033import java.io.File; 034import java.util.ArrayList; 035import java.util.LinkedHashMap; 036import java.util.List; 037 038/** 039 * 040 * @author Scooter Willis 041 */ 042public class GeneIDXMLReader { 043 044 private static final Logger logger = LoggerFactory.getLogger(GeneIDXMLReader.class); 045 046 Document geneidDoc = null; 047 048 public GeneIDXMLReader(String geneidXMLFile) throws Exception { 049 logger.info("Start read of {}", geneidXMLFile); 050 geneidDoc = XMLHelper.loadXML(geneidXMLFile); 051 logger.info("Read finished"); 052 } 053 054 public LinkedHashMap<String, ProteinSequence> getProteinSequences() throws Exception { 055 LinkedHashMap<String, ProteinSequence> proteinSequenceList = new LinkedHashMap<>(); 056 List<Element> elementList = XMLHelper.selectElements(geneidDoc.getDocumentElement(), "prediction/gene/protein"); 057 logger.info("{} hits", elementList.size()); 058 059 for (Element proteinElement : elementList) { 060 Element geneElement = (Element) proteinElement.getParentNode(); 061 String sequence = proteinElement.getTextContent().replaceAll("\\W",""); 062 ProteinSequence proteinSequence = new ProteinSequence(sequence); 063 String idGene = geneElement.getAttribute("idGene"); 064 proteinSequence.setAccession(new AccessionID(idGene)); 065 proteinSequenceList.put(idGene, proteinSequence); 066 } 067 068 return proteinSequenceList; 069 } 070 071 public LinkedHashMap<String, DNASequence> getDNACodingSequences() throws Exception { 072 LinkedHashMap<String, DNASequence> dnaSequenceList = new LinkedHashMap<>(); 073 List<Element> elementList = XMLHelper.selectElements(geneidDoc.getDocumentElement(), "prediction/gene/cDNA"); 074 logger.info("{} hits", elementList.size()); 075 076 for (Element dnaElement : elementList) { 077 Element geneElement = (Element) dnaElement.getParentNode(); 078 String sequence = dnaElement.getTextContent().replaceAll("\\W",""); 079 DNASequence dnaSequence = new DNASequence(sequence); 080 String idGene = geneElement.getAttribute("idGene"); 081 dnaSequence.setAccession(new AccessionID(idGene)); 082 dnaSequenceList.put(idGene, dnaSequence); 083 } 084 085 return dnaSequenceList; 086 } 087 088 public static void main(String[] args) { 089 try { 090 GeneIDXMLReader geneIDXMLReader = new GeneIDXMLReader("/Users/Scooter/scripps/dyadic/geneid/geneid/c1_geneid.xml"); 091 LinkedHashMap<String, ProteinSequence> proteinSequenceHashMap = geneIDXMLReader.getProteinSequences(); 092 FastaWriterHelper.writeProteinSequence(new File("/Users/Scooter/scripps/dyadic/geneid/geneid/c1_geneid.faa"), proteinSequenceHashMap.values()); 093 094 LinkedHashMap<String, DNASequence> dnaSequenceHashMap = geneIDXMLReader.getDNACodingSequences(); 095 FastaWriterHelper.writeNucleotideSequence(new File("/Users/Scooter/scripps/dyadic/geneid/geneid/c1_geneid.fna"), dnaSequenceHashMap.values()); 096 097 } catch (Exception e) { 098 logger.error("Exception: ", e); 099 } 100 } 101}