001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.core.search.io.blast; 022 023 024import org.biojava.nbio.core.search.io.Hit; 025import org.biojava.nbio.core.search.io.Hsp; 026import org.biojava.nbio.core.search.io.Result; 027import org.biojava.nbio.core.search.io.ResultFactory; 028import java.io.File; 029import java.io.IOException; 030import java.text.ParseException; 031import java.util.ArrayList; 032import java.util.HashMap; 033import java.util.List; 034import java.util.Map; 035import javax.xml.parsers.ParserConfigurationException; 036import javax.xml.xpath.XPathException; 037import org.biojava.nbio.core.sequence.template.Sequence; 038import org.biojava.nbio.core.util.XMLHelper; 039import org.slf4j.LoggerFactory; 040import org.w3c.dom.Document; 041import org.w3c.dom.Element; 042import org.xml.sax.SAXException; 043 044/** 045 * Re-designed by Paolo Pavan on the footprint of: 046 * org.biojava.nbio.genome.query.BlastXMLQuery by Scooter Willis <willishf at gmail dot com> 047 * 048 * You may want to find my contacts on Github and LinkedIn for code info 049 * or discuss major changes. 050 * https://github.com/paolopavan 051 * 052 * 053 * @author Paolo Pavan 054 */ 055public class BlastXMLParser implements ResultFactory { 056 private static final org.slf4j.Logger logger = LoggerFactory.getLogger(Hsp.class); 057 Document blastDoc = null; 058 private File targetFile; 059 private List<Sequence> queryReferences, databaseReferences; 060 private Map<String,Sequence> queryReferencesMap, databaseReferencesMap; 061 062 public BlastXMLParser() { 063 064 } 065 @Override 066 public void setFile(File f){ 067 targetFile = f; 068 } 069 070 private void readFile(String blastFile) throws IOException, ParseException{ 071 logger.info("Start reading " + blastFile); 072 try { 073 blastDoc = XMLHelper.loadXML(blastFile); 074 } catch (SAXException ex) { 075 logger.error("A parsing error has occurred while reading XML blast file"); 076 throw new ParseException(ex.getMessage(),0); 077 } catch (ParserConfigurationException ex) { 078 logger.error("Internal XML parser non properly configured"); 079 throw new ParseException(ex.getMessage(),0); 080 } 081 logger.info("Read finished"); 082 } 083 084 @Override 085 public List<Result> createObjects(double maxEScore) throws IOException, ParseException { 086 if (targetFile == null) throw new IllegalStateException("File to be parsed not specified."); 087 088 // getAbsolutePath throws SecurityException 089 readFile(targetFile.getAbsolutePath()); 090 // create mappings between sequences and blast id 091 mapIds(); 092 093 ArrayList<Result> resultsCollection; 094 ArrayList<Hit> hitsCollection; 095 ArrayList<Hsp> hspsCollection; 096 097 try { 098 // select top level elements 099 String program = XMLHelper.selectSingleElement(blastDoc.getDocumentElement(),"BlastOutput_program").getTextContent(); 100 String version = XMLHelper.selectSingleElement(blastDoc.getDocumentElement(),"BlastOutput_version").getTextContent(); 101 String reference = XMLHelper.selectSingleElement(blastDoc.getDocumentElement(),"BlastOutput_reference").getTextContent(); 102 String dbFile = XMLHelper.selectSingleElement(blastDoc.getDocumentElement(),"BlastOutput_db").getTextContent(); 103 104 logger.info("Query for hits in "+ targetFile); 105 ArrayList<Element> IterationsList = XMLHelper.selectElements(blastDoc.getDocumentElement(), "BlastOutput_iterations/Iteration[Iteration_hits]"); 106 logger.info(IterationsList.size() + " results"); 107 108 resultsCollection = new ArrayList<Result>(); 109 for (Element element : IterationsList) { 110 BlastResultBuilder resultBuilder = new BlastResultBuilder(); 111 // will add BlastOutput* key sections in the result object 112 resultBuilder 113 .setProgram(program) 114 .setVersion(version) 115 .setReference(reference) 116 .setDbFile(dbFile); 117 118 // Iteration* section keys: 119 resultBuilder 120 .setIterationNumber(new Integer(XMLHelper.selectSingleElement(element,"Iteration_iter-num").getTextContent())) 121 .setQueryID(XMLHelper.selectSingleElement(element,"Iteration_query-ID").getTextContent()) 122 .setQueryDef(XMLHelper.selectSingleElement(element, "Iteration_query-def").getTextContent()) 123 .setQueryLength(new Integer(XMLHelper.selectSingleElement(element,"Iteration_query-len").getTextContent())); 124 125 if (queryReferences != null) resultBuilder.setQuerySequence(queryReferencesMap.get( 126 XMLHelper.selectSingleElement(element,"Iteration_query-ID").getTextContent() 127 )); 128 129 130 131 Element iterationHitsElement = XMLHelper.selectSingleElement(element, "Iteration_hits"); 132 ArrayList<Element> hitList = XMLHelper.selectElements(iterationHitsElement, "Hit"); 133 134 hitsCollection = new ArrayList<Hit>(); 135 for (Element hitElement : hitList) { 136 BlastHitBuilder blastHitBuilder = new BlastHitBuilder(); 137 blastHitBuilder 138 .setHitNum(new Integer(XMLHelper.selectSingleElement(hitElement, "Hit_num").getTextContent())) 139 .setHitId(XMLHelper.selectSingleElement(hitElement, "Hit_id").getTextContent()) 140 .setHitDef(XMLHelper.selectSingleElement(hitElement, "Hit_def").getTextContent()) 141 .setHitAccession(XMLHelper.selectSingleElement(hitElement, "Hit_accession").getTextContent()) 142 .setHitLen(new Integer(XMLHelper.selectSingleElement(hitElement, "Hit_len").getTextContent())); 143 144 if (databaseReferences != null) blastHitBuilder.setHitSequence(databaseReferencesMap.get( 145 XMLHelper.selectSingleElement(hitElement, "Hit_id").getTextContent() 146 )); 147 148 Element hithspsElement = XMLHelper.selectSingleElement(hitElement, "Hit_hsps"); 149 ArrayList<Element> hspList = XMLHelper.selectElements(hithspsElement, "Hsp"); 150 151 hspsCollection = new ArrayList<Hsp>(); 152 for (Element hspElement : hspList) { 153 Double evalue = new Double(XMLHelper.selectSingleElement(hspElement, "Hsp_evalue").getTextContent()); 154 155 // add the new hsp only if it pass the specified threshold. It can save lot of memory and some parsing time 156 if (evalue <= maxEScore) { 157 BlastHspBuilder blastHspBuilder = new BlastHspBuilder(); 158 blastHspBuilder 159 .setHspNum(new Integer(XMLHelper.selectSingleElement(hspElement, "Hsp_num").getTextContent())) 160 .setHspBitScore(new Double(XMLHelper.selectSingleElement(hspElement, "Hsp_bit-score").getTextContent())) 161 .setHspScore(new Integer(XMLHelper.selectSingleElement(hspElement, "Hsp_score").getTextContent())) 162 .setHspEvalue(evalue) 163 .setHspQueryFrom(new Integer(XMLHelper.selectSingleElement(hspElement, "Hsp_query-from").getTextContent())) 164 .setHspQueryTo(new Integer(XMLHelper.selectSingleElement(hspElement, "Hsp_query-to").getTextContent())) 165 .setHspHitFrom(new Integer(XMLHelper.selectSingleElement(hspElement, "Hsp_hit-from").getTextContent())) 166 .setHspHitTo(new Integer(XMLHelper.selectSingleElement(hspElement, "Hsp_hit-to").getTextContent())) 167 .setHspQueryFrame(new Integer(XMLHelper.selectSingleElement(hspElement, "Hsp_query-frame").getTextContent())) 168 .setHspHitFrame(new Integer(XMLHelper.selectSingleElement(hspElement, "Hsp_hit-frame").getTextContent())) 169 .setHspIdentity(new Integer(XMLHelper.selectSingleElement(hspElement, "Hsp_identity").getTextContent())) 170 .setHspPositive(new Integer(XMLHelper.selectSingleElement(hspElement, "Hsp_positive").getTextContent())) 171 .setHspGaps(new Integer(XMLHelper.selectSingleElement(hspElement, "Hsp_gaps").getTextContent())) 172 .setHspAlignLen(new Integer(XMLHelper.selectSingleElement(hspElement, "Hsp_align-len").getTextContent())) 173 .setHspQseq(XMLHelper.selectSingleElement(hspElement, "Hsp_qseq").getTextContent()) 174 .setHspHseq(XMLHelper.selectSingleElement(hspElement, "Hsp_hseq").getTextContent()) 175 .setHspIdentityString(XMLHelper.selectSingleElement(hspElement, "Hsp_midline").getTextContent()); 176 177 hspsCollection.add(blastHspBuilder.createBlastHsp()); 178 } 179 } 180 // finally set the computed hsp collection and create Hit object 181 blastHitBuilder.setHsps(hspsCollection); 182 hitsCollection.add(blastHitBuilder.createBlastHit()); 183 } 184 // finally set the computed Hit collection to the result 185 resultBuilder.setHits(hitsCollection); 186 resultsCollection.add(resultBuilder.createBlastResult()); 187 } 188 } catch (XPathException e) { 189 throw new ParseException(e.getMessage(),0); 190 } 191 logger.info("Parsing of "+targetFile+" finished."); 192 193 return resultsCollection; 194 } 195 196 @Override 197 public List<String> getFileExtensions(){ 198 ArrayList<String> extensions = new ArrayList<String>(1); 199 extensions.add("blastxml"); 200 return extensions; 201 } 202 203 @Override 204 public void setQueryReferences(List<Sequence> sequences) { 205 queryReferences = sequences; 206 } 207 208 @Override 209 public void setDatabaseReferences(List<Sequence> sequences) { 210 databaseReferences = sequences; 211 } 212 213 /** 214 * fill the map association between sequences an a unique id 215 */ 216 private void mapIds() { 217 if (queryReferences != null) { 218 queryReferencesMap = new HashMap<String,Sequence>(queryReferences.size()); 219 for (int counter=0; counter < queryReferences.size() ; counter ++){ 220 String id = "Query_"+(counter+1); 221 queryReferencesMap.put(id, queryReferences.get(counter)); 222 } 223 } 224 225 if (databaseReferences != null) { 226 databaseReferencesMap = new HashMap<String,Sequence>(databaseReferences.size()); 227 for (int counter=0; counter < databaseReferences.size() ; counter ++){ 228 // this is strange: while Query_id are 1 based, Hit (database) id are 0 based 229 String id = "gnl|BL_ORD_ID|"+(counter); 230 databaseReferencesMap.put(id, databaseReferences.get(counter)); 231 } 232 } 233 } 234 235 @Override 236 public void storeObjects(List<Result> results) throws IOException, ParseException { 237 throw new UnsupportedOperationException("This parser does not support writing yet."); 238 } 239} 240 241 242class BlastHsp extends org.biojava.nbio.core.search.io.Hsp { 243 public BlastHsp(int hspNum, double hspBitScore, int hspScore, double hspEvalue, int hspQueryFrom, int hspQueryTo, int hspHitFrom, int hspHitTo, int hspQueryFrame, int hspHitFrame, int hspIdentity, int hspPositive, int hspGaps, int hspAlignLen, String hspQseq, String hspHseq, String hspIdentityString, Double percentageIdentity, Integer mismatchCount) { 244 super(hspNum, hspBitScore, hspScore, hspEvalue, hspQueryFrom, hspQueryTo, hspHitFrom, hspHitTo, hspQueryFrame, hspHitFrame, hspIdentity, hspPositive, hspGaps, hspAlignLen, hspQseq, hspHseq, hspIdentityString, percentageIdentity, mismatchCount); 245 } 246 247} 248 249class BlastHit extends org.biojava.nbio.core.search.io.Hit { 250 public BlastHit(int hitNum, String hitId, String hitDef, String hitAccession, int hitLen, List<Hsp> hitHsps, Sequence hitSequence) { 251 super(hitNum, hitId, hitDef, hitAccession, hitLen, hitHsps, hitSequence); 252 } 253 254}