001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.core.search.io.blast;
022
023
024import org.biojava.nbio.core.search.io.Hit;
025import org.biojava.nbio.core.search.io.Hsp;
026import org.biojava.nbio.core.search.io.Result;
027import org.biojava.nbio.core.search.io.ResultFactory;
028import java.io.File;
029import java.io.IOException;
030import java.text.ParseException;
031import java.util.ArrayList;
032import java.util.HashMap;
033import java.util.List;
034import java.util.Map;
035import javax.xml.parsers.ParserConfigurationException;
036import javax.xml.xpath.XPathException;
037import org.biojava.nbio.core.sequence.template.Sequence;
038import org.biojava.nbio.core.util.XMLHelper;
039import org.slf4j.LoggerFactory;
040import org.w3c.dom.Document;
041import org.w3c.dom.Element;
042import org.xml.sax.SAXException;
043
044/**
045 * Re-designed by Paolo Pavan on the footprint of:
046 * org.biojava.nbio.genome.query.BlastXMLQuery by Scooter Willis <willishf at gmail dot com>
047 *
048 * You may want to find my contacts on Github and LinkedIn for code info
049 * or discuss major changes.
050 * https://github.com/paolopavan
051 *
052 *
053 * @author Paolo Pavan
054 */
055public class BlastXMLParser implements ResultFactory {
056        private static final org.slf4j.Logger logger = LoggerFactory.getLogger(Hsp.class);
057        Document blastDoc = null;
058        private File targetFile;
059        private List<Sequence> queryReferences, databaseReferences;
060        private Map<String,Sequence> queryReferencesMap, databaseReferencesMap;
061
062        public BlastXMLParser() {
063
064        }
065        @Override
066        public void setFile(File f){
067                targetFile = f;
068        }
069
070        private void readFile(String blastFile) throws IOException, ParseException{
071                logger.info("Start reading " + blastFile);
072                try {
073                        blastDoc = XMLHelper.loadXML(blastFile);
074                } catch (SAXException ex) {
075                        logger.error("A parsing error has occurred while reading XML blast file");
076                        throw new ParseException(ex.getMessage(),0);
077                } catch (ParserConfigurationException ex) {
078                        logger.error("Internal XML parser non properly configured");
079                        throw new ParseException(ex.getMessage(),0);
080                }
081                logger.info("Read finished");
082        }
083
084        @Override
085        public List<Result> createObjects(double maxEScore) throws IOException, ParseException {
086                if (targetFile == null) throw new IllegalStateException("File to be parsed not specified.");
087
088                // getAbsolutePath throws SecurityException
089                readFile(targetFile.getAbsolutePath());
090                // create mappings between sequences and blast id
091                mapIds();
092
093                ArrayList<Result> resultsCollection;
094                ArrayList<Hit> hitsCollection;
095                ArrayList<Hsp> hspsCollection;
096
097                try {
098                        // select top level elements
099                        String program = XMLHelper.selectSingleElement(blastDoc.getDocumentElement(),"BlastOutput_program").getTextContent();
100                        String version = XMLHelper.selectSingleElement(blastDoc.getDocumentElement(),"BlastOutput_version").getTextContent();
101                        String reference = XMLHelper.selectSingleElement(blastDoc.getDocumentElement(),"BlastOutput_reference").getTextContent();
102                        String dbFile = XMLHelper.selectSingleElement(blastDoc.getDocumentElement(),"BlastOutput_db").getTextContent();
103
104                        logger.info("Query for hits in "+ targetFile);
105                        ArrayList<Element> IterationsList = XMLHelper.selectElements(blastDoc.getDocumentElement(), "BlastOutput_iterations/Iteration[Iteration_hits]");
106                        logger.info(IterationsList.size() + " results");
107
108                        resultsCollection = new ArrayList<Result>();
109                        for (Element element : IterationsList) {
110                                BlastResultBuilder resultBuilder = new BlastResultBuilder();
111                                // will add BlastOutput* key sections in the result object
112                                resultBuilder
113                                        .setProgram(program)
114                                        .setVersion(version)
115                                        .setReference(reference)
116                                        .setDbFile(dbFile);
117
118                                // Iteration* section keys:
119                                resultBuilder
120                                        .setIterationNumber(new Integer(XMLHelper.selectSingleElement(element,"Iteration_iter-num").getTextContent()))
121                                        .setQueryID(XMLHelper.selectSingleElement(element,"Iteration_query-ID").getTextContent())
122                                        .setQueryDef(XMLHelper.selectSingleElement(element, "Iteration_query-def").getTextContent())
123                                        .setQueryLength(new Integer(XMLHelper.selectSingleElement(element,"Iteration_query-len").getTextContent()));
124
125                                if (queryReferences != null) resultBuilder.setQuerySequence(queryReferencesMap.get(
126                                                XMLHelper.selectSingleElement(element,"Iteration_query-ID").getTextContent()
127                                ));
128
129
130
131                                Element iterationHitsElement = XMLHelper.selectSingleElement(element, "Iteration_hits");
132                                ArrayList<Element> hitList = XMLHelper.selectElements(iterationHitsElement, "Hit");
133
134                                hitsCollection = new ArrayList<Hit>();
135                                for (Element hitElement : hitList) {
136                                        BlastHitBuilder blastHitBuilder = new BlastHitBuilder();
137                                        blastHitBuilder
138                                                .setHitNum(new Integer(XMLHelper.selectSingleElement(hitElement, "Hit_num").getTextContent()))
139                                                .setHitId(XMLHelper.selectSingleElement(hitElement, "Hit_id").getTextContent())
140                                                .setHitDef(XMLHelper.selectSingleElement(hitElement, "Hit_def").getTextContent())
141                                                .setHitAccession(XMLHelper.selectSingleElement(hitElement, "Hit_accession").getTextContent())
142                                                .setHitLen(new Integer(XMLHelper.selectSingleElement(hitElement, "Hit_len").getTextContent()));
143
144                                        if (databaseReferences != null) blastHitBuilder.setHitSequence(databaseReferencesMap.get(
145                                                XMLHelper.selectSingleElement(hitElement, "Hit_id").getTextContent()
146                                        ));
147
148                                        Element hithspsElement = XMLHelper.selectSingleElement(hitElement, "Hit_hsps");
149                                        ArrayList<Element> hspList = XMLHelper.selectElements(hithspsElement, "Hsp");
150
151                                        hspsCollection = new ArrayList<Hsp>();
152                                        for (Element hspElement : hspList) {
153                                                Double evalue = new Double(XMLHelper.selectSingleElement(hspElement, "Hsp_evalue").getTextContent());
154
155                                                // add the new hsp only if it pass the specified threshold. It can save lot of memory and some parsing time
156                                                if (evalue <= maxEScore) {
157                                                        BlastHspBuilder blastHspBuilder = new BlastHspBuilder();
158                                                        blastHspBuilder
159                                                                .setHspNum(new Integer(XMLHelper.selectSingleElement(hspElement, "Hsp_num").getTextContent()))
160                                                                .setHspBitScore(new Double(XMLHelper.selectSingleElement(hspElement, "Hsp_bit-score").getTextContent()))
161                                                                .setHspScore(new Integer(XMLHelper.selectSingleElement(hspElement, "Hsp_score").getTextContent()))
162                                                                .setHspEvalue(evalue)
163                                                                .setHspQueryFrom(new Integer(XMLHelper.selectSingleElement(hspElement, "Hsp_query-from").getTextContent()))
164                                                                .setHspQueryTo(new Integer(XMLHelper.selectSingleElement(hspElement, "Hsp_query-to").getTextContent()))
165                                                                .setHspHitFrom(new Integer(XMLHelper.selectSingleElement(hspElement, "Hsp_hit-from").getTextContent()))
166                                                                .setHspHitTo(new Integer(XMLHelper.selectSingleElement(hspElement, "Hsp_hit-to").getTextContent()))
167                                                                .setHspQueryFrame(new Integer(XMLHelper.selectSingleElement(hspElement, "Hsp_query-frame").getTextContent()))
168                                                                .setHspHitFrame(new Integer(XMLHelper.selectSingleElement(hspElement, "Hsp_hit-frame").getTextContent()))
169                                                                .setHspIdentity(new Integer(XMLHelper.selectSingleElement(hspElement, "Hsp_identity").getTextContent()))
170                                                                .setHspPositive(new Integer(XMLHelper.selectSingleElement(hspElement, "Hsp_positive").getTextContent()))
171                                                                .setHspGaps(new Integer(XMLHelper.selectSingleElement(hspElement, "Hsp_gaps").getTextContent()))
172                                                                .setHspAlignLen(new Integer(XMLHelper.selectSingleElement(hspElement, "Hsp_align-len").getTextContent()))
173                                                                .setHspQseq(XMLHelper.selectSingleElement(hspElement, "Hsp_qseq").getTextContent())
174                                                                .setHspHseq(XMLHelper.selectSingleElement(hspElement, "Hsp_hseq").getTextContent())
175                                                                .setHspIdentityString(XMLHelper.selectSingleElement(hspElement, "Hsp_midline").getTextContent());
176
177                                                        hspsCollection.add(blastHspBuilder.createBlastHsp());
178                                                }
179                                        }
180                                        // finally set the computed hsp collection and create Hit object
181                                        blastHitBuilder.setHsps(hspsCollection);
182                                        hitsCollection.add(blastHitBuilder.createBlastHit());
183                                }
184                                // finally set the computed Hit collection to the result
185                                resultBuilder.setHits(hitsCollection);
186                                resultsCollection.add(resultBuilder.createBlastResult());
187                        }
188                } catch (XPathException e) {
189                        throw new ParseException(e.getMessage(),0);
190                }
191                logger.info("Parsing of "+targetFile+" finished.");
192
193                return resultsCollection;
194        }
195
196        @Override
197        public List<String> getFileExtensions(){
198                ArrayList<String> extensions = new ArrayList<String>(1);
199                extensions.add("blastxml");
200                return extensions;
201        }
202
203        @Override
204        public void setQueryReferences(List<Sequence> sequences) {
205                queryReferences = sequences;
206        }
207
208        @Override
209        public void setDatabaseReferences(List<Sequence> sequences) {
210                databaseReferences = sequences;
211        }
212
213        /**
214         * fill the map association between sequences an a unique id
215         */
216        private void mapIds() {
217                if (queryReferences != null) {
218                        queryReferencesMap = new HashMap<String,Sequence>(queryReferences.size());
219                        for (int counter=0; counter < queryReferences.size() ; counter ++){
220                                String id = "Query_"+(counter+1);
221                                queryReferencesMap.put(id, queryReferences.get(counter));
222                        }
223                }
224
225                if (databaseReferences != null) {
226                        databaseReferencesMap = new HashMap<String,Sequence>(databaseReferences.size());
227                        for (int counter=0; counter < databaseReferences.size() ; counter ++){
228                                // this is strange: while Query_id are 1 based, Hit (database) id are 0 based
229                                String id = "gnl|BL_ORD_ID|"+(counter);
230                                databaseReferencesMap.put(id, databaseReferences.get(counter));
231                        }
232                }
233        }
234
235        @Override
236        public void storeObjects(List<Result> results) throws IOException, ParseException {
237                throw new UnsupportedOperationException("This parser does not support writing yet.");
238        }
239}
240
241
242class BlastHsp extends org.biojava.nbio.core.search.io.Hsp {
243        public BlastHsp(int hspNum, double hspBitScore, int hspScore, double hspEvalue, int hspQueryFrom, int hspQueryTo, int hspHitFrom, int hspHitTo, int hspQueryFrame, int hspHitFrame, int hspIdentity, int hspPositive, int hspGaps, int hspAlignLen, String hspQseq, String hspHseq, String hspIdentityString, Double percentageIdentity, Integer mismatchCount) {
244                super(hspNum, hspBitScore, hspScore, hspEvalue, hspQueryFrom, hspQueryTo, hspHitFrom, hspHitTo, hspQueryFrame, hspHitFrame, hspIdentity, hspPositive, hspGaps, hspAlignLen, hspQseq, hspHseq, hspIdentityString, percentageIdentity, mismatchCount);
245        }
246
247}
248
249class BlastHit extends org.biojava.nbio.core.search.io.Hit {
250        public BlastHit(int hitNum, String hitId, String hitDef, String hitAccession, int hitLen, List<Hsp> hitHsps, Sequence hitSequence) {
251                super(hitNum, hitId, hitDef, hitAccession, hitLen, hitHsps, hitSequence);
252        }
253
254}