Source code

001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.core.search.io.blast;
022
023import java.io.File;
024import java.io.FileInputStream;
025import java.io.FileReader;
026import java.io.IOException;
027import java.io.LineNumberReader;
028import java.text.ParseException;
029import java.util.ArrayList;
030import java.util.HashMap;
031import java.util.List;
032import java.util.Scanner;
033import org.biojava.nbio.core.search.io.Hit;
034import org.biojava.nbio.core.search.io.Hsp;
035import org.biojava.nbio.core.search.io.Result;
036import org.biojava.nbio.core.search.io.ResultFactory;
037import org.biojava.nbio.core.sequence.template.Sequence;
038import org.slf4j.Logger;
039import org.slf4j.LoggerFactory;
040import java.util.Map;
041
042/**
043 * Designed by Paolo Pavan.
044 * You may want to find my contacts on Github and LinkedIn for code info
045 * or discuss major changes.
046 * https://github.com/paolopavan
047 *
048 * @author Paolo Pavan
049 */
050
051public class BlastTabularParser implements ResultFactory {
052        private final String blastReference =
053                        "Zheng Zhang, Scott Schwartz, Lukas Wagner, and Webb Miller (2000), A greedy algorithm for aligning DNA sequences&quot;, J Comput Biol 2000; 7(1-2):203-14.";
054        /**
055         * Tries to define a different level of consistency during parsing.
056         * LITERAL is intended a strict parsing much tight to the report.
057         * IMPROVED consistency tries to import data much tight to the data model
058         * (I hope you got the idea, if not, have a look to the code.
059         * I suggest to use improved unless you have reasons to do not)
060         */
061        private enum PARSING_CONSISTENCY {
062                IMPROVED,
063                LITERAL
064        }
065        private static final Logger log = LoggerFactory.getLogger(BlastTabularParser.class);
066
067
068        private File targetFile;
069        private int fileLinesCount;
070        private PARSING_CONSISTENCY parsingConsistency = PARSING_CONSISTENCY.IMPROVED;
071
072
073        // data imported private:
074        int queryIdNumber = 0;
075        Map<String, String> queryIdMapping = new HashMap<>();
076        String programName=null, queryName = null, databaseFile = null;
077        private String queryId      ;
078        private String subjectId    ;
079        private String percIdentity ;
080        private String alnLength    ;
081        private String mismatchCount;
082        private String gapOpenCount ;
083        private String queryStart   ;
084        private String queryEnd     ;
085        private String subjectStart ;
086        private String subjectEnd   ;
087        private String evalue       ;
088        private String bitScore     ;
089
090
091        @Override
092        public List<String> getFileExtensions() {
093                List<String> l = new ArrayList<>();
094                l.add("blasttabular");
095                l.add("blasttxt");
096                return l;
097        }
098
099        @Override
100        public void setFile(File f) {
101                targetFile = f;
102        }
103
104        @Override
105        public List<Result> createObjects(double maxEScore) throws IOException, ParseException {
106                List<Result> results = new ArrayList<>();
107
108                log.info("Query for hits");
109                LineNumberReader  lnr = new LineNumberReader(new FileReader(targetFile));
110                lnr.skip(Long.MAX_VALUE);
111                fileLinesCount = lnr.getLineNumber();
112                log.info(fileLinesCount + " hits approximately in all results");
113                lnr.close();
114
115                FileInputStream fileInputStream = new FileInputStream(targetFile);
116                Scanner scanner = new Scanner(fileInputStream);
117
118                String line = fetchData(scanner);
119                int lineNumber=0;
120                while (lineNumber < fileLinesCount){
121                        try {
122                                BlastResultBuilder resultBuilder = new BlastResultBuilder();
123                                resultBuilder
124                                                .setQueryID(queryId)
125                                                .setDbFile(databaseFile)
126                                                .setProgram(programName)
127                                                .setQueryDef(queryName)
128                                                .setReference(blastReference);
129
130                                List<Hit> hits = new ArrayList<>();
131
132                                String currentQueryId = queryId;
133                                while (currentQueryId.equals(queryId) && lineNumber < fileLinesCount){
134                                        BlastHitBuilder hitBuilder = new BlastHitBuilder();
135
136                                        List<Hsp> hsps = new ArrayList<>();
137
138                                        String currentSubjectId=subjectId;
139                                        while (currentSubjectId.equals(subjectId) && lineNumber < fileLinesCount){
140                                                if (Double.valueOf(evalue) > maxEScore) {
141                                                        line = fetchData(scanner);
142                                                        lineNumber++;
143                                                        continue;
144                                                }
145                                                BlastHspBuilder hspBuilder = new BlastHspBuilder();
146                                                hspBuilder
147                                                        .setHspAlignLen(Integer.valueOf(alnLength))
148                                                        .setHspGaps(Integer.valueOf(gapOpenCount))
149                                                        .setHspQueryFrom(Integer.valueOf(queryStart))
150                                                        .setHspQueryTo(Integer.valueOf(queryEnd))
151                                                        .setHspHitFrom(Integer.valueOf(subjectStart))
152                                                        .setHspHitTo(Integer.valueOf(subjectEnd))
153                                                        .setHspEvalue(Double.valueOf(evalue))
154                                                        .setHspBitScore(Double.valueOf(bitScore))
155                                                        .setPercentageIdentity(Double.valueOf(percIdentity)/100)
156                                                        .setMismatchCount(Integer.valueOf(mismatchCount));
157                                                hsps.add(hspBuilder.createBlastHsp());
158                                                if (scanner.hasNext()) line = fetchData(scanner);
159                                                lineNumber++;
160                                        }
161                                        hits.add(hitBuilder.setHsps(hsps).createBlastHit());
162                                }
163                                results.add(resultBuilder.setHits(hits).createBlastResult());
164                        } catch (NumberFormatException e) {
165                                throw new ParseException("Invalid numeric value met at line "+ lineNumber+" in:\n"+line,0);
166                        }
167                }
168                return results;
169        }
170
171        private String fetchData(Scanner scanner){
172                String line;
173                String[] split;
174
175                line = scanner.nextLine();
176                while (line.startsWith("#")){
177                        // blast tabular with header options contains some more informations
178                        if (line.matches("#\\s.?BLAST.+")) programName = line.replace("#\\s","");
179                        if (line.startsWith("# Query:")) queryName = line.replace("# Query: ","");
180                        if (line.startsWith("# Database:")) databaseFile = line.replace("# Database: ","");
181
182                        // needed because blast report can end with a comment...
183                        if (!scanner.hasNext()) return null;
184                        line = scanner.nextLine();
185                }
186
187                // Here, programName != null checks if there was a header in the file
188                boolean headerFound = programName != null;
189
190                split = line.split("\\t");
191                queryId      =split[0];
192                subjectId    =split[1];
193                percIdentity =split[2];
194                alnLength    =split[3];
195                mismatchCount=split[4];
196                gapOpenCount =split[5];
197                queryStart   =split[6];
198                queryEnd     =split[7];
199                subjectStart =split[8];
200                subjectEnd   =split[9];
201                evalue       =split[10];
202                bitScore     =split[11];
203
204                // blast tabular reports only the first word of the query name.
205                // If it was specified in the header it is better to use that definition
206                if (parsingConsistency == PARSING_CONSISTENCY.IMPROVED && headerFound) {
207                        if (queryIdMapping.get(queryId)==null) {
208                                queryIdNumber ++;
209                                queryIdMapping.put(queryId,"Query_" + queryIdNumber);
210                        }
211                        // If a complete definition of the query name was readed, than we can use
212                        // a queryID schema that is consistent with blast xml report
213                        queryId = queryIdMapping.get(queryId);
214                }
215                if (!headerFound) queryName = queryId;
216
217                return line;
218        }
219
220        @Override
221        public void storeObjects(List<Result> results) throws IOException, ParseException {
222                throw new UnsupportedOperationException("Not supported yet.");
223        }
224
225        /**
226         * Intended for use with run module.
227         * Although possible, does not make a lot of sense to have it with limited
228         * information such those in tabular report
229         * @param sequences
230         */
231        @Override
232        public void setQueryReferences(List<Sequence> sequences) {
233                throw new UnsupportedOperationException("Not supported for this parser.");
234        }
235        /**
236         * Intended for use with run module.
237         * Although possible, does not make a lot of sense to have it with limited
238         * information such those in tabular report
239         * @param sequences
240         */
241        @Override
242        public void setDatabaseReferences(List<Sequence> sequences) {
243                throw new UnsupportedOperationException("Not supported for this parser.");
244        }
245         /**
246         * Tries to define a different level of consistency during parsing.
247         * LITERAL is intended a strict parsing much tight to the report.
248         * IMPROVED consistency tries to import data much tight to the data model
249         * (I hope you got the idea, if not, have a look to the code.
250         * I suggest to use improved unless you have reasons to do not)
251         */
252        public void setParsingConsistency(PARSING_CONSISTENCY parsingConsistency) {
253                this.parsingConsistency = parsingConsistency;
254        }
255
256}