001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.core.search.io.blast;
022
023import java.io.File;
024import java.io.FileInputStream;
025import java.io.FileReader;
026import java.io.IOException;
027import java.io.LineNumberReader;
028import java.text.ParseException;
029import java.util.ArrayList;
030import java.util.HashMap;
031import java.util.List;
032import java.util.Scanner;
033import java.util.logging.Logger;
034import org.biojava.nbio.core.search.io.Hit;
035import org.biojava.nbio.core.search.io.Hsp;
036import org.biojava.nbio.core.search.io.Result;
037import org.biojava.nbio.core.search.io.ResultFactory;
038import org.biojava.nbio.core.sequence.template.Sequence;
039
040/**
041 * Designed by Paolo Pavan.
042 * You may want to find my contacts on Github and LinkedIn for code info
043 * or discuss major changes.
044 * https://github.com/paolopavan
045 *
046 * @author Paolo Pavan
047 */
048
049public class BlastTabularParser implements ResultFactory {
050        private final String blastReference =
051                        "Zheng Zhang, Scott Schwartz, Lukas Wagner, and Webb Miller (2000), A greedy algorithm for aligning DNA sequences", J Comput Biol 2000; 7(1-2):203-14.";
052        /**
053         * Tries to define a different level of consistency during parsing.
054         * LITERAL is intended a strict parsing much tight to the report.
055         * IMPROVED consistency tries to import data much tight to the data model
056         * (I hope you got the idea, if not, have a look to the code.
057         * I suggest to use improved unless you have reasons to do not)
058         */
059        private enum PARSING_CONSISTENCY {
060                IMPROVED,
061                LITERAL
062        }
063        private static final Logger log = Logger.getLogger(BlastTabularParser.class.getName());
064
065
066        private File targetFile;
067        private int fileLinesCount;
068        private PARSING_CONSISTENCY parsingConsistency = PARSING_CONSISTENCY.IMPROVED;
069
070
071        // data imported private:
072        int queryIdNumber = 0;
073        HashMap<String,String> queryIdMapping = new HashMap<String,String>();
074        String programName=null, queryName = null, databaseFile = null;
075        private String queryId      ;
076        private String subjectId    ;
077        private String percIdentity ;
078        private String alnLength    ;
079        private String mismatchCount;
080        private String gapOpenCount ;
081        private String queryStart   ;
082        private String queryEnd     ;
083        private String subjectStart ;
084        private String subjectEnd   ;
085        private String evalue       ;
086        private String bitScore     ;
087
088
089        @Override
090        public List<String> getFileExtensions() {
091                List<String> l = new ArrayList<String>();
092                l.add("blasttabular");
093                l.add("blasttxt");
094                return l;
095        }
096
097        @Override
098        public void setFile(File f) {
099                targetFile = f;
100        }
101
102        @Override
103        public List<Result> createObjects(double maxEScore) throws IOException, ParseException {
104                List<Result> results = new ArrayList<Result>();
105
106                log.info("Query for hits");
107                LineNumberReader  lnr = new LineNumberReader(new FileReader(targetFile));
108                lnr.skip(Long.MAX_VALUE);
109                fileLinesCount = lnr.getLineNumber();
110                log.info(fileLinesCount + " hits approximately in all results");
111                lnr.close();
112
113                FileInputStream fileInputStream = new FileInputStream(targetFile);
114                Scanner scanner = new Scanner(fileInputStream);
115
116                String line = fetchData(scanner);
117                int lineNumber=0;
118                while (lineNumber < fileLinesCount){
119                        try {
120                                BlastResultBuilder resultBuilder = new BlastResultBuilder();
121                                resultBuilder
122                                                .setQueryID(queryId)
123                                                .setDbFile(databaseFile)
124                                                .setProgram(programName)
125                                                .setQueryDef(queryName)
126                                                .setReference(blastReference);
127
128                                List<Hit> hits = new ArrayList<Hit>();
129
130                                String currentQueryId = queryId;
131                                while (currentQueryId.equals(queryId) && lineNumber < fileLinesCount){
132                                        BlastHitBuilder hitBuilder = new BlastHitBuilder();
133
134                                        List<Hsp> hsps = new ArrayList<Hsp>();
135
136                                        String currentSubjectId=subjectId;
137                                        while (currentSubjectId.equals(subjectId) && lineNumber < fileLinesCount){
138                                                if (new Double(evalue) > maxEScore) {
139                                                        line = fetchData(scanner);
140                                                        lineNumber++;
141                                                        continue;
142                                                }
143                                                BlastHspBuilder hspBuilder = new BlastHspBuilder();
144                                                hspBuilder
145                                                        .setHspAlignLen(new Integer(alnLength))
146                                                        .setHspGaps(new Integer(gapOpenCount))
147                                                        .setHspQueryFrom(new Integer(queryStart))
148                                                        .setHspQueryTo(new Integer(queryEnd))
149                                                        .setHspHitFrom(new Integer(subjectStart))
150                                                        .setHspHitTo(new Integer(subjectEnd))
151                                                        .setHspEvalue(new Double(evalue))
152                                                        .setHspBitScore(new Double(bitScore))
153                                                        .setPercentageIdentity(new Double(percIdentity)/100)
154                                                        .setMismatchCount(new Integer(mismatchCount));
155                                                hsps.add(hspBuilder.createBlastHsp());
156                                                if (scanner.hasNext()) line = fetchData(scanner);
157                                                lineNumber++;
158                                        }
159                                        hits.add(hitBuilder.setHsps(hsps).createBlastHit());
160                                }
161                                results.add(resultBuilder.setHits(hits).createBlastResult());
162                        } catch (NumberFormatException e) {
163                                throw new ParseException("Invalid numeric value met at line "+ lineNumber+" in:\n"+line,0);
164                        }
165                }
166                return results;
167        }
168
169        private String fetchData(Scanner scanner){
170                String line;
171                String[] split;
172
173                line = scanner.nextLine();
174                while (line.startsWith("#")){
175                        // blast tabular with header options contains some more informations
176                        if (line.matches("#\\s.?BLAST.+")) programName = line.replace("#\\s","");
177                        if (line.startsWith("# Query:")) queryName = line.replace("# Query: ","");
178                        if (line.startsWith("# Database:")) databaseFile = line.replace("# Database: ","");
179
180                        // needed because blast report can end with a comment...
181                        if (!scanner.hasNext()) return null;
182                        line = scanner.nextLine();
183                }
184
185                // Here, programName != null checks if there was a header in the file
186                boolean headerFound = programName != null;
187
188                split = line.split("\\t");
189                queryId      =split[0];
190                subjectId    =split[1];
191                percIdentity =split[2];
192                alnLength    =split[3];
193                mismatchCount=split[4];
194                gapOpenCount =split[5];
195                queryStart   =split[6];
196                queryEnd     =split[7];
197                subjectStart =split[8];
198                subjectEnd   =split[9];
199                evalue       =split[10];
200                bitScore     =split[11];
201
202                // blast tabular reports only the first word of the query name.
203                // If it was specified in the header it is better to use that definition
204                if (parsingConsistency == PARSING_CONSISTENCY.IMPROVED && headerFound) {
205                        if (queryIdMapping.get(queryId)==null) {
206                                queryIdNumber ++;
207                                queryIdMapping.put(queryId,"Query_" + queryIdNumber);
208                        }
209                        // If a complete definition of the query name was readed, than we can use
210                        // a queryID schema that is consistent with blast xml report
211                        queryId = queryIdMapping.get(queryId);
212                }
213                if (!headerFound) queryName = queryId;
214
215                return line;
216        }
217
218        @Override
219        public void storeObjects(List<Result> results) throws IOException, ParseException {
220                throw new UnsupportedOperationException("Not supported yet.");
221        }
222
223        /**
224         * Intended for use with run module.
225         * Although possible, does not make a lot of sense to have it with limited
226         * information such those in tabular report
227         * @param sequences
228         */
229        @Override
230        public void setQueryReferences(List<Sequence> sequences) {
231                throw new UnsupportedOperationException("Not supported for this parser.");
232        }
233        /**
234         * Intended for use with run module.
235         * Although possible, does not make a lot of sense to have it with limited
236         * information such those in tabular report
237         * @param sequences
238         */
239        @Override
240        public void setDatabaseReferences(List<Sequence> sequences) {
241                throw new UnsupportedOperationException("Not supported for this parser.");
242        }
243         /**
244         * Tries to define a different level of consistency during parsing.
245         * LITERAL is intended a strict parsing much tight to the report.
246         * IMPROVED consistency tries to import data much tight to the data model
247         * (I hope you got the idea, if not, have a look to the code.
248         * I suggest to use improved unless you have reasons to do not)
249         */
250        public void setParsingConsistency(PARSING_CONSISTENCY parsingConsistency) {
251                this.parsingConsistency = parsingConsistency;
252        }
253
254}