001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.core.search.io.blast;
022
023import java.io.File;
024import java.io.FileInputStream;
025import java.io.FileReader;
026import java.io.IOException;
027import java.io.LineNumberReader;
028import java.text.ParseException;
029import java.util.ArrayList;
030import java.util.HashMap;
031import java.util.List;
032import java.util.Scanner;
033import org.biojava.nbio.core.search.io.Hit;
034import org.biojava.nbio.core.search.io.Hsp;
035import org.biojava.nbio.core.search.io.Result;
036import org.biojava.nbio.core.search.io.ResultFactory;
037import org.biojava.nbio.core.sequence.template.Sequence;
038import org.slf4j.Logger;
039import org.slf4j.LoggerFactory;
040
041/**
042 * Designed by Paolo Pavan.
043 * You may want to find my contacts on Github and LinkedIn for code info
044 * or discuss major changes.
045 * https://github.com/paolopavan
046 *
047 * @author Paolo Pavan
048 */
049
050public class BlastTabularParser implements ResultFactory {
051        private final String blastReference =
052                        "Zheng Zhang, Scott Schwartz, Lukas Wagner, and Webb Miller (2000), A greedy algorithm for aligning DNA sequences", J Comput Biol 2000; 7(1-2):203-14.";
053        /**
054         * Tries to define a different level of consistency during parsing.
055         * LITERAL is intended a strict parsing much tight to the report.
056         * IMPROVED consistency tries to import data much tight to the data model
057         * (I hope you got the idea, if not, have a look to the code.
058         * I suggest to use improved unless you have reasons to do not)
059         */
060        private enum PARSING_CONSISTENCY {
061                IMPROVED,
062                LITERAL
063        }
064        private static final Logger log = LoggerFactory.getLogger(BlastTabularParser.class);
065
066
067        private File targetFile;
068        private int fileLinesCount;
069        private PARSING_CONSISTENCY parsingConsistency = PARSING_CONSISTENCY.IMPROVED;
070
071
072        // data imported private:
073        int queryIdNumber = 0;
074        HashMap<String,String> queryIdMapping = new HashMap<String,String>();
075        String programName=null, queryName = null, databaseFile = null;
076        private String queryId      ;
077        private String subjectId    ;
078        private String percIdentity ;
079        private String alnLength    ;
080        private String mismatchCount;
081        private String gapOpenCount ;
082        private String queryStart   ;
083        private String queryEnd     ;
084        private String subjectStart ;
085        private String subjectEnd   ;
086        private String evalue       ;
087        private String bitScore     ;
088
089
090        @Override
091        public List<String> getFileExtensions() {
092                List<String> l = new ArrayList<String>();
093                l.add("blasttabular");
094                l.add("blasttxt");
095                return l;
096        }
097
098        @Override
099        public void setFile(File f) {
100                targetFile = f;
101        }
102
103        @Override
104        public List<Result> createObjects(double maxEScore) throws IOException, ParseException {
105                List<Result> results = new ArrayList<Result>();
106
107                log.info("Query for hits");
108                LineNumberReader  lnr = new LineNumberReader(new FileReader(targetFile));
109                lnr.skip(Long.MAX_VALUE);
110                fileLinesCount = lnr.getLineNumber();
111                log.info(fileLinesCount + " hits approximately in all results");
112                lnr.close();
113
114                FileInputStream fileInputStream = new FileInputStream(targetFile);
115                Scanner scanner = new Scanner(fileInputStream);
116
117                String line = fetchData(scanner);
118                int lineNumber=0;
119                while (lineNumber < fileLinesCount){
120                        try {
121                                BlastResultBuilder resultBuilder = new BlastResultBuilder();
122                                resultBuilder
123                                                .setQueryID(queryId)
124                                                .setDbFile(databaseFile)
125                                                .setProgram(programName)
126                                                .setQueryDef(queryName)
127                                                .setReference(blastReference);
128
129                                List<Hit> hits = new ArrayList<Hit>();
130
131                                String currentQueryId = queryId;
132                                while (currentQueryId.equals(queryId) && lineNumber < fileLinesCount){
133                                        BlastHitBuilder hitBuilder = new BlastHitBuilder();
134
135                                        List<Hsp> hsps = new ArrayList<Hsp>();
136
137                                        String currentSubjectId=subjectId;
138                                        while (currentSubjectId.equals(subjectId) && lineNumber < fileLinesCount){
139                                                if (new Double(evalue) > maxEScore) {
140                                                        line = fetchData(scanner);
141                                                        lineNumber++;
142                                                        continue;
143                                                }
144                                                BlastHspBuilder hspBuilder = new BlastHspBuilder();
145                                                hspBuilder
146                                                        .setHspAlignLen(new Integer(alnLength))
147                                                        .setHspGaps(new Integer(gapOpenCount))
148                                                        .setHspQueryFrom(new Integer(queryStart))
149                                                        .setHspQueryTo(new Integer(queryEnd))
150                                                        .setHspHitFrom(new Integer(subjectStart))
151                                                        .setHspHitTo(new Integer(subjectEnd))
152                                                        .setHspEvalue(new Double(evalue))
153                                                        .setHspBitScore(new Double(bitScore))
154                                                        .setPercentageIdentity(new Double(percIdentity)/100)
155                                                        .setMismatchCount(new Integer(mismatchCount));
156                                                hsps.add(hspBuilder.createBlastHsp());
157                                                if (scanner.hasNext()) line = fetchData(scanner);
158                                                lineNumber++;
159                                        }
160                                        hits.add(hitBuilder.setHsps(hsps).createBlastHit());
161                                }
162                                results.add(resultBuilder.setHits(hits).createBlastResult());
163                        } catch (NumberFormatException e) {
164                                throw new ParseException("Invalid numeric value met at line "+ lineNumber+" in:\n"+line,0);
165                        }
166                }
167                return results;
168        }
169
170        private String fetchData(Scanner scanner){
171                String line;
172                String[] split;
173
174                line = scanner.nextLine();
175                while (line.startsWith("#")){
176                        // blast tabular with header options contains some more informations
177                        if (line.matches("#\\s.?BLAST.+")) programName = line.replace("#\\s","");
178                        if (line.startsWith("# Query:")) queryName = line.replace("# Query: ","");
179                        if (line.startsWith("# Database:")) databaseFile = line.replace("# Database: ","");
180
181                        // needed because blast report can end with a comment...
182                        if (!scanner.hasNext()) return null;
183                        line = scanner.nextLine();
184                }
185
186                // Here, programName != null checks if there was a header in the file
187                boolean headerFound = programName != null;
188
189                split = line.split("\\t");
190                queryId      =split[0];
191                subjectId    =split[1];
192                percIdentity =split[2];
193                alnLength    =split[3];
194                mismatchCount=split[4];
195                gapOpenCount =split[5];
196                queryStart   =split[6];
197                queryEnd     =split[7];
198                subjectStart =split[8];
199                subjectEnd   =split[9];
200                evalue       =split[10];
201                bitScore     =split[11];
202
203                // blast tabular reports only the first word of the query name.
204                // If it was specified in the header it is better to use that definition
205                if (parsingConsistency == PARSING_CONSISTENCY.IMPROVED && headerFound) {
206                        if (queryIdMapping.get(queryId)==null) {
207                                queryIdNumber ++;
208                                queryIdMapping.put(queryId,"Query_" + queryIdNumber);
209                        }
210                        // If a complete definition of the query name was readed, than we can use
211                        // a queryID schema that is consistent with blast xml report
212                        queryId = queryIdMapping.get(queryId);
213                }
214                if (!headerFound) queryName = queryId;
215
216                return line;
217        }
218
219        @Override
220        public void storeObjects(List<Result> results) throws IOException, ParseException {
221                throw new UnsupportedOperationException("Not supported yet.");
222        }
223
224        /**
225         * Intended for use with run module.
226         * Although possible, does not make a lot of sense to have it with limited
227         * information such those in tabular report
228         * @param sequences
229         */
230        @Override
231        public void setQueryReferences(List<Sequence> sequences) {
232                throw new UnsupportedOperationException("Not supported for this parser.");
233        }
234        /**
235         * Intended for use with run module.
236         * Although possible, does not make a lot of sense to have it with limited
237         * information such those in tabular report
238         * @param sequences
239         */
240        @Override
241        public void setDatabaseReferences(List<Sequence> sequences) {
242                throw new UnsupportedOperationException("Not supported for this parser.");
243        }
244         /**
245         * Tries to define a different level of consistency during parsing.
246         * LITERAL is intended a strict parsing much tight to the report.
247         * IMPROVED consistency tries to import data much tight to the data model
248         * (I hope you got the idea, if not, have a look to the code.
249         * I suggest to use improved unless you have reasons to do not)
250         */
251        public void setParsingConsistency(PARSING_CONSISTENCY parsingConsistency) {
252                this.parsingConsistency = parsingConsistency;
253        }
254
255}