001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.core.search.io.blast; 022 023import java.io.File; 024import java.io.FileInputStream; 025import java.io.FileReader; 026import java.io.IOException; 027import java.io.LineNumberReader; 028import java.text.ParseException; 029import java.util.ArrayList; 030import java.util.HashMap; 031import java.util.List; 032import java.util.Scanner; 033import org.biojava.nbio.core.search.io.Hit; 034import org.biojava.nbio.core.search.io.Hsp; 035import org.biojava.nbio.core.search.io.Result; 036import org.biojava.nbio.core.search.io.ResultFactory; 037import org.biojava.nbio.core.sequence.template.Sequence; 038import org.slf4j.Logger; 039import org.slf4j.LoggerFactory; 040 041/** 042 * Designed by Paolo Pavan. 043 * You may want to find my contacts on Github and LinkedIn for code info 044 * or discuss major changes. 045 * https://github.com/paolopavan 046 * 047 * @author Paolo Pavan 048 */ 049 050public class BlastTabularParser implements ResultFactory { 051 private final String blastReference = 052 "Zheng Zhang, Scott Schwartz, Lukas Wagner, and Webb Miller (2000), A greedy algorithm for aligning DNA sequences", J Comput Biol 2000; 7(1-2):203-14."; 053 /** 054 * Tries to define a different level of consistency during parsing. 055 * LITERAL is intended a strict parsing much tight to the report. 056 * IMPROVED consistency tries to import data much tight to the data model 057 * (I hope you got the idea, if not, have a look to the code. 058 * I suggest to use improved unless you have reasons to do not) 059 */ 060 private enum PARSING_CONSISTENCY { 061 IMPROVED, 062 LITERAL 063 } 064 private static final Logger log = LoggerFactory.getLogger(BlastTabularParser.class); 065 066 067 private File targetFile; 068 private int fileLinesCount; 069 private PARSING_CONSISTENCY parsingConsistency = PARSING_CONSISTENCY.IMPROVED; 070 071 072 // data imported private: 073 int queryIdNumber = 0; 074 HashMap<String,String> queryIdMapping = new HashMap<String,String>(); 075 String programName=null, queryName = null, databaseFile = null; 076 private String queryId ; 077 private String subjectId ; 078 private String percIdentity ; 079 private String alnLength ; 080 private String mismatchCount; 081 private String gapOpenCount ; 082 private String queryStart ; 083 private String queryEnd ; 084 private String subjectStart ; 085 private String subjectEnd ; 086 private String evalue ; 087 private String bitScore ; 088 089 090 @Override 091 public List<String> getFileExtensions() { 092 List<String> l = new ArrayList<String>(); 093 l.add("blasttabular"); 094 l.add("blasttxt"); 095 return l; 096 } 097 098 @Override 099 public void setFile(File f) { 100 targetFile = f; 101 } 102 103 @Override 104 public List<Result> createObjects(double maxEScore) throws IOException, ParseException { 105 List<Result> results = new ArrayList<Result>(); 106 107 log.info("Query for hits"); 108 LineNumberReader lnr = new LineNumberReader(new FileReader(targetFile)); 109 lnr.skip(Long.MAX_VALUE); 110 fileLinesCount = lnr.getLineNumber(); 111 log.info(fileLinesCount + " hits approximately in all results"); 112 lnr.close(); 113 114 FileInputStream fileInputStream = new FileInputStream(targetFile); 115 Scanner scanner = new Scanner(fileInputStream); 116 117 String line = fetchData(scanner); 118 int lineNumber=0; 119 while (lineNumber < fileLinesCount){ 120 try { 121 BlastResultBuilder resultBuilder = new BlastResultBuilder(); 122 resultBuilder 123 .setQueryID(queryId) 124 .setDbFile(databaseFile) 125 .setProgram(programName) 126 .setQueryDef(queryName) 127 .setReference(blastReference); 128 129 List<Hit> hits = new ArrayList<Hit>(); 130 131 String currentQueryId = queryId; 132 while (currentQueryId.equals(queryId) && lineNumber < fileLinesCount){ 133 BlastHitBuilder hitBuilder = new BlastHitBuilder(); 134 135 List<Hsp> hsps = new ArrayList<Hsp>(); 136 137 String currentSubjectId=subjectId; 138 while (currentSubjectId.equals(subjectId) && lineNumber < fileLinesCount){ 139 if (new Double(evalue) > maxEScore) { 140 line = fetchData(scanner); 141 lineNumber++; 142 continue; 143 } 144 BlastHspBuilder hspBuilder = new BlastHspBuilder(); 145 hspBuilder 146 .setHspAlignLen(new Integer(alnLength)) 147 .setHspGaps(new Integer(gapOpenCount)) 148 .setHspQueryFrom(new Integer(queryStart)) 149 .setHspQueryTo(new Integer(queryEnd)) 150 .setHspHitFrom(new Integer(subjectStart)) 151 .setHspHitTo(new Integer(subjectEnd)) 152 .setHspEvalue(new Double(evalue)) 153 .setHspBitScore(new Double(bitScore)) 154 .setPercentageIdentity(new Double(percIdentity)/100) 155 .setMismatchCount(new Integer(mismatchCount)); 156 hsps.add(hspBuilder.createBlastHsp()); 157 if (scanner.hasNext()) line = fetchData(scanner); 158 lineNumber++; 159 } 160 hits.add(hitBuilder.setHsps(hsps).createBlastHit()); 161 } 162 results.add(resultBuilder.setHits(hits).createBlastResult()); 163 } catch (NumberFormatException e) { 164 throw new ParseException("Invalid numeric value met at line "+ lineNumber+" in:\n"+line,0); 165 } 166 } 167 return results; 168 } 169 170 private String fetchData(Scanner scanner){ 171 String line; 172 String[] split; 173 174 line = scanner.nextLine(); 175 while (line.startsWith("#")){ 176 // blast tabular with header options contains some more informations 177 if (line.matches("#\\s.?BLAST.+")) programName = line.replace("#\\s",""); 178 if (line.startsWith("# Query:")) queryName = line.replace("# Query: ",""); 179 if (line.startsWith("# Database:")) databaseFile = line.replace("# Database: ",""); 180 181 // needed because blast report can end with a comment... 182 if (!scanner.hasNext()) return null; 183 line = scanner.nextLine(); 184 } 185 186 // Here, programName != null checks if there was a header in the file 187 boolean headerFound = programName != null; 188 189 split = line.split("\\t"); 190 queryId =split[0]; 191 subjectId =split[1]; 192 percIdentity =split[2]; 193 alnLength =split[3]; 194 mismatchCount=split[4]; 195 gapOpenCount =split[5]; 196 queryStart =split[6]; 197 queryEnd =split[7]; 198 subjectStart =split[8]; 199 subjectEnd =split[9]; 200 evalue =split[10]; 201 bitScore =split[11]; 202 203 // blast tabular reports only the first word of the query name. 204 // If it was specified in the header it is better to use that definition 205 if (parsingConsistency == PARSING_CONSISTENCY.IMPROVED && headerFound) { 206 if (queryIdMapping.get(queryId)==null) { 207 queryIdNumber ++; 208 queryIdMapping.put(queryId,"Query_" + queryIdNumber); 209 } 210 // If a complete definition of the query name was readed, than we can use 211 // a queryID schema that is consistent with blast xml report 212 queryId = queryIdMapping.get(queryId); 213 } 214 if (!headerFound) queryName = queryId; 215 216 return line; 217 } 218 219 @Override 220 public void storeObjects(List<Result> results) throws IOException, ParseException { 221 throw new UnsupportedOperationException("Not supported yet."); 222 } 223 224 /** 225 * Intended for use with run module. 226 * Although possible, does not make a lot of sense to have it with limited 227 * information such those in tabular report 228 * @param sequences 229 */ 230 @Override 231 public void setQueryReferences(List<Sequence> sequences) { 232 throw new UnsupportedOperationException("Not supported for this parser."); 233 } 234 /** 235 * Intended for use with run module. 236 * Although possible, does not make a lot of sense to have it with limited 237 * information such those in tabular report 238 * @param sequences 239 */ 240 @Override 241 public void setDatabaseReferences(List<Sequence> sequences) { 242 throw new UnsupportedOperationException("Not supported for this parser."); 243 } 244 /** 245 * Tries to define a different level of consistency during parsing. 246 * LITERAL is intended a strict parsing much tight to the report. 247 * IMPROVED consistency tries to import data much tight to the data model 248 * (I hope you got the idea, if not, have a look to the code. 249 * I suggest to use improved unless you have reasons to do not) 250 */ 251 public void setParsingConsistency(PARSING_CONSISTENCY parsingConsistency) { 252 this.parsingConsistency = parsingConsistency; 253 } 254 255}