001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.core.search.io.blast; 022 023import java.io.File; 024import java.io.FileInputStream; 025import java.io.FileReader; 026import java.io.IOException; 027import java.io.LineNumberReader; 028import java.text.ParseException; 029import java.util.ArrayList; 030import java.util.HashMap; 031import java.util.List; 032import java.util.Scanner; 033import org.biojava.nbio.core.search.io.Hit; 034import org.biojava.nbio.core.search.io.Hsp; 035import org.biojava.nbio.core.search.io.Result; 036import org.biojava.nbio.core.search.io.ResultFactory; 037import org.biojava.nbio.core.sequence.template.Sequence; 038import org.slf4j.Logger; 039import org.slf4j.LoggerFactory; 040import java.util.Map; 041 042/** 043 * Designed by Paolo Pavan. 044 * You may want to find my contacts on Github and LinkedIn for code info 045 * or discuss major changes. 046 * https://github.com/paolopavan 047 * 048 * @author Paolo Pavan 049 */ 050 051public class BlastTabularParser implements ResultFactory { 052 private final String blastReference = 053 "Zheng Zhang, Scott Schwartz, Lukas Wagner, and Webb Miller (2000), A greedy algorithm for aligning DNA sequences", J Comput Biol 2000; 7(1-2):203-14."; 054 /** 055 * Tries to define a different level of consistency during parsing. 056 * LITERAL is intended a strict parsing much tight to the report. 057 * IMPROVED consistency tries to import data much tight to the data model 058 * (I hope you got the idea, if not, have a look to the code. 059 * I suggest to use improved unless you have reasons to do not) 060 */ 061 private enum PARSING_CONSISTENCY { 062 IMPROVED, 063 LITERAL 064 } 065 private static final Logger log = LoggerFactory.getLogger(BlastTabularParser.class); 066 067 068 private File targetFile; 069 private int fileLinesCount; 070 private PARSING_CONSISTENCY parsingConsistency = PARSING_CONSISTENCY.IMPROVED; 071 072 073 // data imported private: 074 int queryIdNumber = 0; 075 Map<String, String> queryIdMapping = new HashMap<>(); 076 String programName=null, queryName = null, databaseFile = null; 077 private String queryId ; 078 private String subjectId ; 079 private String percIdentity ; 080 private String alnLength ; 081 private String mismatchCount; 082 private String gapOpenCount ; 083 private String queryStart ; 084 private String queryEnd ; 085 private String subjectStart ; 086 private String subjectEnd ; 087 private String evalue ; 088 private String bitScore ; 089 090 091 @Override 092 public List<String> getFileExtensions() { 093 List<String> l = new ArrayList<>(); 094 l.add("blasttabular"); 095 l.add("blasttxt"); 096 return l; 097 } 098 099 @Override 100 public void setFile(File f) { 101 targetFile = f; 102 } 103 104 @Override 105 public List<Result> createObjects(double maxEScore) throws IOException, ParseException { 106 List<Result> results = new ArrayList<>(); 107 108 log.info("Query for hits"); 109 LineNumberReader lnr = new LineNumberReader(new FileReader(targetFile)); 110 lnr.skip(Long.MAX_VALUE); 111 fileLinesCount = lnr.getLineNumber(); 112 log.info(fileLinesCount + " hits approximately in all results"); 113 lnr.close(); 114 115 FileInputStream fileInputStream = new FileInputStream(targetFile); 116 Scanner scanner = new Scanner(fileInputStream); 117 118 String line = fetchData(scanner); 119 int lineNumber=0; 120 while (lineNumber < fileLinesCount){ 121 try { 122 BlastResultBuilder resultBuilder = new BlastResultBuilder(); 123 resultBuilder 124 .setQueryID(queryId) 125 .setDbFile(databaseFile) 126 .setProgram(programName) 127 .setQueryDef(queryName) 128 .setReference(blastReference); 129 130 List<Hit> hits = new ArrayList<>(); 131 132 String currentQueryId = queryId; 133 while (currentQueryId.equals(queryId) && lineNumber < fileLinesCount){ 134 BlastHitBuilder hitBuilder = new BlastHitBuilder(); 135 136 List<Hsp> hsps = new ArrayList<>(); 137 138 String currentSubjectId=subjectId; 139 while (currentSubjectId.equals(subjectId) && lineNumber < fileLinesCount){ 140 if (Double.valueOf(evalue) > maxEScore) { 141 line = fetchData(scanner); 142 lineNumber++; 143 continue; 144 } 145 BlastHspBuilder hspBuilder = new BlastHspBuilder(); 146 hspBuilder 147 .setHspAlignLen(Integer.valueOf(alnLength)) 148 .setHspGaps(Integer.valueOf(gapOpenCount)) 149 .setHspQueryFrom(Integer.valueOf(queryStart)) 150 .setHspQueryTo(Integer.valueOf(queryEnd)) 151 .setHspHitFrom(Integer.valueOf(subjectStart)) 152 .setHspHitTo(Integer.valueOf(subjectEnd)) 153 .setHspEvalue(Double.valueOf(evalue)) 154 .setHspBitScore(Double.valueOf(bitScore)) 155 .setPercentageIdentity(Double.valueOf(percIdentity)/100) 156 .setMismatchCount(Integer.valueOf(mismatchCount)); 157 hsps.add(hspBuilder.createBlastHsp()); 158 if (scanner.hasNext()) line = fetchData(scanner); 159 lineNumber++; 160 } 161 hits.add(hitBuilder.setHsps(hsps).createBlastHit()); 162 } 163 results.add(resultBuilder.setHits(hits).createBlastResult()); 164 } catch (NumberFormatException e) { 165 throw new ParseException("Invalid numeric value met at line "+ lineNumber+" in:\n"+line,0); 166 } 167 } 168 return results; 169 } 170 171 private String fetchData(Scanner scanner){ 172 String line; 173 String[] split; 174 175 line = scanner.nextLine(); 176 while (line.startsWith("#")){ 177 // blast tabular with header options contains some more informations 178 if (line.matches("#\\s.?BLAST.+")) programName = line.replace("#\\s",""); 179 if (line.startsWith("# Query:")) queryName = line.replace("# Query: ",""); 180 if (line.startsWith("# Database:")) databaseFile = line.replace("# Database: ",""); 181 182 // needed because blast report can end with a comment... 183 if (!scanner.hasNext()) return null; 184 line = scanner.nextLine(); 185 } 186 187 // Here, programName != null checks if there was a header in the file 188 boolean headerFound = programName != null; 189 190 split = line.split("\\t"); 191 queryId =split[0]; 192 subjectId =split[1]; 193 percIdentity =split[2]; 194 alnLength =split[3]; 195 mismatchCount=split[4]; 196 gapOpenCount =split[5]; 197 queryStart =split[6]; 198 queryEnd =split[7]; 199 subjectStart =split[8]; 200 subjectEnd =split[9]; 201 evalue =split[10]; 202 bitScore =split[11]; 203 204 // blast tabular reports only the first word of the query name. 205 // If it was specified in the header it is better to use that definition 206 if (parsingConsistency == PARSING_CONSISTENCY.IMPROVED && headerFound) { 207 if (queryIdMapping.get(queryId)==null) { 208 queryIdNumber ++; 209 queryIdMapping.put(queryId,"Query_" + queryIdNumber); 210 } 211 // If a complete definition of the query name was readed, than we can use 212 // a queryID schema that is consistent with blast xml report 213 queryId = queryIdMapping.get(queryId); 214 } 215 if (!headerFound) queryName = queryId; 216 217 return line; 218 } 219 220 @Override 221 public void storeObjects(List<Result> results) throws IOException, ParseException { 222 throw new UnsupportedOperationException("Not supported yet."); 223 } 224 225 /** 226 * Intended for use with run module. 227 * Although possible, does not make a lot of sense to have it with limited 228 * information such those in tabular report 229 * @param sequences 230 */ 231 @Override 232 public void setQueryReferences(List<Sequence> sequences) { 233 throw new UnsupportedOperationException("Not supported for this parser."); 234 } 235 /** 236 * Intended for use with run module. 237 * Although possible, does not make a lot of sense to have it with limited 238 * information such those in tabular report 239 * @param sequences 240 */ 241 @Override 242 public void setDatabaseReferences(List<Sequence> sequences) { 243 throw new UnsupportedOperationException("Not supported for this parser."); 244 } 245 /** 246 * Tries to define a different level of consistency during parsing. 247 * LITERAL is intended a strict parsing much tight to the report. 248 * IMPROVED consistency tries to import data much tight to the data model 249 * (I hope you got the idea, if not, have a look to the code. 250 * I suggest to use improved unless you have reasons to do not) 251 */ 252 public void setParsingConsistency(PARSING_CONSISTENCY parsingConsistency) { 253 this.parsingConsistency = parsingConsistency; 254 } 255 256}