001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.core.search.io.blast; 022 023import java.io.File; 024import java.io.FileInputStream; 025import java.io.FileReader; 026import java.io.IOException; 027import java.io.LineNumberReader; 028import java.text.ParseException; 029import java.util.ArrayList; 030import java.util.HashMap; 031import java.util.List; 032import java.util.Scanner; 033import java.util.logging.Logger; 034import org.biojava.nbio.core.search.io.Hit; 035import org.biojava.nbio.core.search.io.Hsp; 036import org.biojava.nbio.core.search.io.Result; 037import org.biojava.nbio.core.search.io.ResultFactory; 038import org.biojava.nbio.core.sequence.template.Sequence; 039 040/** 041 * Designed by Paolo Pavan. 042 * You may want to find my contacts on Github and LinkedIn for code info 043 * or discuss major changes. 044 * https://github.com/paolopavan 045 * 046 * @author Paolo Pavan 047 */ 048 049public class BlastTabularParser implements ResultFactory { 050 private final String blastReference = 051 "Zheng Zhang, Scott Schwartz, Lukas Wagner, and Webb Miller (2000), A greedy algorithm for aligning DNA sequences", J Comput Biol 2000; 7(1-2):203-14."; 052 /** 053 * Tries to define a different level of consistency during parsing. 054 * LITERAL is intended a strict parsing much tight to the report. 055 * IMPROVED consistency tries to import data much tight to the data model 056 * (I hope you got the idea, if not, have a look to the code. 057 * I suggest to use improved unless you have reasons to do not) 058 */ 059 private enum PARSING_CONSISTENCY { 060 IMPROVED, 061 LITERAL 062 } 063 private static final Logger log = Logger.getLogger(BlastTabularParser.class.getName()); 064 065 066 private File targetFile; 067 private int fileLinesCount; 068 private PARSING_CONSISTENCY parsingConsistency = PARSING_CONSISTENCY.IMPROVED; 069 070 071 // data imported private: 072 int queryIdNumber = 0; 073 HashMap<String,String> queryIdMapping = new HashMap<String,String>(); 074 String programName=null, queryName = null, databaseFile = null; 075 private String queryId ; 076 private String subjectId ; 077 private String percIdentity ; 078 private String alnLength ; 079 private String mismatchCount; 080 private String gapOpenCount ; 081 private String queryStart ; 082 private String queryEnd ; 083 private String subjectStart ; 084 private String subjectEnd ; 085 private String evalue ; 086 private String bitScore ; 087 088 089 @Override 090 public List<String> getFileExtensions() { 091 List<String> l = new ArrayList<String>(); 092 l.add("blasttabular"); 093 l.add("blasttxt"); 094 return l; 095 } 096 097 @Override 098 public void setFile(File f) { 099 targetFile = f; 100 } 101 102 @Override 103 public List<Result> createObjects(double maxEScore) throws IOException, ParseException { 104 List<Result> results = new ArrayList<Result>(); 105 106 log.info("Query for hits"); 107 LineNumberReader lnr = new LineNumberReader(new FileReader(targetFile)); 108 lnr.skip(Long.MAX_VALUE); 109 fileLinesCount = lnr.getLineNumber(); 110 log.info(fileLinesCount + " hits approximately in all results"); 111 lnr.close(); 112 113 FileInputStream fileInputStream = new FileInputStream(targetFile); 114 Scanner scanner = new Scanner(fileInputStream); 115 116 String line = fetchData(scanner); 117 int lineNumber=0; 118 while (lineNumber < fileLinesCount){ 119 try { 120 BlastResultBuilder resultBuilder = new BlastResultBuilder(); 121 resultBuilder 122 .setQueryID(queryId) 123 .setDbFile(databaseFile) 124 .setProgram(programName) 125 .setQueryDef(queryName) 126 .setReference(blastReference); 127 128 List<Hit> hits = new ArrayList<Hit>(); 129 130 String currentQueryId = queryId; 131 while (currentQueryId.equals(queryId) && lineNumber < fileLinesCount){ 132 BlastHitBuilder hitBuilder = new BlastHitBuilder(); 133 134 List<Hsp> hsps = new ArrayList<Hsp>(); 135 136 String currentSubjectId=subjectId; 137 while (currentSubjectId.equals(subjectId) && lineNumber < fileLinesCount){ 138 if (new Double(evalue) > maxEScore) { 139 line = fetchData(scanner); 140 lineNumber++; 141 continue; 142 } 143 BlastHspBuilder hspBuilder = new BlastHspBuilder(); 144 hspBuilder 145 .setHspAlignLen(new Integer(alnLength)) 146 .setHspGaps(new Integer(gapOpenCount)) 147 .setHspQueryFrom(new Integer(queryStart)) 148 .setHspQueryTo(new Integer(queryEnd)) 149 .setHspHitFrom(new Integer(subjectStart)) 150 .setHspHitTo(new Integer(subjectEnd)) 151 .setHspEvalue(new Double(evalue)) 152 .setHspBitScore(new Double(bitScore)) 153 .setPercentageIdentity(new Double(percIdentity)/100) 154 .setMismatchCount(new Integer(mismatchCount)); 155 hsps.add(hspBuilder.createBlastHsp()); 156 if (scanner.hasNext()) line = fetchData(scanner); 157 lineNumber++; 158 } 159 hits.add(hitBuilder.setHsps(hsps).createBlastHit()); 160 } 161 results.add(resultBuilder.setHits(hits).createBlastResult()); 162 } catch (NumberFormatException e) { 163 throw new ParseException("Invalid numeric value met at line "+ lineNumber+" in:\n"+line,0); 164 } 165 } 166 return results; 167 } 168 169 private String fetchData(Scanner scanner){ 170 String line; 171 String[] split; 172 173 line = scanner.nextLine(); 174 while (line.startsWith("#")){ 175 // blast tabular with header options contains some more informations 176 if (line.matches("#\\s.?BLAST.+")) programName = line.replace("#\\s",""); 177 if (line.startsWith("# Query:")) queryName = line.replace("# Query: ",""); 178 if (line.startsWith("# Database:")) databaseFile = line.replace("# Database: ",""); 179 180 // needed because blast report can end with a comment... 181 if (!scanner.hasNext()) return null; 182 line = scanner.nextLine(); 183 } 184 185 // Here, programName != null checks if there was a header in the file 186 boolean headerFound = programName != null; 187 188 split = line.split("\\t"); 189 queryId =split[0]; 190 subjectId =split[1]; 191 percIdentity =split[2]; 192 alnLength =split[3]; 193 mismatchCount=split[4]; 194 gapOpenCount =split[5]; 195 queryStart =split[6]; 196 queryEnd =split[7]; 197 subjectStart =split[8]; 198 subjectEnd =split[9]; 199 evalue =split[10]; 200 bitScore =split[11]; 201 202 // blast tabular reports only the first word of the query name. 203 // If it was specified in the header it is better to use that definition 204 if (parsingConsistency == PARSING_CONSISTENCY.IMPROVED && headerFound) { 205 if (queryIdMapping.get(queryId)==null) { 206 queryIdNumber ++; 207 queryIdMapping.put(queryId,"Query_" + queryIdNumber); 208 } 209 // If a complete definition of the query name was readed, than we can use 210 // a queryID schema that is consistent with blast xml report 211 queryId = queryIdMapping.get(queryId); 212 } 213 if (!headerFound) queryName = queryId; 214 215 return line; 216 } 217 218 @Override 219 public void storeObjects(List<Result> results) throws IOException, ParseException { 220 throw new UnsupportedOperationException("Not supported yet."); 221 } 222 223 /** 224 * Intended for use with run module. 225 * Although possible, does not make a lot of sense to have it with limited 226 * information such those in tabular report 227 * @param sequences 228 */ 229 @Override 230 public void setQueryReferences(List<Sequence> sequences) { 231 throw new UnsupportedOperationException("Not supported for this parser."); 232 } 233 /** 234 * Intended for use with run module. 235 * Although possible, does not make a lot of sense to have it with limited 236 * information such those in tabular report 237 * @param sequences 238 */ 239 @Override 240 public void setDatabaseReferences(List<Sequence> sequences) { 241 throw new UnsupportedOperationException("Not supported for this parser."); 242 } 243 /** 244 * Tries to define a different level of consistency during parsing. 245 * LITERAL is intended a strict parsing much tight to the report. 246 * IMPROVED consistency tries to import data much tight to the data model 247 * (I hope you got the idea, if not, have a look to the code. 248 * I suggest to use improved unless you have reasons to do not) 249 */ 250 public void setParsingConsistency(PARSING_CONSISTENCY parsingConsistency) { 251 this.parsingConsistency = parsingConsistency; 252 } 253 254}