001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojava.bio.program.ssbind; 023 024import java.util.ArrayList; 025import java.util.Arrays; 026import java.util.HashMap; 027import java.util.List; 028import java.util.Map; 029 030import org.biojava.bio.Annotation; 031import org.biojava.bio.BioException; 032import org.biojava.bio.alignment.SimpleAlignment; 033import org.biojava.bio.search.SearchBuilder; 034import org.biojava.bio.search.SeqSimilaritySearchHit; 035import org.biojava.bio.search.SeqSimilaritySearchResult; 036import org.biojava.bio.search.SeqSimilaritySearchSubHit; 037import org.biojava.bio.search.SimpleSeqSimilaritySearchHit; 038import org.biojava.bio.search.SimpleSeqSimilaritySearchResult; 039import org.biojava.bio.search.SimpleSeqSimilaritySearchSubHit; 040import org.biojava.bio.seq.Sequence; 041import org.biojava.bio.seq.StrandedFeature; 042import org.biojava.bio.seq.StrandedFeature.Strand; 043import org.biojava.bio.seq.db.SequenceDB; 044import org.biojava.bio.seq.db.SequenceDBInstallation; 045import org.biojava.bio.seq.io.SymbolTokenization; 046import org.biojava.bio.symbol.FiniteAlphabet; 047import org.biojava.bio.symbol.SimpleSymbolList; 048import org.biojava.utils.SmallMap; 049 050/** 051 * <p><code>BlastLikeSearchBuilder</code> will create 052 * <code>SeqSimilaritySearchResult</code>s from SAX events via a 053 * <code>SeqSimilarityAdapter</code>. The SAX events should describe 054 * elements conforming to the BioJava BlastLikeDataSetCollection 055 * DTD. Suitable sources are <code>BlastLikeSAXParser</code> or 056 * <code>FastaSearchSAXParser</code>. The result objects are placed in 057 * the <code>List</code> supplied to the constructor.</p> 058 * 059 * <p>The start/end/strand of <code>SeqSimilaritySearchHit</code>s are 060 * calculated from their constituent 061 * <code>SeqSimilaritySearchSubHit</code>s as follows:</p> 062 * 063 * <ul> 064 * <li>The query start is the lowest query start coordinate of its 065 * sub-hits, regardless of strand</li> 066 * <li>The query end is the highest query end coordinate of its sub-hits, 067 * regardless of strand</li> 068 * <li>The hit start is the lowest hit start coordinate of its sub-hits, 069 * regardless of strand</li> 070 * <li>The hit end is the highest hit end coordinate of its sub-hits, 071 * regardless of strand</li> 072 * <li>The query strand is null for protein sequences. Otherwise it is 073 * equal to the query strand of its sub-hits if they are all on the 074 * same strand, or <code>StrandedFeature.UNKNOWN</code> if the sub-hits 075 * have mixed query strands</li> 076 * <li>The hit strand is null for protein sequences. Otherwise it is 077 * equal to the hit strand of its sub-hits if they are all on the same 078 * strand, or <code>StrandedFeature.UNKNOWN</code> if the sub-hits have 079 * mixed hit strands</li> 080 * </ul> 081 * 082 * <p> 083 * This class has special meanings for particular keys: if you want to 084 * adapt this class for another parser, you will need to be aware of 085 * this. These originate from and are fully described in the 086 * BlastLikeDataSetCollection DTD. 087 * </p> 088 * <table> 089 * <tr> 090 * <th>Key</th> 091 * <th>Meaning</th> 092 * </tr> 093 * <tr> 094 * <td>program</td> 095 * <td>either this value or the subjectSequenceType value must be set. This can take values 096 * acceptable to AlphabetResolver. These are BLASTN, BLASTP, BLASTX, TBLASTN, 097 * TBLASTX, DNA and PROTEIN. </td> 098 * </tr> 099 * 100 * <tr> 101 * <td>databaseId</td> 102 * <td>Identifier of database searched (in SequenceDBInstallation).</td> 103 * </tr> 104 * <tr> 105 * <td>subjectSequenceType</td> 106 * <td>type of sequence that hit is. Can be DNA or PROTEIN.</td> 107 * </tr> 108 * <tr> 109 * <td>subjectId</td> 110 * <td>id of sequence that is hit</td> 111 * </tr> 112 * <tr> 113 * <td>subjectDescription</td> 114 * <td>description of sequence that is hit</td> 115 * </tr> 116 * <tr> 117 * <td>queryStrand</td> 118 * <td>Strandedness of query in alignment. Takes values of "plus" and "minus"</td> 119 * </tr> 120 * <tr> 121 * <td>subjectStrand</td> 122 * <td>Strandedness of query in alignment. Takes values of "plus" and "minus"</td> 123 * </tr> 124 * <tr> 125 * <td>queryFrame</td> 126 * <td>self-evident</td> 127 * </tr> 128 * <tr> 129 * <td>subjectFrame</td> 130 * <td>self-evident</td> 131 * </tr> 132 * <tr> 133 * <td>querySequenceStart</td> 134 * <td>self-evident</td> 135 * </tr> 136 * <tr> 137 * <td>querySequenceEnd</td> 138 * <td>self-evident</td> 139 * </tr> 140 * <tr> 141 * <td>subjectSequenceStart</td> 142 * <td>self-evident</td> 143 * </tr> 144 * <tr> 145 * <td>subjectSequenceEnd</td> 146 * <td>self-evident</td> 147 * </tr> 148 * <tr> 149 * <td>score</td> 150 * <td>self-evident</td> 151 * </tr> 152 * <tr> 153 * <td>expectValue</td> 154 * <td>self-evident</td> 155 * </tr> 156 * <tr> 157 * <td>pValue</td> 158 * <td>self-evident</td> 159 * </tr> 160 * </table> 161 * 162 * @author Keith James 163 * @author Greg Cox 164 * @since 1.2 165 */ 166public class BlastLikeSearchBuilder implements SearchBuilder 167{ 168 // Supplier of instances of searched databases 169 private SequenceDBInstallation subjectDBs; 170 // Holder for all query sequences 171 private SequenceDB querySeqHolder; 172 173 // The ID of the database searched 174 private String databaseID; 175 // The ID of the query sequence 176 private String queryID; 177 178 // Hit and Result annotation 179 private Annotation resultAnnotation; 180 181 // Data holders for search result properties 182 private Map resultPreAnnotation; 183 private Map searchParameters; 184 private Map hitData; 185 private Map subHitData; 186 187 private SymbolTokenization tokenParser; 188 189 private List hits; 190 private List subHits; 191 192 private SeqSimilaritySearchSubHit [] subs; 193 194 // Flag indicating whether there are more results in the stream 195 private boolean moreSearchesAvailable = false; 196 197 // List to accept all results in the stream 198 private List target; 199 200 /** 201 * Creates a new <code>BlastLikeSearchBuilder</code> which will 202 * instantiate results into the <code>List</code> target. 203 * 204 * @param target a <code>List</code>. 205 */ 206 public BlastLikeSearchBuilder(List target) 207 { 208 this.target = target; 209 210 resultPreAnnotation = new HashMap(); 211 searchParameters = new HashMap(); 212 hitData = new HashMap(); 213 subHitData = new HashMap(); 214 } 215 216 /** 217 * Creates a new <code>BlastLikeSearchBuilder</code> which will 218 * instantiate results into the <code>List</code> target. 219 * 220 * @param target a <code>List</code>. 221 * @param querySeqHolder a <code>SequenceDB</code> of query 222 * sequences. 223 * @param subjectDBs a <code>SequenceDBInstallation</code> of 224 * databases searched. 225 */ 226 public BlastLikeSearchBuilder(List target, 227 SequenceDB querySeqHolder, 228 SequenceDBInstallation subjectDBs) 229 { 230 this(target); 231 this.querySeqHolder = querySeqHolder; 232 this.subjectDBs = subjectDBs; 233 } 234 235 public SeqSimilaritySearchResult makeSearchResult() 236 throws BioException 237 { 238 if (querySeqHolder == null) 239 throw new BioException("Running BlastLikeSearchBuilder with null query SequenceDB"); 240 241 if (subjectDBs == null) 242 throw new BioException("Running BlastLikeSearchBuilder with null subject SequenceDB installation"); 243 244 Sequence query = querySeqHolder.getSequence(queryID); 245 if (query == null) 246 throw new BioException("Failed to retrieve query sequence from queryDB using ID '" 247 + queryID 248 + "' (sequence was null)"); 249 250 SequenceDB subjectDB = (SequenceDB) subjectDBs.getSequenceDB(databaseID); 251 if (subjectDB == null) 252 throw new BioException("Failed to retrieve database from installation using ID '" 253 + databaseID 254 + "' (database was null)"); 255 256 return new SimpleSeqSimilaritySearchResult(query, 257 subjectDB, 258 searchParameters, 259 hits, 260 resultAnnotation); 261 } 262 263 /** 264 * <code>setQuerySeqHolder</code> sets the query sequence holder 265 * to a specific database. 266 * 267 * @param querySeqHolder a <code>SequenceDB</code> containing the 268 * query sequence(s). 269 */ 270 public void setQuerySeqHolder(SequenceDB querySeqHolder) 271 { 272 this.querySeqHolder = querySeqHolder; 273 } 274 275 /** 276 * <code>setSubjectDBInstallation</code> sets the subject database 277 * holder to a specific installation. 278 * 279 * @param subjectDBs a <code>SequenceDBInstallation</code> 280 * containing the subject database(s) 281 */ 282 public void setSubjectDBInstallation(SequenceDBInstallation subjectDBs) 283 { 284 this.subjectDBs = subjectDBs; 285 } 286 287 public void setQueryID(String queryID) 288 { 289 this.queryID = queryID; 290 addSearchProperty("queryId", queryID); 291 } 292 293 public void setDatabaseID(String databaseID) 294 { 295 this.databaseID = databaseID; 296 addSearchProperty("databaseId", databaseID); 297 } 298 299 public boolean getMoreSearches() 300 { 301 return moreSearchesAvailable; 302 } 303 304 public void setMoreSearches(boolean value) 305 { 306 moreSearchesAvailable = value; 307 } 308 309 public void startSearch() 310 { 311 hits = new ArrayList(); 312 } 313 314 public void endSearch() 315 { 316 try 317 { 318 resultAnnotation = AnnotationFactory.makeAnnotation(resultPreAnnotation); 319 target.add(makeSearchResult()); 320 } 321 catch (BioException be) 322 { 323 System.err.println("Failed to build SeqSimilaritySearchResult:"); 324 be.printStackTrace(); 325 } 326 } 327 328 public void startHeader() 329 { 330 resultPreAnnotation.clear(); 331 searchParameters.clear(); 332 } 333 334 public void endHeader() { } 335 336 public void startHit() 337 { 338 hitData.clear(); 339 subHits = new ArrayList(); 340 } 341 342 public void endHit() 343 { 344 hits.add(makeHit()); 345 } 346 347 public void startSubHit() 348 { 349 subHitData.clear(); 350 } 351 352 public void endSubHit() 353 { 354 try 355 { 356 subHits.add(makeSubHit()); 357 } 358 catch (BioException be) 359 { 360 be.printStackTrace(); 361 } 362 } 363 364 public void addSearchProperty(Object key, Object value) 365 { 366 resultPreAnnotation.put(key, value); 367 } 368 369 public void addHitProperty(Object key, Object value) 370 { 371 hitData.put(key, value); 372 } 373 374 public void addSubHitProperty(Object key, Object value) 375 { 376 subHitData.put(key, value); 377 } 378 379 /** 380 * <code>makeHit</code> creates a new hit. The hit's strand data 381 * is the same as that of the highest-scoring sub-hit. The hit's 382 * start/end data are the same as the extent of the sub-hits on 383 * that strand. 384 * 385 * @return a <code>SeqSimilaritySearchHit</code>. 386 */ 387 private SeqSimilaritySearchHit makeHit() 388 { 389 double sc = Double.NaN; 390 double ev = Double.NaN; 391 double pv = Double.NaN; 392 393 subs = (SeqSimilaritySearchSubHit []) subHits 394 .toArray(new SeqSimilaritySearchSubHit [subHits.size() - 1]); 395 396 // Sort to get highest score 397 Arrays.sort(subs, SeqSimilaritySearchSubHit.byScore); 398 sc = subs[subs.length - 1].getScore(); 399 ev = subs[subs.length - 1].getEValue(); 400 pv = subs[subs.length - 1].getPValue(); 401 402 // Check for any mixed or null strands 403 boolean mixQueryStrand = false; 404 boolean mixSubjectStrand = false; 405 boolean nullQueryStrand = false; 406 boolean nullSubjectStrand = false; 407 408 // Start with index 0 value (arbitrarily) 409 Strand qStrand = subs[0].getQueryStrand(); 410 Strand sStrand = subs[0].getSubjectStrand(); 411 412 int qStart = subs[0].getQueryStart(); 413 int qEnd = subs[0].getQueryEnd(); 414 int sStart = subs[0].getSubjectStart(); 415 int sEnd = subs[0].getSubjectEnd(); 416 417 if (qStrand == null) 418 nullQueryStrand = true; 419 if (sStrand == null) 420 nullSubjectStrand = true; 421 422 // Compare all other values 423 for (int i = subs.length; --i > 0;) 424 { 425 Strand qS = subs[i].getQueryStrand(); 426 Strand sS = subs[i].getSubjectStrand(); 427 428 if (qS == null) 429 nullQueryStrand = true; 430 if (sS == null) 431 nullSubjectStrand = true; 432 433 if (qS != qStrand) 434 mixQueryStrand = true; 435 if (sS != sStrand) 436 mixSubjectStrand = true; 437 438 qStart = Math.min(qStart, subs[i].getQueryStart()); 439 qEnd = Math.max(qEnd, subs[i].getQueryEnd()); 440 441 sStart = Math.min(sStart, subs[i].getSubjectStart()); 442 sEnd = Math.max(sEnd, subs[i].getSubjectEnd()); 443 } 444 445 // Note any mixed strand hits as unknown strand 446 if (mixQueryStrand) 447 qStrand = StrandedFeature.UNKNOWN; 448 if (mixSubjectStrand) 449 sStrand = StrandedFeature.UNKNOWN; 450 451 // Any null strands from protein sequences 452 if (nullQueryStrand) 453 qStrand = null; 454 if (nullSubjectStrand) 455 sStrand = null; 456 457 String subjectID = (String) hitData.get("subjectId"); 458 459 return new SimpleSeqSimilaritySearchHit(sc, ev, pv, 460 qStart, qEnd, qStrand, 461 sStart, sEnd, sStrand, 462 subjectID, 463 AnnotationFactory.makeAnnotation(hitData), 464 subHits); 465 } 466 467 /** 468 * <code>makeSubHit</code> creates a new sub-hit. 469 * 470 * @return a <code>SeqSimilaritySearchSubHit</code>. 471 * 472 * @exception BioException if an error occurs. 473 */ 474 private SeqSimilaritySearchSubHit makeSubHit() throws BioException 475 { 476 // Try to get a valid TokenParser 477 if (tokenParser == null) 478 { 479 String identifier; 480 481 // Try explicit sequence type first 482 if (subHitData.containsKey("subjectSequenceType")) 483 identifier = (String) subHitData.get("subjectSequenceType"); 484 // Otherwise try to resolve from the program name (only 485 // works for Blast) 486 else if (resultPreAnnotation.containsKey("program")) 487 identifier = (String) resultPreAnnotation.get("program"); 488 else 489 throw new BioException("Failed to determine sequence type"); 490 491 FiniteAlphabet alpha = AlphabetResolver.resolveAlphabet(identifier); 492 tokenParser = alpha.getTokenization("token"); 493 } 494 495 // BLASTP output has the strands set null (protein sequences) 496 Strand qStrand = null; 497 Strand sStrand = null; 498 499 // Override where an explicit strand is given (FASTA DNA, 500 // BLASTN) 501 if (subHitData.containsKey("queryStrand")) 502 if (subHitData.get("queryStrand").equals("plus")) 503 qStrand = StrandedFeature.POSITIVE; 504 else 505 qStrand = StrandedFeature.NEGATIVE; 506 507 if (subHitData.containsKey("subjectStrand")) 508 if (subHitData.get("subjectStrand").equals("plus")) 509 sStrand = StrandedFeature.POSITIVE; 510 else 511 sStrand = StrandedFeature.NEGATIVE; 512 513 // Override where a frame is given as this contains strand 514 // information (BLASTX for query, TBLASTN for hit, TBLASTX for 515 // both) 516 if (subHitData.containsKey("queryFrame")) 517 if (((String) subHitData.get("queryFrame")).startsWith("plus")) 518 qStrand = StrandedFeature.POSITIVE; 519 else 520 qStrand = StrandedFeature.NEGATIVE; 521 522 if (subHitData.containsKey("subjectFrame")) 523 if (((String) subHitData.get("subjectFrame")).startsWith("plus")) 524 sStrand = StrandedFeature.POSITIVE; 525 else 526 sStrand = StrandedFeature.NEGATIVE; 527 528 // Get start/end 529 int qStart = Integer.parseInt((String) subHitData.get("querySequenceStart")); 530 int qEnd = Integer.parseInt((String) subHitData.get("querySequenceEnd")); 531 int sStart = Integer.parseInt((String) subHitData.get("subjectSequenceStart")); 532 int sEnd = Integer.parseInt((String) subHitData.get("subjectSequenceEnd")); 533 534 // The start/end coordinates from BioJava XML don't follow the 535 // BioJava paradigm of start < end, with orientation given by 536 // the strand property. Rather, they present start/end as 537 // displayed in BLAST output, with the coordinates being 538 // inverted on the reverse strand. We account for this here. 539 if (qStrand == StrandedFeature.NEGATIVE) 540 { 541 int swap = qStart; 542 qStart = qEnd; 543 qEnd = swap; 544 } 545 546 if (sStrand == StrandedFeature.NEGATIVE) 547 { 548 int swap = sStart; 549 sStart = sEnd; 550 sEnd = swap; 551 } 552 553 // Get scores 554 double sc = Double.NaN; 555 double ev = Double.NaN; 556 double pv = Double.NaN; 557 558 if (subHitData.containsKey("score")) 559 sc = Double.parseDouble((String) subHitData.get("score")); 560 561 if (subHitData.containsKey("expectValue")) 562 { 563 String val = (String) subHitData.get("expectValue"); 564 // Blast sometimes uses invalid formatting such as 'e-156' 565 // rather than '1e-156' 566 if (val.startsWith("e")) 567 ev = Double.parseDouble("1" + val); 568 else 569 ev = Double.parseDouble(val); 570 } 571 572 if (subHitData.containsKey("pValue")) 573 pv = Double.parseDouble((String) subHitData.get("pValue")); 574 575 Map labelMap = new SmallMap(); 576 577 // Note that the following is removing the raw sequences 578 StringBuffer tokenBuffer = new StringBuffer(1024); 579 tokenBuffer.append((String) subHitData.remove("querySequence")); 580 labelMap.put(SeqSimilaritySearchSubHit.QUERY_LABEL, 581 new SimpleSymbolList(tokenParser, tokenBuffer.substring(0))); 582 583 tokenBuffer = new StringBuffer(1024); 584 tokenBuffer.append((String) subHitData.remove("subjectSequence")); 585 labelMap.put(hitData.get("subjectId"), 586 new SimpleSymbolList(tokenParser, tokenBuffer.substring(0))); 587 588 return new SimpleSeqSimilaritySearchSubHit(sc, ev, pv, 589 qStart, qEnd, qStrand, 590 sStart, sEnd, sStrand, 591 new SimpleAlignment(labelMap), 592 AnnotationFactory.makeAnnotation(subHitData)); 593 } 594}