001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022package org.biojava.bio.program.ssbind;
023
024import java.util.ArrayList;
025import java.util.Arrays;
026import java.util.HashMap;
027import java.util.List;
028import java.util.Map;
029
030import org.biojava.bio.Annotation;
031import org.biojava.bio.BioException;
032import org.biojava.bio.alignment.SimpleAlignment;
033import org.biojava.bio.search.SearchBuilder;
034import org.biojava.bio.search.SeqSimilaritySearchHit;
035import org.biojava.bio.search.SeqSimilaritySearchResult;
036import org.biojava.bio.search.SeqSimilaritySearchSubHit;
037import org.biojava.bio.search.SimpleSeqSimilaritySearchHit;
038import org.biojava.bio.search.SimpleSeqSimilaritySearchResult;
039import org.biojava.bio.search.SimpleSeqSimilaritySearchSubHit;
040import org.biojava.bio.seq.Sequence;
041import org.biojava.bio.seq.StrandedFeature;
042import org.biojava.bio.seq.StrandedFeature.Strand;
043import org.biojava.bio.seq.db.SequenceDB;
044import org.biojava.bio.seq.db.SequenceDBInstallation;
045import org.biojava.bio.seq.io.SymbolTokenization;
046import org.biojava.bio.symbol.FiniteAlphabet;
047import org.biojava.bio.symbol.SimpleSymbolList;
048import org.biojava.utils.SmallMap;
049
050/**
051 * <p><code>BlastLikeSearchBuilder</code> will create
052 * <code>SeqSimilaritySearchResult</code>s from SAX events via a
053 * <code>SeqSimilarityAdapter</code>. The SAX events should describe
054 * elements conforming to the BioJava BlastLikeDataSetCollection
055 * DTD. Suitable sources are <code>BlastLikeSAXParser</code> or
056 * <code>FastaSearchSAXParser</code>. The result objects are placed in
057 * the <code>List</code> supplied to the constructor.</p>
058 *
059 * <p>The start/end/strand of <code>SeqSimilaritySearchHit</code>s are
060 * calculated from their constituent
061 * <code>SeqSimilaritySearchSubHit</code>s as follows:</p>
062 *
063 * <ul>
064 * <li>The query start is the lowest query start coordinate of its
065 *     sub-hits, regardless of strand</li>
066 * <li>The query end is the highest query end coordinate of its sub-hits,
067 *     regardless of strand</li>
068 * <li>The hit start is the lowest hit start coordinate of its sub-hits,
069 *     regardless of strand</li>
070 * <li>The hit end is the highest hit end coordinate of its sub-hits,
071 *     regardless of strand</li>
072 * <li>The query strand is null for protein sequences. Otherwise it is
073 *     equal to the query strand of its sub-hits if they are all on the
074 *     same strand, or <code>StrandedFeature.UNKNOWN</code> if the sub-hits
075 *     have mixed query strands</li>
076 * <li>The hit strand is null for protein sequences. Otherwise it is
077 *     equal to the hit strand of its sub-hits if they are all on the same
078 *     strand, or <code>StrandedFeature.UNKNOWN</code> if the sub-hits have
079 *     mixed hit strands</li>
080 * </ul>
081 *
082 * <p>
083 * This class has special meanings for particular keys: if you want to
084 * adapt this class for another parser, you will need to be aware of
085 * this. These originate from and are fully described in the
086 * BlastLikeDataSetCollection DTD.
087 * </p>
088 * <table>
089 * <tr>
090 *   <th>Key</th>
091 *   <th>Meaning</th>
092 * </tr>
093 * <tr>
094 *   <td>program</td>
095 *   <td>either this value or the subjectSequenceType value must be set. This can take values
096 *       acceptable to AlphabetResolver. These are BLASTN, BLASTP, BLASTX, TBLASTN,
097 *       TBLASTX, DNA and PROTEIN. </td>
098 * </tr>
099 *
100 * <tr>
101 *   <td>databaseId</td>
102 *   <td>Identifier of database searched (in SequenceDBInstallation).</td>
103 * </tr>
104 * <tr>
105 *   <td>subjectSequenceType</td>
106 *   <td>type of sequence that hit is. Can be DNA or PROTEIN.</td>
107 * </tr>
108 * <tr>
109 *   <td>subjectId</td>
110 *   <td>id of sequence that is hit</td>
111 * </tr>
112 * <tr>
113 *   <td>subjectDescription</td>
114 *   <td>description of sequence that is hit</td>
115 * </tr>
116 * <tr>
117 *   <td>queryStrand</td>
118 *   <td>Strandedness of query in alignment. Takes values of "plus" and "minus"</td>
119 * </tr>
120 * <tr>
121 *   <td>subjectStrand</td>
122 *   <td>Strandedness of query in alignment. Takes values of "plus" and "minus"</td>
123 * </tr>
124 * <tr>
125 *   <td>queryFrame</td>
126 *   <td>self-evident</td>
127 * </tr>
128 * <tr>
129 *   <td>subjectFrame</td>
130 *   <td>self-evident</td>
131 * </tr>
132 * <tr>
133 *   <td>querySequenceStart</td>
134 *   <td>self-evident</td>
135 * </tr>
136 * <tr>
137 *   <td>querySequenceEnd</td>
138 *   <td>self-evident</td>
139 * </tr>
140 * <tr>
141 *   <td>subjectSequenceStart</td>
142 *   <td>self-evident</td>
143 * </tr>
144 * <tr>
145 *   <td>subjectSequenceEnd</td>
146 *   <td>self-evident</td>
147 * </tr>
148 * <tr>
149 *   <td>score</td>
150 *   <td>self-evident</td>
151 * </tr>
152 * <tr>
153 *   <td>expectValue</td>
154 *   <td>self-evident</td>
155 * </tr>
156 * <tr>
157 *   <td>pValue</td>
158 *   <td>self-evident</td>
159 * </tr>
160 * </table>
161 *
162 * @author Keith James
163 * @author Greg Cox
164 * @since 1.2
165 */
166public class BlastLikeSearchBuilder implements SearchBuilder
167{
168    // Supplier of instances of searched databases
169    private SequenceDBInstallation subjectDBs;
170    // Holder for all query sequences
171    private SequenceDB querySeqHolder;
172
173    // The ID of the database searched
174    private String databaseID;
175    // The ID of the query sequence
176    private String queryID;
177
178    // Hit and Result annotation
179    private Annotation resultAnnotation;
180
181    // Data holders for search result properties
182    private Map resultPreAnnotation;
183    private Map searchParameters;
184    private Map hitData;
185    private Map subHitData;
186
187    private SymbolTokenization tokenParser;
188
189    private List hits;
190    private List subHits;
191
192    private SeqSimilaritySearchSubHit [] subs;
193
194    // Flag indicating whether there are more results in the stream
195    private boolean moreSearchesAvailable = false;
196
197    // List to accept all results in the stream
198    private List target;
199
200    /**
201     * Creates a new <code>BlastLikeSearchBuilder</code> which will
202     * instantiate results into the <code>List</code> target.
203     *
204     * @param target a <code>List</code>.
205     */
206    public BlastLikeSearchBuilder(List target)
207    {
208        this.target = target;
209
210        resultPreAnnotation = new HashMap();
211        searchParameters    = new HashMap();
212        hitData             = new HashMap();
213        subHitData          = new HashMap();
214    }
215
216    /**
217     * Creates a new <code>BlastLikeSearchBuilder</code> which will
218     * instantiate results into the <code>List</code> target.
219     *
220     * @param target a <code>List</code>.
221     * @param querySeqHolder a <code>SequenceDB</code> of query
222     * sequences.
223     * @param subjectDBs a <code>SequenceDBInstallation</code> of
224     * databases searched.
225     */
226    public BlastLikeSearchBuilder(List                   target,
227                                  SequenceDB             querySeqHolder,
228                                  SequenceDBInstallation subjectDBs)
229    {
230        this(target);
231        this.querySeqHolder = querySeqHolder;
232        this.subjectDBs = subjectDBs;
233    }
234
235    public SeqSimilaritySearchResult makeSearchResult()
236        throws BioException
237    {
238        if (querySeqHolder == null)
239            throw new BioException("Running BlastLikeSearchBuilder with null query SequenceDB");
240
241        if (subjectDBs == null)
242            throw new BioException("Running BlastLikeSearchBuilder with null subject SequenceDB installation");
243
244        Sequence query = querySeqHolder.getSequence(queryID);
245        if (query == null)
246            throw new BioException("Failed to retrieve query sequence from queryDB using ID '"
247                                   + queryID
248                                   + "' (sequence was null)");
249
250        SequenceDB subjectDB = (SequenceDB) subjectDBs.getSequenceDB(databaseID);
251        if (subjectDB == null)
252            throw new BioException("Failed to retrieve database from installation using ID '"
253                                   + databaseID
254                                   + "' (database was null)");
255
256        return new SimpleSeqSimilaritySearchResult(query,
257                                                   subjectDB,
258                                                   searchParameters,
259                                                   hits,
260                                                   resultAnnotation);
261    }
262
263    /**
264     * <code>setQuerySeqHolder</code> sets the query sequence holder
265     * to a specific database.
266     *
267     * @param querySeqHolder a <code>SequenceDB</code> containing the
268     * query sequence(s).
269     */
270    public void setQuerySeqHolder(SequenceDB querySeqHolder)
271    {
272        this.querySeqHolder = querySeqHolder;
273    }
274
275    /**
276     * <code>setSubjectDBInstallation</code> sets the subject database
277     * holder to a specific installation.
278     *
279     * @param subjectDBs a <code>SequenceDBInstallation</code>
280     * containing the subject database(s)
281     */
282    public void setSubjectDBInstallation(SequenceDBInstallation subjectDBs)
283    {
284        this.subjectDBs = subjectDBs;
285    }
286
287    public void setQueryID(String queryID)
288    {
289        this.queryID = queryID;
290        addSearchProperty("queryId", queryID);
291    }
292
293    public void setDatabaseID(String databaseID)
294    {
295        this.databaseID = databaseID;
296        addSearchProperty("databaseId", databaseID);
297    }
298
299    public boolean getMoreSearches()
300    {
301        return moreSearchesAvailable;
302    }
303
304    public void setMoreSearches(boolean value)
305    {
306        moreSearchesAvailable = value;
307    }
308
309    public void startSearch()
310    {
311        hits = new ArrayList();
312    }
313
314    public void endSearch()
315    {
316        try
317        {
318            resultAnnotation = AnnotationFactory.makeAnnotation(resultPreAnnotation);
319            target.add(makeSearchResult());
320        }
321        catch (BioException be)
322        {
323            System.err.println("Failed to build SeqSimilaritySearchResult:");
324            be.printStackTrace();
325        }
326    }
327
328    public void startHeader()
329    {
330        resultPreAnnotation.clear();
331        searchParameters.clear();
332    }
333
334    public void endHeader() { }
335
336    public void startHit()
337    {
338        hitData.clear();
339        subHits = new ArrayList();
340    }
341
342    public void endHit()
343    {
344        hits.add(makeHit());
345    }
346
347    public void startSubHit()
348    {
349        subHitData.clear();
350    }
351
352    public void endSubHit()
353    {
354        try
355        {
356            subHits.add(makeSubHit());
357        }
358        catch (BioException be)
359        {
360            be.printStackTrace();
361        }
362    }
363
364    public void addSearchProperty(Object key, Object value)
365    {
366        resultPreAnnotation.put(key, value);
367    }
368
369    public void addHitProperty(Object key, Object value)
370    {
371        hitData.put(key, value);
372    }
373
374    public void addSubHitProperty(Object key, Object value)
375    {
376        subHitData.put(key, value);
377    }
378
379    /**
380     * <code>makeHit</code> creates a new hit. The hit's strand data
381     * is the same as that of the highest-scoring sub-hit. The hit's
382     * start/end data are the same as the extent of the sub-hits on
383     * that strand.
384     *
385     * @return a <code>SeqSimilaritySearchHit</code>.
386     */
387    private SeqSimilaritySearchHit makeHit()
388    {
389        double sc = Double.NaN;
390        double ev = Double.NaN;
391        double pv = Double.NaN;
392
393        subs = (SeqSimilaritySearchSubHit []) subHits
394            .toArray(new SeqSimilaritySearchSubHit [subHits.size() - 1]);
395
396        // Sort to get highest score
397        Arrays.sort(subs, SeqSimilaritySearchSubHit.byScore);
398        sc = subs[subs.length - 1].getScore();
399        ev = subs[subs.length - 1].getEValue();
400        pv = subs[subs.length - 1].getPValue();
401
402        // Check for any mixed or null strands
403        boolean    mixQueryStrand = false;
404        boolean  mixSubjectStrand = false;
405        boolean   nullQueryStrand = false;
406        boolean nullSubjectStrand = false;
407
408        // Start with index 0 value (arbitrarily)
409        Strand qStrand = subs[0].getQueryStrand();
410        Strand sStrand = subs[0].getSubjectStrand();
411
412        int qStart = subs[0].getQueryStart();
413        int qEnd   = subs[0].getQueryEnd();
414        int sStart = subs[0].getSubjectStart();
415        int sEnd   = subs[0].getSubjectEnd();
416
417        if (qStrand == null)
418            nullQueryStrand = true;
419        if (sStrand == null)
420            nullSubjectStrand = true;
421
422        // Compare all other values
423        for (int i = subs.length; --i > 0;)
424        {
425            Strand qS = subs[i].getQueryStrand();
426            Strand sS = subs[i].getSubjectStrand();
427
428            if (qS == null)
429                nullQueryStrand = true;
430            if (sS == null)
431                nullSubjectStrand = true;
432
433            if (qS != qStrand)
434                mixQueryStrand = true;
435            if (sS != sStrand)
436                mixSubjectStrand = true;
437
438            qStart = Math.min(qStart, subs[i].getQueryStart());
439            qEnd   = Math.max(qEnd,   subs[i].getQueryEnd());
440
441            sStart = Math.min(sStart, subs[i].getSubjectStart());
442            sEnd   = Math.max(sEnd,   subs[i].getSubjectEnd());
443        }
444
445        // Note any mixed strand hits as unknown strand
446        if (mixQueryStrand)
447            qStrand = StrandedFeature.UNKNOWN;
448        if (mixSubjectStrand)
449            sStrand = StrandedFeature.UNKNOWN;
450
451        // Any null strands from protein sequences
452        if (nullQueryStrand)
453            qStrand = null;
454        if (nullSubjectStrand)
455            sStrand = null;
456
457        String subjectID = (String) hitData.get("subjectId");
458
459        return new SimpleSeqSimilaritySearchHit(sc, ev, pv,
460                                                qStart, qEnd, qStrand,
461                                                sStart, sEnd, sStrand,
462                                                subjectID,
463                                                AnnotationFactory.makeAnnotation(hitData),
464                                                subHits);
465    }
466
467    /**
468     * <code>makeSubHit</code> creates a new sub-hit.
469     *
470     * @return a <code>SeqSimilaritySearchSubHit</code>.
471     *
472     * @exception BioException if an error occurs.
473     */
474    private SeqSimilaritySearchSubHit makeSubHit() throws BioException
475    {
476        // Try to get a valid TokenParser
477        if (tokenParser == null)
478        {
479            String identifier;
480
481            // Try explicit sequence type first
482            if (subHitData.containsKey("subjectSequenceType"))
483                identifier = (String) subHitData.get("subjectSequenceType");
484            // Otherwise try to resolve from the program name (only
485            // works for Blast)
486            else if (resultPreAnnotation.containsKey("program"))
487                identifier = (String) resultPreAnnotation.get("program");
488            else
489                throw new BioException("Failed to determine sequence type");
490
491            FiniteAlphabet alpha = AlphabetResolver.resolveAlphabet(identifier);
492            tokenParser = alpha.getTokenization("token");
493        }
494
495        // BLASTP output has the strands set null (protein sequences)
496        Strand qStrand = null;
497        Strand sStrand = null;
498
499        // Override where an explicit strand is given (FASTA DNA,
500        // BLASTN)
501        if (subHitData.containsKey("queryStrand"))
502            if (subHitData.get("queryStrand").equals("plus"))
503                qStrand = StrandedFeature.POSITIVE;
504            else
505                qStrand = StrandedFeature.NEGATIVE;
506
507        if (subHitData.containsKey("subjectStrand"))
508            if (subHitData.get("subjectStrand").equals("plus"))
509                sStrand = StrandedFeature.POSITIVE;
510            else
511                sStrand = StrandedFeature.NEGATIVE;
512
513        // Override where a frame is given as this contains strand
514        // information (BLASTX for query, TBLASTN for hit, TBLASTX for
515        // both)
516        if (subHitData.containsKey("queryFrame"))
517            if (((String) subHitData.get("queryFrame")).startsWith("plus"))
518                qStrand = StrandedFeature.POSITIVE;
519            else
520                qStrand = StrandedFeature.NEGATIVE;
521
522        if (subHitData.containsKey("subjectFrame"))
523            if (((String) subHitData.get("subjectFrame")).startsWith("plus"))
524                sStrand = StrandedFeature.POSITIVE;
525            else
526                sStrand = StrandedFeature.NEGATIVE;
527
528        // Get start/end
529        int qStart = Integer.parseInt((String) subHitData.get("querySequenceStart"));
530        int   qEnd = Integer.parseInt((String) subHitData.get("querySequenceEnd"));
531        int sStart = Integer.parseInt((String) subHitData.get("subjectSequenceStart"));
532        int   sEnd = Integer.parseInt((String) subHitData.get("subjectSequenceEnd"));
533
534        // The start/end coordinates from BioJava XML don't follow the
535        // BioJava paradigm of start < end, with orientation given by
536        // the strand property. Rather, they present start/end as
537        // displayed in BLAST output, with the coordinates being
538        // inverted on the reverse strand. We account for this here.
539        if (qStrand == StrandedFeature.NEGATIVE)
540        {
541            int swap = qStart;
542            qStart = qEnd;
543            qEnd   = swap;
544        }
545
546        if (sStrand == StrandedFeature.NEGATIVE)
547        {
548            int swap = sStart;
549            sStart = sEnd;
550            sEnd   = swap;
551        }
552
553        // Get scores
554        double sc = Double.NaN;
555        double ev = Double.NaN;
556        double pv = Double.NaN;
557
558        if (subHitData.containsKey("score"))
559            sc = Double.parseDouble((String) subHitData.get("score"));
560
561        if (subHitData.containsKey("expectValue"))
562        {
563            String val = (String) subHitData.get("expectValue");
564            // Blast sometimes uses invalid formatting such as 'e-156'
565            // rather than '1e-156'
566            if (val.startsWith("e"))
567                ev = Double.parseDouble("1" + val);
568            else
569                ev = Double.parseDouble(val);
570        }
571
572        if (subHitData.containsKey("pValue"))
573            pv = Double.parseDouble((String) subHitData.get("pValue"));
574
575        Map labelMap = new SmallMap();
576
577        // Note that the following is removing the raw sequences
578        StringBuffer tokenBuffer = new StringBuffer(1024);
579        tokenBuffer.append((String) subHitData.remove("querySequence"));
580        labelMap.put(SeqSimilaritySearchSubHit.QUERY_LABEL,
581                     new SimpleSymbolList(tokenParser, tokenBuffer.substring(0)));
582
583        tokenBuffer = new StringBuffer(1024);
584        tokenBuffer.append((String) subHitData.remove("subjectSequence"));
585        labelMap.put(hitData.get("subjectId"),
586                     new SimpleSymbolList(tokenParser, tokenBuffer.substring(0)));
587
588        return new SimpleSeqSimilaritySearchSubHit(sc, ev, pv,
589                                                   qStart, qEnd, qStrand,
590                                                   sStart, sEnd, sStrand,
591                                                   new SimpleAlignment(labelMap),
592                                                   AnnotationFactory.makeAnnotation(subHitData));
593    }
594}