001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022package org.biojava.bio.program.ssbind;
023
024import java.util.HashMap;
025import java.util.Map;
026
027import org.biojava.bio.BioError;
028import org.biojava.bio.BioException;
029import org.biojava.bio.alignment.Alignment;
030import org.biojava.bio.alignment.SimpleAlignment;
031import org.biojava.bio.search.SearchContentHandler;
032import org.biojava.bio.seq.Sequence;
033import org.biojava.bio.seq.StrandedFeature;
034import org.biojava.bio.seq.StrandedFeature.Strand;
035import org.biojava.bio.seq.db.IllegalIDException;
036import org.biojava.bio.seq.homol.SimilarityPairFeature;
037import org.biojava.bio.seq.io.SymbolTokenization;
038import org.biojava.bio.symbol.FiniteAlphabet;
039import org.biojava.bio.symbol.RangeLocation;
040import org.biojava.bio.symbol.SimpleSymbolList;
041import org.biojava.utils.ChangeListener;
042import org.biojava.utils.ChangeType;
043import org.biojava.utils.ChangeVetoException;
044
045/**
046 * <p><code>SimilarityPairBuilder</code> annotates query and subject
047 * <code>Sequence</code> with <code>SimilarityPairFeature</code>s
048 * created from SAX events supplied via a
049 * <code>SeqSimilarityAdapter</code>. The objective is to describe a
050 * simple pairwise relationship between the two sequences. This
051 * differs slightly from using <code>HomologyFeature</code>s which are
052 * slightly heavier, have to contain a full alignment and don't have
053 * an explicit distinction between query and subject sequences in the
054 * alignment. The SAX events should describe elements conforming to
055 * the BioJava BlastLikeDataSetCollection DTD. Suitable sources are
056 * <code>BlastLikeSAXParser</code> or <code>FastaSAXParser</code>.</p>
057 *
058 * <p>Annotated <code>ViewSequence</code>s wrapping both query and
059 * subject sequences are created.</p>
060 *
061 * <p><strong>The current implementation should be used with care on
062 * streams containing more than one search output</strong>. This is
063 * because the builder will not stop after each report has been
064 * processed and as a result all the subject sequences get
065 * instantiated and a large object network could be created during
066 * processing.</p>
067 *
068 * @author Keith James
069 * @author Greg Cox
070 * @since 1.2
071 */
072public class SimilarityPairBuilder extends ViewSequenceFactory
073    implements SearchContentHandler
074{
075    /**
076     * Constant <code>SIMILARITY_PAIR_FEATURE_TYPE</code> the type
077     * String used by <code>SimilarityPairBuilder</code> when creating
078     * <code>SimilarityPairFeature</code>s. This is the String which
079     * is returned when a <code>SimilarityPairFeature</code>'s
080     * <code>getType()</code> method is called.
081     */
082    public static final String SIMILARITY_PAIR_FEATURE_TYPE = "similarity";
083
084    // Identifiers for query and database
085    private String queryID;
086
087    // Data holders for search result properties
088    private Map resultData;
089    private Map hitData;
090    private Map subHitData;
091
092    private SymbolTokenization tokenParser;
093    private StringBuffer       tokenBuffer;
094
095    // Flag indicating whether there are more results in the stream
096    private boolean moreSearchesAvailable = false;
097
098    public SimilarityPairBuilder()
099    {
100        resultData       = new HashMap();
101        hitData          = new HashMap();
102        subHitData       = new HashMap();
103        queryViewCache   = new HashMap();
104        subjectViewCache = new HashMap();
105        tokenBuffer      = new StringBuffer(1024);
106    }
107
108    public Sequence getAnnotatedQuerySeq(String queryID)
109        throws IllegalIDException
110    {
111        if (! queryViewCache.containsKey(queryID))
112            throw new IllegalIDException("Failed to retrieve annotated query sequence from cache using ID '"
113                                         + queryID
114                                         + "' (unknown ID");
115
116        return (Sequence) queryViewCache.get(queryID);
117    }
118
119    public Sequence getAnnotatedSubjectSeq(String subjectID)
120        throws IllegalIDException
121    {
122        if (! subjectViewCache.containsKey(subjectID))
123            throw new IllegalIDException("Failed to retrieve annotated subject sequence from cache using ID '"
124                                         + subjectID
125                                         + "' (unknown ID");
126
127        return (Sequence) subjectViewCache.get(subjectID);
128    }
129
130    public void setQueryID(String queryID)
131    {
132        this.queryID = queryID;
133    }
134
135    public boolean getMoreSearches()
136    {
137        return moreSearchesAvailable;
138    }
139
140    public void setMoreSearches(boolean value)
141    {
142        moreSearchesAvailable = value;
143    }
144
145    public void startSearch()
146    {
147        subjectViewCache.clear();
148    }
149
150    public void endSearch() { }
151
152    public void startHeader()
153    {
154        resultData.clear();
155    }
156
157    public void endHeader() { }
158
159    public void startHit()
160    {
161        hitData.clear();
162        subHitData.clear();
163    }
164
165    public void endHit() { }
166
167    public void startSubHit() { }
168
169    public void endSubHit()
170    {
171        try
172        {
173            makeSimilarity();
174        }
175        catch (BioException be)
176        {
177            System.err.println("Failed to build Similarity:");
178            be.printStackTrace();
179        }
180    }
181
182    public void addSearchProperty(Object key, Object value)
183    {
184        resultData.put(key, value);
185    }
186
187    public void addHitProperty(Object key, Object value)
188    {
189        hitData.put(key, value);
190    }
191
192    public void addSubHitProperty(Object key, Object value)
193    {
194        subHitData.put(key, value);
195    }
196
197    private void makeSimilarity() throws BioException
198    {
199        subHitData.putAll(resultData);
200        subHitData.putAll(hitData);
201
202        // Try to get a valid TokenParser
203        if (tokenParser == null)
204        {
205            String identifier;
206            // Try explicit sequence type first
207            if (subHitData.containsKey("hitSequenceType"))
208                identifier = (String) subHitData.get("hitSequenceType");
209            // Otherwise try to resolve from the program name (only
210            // works for Blast)
211            else if (subHitData.containsKey("program"))
212                identifier = (String) subHitData.get("program");
213            else
214                throw new BioException("Failed to determine sequence type");
215
216            FiniteAlphabet alpha = AlphabetResolver.resolveAlphabet(identifier);
217            tokenParser = alpha.getTokenization("token");
218        }
219
220        // Set strands of hit on query and subject
221        Strand qStrand = StrandedFeature.POSITIVE;
222        Strand sStrand = StrandedFeature.POSITIVE;
223
224        // In cases where an explicit strand is given (FASTA DNA, BLASTN)
225        if (subHitData.containsKey("queryStrand") &&
226            subHitData.get("queryStrand").equals("minus"))
227            qStrand = StrandedFeature.NEGATIVE;
228
229        if (subHitData.containsKey("subjectStrand") &&
230            subHitData.get("subjectStrand").equals("minus"))
231            sStrand = StrandedFeature.NEGATIVE;
232
233        // In cases where a frame is given as this contains strand
234        // information (TBLASTN for hit, TBLASTX for both query and
235        // hit)
236        if (subHitData.containsKey("queryFrame") &&
237            ((String) subHitData.get("queryFrame")).startsWith("minus"))
238            qStrand = StrandedFeature.NEGATIVE;
239
240        if (subHitData.containsKey("subjectFrame") &&
241            ((String) subHitData.get("subjectFrame")).startsWith("minus"))
242            sStrand = StrandedFeature.NEGATIVE;
243
244        // Get start/end
245        int qStart = Integer.parseInt((String) subHitData.get("querySequenceStart"));
246        int   qEnd = Integer.parseInt((String) subHitData.get("querySequenceEnd"));
247        int sStart = Integer.parseInt((String) subHitData.get("subjectSequenceStart"));
248        int   sEnd = Integer.parseInt((String) subHitData.get("subjectSequenceEnd"));
249
250        // The start/end coordinates from BioJava XML don't follow the
251        // BioJava paradigm of start < end, with orientation given by
252        // the strand property. Rather, they present start/end as
253        // displayed in BLAST output, with the coordinates being
254        // inverted on the reverse strand. We account for this here.
255        if (qStrand == StrandedFeature.NEGATIVE)
256        {
257            int swap = qStart;
258            qStart = qEnd;
259            qEnd   = swap;
260        }
261
262        if (sStrand == StrandedFeature.NEGATIVE)
263        {
264            int swap = sStart;
265            sStart = sEnd;
266            sEnd   = swap;
267        }
268
269        Sequence   queryView = makeQueryViewSequence(queryID);
270
271        // Map of Alignment sequences
272        Map labelMap = new HashMap();
273
274        try
275        {
276            // Set source to the program name
277            String source = "unknown";
278            if (subHitData.containsKey("program"))
279                source = (String) subHitData.get("program");
280
281            tokenBuffer.setLength(0);
282            tokenBuffer.append((String) subHitData.get("querySequence"));
283            labelMap.put(SimilarityPairFeature.QUERY_LABEL,
284                         new SimpleSymbolList(tokenParser, tokenBuffer.substring(0)));
285
286            tokenBuffer.setLength(0);
287            tokenBuffer.append((String) subHitData.get("subjectSequence"));
288            labelMap.put(SimilarityPairFeature.SUBJECT_LABEL,
289                         new SimpleSymbolList(tokenParser, tokenBuffer.substring(0)));
290
291            double score = 0.0;
292            if (subHitData.containsKey("score"))
293                score = Double.parseDouble((String) subHitData.get("score"));
294
295            // Query sequence feature
296            SimilarityPairFeature.Template qt =
297                new SimilarityPairFeature.Template();
298            qt.type       = SIMILARITY_PAIR_FEATURE_TYPE;
299            qt.source     = source;
300            qt.location   = new RangeLocation(qStart, qEnd);
301            qt.strand     = qStrand;
302            qt.score      = score;
303            qt.annotation = AnnotationFactory.makeAnnotation(subHitData);
304
305            // Subject sequence feature
306            SimilarityPairFeature.Template st =
307                new SimilarityPairFeature.Template();
308            st.type       = SIMILARITY_PAIR_FEATURE_TYPE;
309            st.source     = source;
310            st.location   = new RangeLocation(sStart, sEnd);
311            st.strand     = sStrand;
312            st.score      = score;
313            st.annotation = AnnotationFactory.makeAnnotation(subHitData);
314
315            Alignment a = new SimpleAlignment(labelMap);
316            qt.alignment = a;
317            st.alignment = a;
318
319            SimilarityPairFeature qf =
320                (SimilarityPairFeature) queryView.createFeature(qt);
321
322            SimilarityPairFeature sf =
323                (SimilarityPairFeature) queryView.createFeature(qt);
324
325            sf.setSibling(qf);
326            qf.setSibling(sf);
327
328            qf.addChangeListener(ChangeListener.ALWAYS_VETO,
329                                 ChangeType.UNKNOWN);
330            sf.addChangeListener(ChangeListener.ALWAYS_VETO,
331                                 ChangeType.UNKNOWN);
332        }
333        catch (ChangeVetoException cve)
334        {
335            throw new BioError("Assertion failure creating "
336                               + "SimilarityPairFeature. Template "
337                               + "modification vetoed",cve);
338        }
339    }
340}