001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022package org.biojava.bio.program.ssbind;
023
024import java.util.ArrayList;
025import java.util.HashMap;
026import java.util.List;
027import java.util.Map;
028
029import org.biojava.bio.BioException;
030import org.biojava.bio.alignment.Alignment;
031import org.biojava.bio.alignment.SimpleAlignment;
032import org.biojava.bio.search.SearchContentHandler;
033import org.biojava.bio.seq.Sequence;
034import org.biojava.bio.seq.StrandedFeature;
035import org.biojava.bio.seq.StrandedFeature.Strand;
036import org.biojava.bio.seq.homol.Homology;
037import org.biojava.bio.seq.homol.HomologyFeature;
038import org.biojava.bio.seq.homol.SimpleHomology;
039import org.biojava.bio.seq.io.SymbolTokenization;
040import org.biojava.bio.symbol.FiniteAlphabet;
041import org.biojava.bio.symbol.RangeLocation;
042import org.biojava.bio.symbol.SimpleSymbolList;
043import org.biojava.utils.ChangeVetoException;
044
045/**
046 * <p><code>BlastLikeHomologyBuilder</code> populates a
047 * <code>List</code> with <code>Homology</code> instances created from
048 * SAX events supplied via a <code>SeqSimilarityAdapter</code>. The
049 * SAX events should describe elements conforming to the BioJava
050 * BlastLikeDataSetCollection DTD. Suitable sources are
051 * <code>BlastLikeSAXParser</code> or
052 * <code>FastaSAXParser</code>. Annotated <code>ViewSequence</code>s
053 * wrapping both query and subject sequences are created and populated
054 * with <code>HomologyFeature</code>s. See the documentation of
055 * <code>Homology</code> and <code>HomologyFeature</code>.</p>
056 *
057 * <p>As <code>SimpleHomologyFeature</code>s are created on views of
058 * the query and subject sequences, both query and subject should be
059 * nucleotide sequences (<code>SimpleHomologyFeature</code> extends
060 * <code>StrandedFeature</code>.). This limits the searches currently
061 * handled to BLASTN, TBLASTX and Fasta DNA.</p>
062 *
063 * @author Keith James
064 * @author Greg Cox
065 * @since 1.2
066 */
067public class BlastLikeHomologyBuilder extends ViewSequenceFactory
068    implements SearchContentHandler
069{
070    /**
071     * <code>HOMOLOGY_FEATURE_TYPE</code> is the type String used by
072     * <code>BlastLikeHomologyBuilder</code> when creating
073     * <code>HomologyFeature</code>s. This is the String which is
074     * returned when an <code>HomologyFeature</code>'s
075     * <code>getType()</code> method is called.
076     */
077    public static final String HOMOLOGY_FEATURE_TYPE = "homology";
078
079    // Identifiers for query and database
080    private String queryID;
081
082    // Data holders for search result properties
083    private Map resultData;
084    private Map hitData;
085    private Map subHitData;
086
087    private SymbolTokenization tokenParser;
088    private StringBuffer       tokenBuffer;
089
090    // List for holding homologies from current search. There may be
091    // more than one search result in a stream
092    private List homologies;
093    // Flag indicating whether there are more results in the stream
094    private boolean moreSearchesAvailable = false;
095    // List to accept homologies from all results in the stream
096    private List target;
097
098    /**
099     * Creates a new <code>BlastLikeHomologyBuilder</code> which will
100     * instantiate <code>Homology</code> objects into the
101     * <code>List</code> target.
102     *
103     * @param target a <code>List</code>.
104     */
105    public BlastLikeHomologyBuilder(List target)
106    {
107        this.target = target;
108
109        resultData       = new HashMap();
110        hitData          = new HashMap();
111        subHitData       = new HashMap();
112        queryViewCache   = new HashMap();
113        subjectViewCache = new HashMap();
114        tokenBuffer      = new StringBuffer(1024);
115    }
116
117    public void setQueryID(String queryID)
118    {
119        this.queryID = queryID;
120    }
121
122    public boolean getMoreSearches()
123    {
124        return moreSearchesAvailable;
125    }
126
127    public void setMoreSearches(boolean value)
128    {
129        moreSearchesAvailable = value;
130    }
131
132    public void startSearch()
133    {
134        subjectViewCache.clear();
135        homologies = new ArrayList();
136    }
137
138    public void endSearch()
139    {
140        target.addAll(homologies);
141    }
142
143    public void startHeader()
144    {
145        resultData.clear();
146    }
147
148    public void endHeader() { }
149
150    public void startHit()
151    {
152        hitData.clear();
153        subHitData.clear();
154    }
155
156    public void endHit() { }
157
158    public void startSubHit() { }
159
160    public void endSubHit()
161    {
162        try
163        {
164            homologies.add(makeHomology());
165        }
166        catch (BioException be)
167        {
168            System.err.println("Failed to build Homology:");
169            be.printStackTrace();
170        }
171    }
172
173    public void addSearchProperty(Object key, Object value)
174    {
175        resultData.put(key, value);
176    }
177
178    public void addHitProperty(Object key, Object value)
179    {
180        hitData.put(key, value);
181    }
182
183    public void addSubHitProperty(Object key, Object value)
184    {
185        subHitData.put(key, value);
186    }
187
188    /**
189     * <code>makeHomology</code> creates a new
190     * <code>SimpleHomology</code> describing the similarity between
191     * the query and subject sequences. The
192     * <code>HomologyFeatures</code> created are added to
193     * <code>ViewSequence</code>s wrapping the query and subject
194     * sequences.
195     *
196     * @return an <code>Homology</code>.
197     *
198     * @exception BioException if an error occurs.
199     */
200    private Homology makeHomology() throws BioException
201    {
202        subHitData.putAll(resultData);
203        subHitData.putAll(hitData);
204
205        if (tokenParser == null)
206        {
207            String identifier;
208            // Try explicit sequence type first
209            if (subHitData.containsKey("hitSequenceType"))
210                identifier = (String) subHitData.get("hitSequenceType");
211            // Otherwise try to resolve from the program name (only
212            // works for Blast)
213            else if (subHitData.containsKey("program"))
214                identifier = (String) subHitData.get("program");
215            else
216                throw new BioException("Failed to determine sequence type");
217
218            FiniteAlphabet alpha = AlphabetResolver.resolveAlphabet(identifier);
219            tokenParser = alpha.getTokenization("token");
220        }
221
222        Strand qStrand = StrandedFeature.POSITIVE;
223        Strand sStrand = StrandedFeature.POSITIVE;
224
225        // In cases where an explicit strand is given (FASTA DNA, BLASTN)
226        if (subHitData.containsKey("queryStrand") &&
227            subHitData.get("queryStrand").equals("minus"))
228            qStrand = StrandedFeature.NEGATIVE;
229
230        if (subHitData.containsKey("subjectStrand") &&
231            subHitData.get("subjectStrand").equals("minus"))
232            sStrand = StrandedFeature.NEGATIVE;
233
234        // In cases where a frame is given as this contains strand
235        // information (TBLASTN for hit, TBLASTX for both query and
236        // hit)
237        if (subHitData.containsKey("queryFrame") &&
238            ((String) subHitData.get("queryFrame")).startsWith("minus"))
239            qStrand = StrandedFeature.NEGATIVE;
240
241        if (subHitData.containsKey("subjectFrame") &&
242            ((String) subHitData.get("subjectFrame")).startsWith("minus"))
243            sStrand = StrandedFeature.NEGATIVE;
244
245        int qStart = Integer.parseInt((String) subHitData.get("querySequenceStart"));
246        int   qEnd = Integer.parseInt((String) subHitData.get("querySequenceEnd"));
247        int sStart = Integer.parseInt((String) subHitData.get("subjectSequenceStart"));
248        int   sEnd = Integer.parseInt((String) subHitData.get("subjectSequenceEnd"));
249
250        // The start/end coordinates from BioJava XML don't follow the
251        // BioJava paradigm of start < end, with orientation given by
252        // the strand property. Rather, they present start/end as
253        // displayed in BLAST output, with the coordinates being
254        // inverted on the reverse strand. We account for this here.
255        if (qStrand == StrandedFeature.NEGATIVE)
256        {
257            int swap = qStart;
258            qStart = qEnd;
259            qEnd   = swap;
260        }
261
262        if (sStrand == StrandedFeature.NEGATIVE)
263        {
264            int swap = sStart;
265            sStart = sEnd;
266            sEnd   = swap;
267        }
268
269        String subjectID = (String) hitData.get("subjectId");
270
271        Sequence   queryView = makeQueryViewSequence(queryID);
272        Sequence subjectView = makeSubjectViewSequence(subjectID);
273
274        SimpleHomology homology = new SimpleHomology();
275
276        // Map of HomologyFeatures to Alignment sequences
277        Map labelMap = new HashMap();
278
279        try
280        {
281            String source = "unknown";
282            if (subHitData.containsKey("program"))
283                source = (String) subHitData.get("program");
284
285            tokenBuffer.setLength(0);
286            tokenBuffer.append((String) subHitData.get("querySequence"));
287
288            // Query sequence feature
289            HomologyFeature.Template qt = new HomologyFeature.Template();
290            qt.type       = HOMOLOGY_FEATURE_TYPE;
291            qt.source     = source;
292            qt.location   = new RangeLocation(qStart, qEnd);
293            qt.strand     = qStrand;
294            qt.annotation = AnnotationFactory.makeAnnotation(subHitData);
295            qt.homology   = homology;
296
297            // Map the new feature to the alignment SymbolList
298            labelMap.put(queryView.createFeature(qt),
299                         new SimpleSymbolList(tokenParser, tokenBuffer.substring(0)));
300
301            tokenBuffer.setLength(0);
302            tokenBuffer.append((String) subHitData.get("subjectSequence"));
303
304            // Subject sequence feature
305            HomologyFeature.Template st = new HomologyFeature.Template();
306            st.type       = HOMOLOGY_FEATURE_TYPE;
307            st.source     = source;
308            st.location   = new RangeLocation(sStart, sEnd);
309            st.strand     = sStrand;
310            st.annotation = AnnotationFactory.makeAnnotation(subHitData);
311            st.homology   = homology;
312
313            // Map the new feature to the alignment SymbolList
314            labelMap.put(subjectView.createFeature(st),
315                         new SimpleSymbolList(tokenParser, tokenBuffer.substring(0)));
316
317            Alignment a = new SimpleAlignment(labelMap);
318            homology.setAlignment(a);
319
320            return homology;
321        }
322        catch (ChangeVetoException cve)
323        {
324            throw new BioException( "Failed to create HomologyFeature",cve);
325        }
326    }
327}