001/* -*- c-basic-offset: 2; indent-tabs-mode: nil -*- */
002/*
003 *                    BioJava development code
004 *
005 * This code may be freely distributed and modified under the
006 * terms of the GNU Lesser General Public Licence.  This should
007 * be distributed with the code.  If you do not have a copy,
008 * see:
009 *
010 *      http://www.gnu.org/copyleft/lesser.html
011 *
012 * Copyright for this code is held jointly by the individual
013 * authors.  These should be listed in @author doc comments.
014 *
015 * For more information on the BioJava project and its aims,
016 * or to join the biojava-l mailing list, visit the home page
017 * at:
018 *
019 *      http://www.biojava.org/
020 *
021 */
022
023package org.biojava.bio.program.gff;
024
025import java.util.ArrayList;
026import java.util.Collection;
027import java.util.HashMap;
028import java.util.Iterator;
029import java.util.List;
030import java.util.Map;
031
032import org.biojava.bio.BioException;
033import org.biojava.bio.seq.Feature;
034import org.biojava.bio.seq.FeatureFilter;
035import org.biojava.bio.seq.Sequence;
036import org.biojava.bio.seq.StrandedFeature;
037import org.biojava.bio.seq.db.SequenceDB;
038import org.biojava.bio.symbol.Location;
039
040/**
041 * Turns a sequence database into a GFF event stream.
042 *
043 * @author Matthew Pocock
044 * @author Thomas Down
045 * @author Len Trigg
046 */
047public class SequencesAsGFF {
048  /**
049   * The <span class="type">FeatureFilter</span> for selecting features to
050   * report as <span class="type">GFFRecord</span>s.
051   */
052  private FeatureFilter filter = FeatureFilter.all;
053  
054  /**
055   * Whether or not to recurse through the features during searching.
056   */
057  private boolean recurse = false;
058  
059  /**
060   * Whether or not non-contiguous features should be broken into blocks
061   * 
062   * @since 1.4
063   */
064  
065  private boolean shatter = false;
066  
067  private boolean generateSequenceHeader = true;
068  
069  /**
070   * Specify whether features with non-contiguous locations should be broken
071   * up such that a GFF feature line is emitted for each contiguous block.
072   * 
073   * @param b
074   * @since 1.4
075   */
076  
077  public void setShatter(boolean b) {
078      this.shatter = b;
079  }
080  
081  /**
082   * Determine if features with non-contiguous locations will be broken into
083   * multiple GFF records.
084   * 
085   * @since 1.4
086   */
087  
088  public boolean getShatter() {
089      return shatter;
090  }
091  
092  /**
093   * Specify whether a per-sequence header line, giving the length of the
094   * sequence, should be generated.
095   *
096   * @since 1.4
097   */
098   
099   public void setGenerateSequenceHeader(boolean b) {
100       this.generateSequenceHeader = b;
101   }
102   
103   /**
104    * Discover if per-sequence header lines will be generated.
105    *
106    * @since 1.4
107    */
108   
109   public boolean getGenerateSequenceHeader() {
110       return generateSequenceHeader;
111   }
112  
113  /**
114   * Return the current <span class="type">FeatureFilter</span>.
115   * <p>
116   * This is the object that will accept or reject individual features.
117   *
118   * @return the current <span class="type">FeatureFilter</span>
119   */
120  public FeatureFilter getFeatureFilter() {
121    return filter;
122  }
123  
124  /**
125   * Replace the current <span class="type">FeatureFilter</span> with
126   * <span class="arg">filter</span>.
127   *
128   * @param filter  the new <span class="type">FeatureFilter</span>
129   */
130  public void setFeatureFilter(FeatureFilter filter) {
131    this.filter = filter;
132  }
133  
134  /**
135   * Return whether features will be filtered recursively or not.
136   *
137   * @return whether or not to recurse
138   */
139  public boolean getRecurse() {
140    return recurse;
141  }
142  
143  /**
144   * Set whether features will be filtered recursively to
145   * <span class="arg">recurse</span>.
146   *
147   * @param recurse  <span class="kw">true</span> if you want to recurse,
148   *                 <span class="kw">false</span> otherwise
149   */
150  public void setRecurse(boolean recurse) {
151    this.recurse = recurse;
152  }
153
154  /**
155   * Emit any per-sequence header information.
156   * The default implementation emits sequence-region comment lines.
157   *
158   * @since 1.4
159   */
160  
161  protected void doPreProcessSequence(
162    Sequence seq,
163    GFFDocumentHandler handler,
164    String id
165  )
166    throws BioException
167  {
168      if (generateSequenceHeader) {
169          handler.commentLine("#sequence-region " + id + " 1 " + seq.length());
170      }
171  }
172  
173  /**
174   * Internal method to process an individual <span class="type">Sequence</span>.
175   *
176   * @param seq  the <span class="type">Sequence</span> to GFFify
177   * @param handler the <span class="type">GFFDocumentHandler</span> that will
178   *                receive the GFF for all suitable features within
179   *                <span class="arg">seq</span>
180   * @param id the value of the <span class="method">seqName</span> field in any
181   *           <span class="type">GFFRecord</span>s produced
182   */
183  protected void doProcessSequence(Sequence seq,
184                                   GFFDocumentHandler handler,
185                                   String id) 
186    throws BioException 
187  {
188    Iterator fi = seq.filter(getFeatureFilter(), getRecurse()).features();
189      
190    while (fi.hasNext()) {
191      doProcessFeature((Feature) fi.next(), handler, id);
192    }
193  }
194
195
196  /**
197   * Internal method to process an individual <span class="type">Feature</span>.
198   *
199   * @param feature  the <span class="type">Feature</span> to GFFify
200   * @param handler the <span class="type">GFFDocumentHandler</span> that will
201   *                receive the GFF for this feature
202   * @param id the value of the <span class="method">seqName</span> field in any
203   *           <span class="type">GFFRecord</span>s produced
204   */
205  protected void doProcessFeature(Feature feature,
206                                  GFFDocumentHandler handler,
207                                  String id) 
208    throws BioException 
209  {
210    SimpleGFFRecord record = createGFFRecord(feature, id);
211    if (shatter && !feature.getLocation().isContiguous()) {
212        for (Iterator si = feature.getLocation().blockIterator(); si.hasNext(); ) {
213            Location shatterBloc = (Location) si.next();
214            record.setStart(shatterBloc.getMin());
215            record.setEnd(shatterBloc.getMax());
216            handler.recordLine(record);
217        }
218    } else {
219        handler.recordLine(record);
220    }
221  }
222
223
224  /**
225   * Internal method to create a <span class="type">GFFRecord</span>
226   * from an individual <span class="type">Feature</span>.
227   *
228   * @param feature  the <span class="type">Feature</span> to GFFify
229   * @param id the value of the <span class="method">seqName</span> field in any
230   *           <span class="type">GFFRecord</span>s produced
231   */
232  protected SimpleGFFRecord createGFFRecord(Feature feature,
233                                            String id) 
234    throws BioException {
235    
236    SimpleGFFRecord record = new SimpleGFFRecord();
237    record.setSeqName(id);
238    record.setSource(feature.getSource());
239    record.setFeature(feature.getType());
240    Location loc = feature.getLocation();
241    record.setStart(loc.getMin());
242    record.setEnd(loc.getMax());
243    record.setScore(GFFTools.NO_SCORE);
244    record.setStrand(StrandedFeature.UNKNOWN);
245    if (feature instanceof StrandedFeature) {
246      StrandedFeature sf = (StrandedFeature) feature;
247      if (sf.getStrand() == StrandedFeature.POSITIVE) {
248        record.setStrand(StrandedFeature.POSITIVE);
249      } else if (sf.getStrand() == StrandedFeature.NEGATIVE) {
250        record.setStrand(StrandedFeature.NEGATIVE);
251      }
252    }
253    record.setFrame(GFFTools.NO_FRAME);
254    Map fMap = feature.getAnnotation().asMap();
255    Map fMap2 = new HashMap();
256    for (Iterator ki = fMap.keySet().iterator(); ki.hasNext(); ) {
257      Object key = ki.next();
258      Object value = fMap.get(key);
259      String keyS = key.toString();
260      List valueList;
261      if (value instanceof Collection) {
262        valueList = new ArrayList((Collection) value);
263      } else {
264        //valueList = Collections.singletonList(value); 1.3?
265        valueList = new ArrayList();
266        valueList.add(value);
267      }
268      for (int i = 0; i < valueList.size(); i++) {
269        Object o = valueList.get(i);
270        valueList.set(i, o.toString());
271      }
272      fMap2.put(keyS, valueList);
273    }
274    record.setGroupAttributes(fMap2);
275    record.setComment(null);        
276
277    return record;
278  }
279
280
281  /**
282   * Process an individual <span class="type">Sequence</span>, informing
283   * <span class="arg">handler</span> of any suitable features.
284   *
285   * @param seq  the <span class="type">Sequence</span> to GFFify
286   * @param handler the <span class="type">GFFDocumentHandler</span> that will
287   *                receive the GFF for all suitable features within
288   *                <span class="arg">seq</span>
289   */
290  public void processSequence(Sequence seq, GFFDocumentHandler handler) 
291  throws BioException {
292    handler.startDocument(seq.getURN());
293    doPreProcessSequence(seq, handler, seq.getName());
294    doProcessSequence(seq, handler, seq.getName());
295    handler.endDocument();
296  }
297
298  /**
299   * Process all <span class="type">Sequence</span>s within a
300   * <span class="type">SequenceDB</span>, informing
301   * <span class="arg">handler</span> of any suitable features.
302   *
303   * @param seqDB  the <span class="type">SequenceDB</span> to GFFify
304   * @param handler the <span class="type">GFFDocumentHandler</span> that will
305   *                receive the GFF for all suitable features within
306   *                <span class="arg">seqDB</span>
307   */
308  public void processDB(SequenceDB seqDB, GFFDocumentHandler handler)
309  throws BioException {
310    handler.startDocument("unknown:SequenceDB:" + seqDB.getName());
311    for(Iterator i = seqDB.ids().iterator(); i.hasNext(); ) {
312      String id = (String) i.next();
313      Sequence seq = seqDB.getSequence(id);
314      doPreProcessSequence(seq, handler, id);
315    }
316    for(Iterator i = seqDB.ids().iterator(); i.hasNext(); ) {
317      String id = (String) i.next();
318      Sequence seq = seqDB.getSequence(id);
319      doProcessSequence(seq, handler, id);
320    }
321    handler.endDocument();
322  }
323}