001 /*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022package org.biojava.bio.program.gff;
023
024import java.io.BufferedReader;
025import java.io.File;
026import java.io.FileNotFoundException;
027import java.io.FileReader;
028import java.io.FileWriter;
029import java.io.IOException;
030import java.io.PrintWriter;
031import java.util.HashSet;
032import java.util.Iterator;
033import java.util.Set;
034
035import org.biojava.bio.BioError;
036import org.biojava.bio.BioException;
037import org.biojava.bio.seq.Sequence;
038import org.biojava.bio.seq.SequenceIterator;
039import org.biojava.bio.seq.db.IllegalIDException;
040import org.biojava.bio.seq.db.SequenceDB;
041import org.biojava.utils.ChangeVetoException;
042import org.biojava.utils.ParserException;
043
044/**
045 * @author Mark Schreiber
046 * @author Matthew Pocock
047 * @since 1.2
048 */
049
050public class GFFTools {
051
052  /**
053   * Flag to indicate that there is no score info.
054   */
055  public static double NO_SCORE = Double.NEGATIVE_INFINITY;
056
057  /**
058   * Flag to indicate that there is no frame info.
059   */
060  public static int NO_FRAME = -1;
061
062  /**
063   * Reads a <code>GFFEntrySet</code> from a file with no filtering.
064   *
065   * @param fileName the file containing the GFF
066   * @throws FileNotFoundException if file is not found
067   * @throws ParserException if format is wrong
068   * @throws BioException if format is wrong
069   * @throws IOException if file reading error occurs
070   * @return a <code>GFFEntrySet</code> encapsulating the records read from the file
071   * @deprecated use: readGff(File)
072   */
073  public static GFFEntrySet readGFF(String fileName)
074    throws FileNotFoundException, ParserException, BioException, IOException
075  {
076    return readGFF(fileName, GFFRecordFilter.ACCEPT_ALL);
077  }
078
079  /**
080   * Reads a GFFEntrySet from a file with the specified filter.
081   *
082   * @param fileName the file containing the GFF
083   * @param recFilt the filter to use
084   * @throws FileNotFoundException if file is not found
085   * @throws ParserException if format is wrong
086   * @throws BioException if format is wrong
087   * @throws IOException if file reading error occurs
088   * @return a <code>GFFEntrySet</code> encapsulating the records read from the file
089   * @deprecated use: readGff(File,GFFRecordFilter)
090   */
091  public static GFFEntrySet readGFF(String fileName, GFFRecordFilter recFilt)
092    throws FileNotFoundException, ParserException, BioException, IOException
093  {
094    GFFEntrySet gffEntries = new GFFEntrySet();
095    GFFFilterer filterer = new GFFFilterer(gffEntries.getAddHandler(),recFilt);
096    GFFParser parser = new GFFParser();
097    parser.parse(new BufferedReader(new FileReader(fileName)),filterer);
098    return gffEntries;
099  }
100  
101 /**
102  * Reads a <code>GFFEntrySet</code> from a file with no filtering.
103  *
104  * @param inFile the File containing the GFF
105  * @throws FileNotFoundException if file is not found
106  * @throws ParserException if format is wrong
107  * @throws BioException if format is wrong
108  * @throws IOException if file reading error occurs
109  * @return a <code>GFFEntrySet</code> encapsulating the records read from the file
110  */
111  public static GFFEntrySet readGFF(File inFile)
112    throws FileNotFoundException, ParserException, BioException, IOException
113  {
114    return readGFF(inFile, GFFRecordFilter.ACCEPT_ALL);
115  }
116
117  /**
118   * Reads a GFFEntrySet from a file with the specified filter.
119   *
120   * @param inFile the File containing the GFF
121   * @param recFilt the filter to use
122   * @throws FileNotFoundException if file is not found
123   * @throws ParserException if format is wrong
124   * @throws BioException if format is wrong
125   * @throws IOException if file reading error occurs
126   * @return a <code>GFFEntrySet</code> encapsulating the records read from the file
127   */
128  public static GFFEntrySet readGFF(File inFile, GFFRecordFilter recFilt)
129    throws FileNotFoundException, ParserException, BioException, IOException
130  {
131    GFFEntrySet gffEntries = new GFFEntrySet();
132    GFFFilterer filterer = new GFFFilterer(gffEntries.getAddHandler(),recFilt);
133    GFFParser parser = new GFFParser();
134    parser.parse(new BufferedReader(new FileReader(inFile)),filterer);
135    return gffEntries;
136  }
137
138  /**
139   * Read all GFF entries from a buffered reader.
140   *
141   * This will read up untill the end of the reader.
142   *
143   * @param gffIn  the BufferedReader to read text from
144   * @return a GFFEntrySet containing all of the GFF that could be read
145   * @throws parserException  if the text could not be parsed as GFF
146   * @throws BioException if there was some error reading the GFF
147   * @throws IOException if there was an error with the reader
148   */
149  public static GFFEntrySet readGFF(BufferedReader gffIn)
150    throws ParserException, BioException, IOException
151  {
152    return readGFF(gffIn, GFFRecordFilter.ACCEPT_ALL);
153  }
154
155  /**
156   * Read all GFF entries matching a filter from a buffered reader.
157   *
158   * This will read up untill the end of the reader.
159   *
160   * @param gffIn  the BufferedReader to read text from
161   * @return a GFFEntrySet containing all of the GFF that could be read
162   * @throws parserException  if the text could not be parsed as GFF
163   * @throws BioException if there was some error reading the GFF
164   * @throws IOException if there was an error with the reader
165   */
166  public static GFFEntrySet readGFF(BufferedReader gffIn, GFFRecordFilter recFilt)
167    throws ParserException, BioException, IOException
168  {
169    GFFEntrySet gffEntries = new GFFEntrySet();
170    GFFFilterer filterer = new GFFFilterer(gffEntries.getAddHandler(),recFilt);
171    GFFParser parser = new GFFParser();
172    parser.parse(gffIn, filterer);
173    return gffEntries;
174  }
175
176  /**
177   * Writes a GFFEntrySet to a file.
178   *
179   * @param fileName the file to write to
180   * @param ents the entries to write
181   * @throws IOException if file writing fails
182   */
183  public static void writeGFF(String fileName, GFFEntrySet ents)
184    throws IOException
185  {
186    PrintWriter pw = new PrintWriter(new FileWriter(fileName));
187    writeGFF(pw, ents);
188    pw.close();
189  }
190  
191  /**
192   * Writes a GFFEntrySet to a file.
193   *
194   * @param outFile  the file to write to
195   * @param ents  the entry set to write
196   * @throws IOException if writing to the file fails
197   */
198  public static void writeGFF(File outFile, GFFEntrySet ents)
199    throws IOException
200  {
201    PrintWriter pw = new PrintWriter(new FileWriter(outFile));
202    writeGFF(pw, ents);
203    pw.close();
204  }
205
206  /**
207   * Writes a GFFEntrySet to a PrintWriter.
208   *
209   * @param pw  the PrintWriter to write to
210   * @param ents the entries to write
211   * @throws IOException if file writing fails
212   */
213  public static void writeGFF(PrintWriter pw, GFFEntrySet ents)
214    throws IOException
215  {
216    GFFWriter writer = new GFFWriter(pw);
217    ents.streamRecords(writer);
218  }
219
220  /**
221   * Annotates a sequence with the features from a GFF entry set with sequence
222   * name matching this sequence.
223   *
224   * @param seq the <code>Sequence</code> to annotate.
225   * @param ents the the GFF features to annotate it with.
226   * @return a reference to a newly annotated sequence.
227   */
228  public static Sequence annotateSequence(Sequence seq, GFFEntrySet ents){
229    Sequence annotated;
230    try {
231      annotated = ents.getAnnotator().annotate(seq);
232    }
233    catch (ChangeVetoException ex) {
234      throw new BioError("Assertion Error: Unable to annotate sequence",ex);
235    }catch (BioException ex) {
236      throw new BioError("Assertion Error: Unable to annotate sequence",ex);
237    }
238    return annotated;
239  }
240
241  /**
242   * Annotates a sequence with the features from a GFF entry set.
243   *
244   * @param seq the <code>Sequence</code> to annotate.
245   * @param ents the the GFF features to annotate it with.
246   * @param checkSeqName  boolean flat, if true only annotate sequence with
247   *        features that have matching sequence names, otherwise annotate
248   *        all features
249   * @return a reference to a newly annotated sequence.
250   */
251  public static Sequence annotateSequence(
252    Sequence seq,
253    GFFEntrySet ents,
254    boolean checkSeqName
255  ) {
256    Sequence annotated;
257    try {
258      annotated = ents.getAnnotator(checkSeqName).annotate(seq);
259    }
260    catch (ChangeVetoException ex) {
261      throw new BioError("Assertion Error: Unable to annotate sequence",ex);
262    }catch (BioException ex) {
263      throw new BioError("Assertion Error: Unable to annotate sequence",ex);
264    }
265    return annotated;
266  }
267
268  /**
269   * Annotates all sequences in a sequence DB with features from a GFF entry set.
270   *
271   * @param seqs  the SequenceDB to annotate
272   * @param ents  the GFFEntrySet to annote with
273   * @return a SequenceDB with all the annotations on
274   */
275  public static SequenceDB annotateSequences(SequenceDB seqs, GFFEntrySet ents)
276    throws IllegalIDException, BioException{
277    Set names = new HashSet();
278
279    //get the list of names for each sequence
280    for (Iterator i = ents.lineIterator(); i.hasNext(); ) {
281      Object o = i.next();
282      if(o instanceof GFFRecord){//only process GFFRecords not comments
283        GFFRecord record = (GFFRecord)o;
284        if(! names.contains(record.getSeqName())){
285          names.add(record.getSeqName());
286        }
287      }
288    }
289
290    //filter entry set into subsets with same names, use that subset to annotate
291    //the correct sequence.
292    for (Iterator i = names.iterator(); i.hasNext(); ) {
293      final String name = (String)i.next();
294      GFFRecordFilter filt = new GFFRecordFilter(){
295        public boolean accept(GFFRecord rec){
296          return rec.getSeqName().equals(name);
297        }
298      };
299
300      GFFEntrySet filtered = ents.filter(filt);
301      Sequence seq = seqs.getSequence(name);
302      seq = GFFTools.annotateSequence(seq, filtered);
303    }
304
305    return seqs;
306  }
307
308  /**
309   * Creates a GFFEntrySet containing one entry for each feature on a sequence.
310   *
311   * @param seq  the Sequence to create features for
312   * @return a new GFFEntrySet with gff records for each featre on the sequence
313   * @throws BioException if something went wrong GFF-ifying the sequences
314   *         features
315   */
316  public static GFFEntrySet gffFromSequence(Sequence seq)
317  throws BioException {
318    SequencesAsGFF sagff = new SequencesAsGFF();
319    GFFEntrySet gffES = new GFFEntrySet();
320    sagff.processSequence(seq, gffES.getAddHandler());
321    return gffES;
322  }
323  
324  /**
325   * Creates a GFFEntrySet containing one entry for each feature on each
326   * sequence of a SequenceDB.
327   *
328   * <p><em>Note:</em> This converts all features in the whole database to
329   * in-memorey GFFRecord instances. This will take up considerable memory for
330   * large databases.</p>
331   *
332   * @param seqDB  the SequenceDB to create features for
333   * @return  a new GFFEntrySet with gff records for each feature on the database
334   * @throws BioException if something went wrong GFF-ifying the sequences
335   *         features
336   */
337public static GFFEntrySet gffFromSeqDB(SequenceDB seqDB)
338  throws BioException {
339    GFFEntrySet gffES = new GFFEntrySet();
340    for(SequenceIterator si = seqDB.sequenceIterator(); si.hasNext(); ) {
341      Sequence seq = si.nextSequence();
342      SequencesAsGFF sagff = new SequencesAsGFF();
343      sagff.processSequence(seq, gffES.getAddHandler());
344    }
345    return gffES;
346  }
347}