001 /* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojava.bio.program.gff; 023 024import java.io.BufferedReader; 025import java.io.File; 026import java.io.FileNotFoundException; 027import java.io.FileReader; 028import java.io.FileWriter; 029import java.io.IOException; 030import java.io.PrintWriter; 031import java.util.HashSet; 032import java.util.Iterator; 033import java.util.Set; 034 035import org.biojava.bio.BioError; 036import org.biojava.bio.BioException; 037import org.biojava.bio.seq.Sequence; 038import org.biojava.bio.seq.SequenceIterator; 039import org.biojava.bio.seq.db.IllegalIDException; 040import org.biojava.bio.seq.db.SequenceDB; 041import org.biojava.utils.ChangeVetoException; 042import org.biojava.utils.ParserException; 043 044/** 045 * @author Mark Schreiber 046 * @author Matthew Pocock 047 * @since 1.2 048 */ 049 050public class GFFTools { 051 052 /** 053 * Flag to indicate that there is no score info. 054 */ 055 public static double NO_SCORE = Double.NEGATIVE_INFINITY; 056 057 /** 058 * Flag to indicate that there is no frame info. 059 */ 060 public static int NO_FRAME = -1; 061 062 /** 063 * Reads a <code>GFFEntrySet</code> from a file with no filtering. 064 * 065 * @param fileName the file containing the GFF 066 * @throws FileNotFoundException if file is not found 067 * @throws ParserException if format is wrong 068 * @throws BioException if format is wrong 069 * @throws IOException if file reading error occurs 070 * @return a <code>GFFEntrySet</code> encapsulating the records read from the file 071 * @deprecated use: readGff(File) 072 */ 073 public static GFFEntrySet readGFF(String fileName) 074 throws FileNotFoundException, ParserException, BioException, IOException 075 { 076 return readGFF(fileName, GFFRecordFilter.ACCEPT_ALL); 077 } 078 079 /** 080 * Reads a GFFEntrySet from a file with the specified filter. 081 * 082 * @param fileName the file containing the GFF 083 * @param recFilt the filter to use 084 * @throws FileNotFoundException if file is not found 085 * @throws ParserException if format is wrong 086 * @throws BioException if format is wrong 087 * @throws IOException if file reading error occurs 088 * @return a <code>GFFEntrySet</code> encapsulating the records read from the file 089 * @deprecated use: readGff(File,GFFRecordFilter) 090 */ 091 public static GFFEntrySet readGFF(String fileName, GFFRecordFilter recFilt) 092 throws FileNotFoundException, ParserException, BioException, IOException 093 { 094 GFFEntrySet gffEntries = new GFFEntrySet(); 095 GFFFilterer filterer = new GFFFilterer(gffEntries.getAddHandler(),recFilt); 096 GFFParser parser = new GFFParser(); 097 parser.parse(new BufferedReader(new FileReader(fileName)),filterer); 098 return gffEntries; 099 } 100 101 /** 102 * Reads a <code>GFFEntrySet</code> from a file with no filtering. 103 * 104 * @param inFile the File containing the GFF 105 * @throws FileNotFoundException if file is not found 106 * @throws ParserException if format is wrong 107 * @throws BioException if format is wrong 108 * @throws IOException if file reading error occurs 109 * @return a <code>GFFEntrySet</code> encapsulating the records read from the file 110 */ 111 public static GFFEntrySet readGFF(File inFile) 112 throws FileNotFoundException, ParserException, BioException, IOException 113 { 114 return readGFF(inFile, GFFRecordFilter.ACCEPT_ALL); 115 } 116 117 /** 118 * Reads a GFFEntrySet from a file with the specified filter. 119 * 120 * @param inFile the File containing the GFF 121 * @param recFilt the filter to use 122 * @throws FileNotFoundException if file is not found 123 * @throws ParserException if format is wrong 124 * @throws BioException if format is wrong 125 * @throws IOException if file reading error occurs 126 * @return a <code>GFFEntrySet</code> encapsulating the records read from the file 127 */ 128 public static GFFEntrySet readGFF(File inFile, GFFRecordFilter recFilt) 129 throws FileNotFoundException, ParserException, BioException, IOException 130 { 131 GFFEntrySet gffEntries = new GFFEntrySet(); 132 GFFFilterer filterer = new GFFFilterer(gffEntries.getAddHandler(),recFilt); 133 GFFParser parser = new GFFParser(); 134 parser.parse(new BufferedReader(new FileReader(inFile)),filterer); 135 return gffEntries; 136 } 137 138 /** 139 * Read all GFF entries from a buffered reader. 140 * 141 * This will read up untill the end of the reader. 142 * 143 * @param gffIn the BufferedReader to read text from 144 * @return a GFFEntrySet containing all of the GFF that could be read 145 * @throws parserException if the text could not be parsed as GFF 146 * @throws BioException if there was some error reading the GFF 147 * @throws IOException if there was an error with the reader 148 */ 149 public static GFFEntrySet readGFF(BufferedReader gffIn) 150 throws ParserException, BioException, IOException 151 { 152 return readGFF(gffIn, GFFRecordFilter.ACCEPT_ALL); 153 } 154 155 /** 156 * Read all GFF entries matching a filter from a buffered reader. 157 * 158 * This will read up untill the end of the reader. 159 * 160 * @param gffIn the BufferedReader to read text from 161 * @return a GFFEntrySet containing all of the GFF that could be read 162 * @throws parserException if the text could not be parsed as GFF 163 * @throws BioException if there was some error reading the GFF 164 * @throws IOException if there was an error with the reader 165 */ 166 public static GFFEntrySet readGFF(BufferedReader gffIn, GFFRecordFilter recFilt) 167 throws ParserException, BioException, IOException 168 { 169 GFFEntrySet gffEntries = new GFFEntrySet(); 170 GFFFilterer filterer = new GFFFilterer(gffEntries.getAddHandler(),recFilt); 171 GFFParser parser = new GFFParser(); 172 parser.parse(gffIn, filterer); 173 return gffEntries; 174 } 175 176 /** 177 * Writes a GFFEntrySet to a file. 178 * 179 * @param fileName the file to write to 180 * @param ents the entries to write 181 * @throws IOException if file writing fails 182 */ 183 public static void writeGFF(String fileName, GFFEntrySet ents) 184 throws IOException 185 { 186 PrintWriter pw = new PrintWriter(new FileWriter(fileName)); 187 writeGFF(pw, ents); 188 pw.close(); 189 } 190 191 /** 192 * Writes a GFFEntrySet to a file. 193 * 194 * @param outFile the file to write to 195 * @param ents the entry set to write 196 * @throws IOException if writing to the file fails 197 */ 198 public static void writeGFF(File outFile, GFFEntrySet ents) 199 throws IOException 200 { 201 PrintWriter pw = new PrintWriter(new FileWriter(outFile)); 202 writeGFF(pw, ents); 203 pw.close(); 204 } 205 206 /** 207 * Writes a GFFEntrySet to a PrintWriter. 208 * 209 * @param pw the PrintWriter to write to 210 * @param ents the entries to write 211 * @throws IOException if file writing fails 212 */ 213 public static void writeGFF(PrintWriter pw, GFFEntrySet ents) 214 throws IOException 215 { 216 GFFWriter writer = new GFFWriter(pw); 217 ents.streamRecords(writer); 218 } 219 220 /** 221 * Annotates a sequence with the features from a GFF entry set with sequence 222 * name matching this sequence. 223 * 224 * @param seq the <code>Sequence</code> to annotate. 225 * @param ents the the GFF features to annotate it with. 226 * @return a reference to a newly annotated sequence. 227 */ 228 public static Sequence annotateSequence(Sequence seq, GFFEntrySet ents){ 229 Sequence annotated; 230 try { 231 annotated = ents.getAnnotator().annotate(seq); 232 } 233 catch (ChangeVetoException ex) { 234 throw new BioError("Assertion Error: Unable to annotate sequence",ex); 235 }catch (BioException ex) { 236 throw new BioError("Assertion Error: Unable to annotate sequence",ex); 237 } 238 return annotated; 239 } 240 241 /** 242 * Annotates a sequence with the features from a GFF entry set. 243 * 244 * @param seq the <code>Sequence</code> to annotate. 245 * @param ents the the GFF features to annotate it with. 246 * @param checkSeqName boolean flat, if true only annotate sequence with 247 * features that have matching sequence names, otherwise annotate 248 * all features 249 * @return a reference to a newly annotated sequence. 250 */ 251 public static Sequence annotateSequence( 252 Sequence seq, 253 GFFEntrySet ents, 254 boolean checkSeqName 255 ) { 256 Sequence annotated; 257 try { 258 annotated = ents.getAnnotator(checkSeqName).annotate(seq); 259 } 260 catch (ChangeVetoException ex) { 261 throw new BioError("Assertion Error: Unable to annotate sequence",ex); 262 }catch (BioException ex) { 263 throw new BioError("Assertion Error: Unable to annotate sequence",ex); 264 } 265 return annotated; 266 } 267 268 /** 269 * Annotates all sequences in a sequence DB with features from a GFF entry set. 270 * 271 * @param seqs the SequenceDB to annotate 272 * @param ents the GFFEntrySet to annote with 273 * @return a SequenceDB with all the annotations on 274 */ 275 public static SequenceDB annotateSequences(SequenceDB seqs, GFFEntrySet ents) 276 throws IllegalIDException, BioException{ 277 Set names = new HashSet(); 278 279 //get the list of names for each sequence 280 for (Iterator i = ents.lineIterator(); i.hasNext(); ) { 281 Object o = i.next(); 282 if(o instanceof GFFRecord){//only process GFFRecords not comments 283 GFFRecord record = (GFFRecord)o; 284 if(! names.contains(record.getSeqName())){ 285 names.add(record.getSeqName()); 286 } 287 } 288 } 289 290 //filter entry set into subsets with same names, use that subset to annotate 291 //the correct sequence. 292 for (Iterator i = names.iterator(); i.hasNext(); ) { 293 final String name = (String)i.next(); 294 GFFRecordFilter filt = new GFFRecordFilter(){ 295 public boolean accept(GFFRecord rec){ 296 return rec.getSeqName().equals(name); 297 } 298 }; 299 300 GFFEntrySet filtered = ents.filter(filt); 301 Sequence seq = seqs.getSequence(name); 302 seq = GFFTools.annotateSequence(seq, filtered); 303 } 304 305 return seqs; 306 } 307 308 /** 309 * Creates a GFFEntrySet containing one entry for each feature on a sequence. 310 * 311 * @param seq the Sequence to create features for 312 * @return a new GFFEntrySet with gff records for each featre on the sequence 313 * @throws BioException if something went wrong GFF-ifying the sequences 314 * features 315 */ 316 public static GFFEntrySet gffFromSequence(Sequence seq) 317 throws BioException { 318 SequencesAsGFF sagff = new SequencesAsGFF(); 319 GFFEntrySet gffES = new GFFEntrySet(); 320 sagff.processSequence(seq, gffES.getAddHandler()); 321 return gffES; 322 } 323 324 /** 325 * Creates a GFFEntrySet containing one entry for each feature on each 326 * sequence of a SequenceDB. 327 * 328 * <p><em>Note:</em> This converts all features in the whole database to 329 * in-memorey GFFRecord instances. This will take up considerable memory for 330 * large databases.</p> 331 * 332 * @param seqDB the SequenceDB to create features for 333 * @return a new GFFEntrySet with gff records for each feature on the database 334 * @throws BioException if something went wrong GFF-ifying the sequences 335 * features 336 */ 337public static GFFEntrySet gffFromSeqDB(SequenceDB seqDB) 338 throws BioException { 339 GFFEntrySet gffES = new GFFEntrySet(); 340 for(SequenceIterator si = seqDB.sequenceIterator(); si.hasNext(); ) { 341 Sequence seq = si.nextSequence(); 342 SequencesAsGFF sagff = new SequencesAsGFF(); 343 sagff.processSequence(seq, gffES.getAddHandler()); 344 } 345 return gffES; 346 } 347}