001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022package org.biojava.bio.program.gff3;
023
024import java.io.BufferedReader;
025import java.io.IOException;
026import java.util.ArrayList;
027import java.util.List;
028import java.util.NoSuchElementException;
029import java.util.StringTokenizer;
030
031import org.biojava.bio.Annotation;
032import org.biojava.bio.BioError;
033import org.biojava.bio.BioException;
034import org.biojava.bio.program.gff.GFFErrorHandler;
035import org.biojava.bio.program.gff.GFFTools;
036import org.biojava.bio.program.gff.IgnoreRecordException;
037import org.biojava.bio.seq.StrandedFeature;
038import org.biojava.ontology.AlreadyExistsException;
039import org.biojava.ontology.OntoTools;
040import org.biojava.ontology.Ontology;
041import org.biojava.ontology.OntologyException;
042import org.biojava.ontology.Term;
043import org.biojava.utils.ChangeVetoException;
044import org.biojava.utils.ParserException;
045
046/**
047 * Parse a stream of GFF text into a stream of records and comments.
048 *
049 * <p>
050 * Developed from {@link org.biojava.bio.program.gff.GFFParser GFFParser}.
051 * </p>
052 *
053 * @author Matthew Pocock
054 */
055public class GFF3Parser {
056  private GFFErrorHandler errors = GFFErrorHandler.ABORT_PARSING;
057
058  /**
059  * Set the error handler used by this parser.
060  */
061
062  public void setErrorHandler(GFFErrorHandler errors) {
063    this.errors = errors;
064  }
065
066  /**
067  * Find the error handler used by this parser.
068  */
069
070  public GFFErrorHandler getErrorHandler() {
071    return errors;
072  }
073
074  /**
075  * Informs <span class="arg">handler</span> of each line of
076  * gff read from <span class="arg">bReader</span>.  This form
077  * of the method should only be used if no locator string is
078  * available for the resource being parsed.
079  *
080  * @param bReader the <span class="type">BufferedReader</span> to parse
081  * @param handler the <span class="type">GFF3DocumentHandler</span> that will
082  *                listen for 'stuff'
083  * @param ontology  an Ontology that all terms should come from
084  *
085  * @throws <span class="type">IOException</span> if for any reason
086  *         <span class="arg">bReader</span> throws one
087  * @throws <span class="type">BioException</span> if
088  *         <span class="arg">handler</span> can not correct a parse error
089  */
090
091  public void parse(
092    BufferedReader bReader,
093    GFF3DocumentHandler handler,
094    Ontology ontology
095  )
096  throws IOException, BioException, ParserException
097  {
098    parse(bReader, handler, ontology, "unknown:");
099  }
100
101  /**
102  * Informs <span class="arg">handler</span> of each line of
103  * GFF read from <span class="arg">bReader</span>
104  *
105  * @param bReader the <span class="type">BufferedReader</span> to parse
106  * @param handler the <span class="type">GFF3DocumentHandler</span> that will
107  *                listen for 'stuff'
108  * @param ontology  an Ontology that all terms should come from
109  *
110  * @throws <span class="type">IOException</span> if for any reason
111  *         <span class="arg">bReader</span> throws one
112  * @throws <span class="type">BioException</span> if
113  *         <span class="arg">handler</span> can not correct a parse error
114  */
115
116  public void parse(
117    BufferedReader bReader,
118    GFF3DocumentHandler handler,
119    Ontology ontology,
120    String locator
121  )
122  throws IOException, BioException, ParserException
123  {
124    Ontology fallBack;
125    try {
126        fallBack = OntoTools.getDefaultFactory().createOntology(
127            "Unknown in " + locator,
128            ""
129        );
130    } catch (OntologyException ex) {
131        throw new ParserException("Couldn't create fallback ontology", ex);
132    }
133
134    handler.startDocument(locator);
135    ArrayList aList = new ArrayList();
136    int lineNum = 0;
137    for(String line = bReader.readLine(); line != null; line = bReader.readLine()) {
138      ++lineNum;
139
140      try {
141        aList.clear();
142        if(line.startsWith("#")) {
143          handler.commentLine(line.substring(1));
144        } else if (line.length() == 0) {
145        } else {
146          StringTokenizer st = new StringTokenizer(line, "\t", false);
147          while(st.hasMoreTokens() && aList.size() < 8) {
148            String token = st.nextToken();
149            aList.add(token);
150          }
151          String rest = null;
152          String comment = null;
153          if(st.hasMoreTokens()) {
154            try {
155              rest = st.nextToken(((char) 0) + "");
156            } catch (NoSuchElementException nsee) {
157            }
158          }
159          if(rest != null) {
160            int ci = rest.indexOf("#");
161            if (ci != -1) {
162              comment = rest.substring(ci);
163              rest = rest.substring(0, ci);
164            }
165          }
166          GFF3Record record = createRecord(handler, aList, rest, comment, ontology, fallBack);
167          handler.recordLine(record);
168        }
169      } catch (ParserException ex) {
170        throw new ParserException(ex, "",
171        locator,
172        lineNum,
173        line);
174      } catch (IgnoreRecordException ex) {
175        // Silently skip any more work on this record
176      }
177    }
178    handler.endDocument();
179  }
180
181  /**
182  * Actually turns a list of tokens, some value string and a comment into a
183  * <span class="type">GFF3Record</span> and informs
184  * <span class="arg">handler</span>.
185  *
186  * @param handler a <span class="type">GFF3DocumentHandler</span> to inform of
187  *                any parse errors, and the completed <span class="type">GFF3Record</span>
188  * @param aList   a <span class="type">List</span> containing the 8 mandatory GFF columns
189  * @param rest    a <span class="type">String</span> representing the unparsed
190  *                attribute-value text, or <span class="kw">null</span> if there is none
191  * @param comment a <span class="type">String</span> containing the comment (without the
192  *                leading '<code>#</code>' character.
193  * @param ontology  the Ontology to resolve Terms in
194  * @throws <span class="type">BioException</span> if <span class="arg">handler</span>
195  *         could not correct a parse error
196  */
197  protected GFF3Record createRecord(
198    GFF3DocumentHandler handler,
199    List aList,
200    String rest,
201    String comment,
202    Ontology ontology,
203    Ontology fallBack
204  )
205  throws BioException, ParserException, IgnoreRecordException
206  {
207    GFF3Record.Impl record = new GFF3Record.Impl();
208
209    record.setSequenceID((String) aList.get(0));
210
211    {
212      Term st;
213      String stn = (String) aList.get(1);
214      if(ontology.containsTerm(stn)) {
215        st = ontology.getTerm(stn);
216      } else if(fallBack.containsTerm(stn)) {
217        st = fallBack.getTerm(stn);
218      } else {
219        try {
220          st = fallBack.createTerm(stn, "");
221        } catch (AlreadyExistsException te) {
222          throw new BioError("Assertion Failure: Term should not yet exist", te);
223        } catch (ChangeVetoException cve) {
224          throw new BioError("Assertion Failure: Unable to create term", cve);
225        }
226      }
227      record.setSource(st);
228    }
229
230    {
231      Term tt;
232      String ttn = (String) aList.get(2);
233      if(ontology.containsTerm(ttn)) {
234        tt = ontology.getTerm(ttn);
235      } else if(fallBack.containsTerm(ttn)) {
236        tt = fallBack.getTerm(ttn);
237      } else {
238        try {
239          tt = fallBack.createTerm(ttn, "");
240        } catch (AlreadyExistsException te) {
241          throw new BioError("Assertion Failure: Term should not yet exist", te);
242        } catch (ChangeVetoException cve) {
243          throw new BioError("Assertion Failure: Unable to create term", cve);
244        }
245      }
246      record.setType(tt);
247    }
248
249    int start = -1;
250    try {
251      start = Integer.parseInt( (String) aList.get(3));
252    } catch (NumberFormatException nfe) {
253      start = errors.invalidStart((String) aList.get(3));
254    }
255    record.setStart(start);
256
257    int end = -1;
258    try {
259      end = Integer.parseInt( (String) aList.get(4));
260    } catch (NumberFormatException nfe) {
261      end = errors.invalidEnd((String) aList.get(3));
262    }
263    record.setEnd(end);
264
265    String score = (String) aList.get(5);
266    if(
267      score == null     ||
268      score.equals("")  ||
269    score.equals(".") ||
270    score.equals("0")
271    )
272    {
273      record.setScore(GFFTools.NO_SCORE);
274    } else {
275      double sc = 0.0;
276      try {
277        sc = Double.parseDouble(score);
278      } catch (NumberFormatException nfe) {
279        sc = errors.invalidScore(score);
280      }
281      record.setScore(sc);
282    }
283
284    String strand = (String) aList.get(6);
285    if(strand == null || strand.equals("") || strand.equals(".")) {
286      record.setStrand(StrandedFeature.UNKNOWN);
287    } else {
288      if(strand.equals("+")) {
289        record.setStrand(StrandedFeature.POSITIVE);
290      } else if(strand.equals("-")) {
291        record.setStrand(StrandedFeature.NEGATIVE);
292      } else {
293        record.setStrand(errors.invalidStrand(strand));
294      }
295    }
296
297    String frame = (String) aList.get(7);
298    if(frame.equals(".")) {
299      record.setPhase(GFFTools.NO_FRAME);
300    } else {
301      int fr = 0;
302      try {
303        fr = Integer.parseInt(frame);
304      } catch (NumberFormatException nfe) {
305        fr = errors.invalidFrame(frame);
306      }
307      record.setPhase(fr);
308    }
309
310    if (rest != null) {
311      try {
312        parseAttribute(rest, record.getAnnotation(), ontology, fallBack);
313      } catch (ChangeVetoException cve) {
314        throw new BioException("Unable to populate annotations", cve);
315      }
316    }
317
318    return record;
319  }
320
321  /**
322  * Parse <span class="arg">attValList</span> into a
323  * <span class="type">Map</span> of attributes and value lists.
324  * <p>
325  * Populates an Annotation instance with Ontology Term keys and string/list
326  * values.
327  * </p>
328  *
329  * @param attValList  the <span class="type">String</span> to parse
330  */
331
332  protected void parseAttribute(String attValList, Annotation anno, Ontology onto, Ontology fallBack)
333  throws ChangeVetoException {
334    StringTokenizer sTok = new StringTokenizer(attValList, ";", false);
335    while(sTok.hasMoreTokens()) {
336      String attVal = sTok.nextToken().trim();
337      String attName;
338      List valList = new ArrayList();
339      int spaceIndx = attVal.indexOf("=");
340      if(spaceIndx == -1) {
341        attName = attVal;
342      } else {
343        attName = attVal.substring(0, spaceIndx);
344        attValList = attVal.substring(spaceIndx+1).trim();
345        while(attValList.length() > 0) {
346          if(attValList.startsWith("\"")) {
347            // System.out.println("Quoted");
348            int quoteIndx = 0;
349            do {
350              quoteIndx++;
351              quoteIndx = attValList.indexOf("\"", quoteIndx);
352            } while(quoteIndx != -1 && attValList.charAt(quoteIndx-1) == '\\');
353            if(quoteIndx > 0){
354              valList.add(attValList.substring(1, quoteIndx));
355              attValList = attValList.substring(quoteIndx+1).trim();
356            }else{
357              valList.add(attValList);
358              attValList = "";
359            }
360          } else {
361            int commaIndx = attValList.indexOf(",");
362            if(commaIndx == -1) {
363              valList.add(attValList);
364              attValList = "";
365            } else {
366              valList.add(attValList.substring(0, commaIndx));
367              attValList = attValList.substring(commaIndx+1).trim();
368            }
369          }
370        }
371      }
372
373      Term key;
374      if(onto.containsTerm(attName)) {
375        key = onto.getTerm(attName);
376      } else if(fallBack.containsTerm(attName)) {
377        key = fallBack.getTerm(attName);
378      } else {
379        try {
380          key = fallBack.createTerm(attName, "");
381        } catch (AlreadyExistsException te) {
382          throw new BioError("Assertion Failure: Term should not be there yet",te);
383        } catch (ChangeVetoException cve) {
384          throw new BioError("Assertion Failure: Unable to create term", cve);
385        }
386      }
387      anno.setProperty(key, valList);
388    }
389  }
390}