001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022package org.biojava.bio.program.gff;
023
024import java.io.BufferedReader;
025import java.io.IOException;
026import java.util.ArrayList;
027import java.util.List;
028import java.util.Map;
029import java.util.NoSuchElementException;
030import java.util.StringTokenizer;
031
032import org.biojava.bio.BioException;
033import org.biojava.bio.seq.StrandedFeature;
034import org.biojava.utils.ParserException;
035import org.biojava.utils.SmallMap;
036
037/**
038 * Parse a stream of GFF text into a stream of records and comments.
039 *
040 * @author Matthew Pocock
041 * @author Thomas Down
042 * @author Keith James (docs)
043 */
044public class GFFParser {
045    private GFFErrorHandler errors = GFFErrorHandler.ABORT_PARSING;
046
047    /**
048     * Set the error handler used by this parser.
049     */
050
051    public void setErrorHandler(GFFErrorHandler errors) {
052        this.errors = errors;
053    }
054
055    /**
056     * Find the error handler used by this parser.
057     */
058
059    public GFFErrorHandler getErrorHandler() {
060        return errors;
061    }
062
063    /**
064     * Informs <span class="arg">handler</span> of each line of
065     * gff read from <span class="arg">bReader</span>.  This form
066     * of the method should only be used if no locator string is
067     * available for the resource being parsed.
068     *
069     * @param bReader the <span class="type">BufferedReader</span> to parse
070     * @param handler the <span class="type">GFFDocumentHandler</span> that will
071     *                listen for 'stuff'
072     *
073     * @throws <span class="type">IOException</span> if for any reason
074     *         <span class="arg">bReader</span> throws one
075     * @throws <span class="type">BioException</span> if
076     *         <span class="arg">handler</span> can not correct a parse error
077     */
078
079    public void parse(BufferedReader bReader, GFFDocumentHandler handler)
080        throws IOException, BioException, ParserException
081    {
082        parse(bReader, handler, "unknown:");
083    }
084
085    /**
086     * Informs <span class="arg">handler</span> of each line of
087     * GFF read from <span class="arg">bReader</span>
088     *
089     * @param bReader the <span class="type">BufferedReader</span> to parse
090     * @param handler the <span class="type">GFFDocumentHandler</span> that will
091     *                listen for 'stuff'
092     *
093     * @throws <span class="type">IOException</span> if for any reason
094     *         <span class="arg">bReader</span> throws one
095     * @throws <span class="type">BioException</span> if
096     *         <span class="arg">handler</span> can not correct a parse error
097     */
098
099    public void parse(BufferedReader bReader, GFFDocumentHandler handler, String locator)
100        throws IOException, BioException, ParserException
101    {
102        handler.startDocument(locator);
103        ArrayList aList = new ArrayList();
104        int lineNum = 0;
105        for(String line = bReader.readLine(); line != null; line = bReader.readLine()) {
106            ++lineNum;
107
108            try {
109                aList.clear();
110                if(line.startsWith("#")) {
111                    handler.commentLine(line.substring(1));
112                } else if (line.length() == 0) {
113                } else {
114                    StringTokenizer st = new StringTokenizer(line, "\t", false);
115                    while(st.hasMoreTokens() && aList.size() < 8) {
116                        String token = st.nextToken();
117                        aList.add(token);
118                    }
119
120                    if(aList.size() < 7) {
121                      throw new ParserException(
122                        "Line doesn't look like GFF",
123                        locator,
124                        lineNum,
125                        line );
126                    }
127
128                    String rest = null;
129                    String comment = null;
130                    if(st.hasMoreTokens()) {
131                        try {
132                            rest = st.nextToken(((char) 0) + "");
133                        } catch (NoSuchElementException nsee) {
134                        }
135                    }
136                    if(rest != null) {
137                        int ci = rest.indexOf("#");
138                        if (ci != -1) {
139                            comment = rest.substring(ci);
140                            rest = rest.substring(0, ci);
141                        }
142                    }
143
144                    GFFRecord record = createRecord(handler, aList, rest, comment);
145                    handler.recordLine(record);
146                }
147            } catch (ParserException ex) {
148                throw new ParserException(ex.getMessage(),
149                                          locator,
150                                          lineNum,
151                                          line);
152            } catch (IgnoreRecordException ex) {
153                // Silently skip any more work on this record
154            }
155        }
156        handler.endDocument();
157    }
158
159  /**
160   * Actually turns a list of tokens, some value string and a comment into a
161   * <span class="type">GFFRecord</span> and informs
162   * <span class="arg">handler</span>.
163   *
164   * @param handler a <span class="type">GFFDocumentHandler</span> to inform of
165   *                any parse errors, and the completed <span class="type">GFFRecord</span>
166   * @param aList   a <span class="type">List</span> containing the 8 mandatory GFF columns
167   * @param rest    a <span class="type">String</span> representing the unparsed
168   *                attribute-value text, or <span class="kw">null</span> if there is none
169   * @param comment a <span class="type">String</span> containing the comment (without the
170   *                leading '<code>#</code>' character.
171   * @throws <span class="type">BioException</span> if <span class="arg">handler</span>
172   *         could not correct a parse error
173   */
174    protected GFFRecord createRecord(GFFDocumentHandler handler,
175                                     List aList,
176                                     String rest,
177                                     String comment)
178        throws BioException, ParserException, IgnoreRecordException
179    {
180        SimpleGFFRecord record = new SimpleGFFRecord();
181
182        record.setSeqName((String) aList.get(0));
183        record.setSource((String) aList.get(1));
184        record.setFeature((String) aList.get(2));
185
186        int start = -1;
187        try {
188            start = Integer.parseInt( (String) aList.get(3));
189        } catch (NumberFormatException nfe) {
190            start = errors.invalidStart((String) aList.get(3));
191        }
192        record.setStart(start);
193
194        int end = -1;
195        try {
196            end = Integer.parseInt( (String) aList.get(4));
197        } catch (NumberFormatException nfe) {
198            end = errors.invalidEnd((String) aList.get(3));
199        }
200        record.setEnd(end);
201
202        String score = (String) aList.get(5);
203        if(score == null     ||
204           score.equals("")  ||
205           score.equals(".") ||
206           score.equals("0")
207           ) 
208        {
209            record.setScore(GFFTools.NO_SCORE);
210        } else {
211            double sc = 0.0;
212            try {
213                sc = Double.parseDouble(score);
214            } catch (NumberFormatException nfe) {
215                sc = errors.invalidScore(score);
216            }
217            record.setScore(sc);
218        }
219
220        String strand = (String) aList.get(6);
221        if(strand == null || strand.equals("") || strand.equals(".")) {
222            record.setStrand(StrandedFeature.UNKNOWN);
223        } else {
224            if(strand.equals("+")) {
225                record.setStrand(StrandedFeature.POSITIVE);
226            } else if(strand.equals("-")) {
227                record.setStrand(StrandedFeature.NEGATIVE);
228            } else {
229                record.setStrand(errors.invalidStrand(strand));
230            }
231        }
232
233        String frame = (String) aList.get(7);
234        if(frame.equals(".")) {
235            record.setFrame(GFFTools.NO_FRAME);
236        } else {
237            int fr = 0;
238            try {
239                fr = Integer.parseInt(frame);
240            } catch (NumberFormatException nfe) {
241                fr = errors.invalidFrame(frame);
242            }
243            record.setFrame(fr);
244        }
245
246        if (rest != null)
247            record.setGroupAttributes(parseAttribute(rest));
248        else
249            record.setGroupAttributes(new SmallMap());
250        record.setComment(comment);
251
252        return record;
253    }
254
255    /**
256     * Parse <span class="arg">attValList</span> into a
257     * <span class="type">Map</span> of attributes and value lists.
258     * <p>
259     * The resulting <span class="type">Map</span> will have
260     * <span class="type">String</span> keys, with
261     * <span class="type">List</span> values. If there are no values
262     * associated with a key, then it will have an empty
263     * <span class="type">List</span>, not <span class="kw">null</span> as
264     * its value.
265     *
266     * @param attValList  the <span class="type">String</span> to parse
267     * @return a <span class="type">Map</span> of parsed attributes and value lists
268     */
269
270    protected Map parseAttribute(String attValList) {
271        Map attMap = new SmallMap();
272
273        StringTokenizer sTok = new StringTokenizer(attValList, ";", false);
274        while(sTok.hasMoreTokens()) {
275            String attVal = sTok.nextToken().trim();
276            String attName;
277            List valList = new ArrayList();
278            int spaceIndx = attVal.indexOf(" ");
279            if(spaceIndx == -1) {
280                attName = attVal;
281            } else {
282                attName = attVal.substring(0, spaceIndx);
283                attValList = attVal.substring(spaceIndx).trim();
284                while(attValList.length() > 0) {
285                    if(attValList.startsWith("\"")) {
286                        // System.out.println("Quoted");
287                        int quoteIndx = 0;
288                        do {
289                            quoteIndx++;
290                            quoteIndx = attValList.indexOf("\"", quoteIndx);
291                        } while(quoteIndx != -1 && attValList.charAt(quoteIndx-1) == '\\');
292                        if(quoteIndx > 0){
293                          valList.add(attValList.substring(1, quoteIndx));
294                          attValList = attValList.substring(quoteIndx+1).trim();
295                        }else{
296                          valList.add(attValList);
297                          attValList = "";
298                        }
299                    } else {
300                        spaceIndx = attValList.indexOf(" ");
301                        if(spaceIndx == -1) {
302                            valList.add(attValList);
303                            attValList = "";
304                        } else {
305                            valList.add(attValList.substring(0, spaceIndx));
306                            attValList = attValList.substring(spaceIndx).trim();
307                        }
308                    }
309                }
310            }
311            attMap.put(attName, valList);
312        }
313
314        return attMap;
315    }
316}