001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.genome.parsers.gff;
022
023import org.slf4j.Logger;
024import org.slf4j.LoggerFactory;
025
026import java.io.BufferedReader;
027import java.io.FileReader;
028import java.io.IOException;
029import java.util.ArrayList;
030import java.util.List;
031import java.util.regex.Pattern;
032
033
034/**
035 * http://www.bioperl.org/wiki/GTF
036 * Read and write FeatureLists as GFF/GTF formatted files.
037 *<br><br>
038 * The GFF moniker is applied to a variety of tab-delimited formats
039 * that mock the notion of a standard. This class should parse most files
040 * bearing at least a passing resemblance to any of the formats. You will, however, need
041 * to research the semantics of the files you encounter. Generally,
042 * the format consists of 9 tab-delimited fields:
043 * <br>
044 * <pre>
045 * seqname   source   featureType   start   end   score   strand   frame   attributes
046 * </pre>
047 * The 9th field consists of key-value pairs separated by semicolons, the first of which JavaGene interprets
048 * as the group id (as used in GFF1). It is the precise meaning of this 9th field that
049 * varies from week to week. The Feature and FeatureList objects provide various utility methods to
050 * ease the task of accessing and using the attributes. The proper interpretation of any
051 * particular attribute, however, is left to you.
052 *
053 * @author Hanno Hinsch
054 */
055public class GFF3Reader {
056
057        private static final Logger logger = LoggerFactory.getLogger(GFF3Reader.class);
058
059        private static final  Pattern p = Pattern.compile("\t");
060
061        /**
062         * Read a file into a FeatureList. Each line of the file becomes one Feature object.
063         *
064         * @param filename The path to the GFF file.
065         * @return A FeatureList.
066         * @throws IOException Something went wrong -- check exception detail message.
067         */
068
069        public static FeatureList read(String filename, List<String> indexes) throws IOException {
070                logger.info("Reading: {}", filename);
071
072                FeatureList features = new FeatureList();
073                features.addIndexes(indexes);
074                BufferedReader br = new BufferedReader(new FileReader(filename));
075
076                String s;
077                for (s = br.readLine(); null != s; s = br.readLine()) {
078                        s = s.trim();
079
080                        if (s.length() > 0) {
081                                if (s.charAt(0) == '#') {
082                                        //ignore comment lines
083                                        if(s.startsWith("##fasta"))
084                                                break;
085                                } else {
086
087                                        FeatureI f = parseLine(s);
088                                        if (f != null) {
089                                                features.add(f);
090
091                                        }
092                                }
093                        }
094
095                }
096
097                br.close();
098                return features;
099        }
100
101
102        public static FeatureList read(String filename) throws IOException {
103           return read(filename,new ArrayList<String>(0));
104        }
105
106
107        /**
108         * create Feature from line of GFF file
109         */
110        private static Feature parseLine(String s) {
111                //FIXME update to use regex split on tabs
112                //FIXME better errors on parse failures
113                String[] line = p.split(s);
114                String seqname =line[0].trim();
115
116                String source =line[1].trim();
117
118                String type =line[2].trim();
119
120
121                String locStart =line[3].trim();
122
123                String locEnd =line[4].trim();
124
125                Double score;
126
127                try {
128                        score = Double.parseDouble(line[5].trim());
129                } catch (Exception e) {
130                        score = 0.0;
131                }
132
133
134                char strand = line[6].trim().charAt(0);
135                //added by scooter willis to deal with glimmer predictions that
136                //have the start after the end but is a negative strand
137                int locationStart = Integer.parseInt(locStart);
138                int locationEnd = Integer.parseInt(locEnd);
139                if(locationStart > locationEnd){
140                        int temp = locationStart;
141                        locationStart = locationEnd;
142                        locationEnd = temp;
143
144                }
145                Location location = Location.fromBio(locationStart, locationEnd, strand);
146
147                assert (strand == '-') == location.isNegative();
148
149                int frame;
150                try {
151                        frame = Integer.parseInt(line[7].trim());
152                } catch (Exception e) {
153                        frame = -1;
154                }
155                String attributes=line[8];
156        /*    //grab everything until end of line (or # comment)
157                start = end + 1;
158                end = s.indexOf('#', start);
159                String attributes = null;
160                if (end < 0) {
161                        attributes = new String(s.substring(start));
162                } else {
163                        attributes = new String(s.substring(start, end));
164                }
165 */
166                return new Feature(seqname, source, type, location, score, frame, attributes.split("#")[0]);
167
168        }
169
170
171
172
173        public static void main(String[] args) throws Exception {
174                long start = System.currentTimeMillis();
175                @SuppressWarnings("unused")
176                FeatureList listGenes = GFF3Reader.read("/home/melo/workspace/release/stdout.combined.checked2.gtf");
177                long stop = System.currentTimeMillis();
178                logger.info("Loading = {}", stop-start);
179//        logger.info(listGenes);
180                //      GeneMarkGTF.write( list, args[1] );
181        }
182}