001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.genome.parsers.gff;
022
023import java.nio.file.Files;
024import java.nio.file.Path;
025import java.nio.file.Paths;
026import org.slf4j.Logger;
027import org.slf4j.LoggerFactory;
028
029import java.io.BufferedReader;
030import java.io.IOException;
031import java.util.ArrayList;
032import java.util.List;
033import java.util.regex.Pattern;
034
035
036/**
037 * http://www.bioperl.org/wiki/GTF
038 * Read and write FeatureLists as GFF/GTF formatted files.
039 *<br><br>
040 * The GFF moniker is applied to a variety of tab-delimited formats
041 * that mock the notion of a standard. This class should parse most files
042 * bearing at least a passing resemblance to any of the formats. You will, however, need
043 * to research the semantics of the files you encounter. Generally,
044 * the format consists of 9 tab-delimited fields:
045 * <br>
046 * <pre>
047 * seqname   source   featureType   start   end   score   strand   frame   attributes
048 * </pre>
049 * The 9th field consists of key-value pairs separated by semicolons, the first of which JavaGene interprets
050 * as the group id (as used in GFF1). It is the precise meaning of this 9th field that
051 * varies from week to week. The Feature and FeatureList objects provide various utility methods to
052 * ease the task of accessing and using the attributes. The proper interpretation of any
053 * particular attribute, however, is left to you.
054 *
055 * @author Hanno Hinsch
056 */
057public class GFF3Reader {
058
059        private static final Logger logger = LoggerFactory.getLogger(GFF3Reader.class);
060
061        private static final  Pattern p = Pattern.compile("\t");
062
063        /**
064         * Read a file into a FeatureList. Each line of the file becomes one Feature object.
065         *
066         * @param filename The path to the GFF file.
067         * @return A FeatureList.
068         * @throws IOException Something went wrong -- check exception detail message.
069         */
070        public static FeatureList read(String filename, List<String> indexes) throws IOException {
071                return read(Paths.get(filename), indexes);
072        }
073
074        /**
075         * Read a file into a FeatureList. Each line of the file becomes one Feature object.
076         *
077         * @param path The path to the GFF file.
078         * @return A FeatureList.
079         * @throws IOException Something went wrong -- check exception detail message.
080         */
081        public static FeatureList read(Path path, List<String> indexes) throws IOException {
082                logger.info("Reading: {}", path.toString());
083
084                FeatureList features = new FeatureList();
085                features.addIndexes(indexes);
086                try (BufferedReader br = Files.newBufferedReader(path)) {
087
088                        String s;
089                        for (s = br.readLine(); null != s; s = br.readLine()) {
090                                s = s.trim();
091
092                                if (s.length() > 0) {
093                                        if (s.charAt(0) == '#') {
094                                                //ignore comment lines
095                                                if (s.startsWith("##fasta"))
096                                                        break;
097                                        } else {
098
099                                                FeatureI f = parseLine(s);
100                                                if (f != null) {
101                                                        features.add(f);
102
103                                                }
104                                        }
105                                }
106
107                        }
108
109                }
110                return features;
111        }
112
113
114        public static FeatureList read(String filename) throws IOException {
115           return read(filename,new ArrayList<String>(0));
116        }
117
118        public static FeatureList read(Path path) throws IOException {
119                return read(path,new ArrayList<String>(0));
120        }
121
122
123        /**
124         * create Feature from line of GFF file
125         */
126        private static Feature parseLine(String s) {
127                //FIXME update to use regex split on tabs
128                //FIXME better errors on parse failures
129                String[] line = p.split(s);
130                String seqname =line[0].trim();
131
132                String source =line[1].trim();
133
134                String type =line[2].trim();
135
136
137                String locStart =line[3].trim();
138
139                String locEnd =line[4].trim();
140
141                Double score;
142
143                try {
144                        score = Double.parseDouble(line[5].trim());
145                } catch (Exception e) {
146                        score = 0.0;
147                }
148
149
150                char strand = line[6].trim().charAt(0);
151                //added by scooter willis to deal with glimmer predictions that
152                //have the start after the end but is a negative strand
153                int locationStart = Integer.parseInt(locStart);
154                int locationEnd = Integer.parseInt(locEnd);
155                if(locationStart > locationEnd){
156                        int temp = locationStart;
157                        locationStart = locationEnd;
158                        locationEnd = temp;
159
160                }
161                Location location = Location.fromBio(locationStart, locationEnd, strand);
162
163                assert (strand == '-') == location.isNegative();
164
165                int frame;
166                try {
167                        frame = Integer.parseInt(line[7].trim());
168                } catch (Exception e) {
169                        frame = -1;
170                }
171                String attributes=line[8];
172        /*    //grab everything until end of line (or # comment)
173                start = end + 1;
174                end = s.indexOf('#', start);
175                String attributes = null;
176                if (end < 0) {
177                        attributes = new String(s.substring(start));
178                } else {
179                        attributes = new String(s.substring(start, end));
180                }
181 */
182                return new Feature(seqname, source, type, location, score, frame, attributes.split("#")[0]);
183
184        }
185
186
187
188
189        public static void main(String[] args) throws Exception {
190                long start = System.currentTimeMillis();
191                @SuppressWarnings("unused")
192                FeatureList listGenes = GFF3Reader.read("/home/melo/workspace/release/stdout.combined.checked2.gtf");
193                long stop = System.currentTimeMillis();
194                logger.info("Loading = {}", stop-start);
195//        logger.info(listGenes);
196                //      GeneMarkGTF.write( list, args[1] );
197        }
198}