001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.genome.parsers.gff;
022
023import org.slf4j.Logger;
024import org.slf4j.LoggerFactory;
025
026import java.io.BufferedReader;
027import java.io.FileReader;
028import java.io.IOException;
029
030/**
031 * http://www.bioperl.org/wiki/GTF
032 * Read and write FeatureLists as GFF/GTF formatted files.
033 *<br><br>
034 * The GFF moniker is applied to a variety of tab-delimited formats
035 * that mock the notion of a standard. This class should parse most files
036 * bearing at least a passing resemblance to any of the formats. You will, however, need
037 * to research the semantics of the files you encounter. Generally,
038 * the format consists of 9 tab-delimited fields:
039 * <br>
040 * <pre>
041 * seqname   source   featureType   start   end   score   strand   frame   attributes
042 * </pre>
043 * The 9th field consists of key-value pairs separated by semicolons, the first of which JavaGene interprets
044 * as the group id (as used in GFF1). It is the precise meaning of this 9th field that
045 * varies from week to week. The Feature and FeatureList objects provide various utility methods to
046 * ease the task of accessing and using the attributes. The proper interpretation of any
047 * particular attribute, however, is left to you.
048 *
049 * @author Hanno Hinsch
050 */
051public class GeneMarkGTFReader {
052
053        private static final Logger logger = LoggerFactory.getLogger(GeneMarkGTFReader.class);
054
055        /**
056         * Read a file into a FeatureList. Each line of the file becomes one Feature object.
057         *
058         * @param filename The path to the GFF file.
059         * @return A FeatureList.
060         * @throws IOException Something went wrong -- check exception detail message.
061         */
062        public static FeatureList read(String filename) throws IOException {
063                logger.info("Reading: {}", filename);
064
065                FeatureList features = new FeatureList();
066                BufferedReader br = new BufferedReader(new FileReader(filename));
067
068                String s;
069                for (s = br.readLine(); null != s; s = br.readLine()) {
070                        s = s.trim();
071
072                        if (s.length() > 0) {
073                                if (s.charAt(0) == '#') {
074                                        //ignore comment lines
075                                } else {
076
077                                        FeatureI f = parseLine(s);
078                                        if (f != null) {
079                                                features.add(f);
080                                        }
081                                }
082                        }
083
084                }
085
086                br.close();
087                return features;
088        }
089
090        /**
091         * create Feature from line of GFF file
092         */
093        private static Feature parseLine(String s) {
094                //FIXME update to use regex split on tabs
095                //FIXME better errors on parse failures
096                int start = 0;
097                int end = 0;
098
099                start = end;
100                end = s.indexOf('\t', start);
101                String seqname = s.substring(start, end).trim();
102
103                start = end + 1;
104                end = s.indexOf('\t', start);
105                String source = s.substring(start, end).trim();
106
107                start = end + 1;
108                end = s.indexOf('\t', start);
109                String type = s.substring(start, end);
110
111                start = end + 1;
112                end = s.indexOf('\t', start);
113                String locStart = s.substring(start, end);
114
115                start = end + 1;
116                end = s.indexOf('\t', start);
117                String locEnd = s.substring(start, end);
118
119                Double score;
120                start = end + 1;
121                end = s.indexOf('\t', start);
122                try {
123                        score = Double.parseDouble(s.substring(start, end));
124                } catch (Exception e) {
125                        score = 0.0;
126                }
127
128                start = end + 1;
129                end = s.indexOf('\t', start);
130                char strand = s.charAt(end - 1);
131
132                Location location = Location.fromBio(Integer.parseInt(locStart), Integer.parseInt(locEnd), strand);
133
134                assert (strand == '-') == location.isNegative();
135
136                int frame;
137                start = end + 1;
138                end = s.indexOf('\t', start);
139                try {
140                        frame = Integer.parseInt(s.substring(start, end));
141                } catch (Exception e) {
142                        frame = -1;
143                }
144
145                //grab everything until end of line (or # comment)
146                start = end + 1;
147                end = s.indexOf('#', start);
148                String attributes = null;
149                if (end < 0) {
150                        attributes = new String(s.substring(start));
151                } else {
152                        attributes = new String(s.substring(start, end));
153                }
154
155                return new Feature(seqname, source, type, location, score, frame, attributes);
156
157        }
158/*
159
160        public static void write(FeatureList features, String filename) throws IOException {
161                logger.info("Writing: {}", filename);
162
163                BufferedWriter bw = new BufferedWriter(new FileWriter(filename));
164
165                ListIterator iter = features.listIterator();
166                while (iter.hasNext()) {
167                        Feature feature = (Feature) iter.next();
168                        writeLine(feature, bw);
169                }
170
171                bw.close();
172        }
173
174        private static void writeLine(Feature f, BufferedWriter bw) throws IOException {
175                String s = f.seqname() + '\t';
176                s += f.source() + '\t';
177                s += f.type() + '\t';
178
179                s += f.location().bioStart() + "\t";
180                s += f.location().bioEnd() + "\t";
181                s += Double.toString(f.score()) + "\t";
182                s += f.location().bioStrand() + "\t";
183
184                if (f.frame() == -1) {
185                        s += ".\t";
186                } else {
187                        s += f.frame() + "\t";
188                }
189
190                s += f.attributes();
191
192                bw.write(s);
193                bw.newLine();
194        }
195 */
196
197        public static void main(String[] args) throws Exception {
198
199                FeatureList listGenes = GeneMarkGTFReader.read("/Users/Scooter/scripps/dyadic/analysis/454Scaffolds/genemark_hmm.gtf");
200
201                for(FeatureI feature : listGenes){
202                        logger.info("Gene Feature: {}", feature);
203                }
204//        logger.info(listGenes);
205                //      GeneMarkGTFReader.write( list, args[1] );
206        }
207}