001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.genome.parsers.gff;
022
023import org.slf4j.Logger;
024import org.slf4j.LoggerFactory;
025
026import java.io.*;
027import java.util.ListIterator;
028
029/**
030 * http://www.bioperl.org/wiki/GTF
031 * Read and write FeatureLists as GFF/GTF formatted files.
032 *<br><br>
033 * The GFF moniker is applied to a variety of tab-delimited formats
034 * that mock the notion of a standard. This class should parse most files
035 * bearing at least a passing resemblance to any of the formats. You will, however, need
036 * to research the semantics of the files you encounter. Generally,
037 * the format consists of 9 tab-delimited fields:
038 * <br>
039 * <pre>
040 * seqname   source   featureType   start   end   score   strand   frame   attributes
041 * </pre>
042 * The 9th field consists of key-value pairs separated by semicolons, the first of which JavaGene interprets
043 * as the group id (as used in GFF1). It is the precise meaning of this 9th field that
044 * varies from week to week. The Feature and FeatureList objects provide various utility methods to
045 * ease the task of accessing and using the attributes. The proper interpretation of any
046 * particular attribute, however, is left to you.
047 *
048 * @author Hanno Hinsch
049 */
050public class GeneIDGFF2Reader {
051
052        private static final Logger logger = LoggerFactory.getLogger(GeneIDGFF2Reader.class);
053
054        /**
055         * Read a file into a FeatureList. Each line of the file becomes one Feature object.
056         *
057         * @param filename The path to the GFF file.
058         * @return A FeatureList.
059         * @throws IOException Something went wrong -- check exception detail message.
060         */
061        public static FeatureList read(String filename) throws IOException {
062                logger.info("Reading: {}", filename);
063
064                FeatureList features = new FeatureList();
065                BufferedReader br = new BufferedReader(new FileReader(filename));
066
067                String s;
068                for (s = br.readLine(); null != s; s = br.readLine()) {
069                        s = s.trim();
070
071                        if (s.length() > 0) {
072                                if (s.charAt(0) == '#') {
073                                        //ignore comment lines
074                                } else {
075
076                                        FeatureI f = parseLine(s);
077                                        if (f != null) {
078                                                features.add(f);
079                                        }
080                                }
081                        }
082
083                }
084
085                br.close();
086                return features;
087        }
088
089        /**
090         * create Feature from line of GFF file
091         */
092        private static Feature parseLine(String s) {
093                //FIXME update to use regex split on tabs
094                //FIXME better errors on parse failures
095
096                int start = 0;
097                int end = 0;
098
099                start = end;
100                end = s.indexOf('\t', start);
101                String seqname = s.substring(start, end).trim();
102
103                start = end + 1;
104                end = s.indexOf('\t', start);
105                String source = s.substring(start, end).trim();
106
107                start = end + 1;
108                end = s.indexOf('\t', start);
109                String type = s.substring(start, end);
110
111                start = end + 1;
112                end = s.indexOf('\t', start);
113                String locStart = s.substring(start, end);
114
115                start = end + 1;
116                end = s.indexOf('\t', start);
117                String locEnd = s.substring(start, end);
118
119                Double score;
120                start = end + 1;
121                end = s.indexOf('\t', start);
122                try {
123                        score = Double.parseDouble(s.substring(start, end));
124                } catch (Exception e) {
125                        score = 0.0;
126                }
127
128                start = end + 1;
129                end = s.indexOf('\t', start);
130                char strand = s.charAt(end - 1);
131
132                Location location = Location.fromBio(Integer.parseInt(locStart), Integer.parseInt(locEnd), strand);
133
134                assert (strand == '-') == location.isNegative();
135
136                int frame;
137                start = end + 1;
138                end = s.indexOf('\t', start);
139                try {
140                        frame = Integer.parseInt(s.substring(start, end));
141                } catch (Exception e) {
142                        frame = -1;
143                }
144
145                //grab everything until end of line (or # comment)
146                start = end + 1;
147                end = s.indexOf('#', start);
148                String attributes = null;
149                if (end < 0) {
150                        attributes = new String(s.substring(start));
151                } else {
152                        attributes = new String(s.substring(start, end));
153                }
154                //need to add in attribute assignment for geneid where it just provides a gene name and will make it gtf like
155                attributes = "gene_id " + '"' + attributes + '"' + ";";
156                return new Feature(seqname, source, type, location, score, frame, attributes);
157
158        }
159
160        /**
161         * Write features in FeatureList to file. Each Feature becomes one line in the file.
162         * The userMap() data in the features is not written to file.
163         *
164         * @param features The list of features to write.
165         * @param filename The path to the file.
166         * @throws IOException Something went wrong -- check exception detail message.
167         */
168        public static void write(FeatureList features, String filename) throws IOException {
169                logger.info("Writing: {}", filename);
170
171                BufferedWriter bw = new BufferedWriter(new FileWriter(filename));
172
173                ListIterator<FeatureI> iter = features.listIterator();
174                while (iter.hasNext()) {
175                        Feature feature = (Feature) iter.next();
176                        writeLine(feature, bw);
177                }
178
179                bw.close();
180        }
181
182        private static void writeLine(Feature f, BufferedWriter bw) throws IOException {
183                String s = f.seqname() + '\t';
184                s += f.source() + '\t';
185                s += f.type() + '\t';
186
187                s += f.location().bioStart() + "\t";
188                s += f.location().bioEnd() + "\t";
189                s += Double.toString(f.score()) + "\t";
190                s += f.location().bioStrand() + "\t";
191
192                if (f.frame() == -1) {
193                        s += ".\t";
194                } else {
195                        s += f.frame() + "\t";
196                }
197
198                s += f.attributes();
199
200                bw.write(s);
201                bw.newLine();
202        }
203
204
205        public static void main(String[] args) throws Exception {
206
207                FeatureList listGenes = GeneIDGFF2Reader.read("/Users/Scooter/scripps/dyadic/analysis/454Scaffolds/genemark_hmm.gtf");
208
209                for(FeatureI feature : listGenes){
210                        logger.info("Gene Feature: {}", feature);
211                }
212//        logger.info(listGenes);
213                //      GeneMarkGTF.write( list, args[1] );
214        }
215}