001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.genome.parsers.gff; 022 023import org.slf4j.Logger; 024import org.slf4j.LoggerFactory; 025 026import java.io.BufferedReader; 027import java.io.FileReader; 028import java.io.IOException; 029 030/** 031 * http://www.bioperl.org/wiki/GTF 032 * Read and write FeatureLists as GFF/GTF formatted files. 033 *<br><br> 034 * The GFF moniker is applied to a variety of tab-delimited formats 035 * that mock the notion of a standard. This class should parse most files 036 * bearing at least a passing resemblance to any of the formats. You will, however, need 037 * to research the semantics of the files you encounter. Generally, 038 * the format consists of 9 tab-delimited fields: 039 * <br> 040 * <pre> 041 * seqname source featureType start end score strand frame attributes 042 * </pre> 043 * The 9th field consists of key-value pairs separated by semicolons, the first of which JavaGene interprets 044 * as the group id (as used in GFF1). It is the precise meaning of this 9th field that 045 * varies from week to week. The Feature and FeatureList objects provide various utility methods to 046 * ease the task of accessing and using the attributes. The proper interpretation of any 047 * particular attribute, however, is left to you. 048 * 049 * @author Hanno Hinsch 050 */ 051public class GeneMarkGTFReader { 052 053 private static final Logger logger = LoggerFactory.getLogger(GeneMarkGTFReader.class); 054 055 /** 056 * Read a file into a FeatureList. Each line of the file becomes one Feature object. 057 * 058 * @param filename The path to the GFF file. 059 * @return A FeatureList. 060 * @throws IOException Something went wrong -- check exception detail message. 061 */ 062 public static FeatureList read(String filename) throws IOException { 063 logger.info("Reading: {}", filename); 064 065 FeatureList features = new FeatureList(); 066 try (BufferedReader br = new BufferedReader(new FileReader(filename))) { 067 068 String s; 069 for (s = br.readLine(); null != s; s = br.readLine()) { 070 s = s.trim(); 071 072 if (s.length() > 0) { 073 if (s.charAt(0) == '#') { 074 //ignore comment lines 075 } else { 076 077 FeatureI f = parseLine(s); 078 if (f != null) { 079 features.add(f); 080 } 081 } 082 } 083 084 } 085 086 } 087 return features; 088 } 089 090 /** 091 * create Feature from line of GFF file 092 */ 093 private static Feature parseLine(String s) { 094 //FIXME update to use regex split on tabs 095 //FIXME better errors on parse failures 096 int start = 0; 097 int end = 0; 098 099 start = end; 100 end = s.indexOf('\t', start); 101 String seqname = s.substring(start, end).trim(); 102 103 start = end + 1; 104 end = s.indexOf('\t', start); 105 String source = s.substring(start, end).trim(); 106 107 start = end + 1; 108 end = s.indexOf('\t', start); 109 String type = s.substring(start, end); 110 111 start = end + 1; 112 end = s.indexOf('\t', start); 113 String locStart = s.substring(start, end); 114 115 start = end + 1; 116 end = s.indexOf('\t', start); 117 String locEnd = s.substring(start, end); 118 119 Double score; 120 start = end + 1; 121 end = s.indexOf('\t', start); 122 try { 123 score = Double.parseDouble(s.substring(start, end)); 124 } catch (Exception e) { 125 score = 0.0; 126 } 127 128 start = end + 1; 129 end = s.indexOf('\t', start); 130 char strand = s.charAt(end - 1); 131 132 Location location = Location.fromBio(Integer.parseInt(locStart), Integer.parseInt(locEnd), strand); 133 134 assert (strand == '-') == location.isNegative(); 135 136 int frame; 137 start = end + 1; 138 end = s.indexOf('\t', start); 139 try { 140 frame = Integer.parseInt(s.substring(start, end)); 141 } catch (Exception e) { 142 frame = -1; 143 } 144 145 //grab everything until end of line (or # comment) 146 start = end + 1; 147 end = s.indexOf('#', start); 148 String attributes = null; 149 if (end < 0) { 150 attributes = s.substring(start); 151 } else { 152 attributes = s.substring(start, end); 153 } 154 155 return new Feature(seqname, source, type, location, score, frame, attributes); 156 157 } 158/* 159 160 public static void write(FeatureList features, String filename) throws IOException { 161 logger.info("Writing: {}", filename); 162 163 BufferedWriter bw = new BufferedWriter(new FileWriter(filename)); 164 165 ListIterator iter = features.listIterator(); 166 while (iter.hasNext()) { 167 Feature feature = (Feature) iter.next(); 168 writeLine(feature, bw); 169 } 170 171 bw.close(); 172 } 173 174 private static void writeLine(Feature f, BufferedWriter bw) throws IOException { 175 String s = f.seqname() + '\t'; 176 s += f.source() + '\t'; 177 s += f.type() + '\t'; 178 179 s += f.location().bioStart() + "\t"; 180 s += f.location().bioEnd() + "\t"; 181 s += Double.toString(f.score()) + "\t"; 182 s += f.location().bioStrand() + "\t"; 183 184 if (f.frame() == -1) { 185 s += ".\t"; 186 } else { 187 s += f.frame() + "\t"; 188 } 189 190 s += f.attributes(); 191 192 bw.write(s); 193 bw.newLine(); 194 } 195 */ 196 197 public static void main(String[] args) throws Exception { 198 199 FeatureList listGenes = GeneMarkGTFReader.read("/Users/Scooter/scripps/dyadic/analysis/454Scaffolds/genemark_hmm.gtf"); 200 201 for(FeatureI feature : listGenes){ 202 logger.info("Gene Feature: {}", feature); 203 } 204// logger.info(listGenes); 205 // GeneMarkGTFReader.write( list, args[1] ); 206 } 207}