001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.genome.parsers.gff; 022 023import org.slf4j.Logger; 024import org.slf4j.LoggerFactory; 025 026import java.io.*; 027import java.util.ListIterator; 028 029/** 030 * http://www.bioperl.org/wiki/GTF 031 * Read and write FeatureLists as GFF/GTF formatted files. 032 *<br><br> 033 * The GFF moniker is applied to a variety of tab-delimited formats 034 * that mock the notion of a standard. This class should parse most files 035 * bearing at least a passing resemblance to any of the formats. You will, however, need 036 * to research the semantics of the files you encounter. Generally, 037 * the format consists of 9 tab-delimited fields: 038 * <br> 039 * <pre> 040 * seqname source featureType start end score strand frame attributes 041 * </pre> 042 * The 9th field consists of key-value pairs separated by semicolons, the first of which JavaGene interprets 043 * as the group id (as used in GFF1). It is the precise meaning of this 9th field that 044 * varies from week to week. The Feature and FeatureList objects provide various utility methods to 045 * ease the task of accessing and using the attributes. The proper interpretation of any 046 * particular attribute, however, is left to you. 047 * 048 * @author Hanno Hinsch 049 */ 050public class GeneIDGFF2Reader { 051 052 private static final Logger logger = LoggerFactory.getLogger(GeneIDGFF2Reader.class); 053 054 /** 055 * Read a file into a FeatureList. Each line of the file becomes one Feature object. 056 * 057 * @param filename The path to the GFF file. 058 * @return A FeatureList. 059 * @throws IOException Something went wrong -- check exception detail message. 060 */ 061 public static FeatureList read(String filename) throws IOException { 062 logger.info("Reading: {}", filename); 063 064 FeatureList features = new FeatureList(); 065 try (BufferedReader br = new BufferedReader(new FileReader(filename))) { 066 067 String s; 068 for (s = br.readLine(); null != s; s = br.readLine()) { 069 s = s.trim(); 070 071 if (s.length() > 0) { 072 if (s.charAt(0) == '#') { 073 //ignore comment lines 074 } else { 075 076 FeatureI f = parseLine(s); 077 if (f != null) { 078 features.add(f); 079 } 080 } 081 } 082 083 } 084 085 } 086 return features; 087 } 088 089 /** 090 * create Feature from line of GFF file 091 */ 092 private static Feature parseLine(String s) { 093 //FIXME update to use regex split on tabs 094 //FIXME better errors on parse failures 095 096 int start = 0; 097 int end = 0; 098 099 start = end; 100 end = s.indexOf('\t', start); 101 String seqname = s.substring(start, end).trim(); 102 103 start = end + 1; 104 end = s.indexOf('\t', start); 105 String source = s.substring(start, end).trim(); 106 107 start = end + 1; 108 end = s.indexOf('\t', start); 109 String type = s.substring(start, end); 110 111 start = end + 1; 112 end = s.indexOf('\t', start); 113 String locStart = s.substring(start, end); 114 115 start = end + 1; 116 end = s.indexOf('\t', start); 117 String locEnd = s.substring(start, end); 118 119 Double score; 120 start = end + 1; 121 end = s.indexOf('\t', start); 122 try { 123 score = Double.parseDouble(s.substring(start, end)); 124 } catch (Exception e) { 125 score = 0.0; 126 } 127 128 start = end + 1; 129 end = s.indexOf('\t', start); 130 char strand = s.charAt(end - 1); 131 132 Location location = Location.fromBio(Integer.parseInt(locStart), Integer.parseInt(locEnd), strand); 133 134 assert (strand == '-') == location.isNegative(); 135 136 int frame; 137 start = end + 1; 138 end = s.indexOf('\t', start); 139 try { 140 frame = Integer.parseInt(s.substring(start, end)); 141 } catch (Exception e) { 142 frame = -1; 143 } 144 145 //grab everything until end of line (or # comment) 146 start = end + 1; 147 end = s.indexOf('#', start); 148 String attributes = null; 149 if (end < 0) { 150 attributes = s.substring(start); 151 } else { 152 attributes = s.substring(start, end); 153 } 154 //need to add in attribute assignment for geneid where it just provides a gene name and will make it gtf like 155 attributes = "gene_id " + '"' + attributes + '"' + ";"; 156 return new Feature(seqname, source, type, location, score, frame, attributes); 157 158 } 159 160 /** 161 * Write features in FeatureList to file. Each Feature becomes one line in the file. 162 * The userMap() data in the features is not written to file. 163 * 164 * @param features The list of features to write. 165 * @param filename The path to the file. 166 * @throws IOException Something went wrong -- check exception detail message. 167 */ 168 public static void write(FeatureList features, String filename) throws IOException { 169 logger.info("Writing: {}", filename); 170 171 BufferedWriter bw = new BufferedWriter(new FileWriter(filename)); 172 173 ListIterator<FeatureI> iter = features.listIterator(); 174 while (iter.hasNext()) { 175 Feature feature = (Feature) iter.next(); 176 writeLine(feature, bw); 177 } 178 179 bw.close(); 180 } 181 182 private static void writeLine(Feature f, BufferedWriter bw) throws IOException { 183 String s = f.seqname() + '\t'; 184 s += f.source() + '\t'; 185 s += f.type() + '\t'; 186 187 s += f.location().bioStart() + "\t"; 188 s += f.location().bioEnd() + "\t"; 189 s += Double.toString(f.score()) + "\t"; 190 s += f.location().bioStrand() + "\t"; 191 192 if (f.frame() == -1) { 193 s += ".\t"; 194 } else { 195 s += f.frame() + "\t"; 196 } 197 198 s += f.attributes(); 199 200 bw.write(s); 201 bw.newLine(); 202 } 203 204 205 public static void main(String[] args) throws Exception { 206 207 FeatureList listGenes = GeneIDGFF2Reader.read("/Users/Scooter/scripps/dyadic/analysis/454Scaffolds/genemark_hmm.gtf"); 208 209 for(FeatureI feature : listGenes){ 210 logger.info("Gene Feature: {}", feature); 211 } 212// logger.info(listGenes); 213 // GeneMarkGTF.write( list, args[1] ); 214 } 215}