001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.genome.parsers.gff; 022 023import org.slf4j.Logger; 024import org.slf4j.LoggerFactory; 025 026import java.io.BufferedReader; 027import java.io.FileReader; 028import java.io.IOException; 029import java.util.ArrayList; 030import java.util.List; 031import java.util.regex.Pattern; 032 033 034/** 035 * http://www.bioperl.org/wiki/GTF 036 * Read and write FeatureLists as GFF/GTF formatted files. 037 *<br><br> 038 * The GFF moniker is applied to a variety of tab-delimited formats 039 * that mock the notion of a standard. This class should parse most files 040 * bearing at least a passing resemblance to any of the formats. You will, however, need 041 * to research the semantics of the files you encounter. Generally, 042 * the format consists of 9 tab-delimited fields: 043 * <br> 044 * <pre> 045 * seqname source featureType start end score strand frame attributes 046 * </pre> 047 * The 9th field consists of key-value pairs separated by semicolons, the first of which JavaGene interprets 048 * as the group id (as used in GFF1). It is the precise meaning of this 9th field that 049 * varies from week to week. The Feature and FeatureList objects provide various utility methods to 050 * ease the task of accessing and using the attributes. The proper interpretation of any 051 * particular attribute, however, is left to you. 052 * 053 * @author Hanno Hinsch 054 */ 055public class GFF3Reader { 056 057 private static final Logger logger = LoggerFactory.getLogger(GFF3Reader.class); 058 059 private static final Pattern p = Pattern.compile("\t"); 060 061 /** 062 * Read a file into a FeatureList. Each line of the file becomes one Feature object. 063 * 064 * @param filename The path to the GFF file. 065 * @return A FeatureList. 066 * @throws IOException Something went wrong -- check exception detail message. 067 */ 068 069 public static FeatureList read(String filename, List<String> indexes) throws IOException { 070 logger.info("Reading: {}", filename); 071 072 FeatureList features = new FeatureList(); 073 features.addIndexes(indexes); 074 BufferedReader br = new BufferedReader(new FileReader(filename)); 075 076 String s; 077 for (s = br.readLine(); null != s; s = br.readLine()) { 078 s = s.trim(); 079 080 if (s.length() > 0) { 081 if (s.charAt(0) == '#') { 082 //ignore comment lines 083 if(s.startsWith("##fasta")) 084 break; 085 } else { 086 087 FeatureI f = parseLine(s); 088 if (f != null) { 089 features.add(f); 090 091 } 092 } 093 } 094 095 } 096 097 br.close(); 098 return features; 099 } 100 101 102 public static FeatureList read(String filename) throws IOException { 103 return read(filename,new ArrayList<String>(0)); 104 } 105 106 107 /** 108 * create Feature from line of GFF file 109 */ 110 private static Feature parseLine(String s) { 111 //FIXME update to use regex split on tabs 112 //FIXME better errors on parse failures 113 String[] line = p.split(s); 114 String seqname =line[0].trim(); 115 116 String source =line[1].trim(); 117 118 String type =line[2].trim(); 119 120 121 String locStart =line[3].trim(); 122 123 String locEnd =line[4].trim(); 124 125 Double score; 126 127 try { 128 score = Double.parseDouble(line[5].trim()); 129 } catch (Exception e) { 130 score = 0.0; 131 } 132 133 134 char strand = line[6].trim().charAt(0); 135 //added by scooter willis to deal with glimmer predictions that 136 //have the start after the end but is a negative strand 137 int locationStart = Integer.parseInt(locStart); 138 int locationEnd = Integer.parseInt(locEnd); 139 if(locationStart > locationEnd){ 140 int temp = locationStart; 141 locationStart = locationEnd; 142 locationEnd = temp; 143 144 } 145 Location location = Location.fromBio(locationStart, locationEnd, strand); 146 147 assert (strand == '-') == location.isNegative(); 148 149 int frame; 150 try { 151 frame = Integer.parseInt(line[7].trim()); 152 } catch (Exception e) { 153 frame = -1; 154 } 155 String attributes=line[8]; 156 /* //grab everything until end of line (or # comment) 157 start = end + 1; 158 end = s.indexOf('#', start); 159 String attributes = null; 160 if (end < 0) { 161 attributes = new String(s.substring(start)); 162 } else { 163 attributes = new String(s.substring(start, end)); 164 } 165 */ 166 return new Feature(seqname, source, type, location, score, frame, attributes.split("#")[0]); 167 168 } 169 170 171 172 173 public static void main(String[] args) throws Exception { 174 long start = System.currentTimeMillis(); 175 @SuppressWarnings("unused") 176 FeatureList listGenes = GFF3Reader.read("/home/melo/workspace/release/stdout.combined.checked2.gtf"); 177 long stop = System.currentTimeMillis(); 178 logger.info("Loading = {}", stop-start); 179// logger.info(listGenes); 180 // GeneMarkGTF.write( list, args[1] ); 181 } 182}