001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.genome.parsers.gff; 022 023import java.nio.file.Files; 024import java.nio.file.Path; 025import java.nio.file.Paths; 026import org.slf4j.Logger; 027import org.slf4j.LoggerFactory; 028 029import java.io.BufferedReader; 030import java.io.IOException; 031import java.util.ArrayList; 032import java.util.List; 033import java.util.regex.Pattern; 034 035 036/** 037 * http://www.bioperl.org/wiki/GTF 038 * Read and write FeatureLists as GFF/GTF formatted files. 039 *<br><br> 040 * The GFF moniker is applied to a variety of tab-delimited formats 041 * that mock the notion of a standard. This class should parse most files 042 * bearing at least a passing resemblance to any of the formats. You will, however, need 043 * to research the semantics of the files you encounter. Generally, 044 * the format consists of 9 tab-delimited fields: 045 * <br> 046 * <pre> 047 * seqname source featureType start end score strand frame attributes 048 * </pre> 049 * The 9th field consists of key-value pairs separated by semicolons, the first of which JavaGene interprets 050 * as the group id (as used in GFF1). It is the precise meaning of this 9th field that 051 * varies from week to week. The Feature and FeatureList objects provide various utility methods to 052 * ease the task of accessing and using the attributes. The proper interpretation of any 053 * particular attribute, however, is left to you. 054 * 055 * @author Hanno Hinsch 056 */ 057public class GFF3Reader { 058 059 private static final Logger logger = LoggerFactory.getLogger(GFF3Reader.class); 060 061 private static final Pattern p = Pattern.compile("\t"); 062 063 /** 064 * Read a file into a FeatureList. Each line of the file becomes one Feature object. 065 * 066 * @param filename The path to the GFF file. 067 * @return A FeatureList. 068 * @throws IOException Something went wrong -- check exception detail message. 069 */ 070 public static FeatureList read(String filename, List<String> indexes) throws IOException { 071 return read(Paths.get(filename), indexes); 072 } 073 074 /** 075 * Read a file into a FeatureList. Each line of the file becomes one Feature object. 076 * 077 * @param path The path to the GFF file. 078 * @return A FeatureList. 079 * @throws IOException Something went wrong -- check exception detail message. 080 */ 081 public static FeatureList read(Path path, List<String> indexes) throws IOException { 082 logger.info("Reading: {}", path.toString()); 083 084 FeatureList features = new FeatureList(); 085 features.addIndexes(indexes); 086 BufferedReader br = Files.newBufferedReader(path); 087 088 String s; 089 for (s = br.readLine(); null != s; s = br.readLine()) { 090 s = s.trim(); 091 092 if (s.length() > 0) { 093 if (s.charAt(0) == '#') { 094 //ignore comment lines 095 if(s.startsWith("##fasta")) 096 break; 097 } else { 098 099 FeatureI f = parseLine(s); 100 if (f != null) { 101 features.add(f); 102 103 } 104 } 105 } 106 107 } 108 109 br.close(); 110 return features; 111 } 112 113 114 public static FeatureList read(String filename) throws IOException { 115 return read(filename,new ArrayList<String>(0)); 116 } 117 118 public static FeatureList read(Path path) throws IOException { 119 return read(path,new ArrayList<String>(0)); 120 } 121 122 123 /** 124 * create Feature from line of GFF file 125 */ 126 private static Feature parseLine(String s) { 127 //FIXME update to use regex split on tabs 128 //FIXME better errors on parse failures 129 String[] line = p.split(s); 130 String seqname =line[0].trim(); 131 132 String source =line[1].trim(); 133 134 String type =line[2].trim(); 135 136 137 String locStart =line[3].trim(); 138 139 String locEnd =line[4].trim(); 140 141 Double score; 142 143 try { 144 score = Double.parseDouble(line[5].trim()); 145 } catch (Exception e) { 146 score = 0.0; 147 } 148 149 150 char strand = line[6].trim().charAt(0); 151 //added by scooter willis to deal with glimmer predictions that 152 //have the start after the end but is a negative strand 153 int locationStart = Integer.parseInt(locStart); 154 int locationEnd = Integer.parseInt(locEnd); 155 if(locationStart > locationEnd){ 156 int temp = locationStart; 157 locationStart = locationEnd; 158 locationEnd = temp; 159 160 } 161 Location location = Location.fromBio(locationStart, locationEnd, strand); 162 163 assert (strand == '-') == location.isNegative(); 164 165 int frame; 166 try { 167 frame = Integer.parseInt(line[7].trim()); 168 } catch (Exception e) { 169 frame = -1; 170 } 171 String attributes=line[8]; 172 /* //grab everything until end of line (or # comment) 173 start = end + 1; 174 end = s.indexOf('#', start); 175 String attributes = null; 176 if (end < 0) { 177 attributes = new String(s.substring(start)); 178 } else { 179 attributes = new String(s.substring(start, end)); 180 } 181 */ 182 return new Feature(seqname, source, type, location, score, frame, attributes.split("#")[0]); 183 184 } 185 186 187 188 189 public static void main(String[] args) throws Exception { 190 long start = System.currentTimeMillis(); 191 @SuppressWarnings("unused") 192 FeatureList listGenes = GFF3Reader.read("/home/melo/workspace/release/stdout.combined.checked2.gtf"); 193 long stop = System.currentTimeMillis(); 194 logger.info("Loading = {}", stop-start); 195// logger.info(listGenes); 196 // GeneMarkGTF.write( list, args[1] ); 197 } 198}