001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.genome.parsers.gff; 022 023import java.util.HashMap; 024 025 026/** 027 * A Feature corresponds to a single row in a GFF file. 028 * 029 * @author Hanno Hinsch 030 */ 031public class Feature implements FeatureI { 032 033 private Location mLocation; 034 private String mSeqname; 035 private String mSource; 036 private String mType; 037 private double mScore; //or . if none 038 private int mFrame; //0,1,2 039 private String mAttributes; //any trailing stuff 040 private HashMap<String, String> mUserMap; 041 042 /** 043 * Get the sequence name. (GFF field 1). Note that feature objects have 044 * no link or reference to the actual sequence object to which 045 * they refer; they are completely uncoupled. 046 * 047 * @return Sequence name. 048 */ 049 @Override 050 public String seqname() { 051 return mSeqname; 052 } 053 054 ; 055 056 /** 057 * Get source (aka method). (GFF field 2). This is often the name of 058 * the program or procedure that created the features. 059 * 060 * @return Source field. 061 */ 062 public String source() { 063 return mSource; 064 } 065 066 ; 067 068 /** 069 * Get feature type, such as "exon" or "CDS". (GFF field 3). 070 * 071 * @return Feature type. 072 */ 073 @Override 074 public String type() { 075 return mType; 076 } 077 078 ; 079 080 /** 081 * Get location of feature. Note that feature objects have 082 * no link or reference to the actual sequence object to which 083 * they refer; they are completely uncoupled. 084 * 085 * @return Location of feature. 086 */ 087 @Override 088 public Location location() { 089 return mLocation; 090 } 091 092 /** 093 * Get score. (GFF field 7). The meaning of the score varies from file to file. 094 * 095 * @return Score value. 096 */ 097 public double score() { 098 return mScore; 099 } 100 101 ; 102 103 /** 104 * Get frame (aka phase). (GFF field 8). Specifies the offset of the 105 * first nucleotide of the first in-frame codon, assuming this feature 106 * is a dna/rna sequence that codes 107 * for a protein. If you 108 * intend to use this field, you probably want to look it up on the web first. 109 * 110 * @return The frame (0, 1, 2). 111 */ 112 public int frame() { 113 return mFrame; 114 } 115 116 ; 117 118 /** 119 * Get the string of key/value attributes. (GFF field 9). The format and 120 * meaning of this field varies from flavor to flavor of GFF/GTF. This method 121 * simply returns the whole string. Other methods in this class make assumptions 122 * about its format and provide additional utility. 123 * 124 * @return The attribute string. 125 */ 126 public String attributes() { 127 return mAttributes; 128 } 129 130 ; 131 132 @SuppressWarnings("unused") 133 private Feature() { 134 } 135 136 ; //unavailable 137 138 /** 139 * Make a copy of the specified feature. The mappings in the userMap() HashMap 140 * are copied, so each feature has independent user data. Note, however, that the 141 * actual objects in the HashMap are shared (not copied), so a change to such an object may 142 * affect multiple features. 143 * 144 * @param feature Feature to clone. 145 */ 146 public Feature(Feature feature) { 147 148 mSeqname = feature.mSeqname; 149 mSource = feature.mSource; 150 mType = feature.mType; 151 mLocation = feature.mLocation; 152 mScore = feature.mScore; 153 mFrame = feature.mFrame; 154 mAttributes = feature.mAttributes; 155 initAttributeHashMap(); 156 mUserMap = new HashMap<>(feature.mUserMap); 157 } 158 159 /** 160 * Construct a new Feature from raw data (usually a GFF row). 161 * 162 * @param seqname The sequence name field (field 1). 163 * @param source The source or method field (field 2). 164 * @param type The type of feature field (field 3). 165 * @param location The location of the feature. (calculated from GFF start, end and strand fields). 166 * @param score The score field (field 7). 167 * @param frame The frame or phase field (field 8). 168 * @param attributes A string of key/value pairs separated by semicolons (field 9). 169 */ 170 public Feature(String seqname, String source, String type, Location location, Double score, int frame, String attributes) { 171 172 mSeqname = seqname; 173 mSource = source; 174 mType = type; 175 mLocation = location; 176 mScore = score; 177 mFrame = frame; 178 mAttributes = attributes; 179 initAttributeHashMap(); 180 mUserMap = new HashMap<>(); 181 182 } 183 184 /** 185 * Get HashMap of user data. Each Feature object has a Java HashMap object 186 * which can be used to annotate the Feature. JavaGene does not use or interpret 187 * the keys or values. The values can be any subtype of the Java Object class. 188 *<br><br> 189 * If a Feature is constructed from data fields, the initial HashMap has no mappings (is empty). 190 * If a Feature is constructed from another Feature, a copy of the mappings is made. 191 * Note that the Objects in the copied mapping are shared, even though the mapping itself 192 * is copied (not shared). Thus removing or adding a mapping to one Feature will not affect the 193 * other, but changing an Object which is part of an established mapping may affect both Features. 194 * 195 * @return The user HashMap. 196 */ 197 @Override 198 public HashMap<String, String> userData() { 199 return mUserMap; 200 } 201 202 HashMap<String,String> attributeHashMap = new HashMap<>(); 203 204 private void initAttributeHashMap(){ 205 String[] values = mAttributes.split(";"); 206 for(String attribute : values){ 207 attribute = attribute.trim(); 208 int equalindex = attribute.indexOf("="); 209 String splitData = "="; 210 if(equalindex == -1) //gtf uses space and gff3 uses = 211 splitData = " "; 212 String[] data = attribute.split(splitData); 213 String value = ""; 214 if(data.length >= 2 && data[1].indexOf('"') != -1){ // an attibute field could be empty 215 value = data[1].replaceAll("\"","").trim(); 216 }else if(data.length >= 2){ 217 value = data[1].trim(); 218 } 219 attributeHashMap.put(data[0].trim(), value); 220 } 221 } 222 223 /** 224 * Get value of specified attribute key. Returns null if the attribute key has no value (does not exist). 225 * Keys are case-sensitive. Assumes attributes are correctly formatted in GFF style. 226 * Known bug: a semicolon within a quoted value will cause parse failure. 227 * 228 * @param key The key. 229 * @return The corresponding value. Null if the key has no value defined. 230 */ 231 @Override 232 public String getAttribute(String key) { 233 234 return attributeHashMap.get(key); 235 } 236 237 public String getAttributeOld(String key) { 238 int start = 0; 239 240 int end = mAttributes.indexOf(';'); 241 while (0 < end) { 242 //find the first word (up to space) in chunk, 243 // see if it is this key 244 int i = mAttributes.indexOf(' ', start); 245 if (0 < i && i < end) { 246 if (mAttributes.substring(start, i).equals(key)) { 247 //remove quotes, if needed 248 if (mAttributes.charAt(i + 1) == '\"' && mAttributes.charAt(end - 1) == '\"') { 249 return mAttributes.substring(i + 2, end - 1);//return attribute 250 } else { 251 return mAttributes.substring(i + 1, end); //return attribute 252 } 253 } 254 } 255 start = end + 2; //skip required semicolon and single space 256 end = mAttributes.indexOf(';', start); 257 } 258 259 return null; 260 } 261 262 @Override 263 public boolean hasAttribute(String key) { 264 return attributeHashMap.containsKey(key); 265 } 266 267 @Override 268 public boolean hasAttribute(String key, String value) { 269 String data = getAttribute(key); 270 if(data == null) 271 return false; 272 if(data.equals(value)) 273 return true; 274 else 275 return false; 276 } 277 278 /** 279 * Get the first item (everything before first semicolon, if it has one) 280 * in the attribute field, which is assumed to 281 * be a group identifer. This is appropriate for GFF1 files and variants. It is not 282 * appropriate for GTF and GFF2 files, although they may use a named attribute key, 283 * such as "gene_id" or "transcript_id", for grouping. 284 * 285 * @return The group id. Everything before the first semicolon in the attributes string (minus trailing whitespace). 286 */ 287 @Override 288 public String group() { 289 int i = mAttributes.indexOf(';'); 290 return (i < 0) ? mAttributes.trim() : mAttributes.substring(0, i).trim(); 291 } 292 293 /** 294 * 295 */ 296 @Override 297 public String toString() { 298 String s = mSeqname + '\t'; 299 s += mSource + '\t'; 300 s += mType + '\t'; 301 s += mLocation.start() + "\t"; 302 s += mLocation.end() + "\t"; 303 s += Double.toString(mScore) + "\t"; 304 305 if (mFrame == -1) { 306 s += ".\t"; 307 } else { 308 s += mFrame + "\t"; 309 } 310 311 s += mAttributes; 312 313 return s; 314 } 315 316 @Override 317 public HashMap<String, String> getAttributes() { 318 319 return attributeHashMap; 320 } 321}