001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021/** 022 * 023 */ 024package org.biojava.nbio.core.sequence.io; 025 026import org.biojava.nbio.core.sequence.Strand; 027import org.biojava.nbio.core.sequence.features.DBReferenceInfo; 028import org.biojava.nbio.core.sequence.features.FeatureInterface; 029import org.biojava.nbio.core.sequence.features.Qualifier; 030import org.biojava.nbio.core.sequence.location.template.AbstractLocation; 031import org.biojava.nbio.core.sequence.location.template.Location; 032import org.biojava.nbio.core.sequence.location.template.Point; 033import org.biojava.nbio.core.sequence.template.AbstractSequence; 034import org.biojava.nbio.core.sequence.template.Compound; 035import org.biojava.nbio.core.util.StringManipulationHelper; 036 037import java.util.ArrayList; 038import java.util.Collections; 039import java.util.Formatter; 040import java.util.List; 041import java.util.Locale; 042 043/** 044 * @author mckeee1 045 * 046 */ 047public class GenericInsdcHeaderFormat<S extends AbstractSequence<C>, C extends Compound> { 048 protected static final int MAX_WIDTH = 80; 049 protected static final int QUALIFIER_INDENT = 21; 050 protected static final String QUALIFIER_INDENT_STR = " "; 051 protected static final String QUALIFIER_INDENT_TMP = " %s "; 052 private static final String lineSep = "%n"; 053 054 /** 055 * Format a feature qualifier using the MAX_WIDTH (default 80) 056 * @param key 057 * @param value 058 * @param quote 059 */ 060 private String _write_feature_qualifier(String key, String value, boolean quote) { 061 String line = ""; 062 if(null == value) { 063 line = QUALIFIER_INDENT_STR + "/" + key + lineSep; 064 return line; 065 } 066 if(quote) { // quote should be true for numerics 067 line = QUALIFIER_INDENT_STR + "/" + key + "=\"" + value + "\""; 068 } else { 069 line = QUALIFIER_INDENT_STR + "/" + key + "=" + value; 070 } 071 if(line.length() <= MAX_WIDTH) { 072 return line + lineSep; 073 } 074 String goodlines = ""; 075 while(!"".equals(line.replaceAll("^\\s+", ""))) { 076 if(line.length() <= MAX_WIDTH) { 077 goodlines += line + lineSep; 078 break; 079 } 080 //Insert line break... 081 int index; 082 for(index = Math.min(line.length()-1, MAX_WIDTH); index > QUALIFIER_INDENT ; index--) { 083 if(' ' == line.charAt(index)) { 084 break; 085 } 086 } 087 if(' ' != line.charAt(index)) { 088 //no nice place to break... 089 index = MAX_WIDTH; 090 } 091 assert index <= MAX_WIDTH; 092 goodlines += line.substring(0,index) + lineSep; 093 line = QUALIFIER_INDENT_STR + line.substring(index).replaceAll("^\\s+", ""); 094 } 095 return goodlines; 096 } 097 /** 098 * Split a feature location into lines (break at commas). 099 * @param location 100 */ 101 private String _wrap_location(String location) { 102 int length = MAX_WIDTH - QUALIFIER_INDENT; 103 if(location.length() <= length) { 104 return location; 105 } 106 int index = location.substring(0, length).lastIndexOf(","); 107 if(-1 == index) { 108 //No good place to split (!) 109 return location; 110 } 111 return location.substring(0,index+1) + lineSep + QUALIFIER_INDENT_STR + _wrap_location(location.substring(index+1)); 112 } 113 /** 114 * Write a single SeqFeature object to features table. 115 * @param feature 116 * @param record_length 117 */ 118 protected String _write_feature(FeatureInterface<AbstractSequence<C>, C> feature, int record_length) { 119 String location = _insdc_feature_location_string(feature, record_length); 120 String f_type = feature.getType().replace(" ", "_"); 121 StringBuilder sb = new StringBuilder(); 122 Formatter formatter = new Formatter(sb,Locale.US); 123 formatter.format(QUALIFIER_INDENT_TMP, f_type); 124 String line = formatter.toString().substring(0, QUALIFIER_INDENT) + _wrap_location(location) + lineSep; 125 formatter.close(); 126 127 //Now the qualifiers... 128 for(List<Qualifier> qualifiers : feature.getQualifiers().values()) { 129 for(Qualifier q : qualifiers){ 130 if (q instanceof DBReferenceInfo) { 131 DBReferenceInfo db = (DBReferenceInfo) q; 132 line += _write_feature_qualifier(q.getName().replaceAll("%","%%"), db.getDatabase().replaceAll("%","%%") + ":" + db.getId().replaceAll("%","%%"), db.needsQuotes()); 133 } else { 134 line += _write_feature_qualifier(q.getName().replaceAll("%","%%"), q.getValue().replaceAll("%","%%"), q.needsQuotes()); 135 } 136 } 137 } 138 return line; 139 /* 140 self.handle.write(line) 141 #Now the qualifiers... 142 for key, values in feature.qualifiers.items(): 143 if isinstance(values, list) or isinstance(values, tuple): 144 for value in values: 145 self._write_feature_qualifier(key, value) 146 elif values: 147 #String, int, etc 148 self._write_feature_qualifier(key, values) 149 else: 150 #e.g. a /psuedo entry 151 self._write_feature_qualifier(key) 152 */ 153 } 154 /** 155 * Build a GenBank/EMBL location string from a SeqFeature (PRIVATE). 156 157 There is a choice of how to show joins on the reverse complement strand, 158 GenBank used "complement(join(1,10),(20,100))" while EMBL used to use 159 "join(complement(20,100),complement(1,10))" instead (but appears to have 160 now adopted the GenBank convention). Notice that the order of the entries 161 is reversed! This function therefore uses the first form. In this situation 162 we expect the parent feature and the two children to all be marked as 163 strand == -1, and in the order 0:10 then 19:100. 164 165 Also need to consider dual-strand examples like these from the Arabidopsis 166 thaliana chloroplast NC_000932: join(complement(69611..69724),139856..140650) 167 gene ArthCp047, GeneID:844801 or its CDS (protein NP_051038.1 GI:7525057) 168 which is further complicated by a splice: 169 join(complement(69611..69724),139856..140087,140625..140650) 170 171 For mixed this mixed strand feature, the parent SeqFeature should have 172 no strand (either 0 or None) while the child features should have either 173 strand +1 or -1 as appropriate, and be listed in the order given here. 174 * @param feature 175 * @param record_length 176 */ 177 private String _insdc_feature_location_string(FeatureInterface<AbstractSequence<C>, C> feature, int record_length) { 178 if(feature.getChildrenFeatures().isEmpty()) { 179 if(feature.getLocations().getSubLocations().isEmpty()) { 180 //Non-recursive. 181 String location = _insdc_location_string_ignoring_strand_and_subfeatures(feature.getLocations(), record_length); 182 if(feature.getLocations().getStrand() == Strand.NEGATIVE) { 183 StringBuilder sb = new StringBuilder(); 184 Formatter formatter = new Formatter(sb,Locale.US); 185 formatter.format("complement(%s)", location); 186 String output = formatter.toString(); 187 formatter.close(); 188 location = output; 189 } 190 return location; 191 192 } else if (feature.getLocations().getStrand() == Strand.NEGATIVE) { 193 194 // As noted above, treat reverse complement strand features carefully: 195 196 // check if any of the sublocations strand differs from the parent features strand 197 for(Location l : feature.getLocations().getSubLocations()) { 198 if (l.getStrand() != Strand.NEGATIVE) { 199 StringBuilder sb = new StringBuilder(); 200 Formatter formatter = new Formatter(sb, Locale.US); 201 formatter.format("Inconsistent strands: %s for parent, %s for child", 202 feature.getLocations().getStrand(), l.getStrand()); 203 String output = formatter.toString(); 204 formatter.close(); 205 throw new RuntimeException(output); 206 } 207 } 208 209 StringBuilder sb = new StringBuilder(); 210 Formatter formatter = new Formatter(sb, Locale.US); 211 ArrayList<String> locations = new ArrayList<String>(); 212 for(Location l : feature.getLocations().getSubLocations()) { 213 locations.add(_insdc_location_string_ignoring_strand_and_subfeatures((AbstractLocation) l, record_length)); 214 } 215 String location = StringManipulationHelper.join(locations, ","); 216 formatter.format("complement(%s(%s))", /* feature.location_operator */ "join", location); 217 String output = formatter.toString(); 218 formatter.close(); 219 return output; 220 221 } else { 222 //Convert feature sub-locations into joins 223 //This covers typical forward strand features, and also an evil mixed strand: 224 StringBuilder sb = new StringBuilder(); 225 Formatter formatter = new Formatter(sb,Locale.US); 226 ArrayList<String> locations = new ArrayList<String>(); 227 for(Location l : feature.getLocations().getSubLocations()) { 228 locations.add(_insdc_location_string_ignoring_strand_and_subfeatures((AbstractLocation) l, record_length)); 229 } 230 String location = StringManipulationHelper.join(locations, ","); 231 formatter.format("%s(%s)", /*feature.location_operator*/ "join", location); 232 String output = formatter.toString(); 233 formatter.close(); 234 return output; 235 } 236 } 237 // As noted above, treat reverse complement strand features carefully: 238 if(feature.getLocations().getStrand() == Strand.NEGATIVE) { 239 for(FeatureInterface<?, ?> f : feature.getChildrenFeatures()) { 240 if(f.getLocations().getStrand() != Strand.NEGATIVE) { 241 StringBuilder sb = new StringBuilder(); 242 Formatter formatter = new Formatter(sb,Locale.US); 243 formatter.format("Inconsistent strands: %s for parent, %s for child", feature.getLocations().getStrand(), f.getLocations().getStrand()); 244 String output = formatter.toString(); 245 formatter.close(); 246 throw new RuntimeException(output); 247 } 248 } 249 StringBuilder sb = new StringBuilder(); 250 Formatter formatter = new Formatter(sb,Locale.US); 251 ArrayList<String> locations = new ArrayList<String>(); 252 for(FeatureInterface<AbstractSequence<C>, C> f : feature.getChildrenFeatures()) { 253 locations.add(_insdc_location_string_ignoring_strand_and_subfeatures(f.getLocations(), record_length)); 254 } 255 String location = StringManipulationHelper.join(locations, ","); 256 formatter.format("complement(%s(%s))", /*feature.location_operator*/ "join", location); 257 String output = formatter.toString(); 258 formatter.close(); 259 return output; 260 } 261 //This covers typical forward strand features, and also an evil mixed strand: 262 StringBuilder sb = new StringBuilder(); 263 Formatter formatter = new Formatter(sb,Locale.US); 264 ArrayList<String> locations = new ArrayList<String>(); 265 for(FeatureInterface<AbstractSequence<C>, C> f : feature.getChildrenFeatures()) { 266 locations.add(_insdc_location_string_ignoring_strand_and_subfeatures(f.getLocations(), record_length)); 267 } 268 String location = StringManipulationHelper.join(locations, ","); 269 formatter.format("%s(%s)", /*feature.location_operator*/ "join", location); 270 String output = formatter.toString(); 271 formatter.close(); 272 return output; 273 } 274 275 private String _insdc_location_string_ignoring_strand_and_subfeatures( 276 //SequenceLocation<AbstractSequence<C>, C> sequenceLocation, 277 AbstractLocation sequenceLocation, 278 int record_length) { 279 /* 280 if location.ref: 281 ref = "%s:" % location.ref 282 else: 283 ref = "" 284 assert not location.ref_db 285 */ 286 String ref = ""; 287 if(!sequenceLocation.getStart().isUncertain() && !sequenceLocation.getEnd().isUncertain() && sequenceLocation.getStart() == sequenceLocation.getEnd()) { 288 //Special case, for 12:12 return 12^13 289 //(a zero length slice, meaning the point between two letters) 290 if(sequenceLocation.getEnd().getPosition() == record_length) { 291 //Very special case, for a between position at the end of a 292 //sequence (used on some circular genomes, Bug 3098) we have 293 //N:N so return N^1 294 StringBuilder sb = new StringBuilder(); 295 Formatter formatter = new Formatter(sb,Locale.US); 296 formatter.format("%s%d^1", ref, record_length); 297 String output = formatter.toString(); 298 formatter.close(); 299 return output; 300 } else { 301 StringBuilder sb = new StringBuilder(); 302 Formatter formatter = new Formatter(sb,Locale.US); 303 formatter.format("%s%d^%d", ref, sequenceLocation.getStart().getPosition(), sequenceLocation.getEnd().getPosition()); 304 String output = formatter.toString(); 305 formatter.close(); 306 return output; 307 } 308 } 309 if(!sequenceLocation.getStart().isUncertain() && !sequenceLocation.getEnd().isUncertain() && sequenceLocation.getStart().getPosition() + 1 == sequenceLocation.getEnd().getPosition()) { 310 //Special case, for 11:12 return 12 rather than 12..12 311 //(a length one slice, meaning a single letter) 312 StringBuilder sb = new StringBuilder(); 313 Formatter formatter = new Formatter(sb,Locale.US); 314 formatter.format("%s%d", ref, sequenceLocation.getEnd().getPosition()); 315 String output = formatter.toString(); 316 formatter.close(); 317 return output; 318 } else if(sequenceLocation.getStart().isUnknown() || sequenceLocation.getEnd().isUnknown()) { 319 //Special case for features from SwissProt/UniProt files 320 if(sequenceLocation.getStart().isUnknown() && sequenceLocation.getEnd().isUnknown()) { 321 throw new RuntimeException("Feature with unknown location"); 322 } else if(sequenceLocation.getStart().isUnknown()) { 323 //Treat the unknown start position as a BeforePosition 324 StringBuilder sb = new StringBuilder(); 325 Formatter formatter = new Formatter(sb,Locale.US); 326 formatter.format("%s<%d..%s", ref, sequenceLocation.getEnd().getPosition(), _insdc_feature_position_string(sequenceLocation.getEnd())); 327 String output = formatter.toString(); 328 formatter.close(); 329 return output; 330 } else { 331 //Treat the unknown start position as an AfterPosition 332 StringBuilder sb = new StringBuilder(); 333 Formatter formatter = new Formatter(sb,Locale.US); 334 formatter.format("%s%s..>%d", ref, _insdc_feature_position_string(sequenceLocation.getStart()), sequenceLocation.getStart().getPosition()); 335 String output = formatter.toString(); 336 formatter.close(); 337 return output; 338 } 339 } else { 340 //Typical case, e.g. 12..15 gets mapped to 11:15 341 String start = _insdc_feature_position_string(sequenceLocation.getStart()); 342 String end = _insdc_feature_position_string(sequenceLocation.getEnd()); 343 344 if (sequenceLocation.isPartial()) { 345 if (sequenceLocation.isPartialOn5prime()) { 346 start = "<" + start; 347 } 348 349 if (sequenceLocation.isPartialOn3prime()) { 350 end = ">" + end; 351 } 352 } 353 354 return ref + start + ".." + end; 355 } 356 } 357 private String _insdc_feature_position_string(Point location) { 358 // TODO Auto-generated method stub 359 return _insdc_feature_position_string(location, 0); 360 } 361 362 /** 363 * Build a GenBank/EMBL position string (PRIVATE). 364 * @param location 365 * @param increment 366 */ 367 private String _insdc_feature_position_string(Point location, int increment) { 368 StringBuilder sb = new StringBuilder(); 369 Formatter formatter = new Formatter(sb,Locale.US); 370 formatter.format("%s", location.getPosition() + increment); 371 String output = formatter.toString(); 372 formatter.close(); 373 return output; 374 375 /* 376 if isinstance(pos, SeqFeature.ExactPosition): 377 return "%i" % (pos.position+offset) 378 elif isinstance(pos, SeqFeature.WithinPosition): 379 return "(%i.%i)" % (pos.position + offset, 380 pos.position + pos.extension + offset) 381 elif isinstance(pos, SeqFeature.BetweenPosition): 382 return "(%i^%i)" % (pos.position + offset, 383 pos.position + pos.extension + offset) 384 elif isinstance(pos, SeqFeature.BeforePosition): 385 return "<%i" % (pos.position + offset) 386 elif isinstance(pos, SeqFeature.AfterPosition): 387 return ">%i" % (pos.position + offset) 388 elif isinstance(pos, SeqFeature.OneOfPosition): 389 return "one-of(%s)" \ 390 % ",".join([_insdc_feature_position_string(p,offset) \ 391 for p in pos.position_choices]) 392 elif isinstance(pos, SeqFeature.AbstractPosition): 393 raise NotImplementedError("Please report this as a bug in Biopython.") 394 else: 395 raise ValueError("Expected a SeqFeature position object.") 396 */ 397 } 398 399 /** 400 * Returns a list of strings. 401 * 402 * Any single words which are too long get returned as a whole line 403 * (e.g. URLs) without an exception or warning. 404 * @param text 405 * @param max_len 406 */ 407 protected ArrayList<String> _split_multi_line(String text, int max_len) { 408 // TODO Auto-generated method stub 409 ArrayList<String> output = new ArrayList<String>(); 410 text = text.trim(); 411 if(text.length() <= max_len) { 412 output.add(text); 413 return output; 414 } 415 416 ArrayList<String> words = new ArrayList<String>(); 417 Collections.addAll(words, text.split("\\s+")); 418 while(!words.isEmpty()) { 419 text = words.remove(0); 420 while(!words.isEmpty() && (text.length() + 1 + words.get(0).length()) <= max_len) { 421 text += " " + words.remove(0); 422 text = text.trim(); 423 } 424 output.add(text); 425 } 426 assert words.isEmpty(); 427 return output; 428 } 429 430 431}