001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021/** 022 * 023 */ 024package org.biojava.nbio.core.sequence.io; 025 026import org.biojava.nbio.core.sequence.Strand; 027import org.biojava.nbio.core.sequence.features.FeatureInterface; 028import org.biojava.nbio.core.sequence.features.Qualifier; 029import org.biojava.nbio.core.sequence.location.template.Point; 030import org.biojava.nbio.core.sequence.template.AbstractSequence; 031import org.biojava.nbio.core.sequence.template.Compound; 032import org.biojava.nbio.core.util.StringManipulationHelper; 033 034import java.util.ArrayList; 035import java.util.Collections; 036import java.util.Formatter; 037import java.util.List; 038import java.util.Locale; 039 040/** 041 * @author mckeee1 042 * 043 */ 044public class GenericInsdcHeaderFormat<S extends AbstractSequence<C>, C extends Compound> { 045 protected static final int MAX_WIDTH = 80; 046 protected static final int QUALIFIER_INDENT = 21; 047 protected static final String QUALIFIER_INDENT_STR = " "; 048 protected static final String QUALIFIER_INDENT_TMP = " %s "; 049 private static final String lineSep = "%n"; 050 051 /** 052 * Format a feature qualifier using the MAX_WIDTH (default 80) 053 * @param key 054 * @param value 055 * @param quote 056 */ 057 private String _write_feature_qualifier(String key, String value, boolean quote) { 058 String line = ""; 059 if(null == value) { 060 line = QUALIFIER_INDENT_STR + "/" + key + lineSep; 061 return line; 062 } 063 if(quote) { // quote should be true for numerics 064 line = QUALIFIER_INDENT_STR + "/" + key + "=\"" + value + "\""; 065 } else { 066 line = QUALIFIER_INDENT_STR + "/" + key + "=" + value; 067 } 068 if(line.length() <= MAX_WIDTH) { 069 return line + lineSep; 070 } 071 String goodlines = ""; 072 while(!"".equals(line.replaceAll("^\\s+", ""))) { 073 if(line.length() <= MAX_WIDTH) { 074 goodlines += line + lineSep; 075 break; 076 } 077 //Insert line break... 078 int index; 079 for(index = Math.min(line.length()-1, MAX_WIDTH); index > QUALIFIER_INDENT ; index--) { 080 if(' ' == line.charAt(index)) { 081 break; 082 } 083 } 084 if(' ' != line.charAt(index)) { 085 //no nice place to break... 086 index = MAX_WIDTH; 087 } 088 assert index <= MAX_WIDTH; 089 goodlines += line.substring(0,index) + lineSep; 090 line = QUALIFIER_INDENT_STR + line.substring(index).replaceAll("^\\s+", ""); 091 } 092 return goodlines; 093 } 094 /** 095 * Split a feature location into lines (break at commas). 096 * @param location 097 */ 098 private String _wrap_location(String location) { 099 int length = MAX_WIDTH - QUALIFIER_INDENT; 100 if(location.length() <= length) { 101 return location; 102 } 103 int index = location.substring(length).lastIndexOf(","); 104 if(-1 == index) { 105 //No good place to split (!) 106 return location; 107 } 108 return location.substring(0,index+1) + lineSep + QUALIFIER_INDENT_STR + _wrap_location(location.substring(index+1)); 109 } 110 /** 111 * Write a single SeqFeature object to features table. 112 * @param feature 113 * @param record_length 114 */ 115 protected String _write_feature(FeatureInterface<AbstractSequence<C>, C> feature, int record_length) { 116 String location = _insdc_feature_location_string(feature, record_length); 117 String f_type = feature.getType().replace(" ", "_"); 118 StringBuilder sb = new StringBuilder(); 119 Formatter formatter = new Formatter(sb,Locale.US); 120 formatter.format(QUALIFIER_INDENT_TMP, f_type); 121 String line = formatter.toString().substring(0, QUALIFIER_INDENT) + _wrap_location(location) + lineSep; 122 formatter.close(); 123 124 //Now the qualifiers... 125 for(List<Qualifier> qualifiers : feature.getQualifiers().values()) { 126 for(Qualifier q : qualifiers){ 127 line += _write_feature_qualifier(q.getName(), q.getValue(), q.needsQuotes()); 128 } 129 } 130 return line; 131 /* 132 self.handle.write(line) 133 #Now the qualifiers... 134 for key, values in feature.qualifiers.items(): 135 if isinstance(values, list) or isinstance(values, tuple): 136 for value in values: 137 self._write_feature_qualifier(key, value) 138 elif values: 139 #String, int, etc 140 self._write_feature_qualifier(key, values) 141 else: 142 #e.g. a /psuedo entry 143 self._write_feature_qualifier(key) 144 */ 145 } 146 /** 147 * Build a GenBank/EMBL location string from a SeqFeature (PRIVATE). 148 149 There is a choice of how to show joins on the reverse complement strand, 150 GenBank used "complement(join(1,10),(20,100))" while EMBL used to use 151 "join(complement(20,100),complement(1,10))" instead (but appears to have 152 now adopted the GenBank convention). Notice that the order of the entries 153 is reversed! This function therefore uses the first form. In this situation 154 we expect the parent feature and the two children to all be marked as 155 strand == -1, and in the order 0:10 then 19:100. 156 157 Also need to consider dual-strand examples like these from the Arabidopsis 158 thaliana chloroplast NC_000932: join(complement(69611..69724),139856..140650) 159 gene ArthCp047, GeneID:844801 or its CDS (protein NP_051038.1 GI:7525057) 160 which is further complicated by a splice: 161 join(complement(69611..69724),139856..140087,140625..140650) 162 163 For mixed this mixed strand feature, the parent SeqFeature should have 164 no strand (either 0 or None) while the child features should have either 165 strand +1 or -1 as appropriate, and be listed in the order given here. 166 * @param feature 167 * @param record_length 168 */ 169 private String _insdc_feature_location_string(FeatureInterface<AbstractSequence<C>, C> feature, int record_length) { 170 if(feature.getChildrenFeatures().isEmpty()) { 171 //Non-recursive. 172 String location = _insdc_location_string_ignoring_strand_and_subfeatures(feature.getLocations(), record_length); 173 if(feature.getLocations().getStrand() == Strand.NEGATIVE) { 174 StringBuilder sb = new StringBuilder(); 175 Formatter formatter = new Formatter(sb,Locale.US); 176 formatter.format("complement(%s)", location); 177 String output = formatter.toString(); 178 formatter.close(); 179 location = output; 180 } 181 return location; 182 } 183 // As noted above, treat reverse complement strand features carefully: 184 if(feature.getLocations().getStrand() == Strand.NEGATIVE) { 185 for(FeatureInterface<?, ?> f : feature.getChildrenFeatures()) { 186 if(f.getLocations().getStrand() != Strand.NEGATIVE) { 187 StringBuilder sb = new StringBuilder(); 188 Formatter formatter = new Formatter(sb,Locale.US); 189 formatter.format("Inconsistent strands: %r for parent, %r for child", feature.getLocations().getStrand(), f.getLocations().getStrand()); 190 String output = formatter.toString(); 191 formatter.close(); 192 throw new RuntimeException(output); 193 } 194 } 195 StringBuilder sb = new StringBuilder(); 196 Formatter formatter = new Formatter(sb,Locale.US); 197 ArrayList<String> locations = new ArrayList<String>(); 198 for(FeatureInterface<AbstractSequence<C>, C> f : feature.getChildrenFeatures()) { 199 locations.add(_insdc_location_string_ignoring_strand_and_subfeatures(f.getLocations(), record_length)); 200 } 201 String location = StringManipulationHelper.join(locations, ","); 202 formatter.format("complement(%s(%s))", /*feature.location_operator*/ "join", location); 203 String output = formatter.toString(); 204 formatter.close(); 205 return output; 206 } 207 //This covers typical forward strand features, and also an evil mixed strand: 208 StringBuilder sb = new StringBuilder(); 209 Formatter formatter = new Formatter(sb,Locale.US); 210 ArrayList<String> locations = new ArrayList<String>(); 211 for(FeatureInterface<AbstractSequence<C>, C> f : feature.getChildrenFeatures()) { 212 locations.add(_insdc_location_string_ignoring_strand_and_subfeatures(f.getLocations(), record_length)); 213 } 214 String location = StringManipulationHelper.join(locations, ","); 215 formatter.format("%s(%s)", /*feature.location_operator*/ "join", location); 216 String output = formatter.toString(); 217 formatter.close(); 218 return output; 219 } 220 221 private String _insdc_location_string_ignoring_strand_and_subfeatures( 222 //SequenceLocation<AbstractSequence<C>, C> sequenceLocation, 223 org.biojava.nbio.core.sequence.location.template.AbstractLocation sequenceLocation, 224 int record_length) { 225 /* 226 if location.ref: 227 ref = "%s:" % location.ref 228 else: 229 ref = "" 230 assert not location.ref_db 231 */ 232 String ref = ""; 233 if(!sequenceLocation.getStart().isUncertain() && !sequenceLocation.getEnd().isUncertain() && sequenceLocation.getStart() == sequenceLocation.getEnd()) { 234 //Special case, for 12:12 return 12^13 235 //(a zero length slice, meaning the point between two letters) 236 if(sequenceLocation.getEnd().getPosition() == record_length) { 237 //Very special case, for a between position at the end of a 238 //sequence (used on some circular genomes, Bug 3098) we have 239 //N:N so return N^1 240 StringBuilder sb = new StringBuilder(); 241 Formatter formatter = new Formatter(sb,Locale.US); 242 formatter.format("%s%d^1", ref, record_length); 243 String output = formatter.toString(); 244 formatter.close(); 245 return output; 246 } else { 247 StringBuilder sb = new StringBuilder(); 248 Formatter formatter = new Formatter(sb,Locale.US); 249 formatter.format("%s%d^%d", ref, sequenceLocation.getStart().getPosition(), sequenceLocation.getEnd().getPosition()); 250 String output = formatter.toString(); 251 formatter.close(); 252 return output; 253 } 254 } 255 if(!sequenceLocation.getStart().isUncertain() && !sequenceLocation.getEnd().isUncertain() && sequenceLocation.getStart().getPosition() + 1 == sequenceLocation.getEnd().getPosition()) { 256 //Special case, for 11:12 return 12 rather than 12..12 257 //(a length one slice, meaning a single letter) 258 StringBuilder sb = new StringBuilder(); 259 Formatter formatter = new Formatter(sb,Locale.US); 260 formatter.format("%s%d", ref, sequenceLocation.getEnd().getPosition()); 261 String output = formatter.toString(); 262 formatter.close(); 263 return output; 264 } else if(sequenceLocation.getStart().isUnknown() || sequenceLocation.getEnd().isUnknown()) { 265 //Special case for features from SwissProt/UniProt files 266 if(sequenceLocation.getStart().isUnknown() && sequenceLocation.getEnd().isUnknown()) { 267 throw new RuntimeException("Feature with unknown location"); 268 } else if(sequenceLocation.getStart().isUnknown()) { 269 //Treat the unknown start position as a BeforePosition 270 StringBuilder sb = new StringBuilder(); 271 Formatter formatter = new Formatter(sb,Locale.US); 272 formatter.format("%s<%d..%s", ref, sequenceLocation.getEnd().getPosition(), _insdc_feature_position_string(sequenceLocation.getEnd())); 273 String output = formatter.toString(); 274 formatter.close(); 275 return output; 276 } else { 277 //Treat the unknown start position as an AfterPosition 278 StringBuilder sb = new StringBuilder(); 279 Formatter formatter = new Formatter(sb,Locale.US); 280 formatter.format("%s%s..>%d", ref, _insdc_feature_position_string(sequenceLocation.getStart()), sequenceLocation.getStart().getPosition()); 281 String output = formatter.toString(); 282 formatter.close(); 283 return output; 284 } 285 } else { 286 //Typical case, e.g. 12..15 gets mapped to 11:15 287 return ref + _insdc_feature_position_string(sequenceLocation.getStart(), 0) + ".." + _insdc_feature_position_string(sequenceLocation.getEnd()); 288 } 289 } 290 private String _insdc_feature_position_string(Point location) { 291 // TODO Auto-generated method stub 292 return _insdc_feature_position_string(location, 0); 293 } 294 295 /** 296 * Build a GenBank/EMBL position string (PRIVATE). 297 * @param location 298 * @param increment 299 */ 300 private String _insdc_feature_position_string(Point location, int increment) { 301 StringBuilder sb = new StringBuilder(); 302 Formatter formatter = new Formatter(sb,Locale.US); 303 formatter.format("%s", location.getPosition() + increment); 304 String output = formatter.toString(); 305 formatter.close(); 306 return output; 307 308 /* 309 if isinstance(pos, SeqFeature.ExactPosition): 310 return "%i" % (pos.position+offset) 311 elif isinstance(pos, SeqFeature.WithinPosition): 312 return "(%i.%i)" % (pos.position + offset, 313 pos.position + pos.extension + offset) 314 elif isinstance(pos, SeqFeature.BetweenPosition): 315 return "(%i^%i)" % (pos.position + offset, 316 pos.position + pos.extension + offset) 317 elif isinstance(pos, SeqFeature.BeforePosition): 318 return "<%i" % (pos.position + offset) 319 elif isinstance(pos, SeqFeature.AfterPosition): 320 return ">%i" % (pos.position + offset) 321 elif isinstance(pos, SeqFeature.OneOfPosition): 322 return "one-of(%s)" \ 323 % ",".join([_insdc_feature_position_string(p,offset) \ 324 for p in pos.position_choices]) 325 elif isinstance(pos, SeqFeature.AbstractPosition): 326 raise NotImplementedError("Please report this as a bug in Biopython.") 327 else: 328 raise ValueError("Expected a SeqFeature position object.") 329 */ 330 } 331 332 /** 333 * Returns a list of strings. 334 * 335 * Any single words which are too long get returned as a whole line 336 * (e.g. URLs) without an exception or warning. 337 * @param text 338 * @param max_len 339 */ 340 protected ArrayList<String> _split_multi_line(String text, int max_len) { 341 // TODO Auto-generated method stub 342 ArrayList<String> output = new ArrayList<String>(); 343 text = text.trim(); 344 if(text.length() <= max_len) { 345 output.add(text); 346 return output; 347 } 348 349 ArrayList<String> words = new ArrayList<String>(); 350 Collections.addAll(words, text.split("\\s+")); 351 while(!words.isEmpty()) { 352 text = words.remove(0); 353 while(!words.isEmpty() && (text.length() + 1 + words.get(0).length()) <= max_len) { 354 text += " " + words.remove(0); 355 text = text.trim(); 356 } 357 output.add(text); 358 } 359 assert words.isEmpty(); 360 return output; 361 } 362 363 364}