001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021/** 022 * 023 */ 024package org.biojava.nbio.core.sequence.io; 025 026import org.biojava.nbio.core.sequence.Strand; 027import org.biojava.nbio.core.sequence.features.FeatureInterface; 028import org.biojava.nbio.core.sequence.features.Qualifier; 029import org.biojava.nbio.core.sequence.location.template.AbstractLocation; 030import org.biojava.nbio.core.sequence.location.template.Point; 031import org.biojava.nbio.core.sequence.template.AbstractSequence; 032import org.biojava.nbio.core.sequence.template.Compound; 033import org.biojava.nbio.core.util.StringManipulationHelper; 034 035import java.util.ArrayList; 036import java.util.Collections; 037import java.util.Formatter; 038import java.util.List; 039import java.util.Locale; 040 041/** 042 * @author mckeee1 043 * 044 */ 045public class GenericInsdcHeaderFormat<S extends AbstractSequence<C>, C extends Compound> { 046 protected static final int MAX_WIDTH = 80; 047 protected static final int QUALIFIER_INDENT = 21; 048 protected static final String QUALIFIER_INDENT_STR = " "; 049 protected static final String QUALIFIER_INDENT_TMP = " %s "; 050 private static final String lineSep = "%n"; 051 052 /** 053 * Format a feature qualifier using the MAX_WIDTH (default 80) 054 * @param key 055 * @param value 056 * @param quote 057 */ 058 private String _write_feature_qualifier(String key, String value, boolean quote) { 059 String line = ""; 060 if(null == value) { 061 line = QUALIFIER_INDENT_STR + "/" + key + lineSep; 062 return line; 063 } 064 if(quote) { // quote should be true for numerics 065 line = QUALIFIER_INDENT_STR + "/" + key + "=\"" + value + "\""; 066 } else { 067 line = QUALIFIER_INDENT_STR + "/" + key + "=" + value; 068 } 069 if(line.length() <= MAX_WIDTH) { 070 return line + lineSep; 071 } 072 String goodlines = ""; 073 while(!"".equals(line.replaceAll("^\\s+", ""))) { 074 if(line.length() <= MAX_WIDTH) { 075 goodlines += line + lineSep; 076 break; 077 } 078 //Insert line break... 079 int index; 080 for(index = Math.min(line.length()-1, MAX_WIDTH); index > QUALIFIER_INDENT ; index--) { 081 if(' ' == line.charAt(index)) { 082 break; 083 } 084 } 085 if(' ' != line.charAt(index)) { 086 //no nice place to break... 087 index = MAX_WIDTH; 088 } 089 assert index <= MAX_WIDTH; 090 goodlines += line.substring(0,index) + lineSep; 091 line = QUALIFIER_INDENT_STR + line.substring(index).replaceAll("^\\s+", ""); 092 } 093 return goodlines; 094 } 095 /** 096 * Split a feature location into lines (break at commas). 097 * @param location 098 */ 099 private String _wrap_location(String location) { 100 int length = MAX_WIDTH - QUALIFIER_INDENT; 101 if(location.length() <= length) { 102 return location; 103 } 104 int index = location.substring(length).lastIndexOf(","); 105 if(-1 == index) { 106 //No good place to split (!) 107 return location; 108 } 109 return location.substring(0,index+1) + lineSep + QUALIFIER_INDENT_STR + _wrap_location(location.substring(index+1)); 110 } 111 /** 112 * Write a single SeqFeature object to features table. 113 * @param feature 114 * @param record_length 115 */ 116 protected String _write_feature(FeatureInterface<AbstractSequence<C>, C> feature, int record_length) { 117 String location = _insdc_feature_location_string(feature, record_length); 118 String f_type = feature.getType().replace(" ", "_"); 119 StringBuilder sb = new StringBuilder(); 120 Formatter formatter = new Formatter(sb,Locale.US); 121 formatter.format(QUALIFIER_INDENT_TMP, f_type); 122 String line = formatter.toString().substring(0, QUALIFIER_INDENT) + _wrap_location(location) + lineSep; 123 formatter.close(); 124 125 //Now the qualifiers... 126 for(List<Qualifier> qualifiers : feature.getQualifiers().values()) { 127 for(Qualifier q : qualifiers){ 128 line += _write_feature_qualifier(q.getName(), q.getValue(), q.needsQuotes()); 129 } 130 } 131 return line; 132 /* 133 self.handle.write(line) 134 #Now the qualifiers... 135 for key, values in feature.qualifiers.items(): 136 if isinstance(values, list) or isinstance(values, tuple): 137 for value in values: 138 self._write_feature_qualifier(key, value) 139 elif values: 140 #String, int, etc 141 self._write_feature_qualifier(key, values) 142 else: 143 #e.g. a /psuedo entry 144 self._write_feature_qualifier(key) 145 */ 146 } 147 /** 148 * Build a GenBank/EMBL location string from a SeqFeature (PRIVATE). 149 150 There is a choice of how to show joins on the reverse complement strand, 151 GenBank used "complement(join(1,10),(20,100))" while EMBL used to use 152 "join(complement(20,100),complement(1,10))" instead (but appears to have 153 now adopted the GenBank convention). Notice that the order of the entries 154 is reversed! This function therefore uses the first form. In this situation 155 we expect the parent feature and the two children to all be marked as 156 strand == -1, and in the order 0:10 then 19:100. 157 158 Also need to consider dual-strand examples like these from the Arabidopsis 159 thaliana chloroplast NC_000932: join(complement(69611..69724),139856..140650) 160 gene ArthCp047, GeneID:844801 or its CDS (protein NP_051038.1 GI:7525057) 161 which is further complicated by a splice: 162 join(complement(69611..69724),139856..140087,140625..140650) 163 164 For mixed this mixed strand feature, the parent SeqFeature should have 165 no strand (either 0 or None) while the child features should have either 166 strand +1 or -1 as appropriate, and be listed in the order given here. 167 * @param feature 168 * @param record_length 169 */ 170 private String _insdc_feature_location_string(FeatureInterface<AbstractSequence<C>, C> feature, int record_length) { 171 if(feature.getChildrenFeatures().isEmpty()) { 172 //Non-recursive. 173 String location = _insdc_location_string_ignoring_strand_and_subfeatures(feature.getLocations(), record_length); 174 if(feature.getLocations().getStrand() == Strand.NEGATIVE) { 175 StringBuilder sb = new StringBuilder(); 176 Formatter formatter = new Formatter(sb,Locale.US); 177 formatter.format("complement(%s)", location); 178 String output = formatter.toString(); 179 formatter.close(); 180 location = output; 181 } 182 return location; 183 } 184 // As noted above, treat reverse complement strand features carefully: 185 if(feature.getLocations().getStrand() == Strand.NEGATIVE) { 186 for(FeatureInterface<?, ?> f : feature.getChildrenFeatures()) { 187 if(f.getLocations().getStrand() != Strand.NEGATIVE) { 188 StringBuilder sb = new StringBuilder(); 189 Formatter formatter = new Formatter(sb,Locale.US); 190 formatter.format("Inconsistent strands: %s for parent, %s for child", feature.getLocations().getStrand(), f.getLocations().getStrand()); 191 String output = formatter.toString(); 192 formatter.close(); 193 throw new RuntimeException(output); 194 } 195 } 196 StringBuilder sb = new StringBuilder(); 197 Formatter formatter = new Formatter(sb,Locale.US); 198 ArrayList<String> locations = new ArrayList<String>(); 199 for(FeatureInterface<AbstractSequence<C>, C> f : feature.getChildrenFeatures()) { 200 locations.add(_insdc_location_string_ignoring_strand_and_subfeatures(f.getLocations(), record_length)); 201 } 202 String location = StringManipulationHelper.join(locations, ","); 203 formatter.format("complement(%s(%s))", /*feature.location_operator*/ "join", location); 204 String output = formatter.toString(); 205 formatter.close(); 206 return output; 207 } 208 //This covers typical forward strand features, and also an evil mixed strand: 209 StringBuilder sb = new StringBuilder(); 210 Formatter formatter = new Formatter(sb,Locale.US); 211 ArrayList<String> locations = new ArrayList<String>(); 212 for(FeatureInterface<AbstractSequence<C>, C> f : feature.getChildrenFeatures()) { 213 locations.add(_insdc_location_string_ignoring_strand_and_subfeatures(f.getLocations(), record_length)); 214 } 215 String location = StringManipulationHelper.join(locations, ","); 216 formatter.format("%s(%s)", /*feature.location_operator*/ "join", location); 217 String output = formatter.toString(); 218 formatter.close(); 219 return output; 220 } 221 222 private String _insdc_location_string_ignoring_strand_and_subfeatures( 223 //SequenceLocation<AbstractSequence<C>, C> sequenceLocation, 224 AbstractLocation sequenceLocation, 225 int record_length) { 226 /* 227 if location.ref: 228 ref = "%s:" % location.ref 229 else: 230 ref = "" 231 assert not location.ref_db 232 */ 233 String ref = ""; 234 if(!sequenceLocation.getStart().isUncertain() && !sequenceLocation.getEnd().isUncertain() && sequenceLocation.getStart() == sequenceLocation.getEnd()) { 235 //Special case, for 12:12 return 12^13 236 //(a zero length slice, meaning the point between two letters) 237 if(sequenceLocation.getEnd().getPosition() == record_length) { 238 //Very special case, for a between position at the end of a 239 //sequence (used on some circular genomes, Bug 3098) we have 240 //N:N so return N^1 241 StringBuilder sb = new StringBuilder(); 242 Formatter formatter = new Formatter(sb,Locale.US); 243 formatter.format("%s%d^1", ref, record_length); 244 String output = formatter.toString(); 245 formatter.close(); 246 return output; 247 } else { 248 StringBuilder sb = new StringBuilder(); 249 Formatter formatter = new Formatter(sb,Locale.US); 250 formatter.format("%s%d^%d", ref, sequenceLocation.getStart().getPosition(), sequenceLocation.getEnd().getPosition()); 251 String output = formatter.toString(); 252 formatter.close(); 253 return output; 254 } 255 } 256 if(!sequenceLocation.getStart().isUncertain() && !sequenceLocation.getEnd().isUncertain() && sequenceLocation.getStart().getPosition() + 1 == sequenceLocation.getEnd().getPosition()) { 257 //Special case, for 11:12 return 12 rather than 12..12 258 //(a length one slice, meaning a single letter) 259 StringBuilder sb = new StringBuilder(); 260 Formatter formatter = new Formatter(sb,Locale.US); 261 formatter.format("%s%d", ref, sequenceLocation.getEnd().getPosition()); 262 String output = formatter.toString(); 263 formatter.close(); 264 return output; 265 } else if(sequenceLocation.getStart().isUnknown() || sequenceLocation.getEnd().isUnknown()) { 266 //Special case for features from SwissProt/UniProt files 267 if(sequenceLocation.getStart().isUnknown() && sequenceLocation.getEnd().isUnknown()) { 268 throw new RuntimeException("Feature with unknown location"); 269 } else if(sequenceLocation.getStart().isUnknown()) { 270 //Treat the unknown start position as a BeforePosition 271 StringBuilder sb = new StringBuilder(); 272 Formatter formatter = new Formatter(sb,Locale.US); 273 formatter.format("%s<%d..%s", ref, sequenceLocation.getEnd().getPosition(), _insdc_feature_position_string(sequenceLocation.getEnd())); 274 String output = formatter.toString(); 275 formatter.close(); 276 return output; 277 } else { 278 //Treat the unknown start position as an AfterPosition 279 StringBuilder sb = new StringBuilder(); 280 Formatter formatter = new Formatter(sb,Locale.US); 281 formatter.format("%s%s..>%d", ref, _insdc_feature_position_string(sequenceLocation.getStart()), sequenceLocation.getStart().getPosition()); 282 String output = formatter.toString(); 283 formatter.close(); 284 return output; 285 } 286 } else { 287 //Typical case, e.g. 12..15 gets mapped to 11:15 288 return ref + _insdc_feature_position_string(sequenceLocation.getStart(), 0) + ".." + _insdc_feature_position_string(sequenceLocation.getEnd()); 289 } 290 } 291 private String _insdc_feature_position_string(Point location) { 292 // TODO Auto-generated method stub 293 return _insdc_feature_position_string(location, 0); 294 } 295 296 /** 297 * Build a GenBank/EMBL position string (PRIVATE). 298 * @param location 299 * @param increment 300 */ 301 private String _insdc_feature_position_string(Point location, int increment) { 302 StringBuilder sb = new StringBuilder(); 303 Formatter formatter = new Formatter(sb,Locale.US); 304 formatter.format("%s", location.getPosition() + increment); 305 String output = formatter.toString(); 306 formatter.close(); 307 return output; 308 309 /* 310 if isinstance(pos, SeqFeature.ExactPosition): 311 return "%i" % (pos.position+offset) 312 elif isinstance(pos, SeqFeature.WithinPosition): 313 return "(%i.%i)" % (pos.position + offset, 314 pos.position + pos.extension + offset) 315 elif isinstance(pos, SeqFeature.BetweenPosition): 316 return "(%i^%i)" % (pos.position + offset, 317 pos.position + pos.extension + offset) 318 elif isinstance(pos, SeqFeature.BeforePosition): 319 return "<%i" % (pos.position + offset) 320 elif isinstance(pos, SeqFeature.AfterPosition): 321 return ">%i" % (pos.position + offset) 322 elif isinstance(pos, SeqFeature.OneOfPosition): 323 return "one-of(%s)" \ 324 % ",".join([_insdc_feature_position_string(p,offset) \ 325 for p in pos.position_choices]) 326 elif isinstance(pos, SeqFeature.AbstractPosition): 327 raise NotImplementedError("Please report this as a bug in Biopython.") 328 else: 329 raise ValueError("Expected a SeqFeature position object.") 330 */ 331 } 332 333 /** 334 * Returns a list of strings. 335 * 336 * Any single words which are too long get returned as a whole line 337 * (e.g. URLs) without an exception or warning. 338 * @param text 339 * @param max_len 340 */ 341 protected ArrayList<String> _split_multi_line(String text, int max_len) { 342 // TODO Auto-generated method stub 343 ArrayList<String> output = new ArrayList<String>(); 344 text = text.trim(); 345 if(text.length() <= max_len) { 346 output.add(text); 347 return output; 348 } 349 350 ArrayList<String> words = new ArrayList<String>(); 351 Collections.addAll(words, text.split("\\s+")); 352 while(!words.isEmpty()) { 353 text = words.remove(0); 354 while(!words.isEmpty() && (text.length() + 1 + words.get(0).length()) <= max_len) { 355 text += " " + words.remove(0); 356 text = text.trim(); 357 } 358 output.add(text); 359 } 360 assert words.isEmpty(); 361 return output; 362 } 363 364 365}