001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.core.sequence.io; 022 023import org.biojava.nbio.core.sequence.AccessionID; 024import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet; 025import org.biojava.nbio.core.sequence.compound.DNACompoundSet; 026import org.biojava.nbio.core.sequence.compound.RNACompoundSet; 027import org.biojava.nbio.core.sequence.features.FeatureInterface; 028import org.biojava.nbio.core.sequence.io.template.GenbankHeaderFormatInterface; 029import org.biojava.nbio.core.sequence.template.AbstractSequence; 030import org.biojava.nbio.core.sequence.template.Compound; 031import org.biojava.nbio.core.util.StringManipulationHelper; 032 033import java.text.SimpleDateFormat; 034import java.util.*; 035 036public class GenericGenbankHeaderFormat<S extends AbstractSequence<C>, C extends Compound> 037 extends GenericInsdcHeaderFormat<S, C> implements 038 GenbankHeaderFormatInterface<S, C> { 039 private static final int HEADER_WIDTH = 12; 040 private static final String lineSep = "%n"; 041 private String seqType = null; 042 private boolean useOriginalHeader = false; 043 044 public GenericGenbankHeaderFormat() { 045 seqType = null; 046 } 047 048 public GenericGenbankHeaderFormat(String seqType) { 049 this.seqType = seqType; 050 } 051 052 public GenericGenbankHeaderFormat(boolean useOriginalHeader) { 053 this.useOriginalHeader = useOriginalHeader; 054 } 055 056 /** 057 * Used in the the 'header' of each GenBank record. 058 * 059 * @param tag 060 * @param text 061 */ 062 private String _write_single_line(String tag, String text) { 063 assert tag.length() < HEADER_WIDTH; 064 return StringManipulationHelper.padRight(tag, HEADER_WIDTH) 065 + text.replace('\n', ' ') + lineSep; 066 } 067 068 /** 069 * Used in the the 'header' of each GenBank record. 070 * 071 * @param tag 072 * @param text 073 */ 074 private String _write_multi_line(String tag, String text) { 075 if (text == null) { 076 text = ""; 077 } 078 int max_len = MAX_WIDTH - HEADER_WIDTH; 079 ArrayList<String> lines = _split_multi_line(text, max_len); 080 String output = _write_single_line(tag, lines.get(0)); 081 for (int i = 1; i < lines.size(); i++) { 082 output += _write_single_line("", lines.get(i)); 083 } 084 return output; 085 } 086 087 private String _get_date(S sequence) { 088 Date sysdate = Calendar.getInstance().getTime(); 089 090 // String default_date = 091 // sysdate.get(Calendar.DAY_OF_MONTH)+"-"+sysdate.get(Calendar.MONTH)+"-"+sysdate.get(Calendar.YEAR); 092 String default_date = new SimpleDateFormat("dd-MMM-yyyy") 093 .format(sysdate); 094 return default_date; 095 /* 096 * try : date = record.annotations["date"] except KeyError : return 097 * default #Cope with a list of one string: if isinstance(date, list) 098 * and len(date)==1 : date = date[0] #TODO - allow a Python date object 099 * if not isinstance(date, str) or len(date) != 11 \ or date[2] != "-" 100 * or date[6] != "-" \ or not date[:2].isdigit() or not 101 * date[7:].isdigit() \ or int(date[:2]) > 31 \ or date[3:6] not in 102 * ["JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", 103 * "OCT", "NOV", "DEC"] : #TODO - Check is a valid date (e.g. not 31 104 * Feb) return default return date 105 */ 106 } 107 108 private String _get_data_division(S sequence) { 109 return UNKNOWN_DNA; 110 /* 111 * try: division = record.annotations["data_file_division"] except 112 * KeyError: division = "UNK" if division in ["PRI", "ROD", "MAM", 113 * "VRT", "INV", "PLN", "BCT", "VRL", "PHG", "SYN", "UNA", "EST", "PAT", 114 * "STS", "GSS", "HTG", "HTC", "ENV", "CON"]: #Good, already GenBank 115 * style # PRI - primate sequences # ROD - rodent sequences # MAM - 116 * other mammalian sequences # VRT - other vertebrate sequences # INV - 117 * invertebrate sequences # PLN - plant, fungal, and algal sequences # 118 * BCT - bacterial sequences [plus archea] # VRL - viral sequences # PHG 119 * - bacteriophage sequences # SYN - synthetic sequences # UNA - 120 * unannotated sequences # EST - EST sequences (expressed sequence tags) 121 * # PAT - patent sequences # STS - STS sequences (sequence tagged 122 * sites) # GSS - GSS sequences (genome survey sequences) # HTG - HTGS 123 * sequences (high throughput genomic sequences) # HTC - HTC sequences 124 * (high throughput cDNA sequences) # ENV - Environmental sampling 125 * sequences # CON - Constructed sequences # #(plus UNK for unknown) 126 * pass else: #See if this is in EMBL style: # Division Code # 127 * ----------------- ---- # Bacteriophage PHG - common # Environmental 128 * Sample ENV - common # Fungal FUN - map to PLN (plants + fungal) # 129 * Human HUM - map to PRI (primates) # Invertebrate INV - common # Other 130 * Mammal MAM - common # Other Vertebrate VRT - common # Mus musculus 131 * MUS - map to ROD (rodent) # Plant PLN - common # Prokaryote PRO - map 132 * to BCT (poor name) # Other Rodent ROD - common # Synthetic SYN - 133 * common # Transgenic TGN - ??? map to SYN ??? # Unclassified UNC - map 134 * to UNK # Viral VRL - common # #(plus XXX for submiting which we can 135 * map to UNK) embl_to_gbk = {"FUN":"PLN", "HUM":"PRI", "MUS":"ROD", 136 * "PRO":"BCT", "UNC":"UNK", "XXX":"UNK", } try: division = 137 * embl_to_gbk[division] except KeyError: division = "UNK" assert 138 * len(division)==3 return division 139 */ 140 } 141 142 /** 143 * Write the LOCUS line. 144 * 145 * @param sequence 146 */ 147 private String _write_the_first_line(S sequence) { 148 /* 149 * locus = record.name if not locus or locus == "<unknown name>": locus 150 * = record.id if not locus or locus == "<unknown id>": locus = 151 * self._get_annotation_str(record, "accession", just_first=True)\ 152 */ 153 String locus; 154 try { 155 locus = sequence.getAccession().getID(); 156 } catch (Exception e) { 157 locus = ""; 158 } 159 if (locus.length() > 16) { 160 throw new RuntimeException("Locus identifier " + locus 161 + " is too long"); 162 } 163 164 String units = ""; 165 String mol_type = ""; 166 if (sequence.getCompoundSet() instanceof DNACompoundSet) { 167 units = "bp"; 168 mol_type = "DNA"; 169 } else if (sequence.getCompoundSet() instanceof RNACompoundSet) { 170 units = "bp"; 171 mol_type = "RNA"; 172 } else if (sequence.getCompoundSet() instanceof AminoAcidCompoundSet) { 173 units = "aa"; 174 mol_type = ""; 175 } else { 176 throw new RuntimeException( 177 "Need a DNACompoundSet, RNACompoundSet, or an AminoAcidCompoundSet"); 178 } 179 180 String division = _get_data_division(sequence); 181 182 if (seqType != null) { 183 division = seqType; 184 } 185 assert units.length() == 2; 186 187 // the next line does not seem right.. seqType == linear 188 // uncommenting for now 189 //assert division.length() == 3; 190 191 StringBuilder sb = new StringBuilder(); 192 Formatter formatter = new Formatter(sb, Locale.US); 193 formatter 194 .format("LOCUS %s %s %s %s %s %s" + lineSep, 195 StringManipulationHelper.padRight(locus, 16), 196 StringManipulationHelper.padLeft( 197 Integer.toString(sequence.getLength()), 11), 198 units, StringManipulationHelper.padRight(mol_type, 6), division, 199 _get_date(sequence)); 200 String output = formatter.toString(); 201 formatter.close(); 202 return output; 203 /* 204 * assert len(line) == 79+1, repr(line) #plus one for new line 205 * 206 * assert line[12:28].rstrip() == locus, \ 'LOCUS line does not contain 207 * the locus at the expected position:\n' + line assert line[28:29] == 208 * " " assert line[29:40].lstrip() == str(len(record)), \ 'LOCUS line 209 * does not contain the length at the expected position:\n' + line 210 * 211 * #Tests copied from Bio.GenBank.Scanner assert line[40:44] in [' bp ', 212 * ' aa '] , \ 'LOCUS line does not contain size units at expected 213 * position:\n' + line assert line[44:47] in [' ', 'ss-', 'ds-', 'ms-'], 214 * \ 'LOCUS line does not have valid strand type (Single stranded, 215 * ...):\n' + line assert line[47:54].strip() == "" \ or 216 * line[47:54].strip().find('DNA') != -1 \ or 217 * line[47:54].strip().find('RNA') != -1, \ 'LOCUS line does not contain 218 * valid sequence type (DNA, RNA, ...):\n' + line assert line[54:55] == 219 * ' ', \ 'LOCUS line does not contain space at position 55:\n' + line 220 * assert line[55:63].strip() in ['', 'linear', 'circular'], \ 'LOCUS 221 * line does not contain valid entry (linear, circular, ...):\n' + line 222 * assert line[63:64] == ' ', \ 'LOCUS line does not contain space at 223 * position 64:\n' + line assert line[67:68] == ' ', \ 'LOCUS line does 224 * not contain space at position 68:\n' + line assert line[70:71] == 225 * '-', \ 'LOCUS line does not contain - at position 71 in date:\n' + 226 * line assert line[74:75] == '-', \ 'LOCUS line does not contain - at 227 * position 75 in date:\n' + line 228 */ 229 } 230 231 /** 232 * Write the original LOCUS line. 233 * 234 * @param sequence 235 */ 236 private String _write_original_first_line(S sequence) { 237 238 StringBuilder sb = new StringBuilder(); 239 Formatter formatter = new Formatter(sb, Locale.US); 240 formatter.format("LOCUS %s" + lineSep, 241 StringManipulationHelper.padRight(sequence.getOriginalHeader(), 16)); 242 String output = formatter.toString(); 243 formatter.close(); 244 return output; 245 } 246 247 /** 248 * This is a bit complicated due to the range of possible ways people might 249 * have done their annotation... Currently the parser uses a single string 250 * with newlines. A list of lines is also reasonable. A single (long) string 251 * is perhaps the most natural of all. This means we may need to deal with 252 * line wrapping. 253 * 254 * @param sequence 255 */ 256 private String _write_comment(S sequence) { 257 ArrayList<String> comments = sequence.getNotesList(); 258 String output = _write_multi_line("COMMENT", comments.remove(0)); 259 for (String comment : comments) { 260 output += _write_multi_line("", comment); 261 } 262 263 return output; 264 } 265 266 @Override 267 public String getHeader(S sequence) { 268 269 String header; 270 271 if (useOriginalHeader) { 272 header = _write_original_first_line(sequence); 273 } else { 274 header = _write_the_first_line(sequence); 275 } 276 277 AccessionID accessionIdObj = sequence.getAccession(); 278 String accession; 279 String acc_with_version; 280 281 try { 282 accession = accessionIdObj.getID(); 283 284 if (accessionIdObj.getIdentifier() != null) { 285 acc_with_version = sequence.getAccession().getID() + "." + sequence.getAccession().getVersion() + " GI:" 286 + accessionIdObj.getIdentifier(); 287 288 } else { 289 acc_with_version = sequence.getAccession().getID() + "." + sequence.getAccession().getVersion(); 290 } 291 292 } catch (Exception e) { 293 acc_with_version = ""; 294 accession = ""; 295 } 296 String description = sequence.getDescription(); 297 if ("<unknown description>".equals(description) || description == null) { 298 description = "."; 299 } 300 header += _write_multi_line("DEFINITION", description); 301 header += _write_multi_line("ACCESSION", accession); 302 header += _write_multi_line("VERSION", acc_with_version); 303 304 /* 305 * #The NCBI only expect two types of link so far, #e.g. "Project:28471" 306 * and "Trace Assembly Archive:123456" #TODO - Filter the dbxrefs list 307 * to just these? self._write_multi_entries("DBLINK", record.dbxrefs) 308 * 309 * try: #List of strings #Keywords should be given separated with semi 310 * colons, keywords = "; ".join(record.annotations["keywords"]) #with a 311 * trailing period: if not keywords.endswith(".") : keywords += "." 312 * except KeyError: #If no keywords, there should be just a period: 313 * keywords = "." 314 */ 315 316 header += _write_multi_line("KEYWORDS", "."); 317 318 /* 319 * if "segment" in record.annotations: #Deal with SEGMENT line found 320 * only in segmented records, #e.g. AH000819 segment = 321 * record.annotations["segment"] if isinstance(segment, list): assert 322 * len(segment)==1, segment segment = segment[0] 323 * self._write_single_line("SEGMENT", segment) 324 * 325 * self._write_multi_line("SOURCE", \ self._get_annotation_str(record, 326 * "source")) 327 */ 328 329 header += _write_multi_line("SOURCE", sequence.getSource()); 330 331 /* 332 * #The ORGANISM line MUST be a single line, as any continuation is the 333 * taxonomy org = self._get_annotation_str(record, "organism") if 334 * len(org) > self.MAX_WIDTH - self.HEADER_WIDTH: org = 335 * org[:self.MAX_WIDTH - self.HEADER_WIDTH-4]+"..." 336 * self._write_single_line(" ORGANISM", org) try: #List of strings 337 * #Taxonomy should be given separated with semi colons, taxonomy = 338 * "; ".join(record.annotations["taxonomy"]) #with a trailing period: if 339 * not taxonomy.endswith(".") : taxonomy += "." except KeyError: 340 * taxonomy = "." self._write_multi_line("", taxonomy) 341 * 342 * if "references" in record.annotations: self._write_references(record) 343 */ 344 if (!sequence.getNotesList().isEmpty()) { 345 header += _write_comment(sequence); 346 } 347 348 header += "FEATURES Location/Qualifiers" + lineSep; 349 int rec_length = sequence.getLength(); 350 for (FeatureInterface<AbstractSequence<C>, C> feature : sequence 351 .getFeatures()) { 352 header += _write_feature(feature, rec_length); 353 } 354 355 return String.format(header); 356 } 357 358}