001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.core.sequence.io; 022 023import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet; 024import org.biojava.nbio.core.sequence.compound.DNACompoundSet; 025import org.biojava.nbio.core.sequence.features.FeatureInterface; 026import org.biojava.nbio.core.sequence.io.template.GenbankHeaderFormatInterface; 027import org.biojava.nbio.core.sequence.template.AbstractSequence; 028import org.biojava.nbio.core.sequence.template.Compound; 029import org.biojava.nbio.core.util.StringManipulationHelper; 030 031import java.text.SimpleDateFormat; 032import java.util.*; 033 034public class GenericGenbankHeaderFormat<S extends AbstractSequence<C>, C extends Compound> 035 extends GenericInsdcHeaderFormat<S, C> implements 036 GenbankHeaderFormatInterface<S, C> { 037 private static final int HEADER_WIDTH = 12; 038 private static final String lineSep = "%n"; 039 private String seqType = null; 040 041 public GenericGenbankHeaderFormat() { 042 seqType = null; 043 } 044 045 public GenericGenbankHeaderFormat(String seqType) { 046 this.seqType = seqType; 047 } 048 049 /** 050 * Used in the the 'header' of each GenBank record. 051 * 052 * @param tag 053 * @param text 054 */ 055 private String _write_single_line(String tag, String text) { 056 assert tag.length() < HEADER_WIDTH; 057 return StringManipulationHelper.padRight(tag, HEADER_WIDTH) 058 + text.replace('\n', ' ') + lineSep; 059 } 060 061 /** 062 * Used in the the 'header' of each GenBank record. 063 * 064 * @param tag 065 * @param text 066 */ 067 private String _write_multi_line(String tag, String text) { 068 if (text == null) { 069 text = ""; 070 } 071 int max_len = MAX_WIDTH - HEADER_WIDTH; 072 ArrayList<String> lines = _split_multi_line(text, max_len); 073 String output = _write_single_line(tag, lines.get(0)); 074 for (int i = 1; i < lines.size(); i++) { 075 output += _write_single_line("", lines.get(i)); 076 } 077 return output; 078 } 079 080 /** 081 * used for DBLINK and any similar later line types. If the list of strings 082 * is empty, nothing is written. 083 * 084 * @param tag 085 * @param text_list 086 */ 087 /* 088 * private String _write_multi_entries(String tag, ArrayList<String> 089 * text_list) { String output = _write_single_line(tag,text_list.remove(0)); 090 * for(String s : text_list) { output += _write_single_line("", s); } return 091 * output; } 092 */ 093 094 private String _get_date(S sequence) { 095 Date sysdate = Calendar.getInstance().getTime(); 096 097 // String default_date = 098 // sysdate.get(Calendar.DAY_OF_MONTH)+"-"+sysdate.get(Calendar.MONTH)+"-"+sysdate.get(Calendar.YEAR); 099 String default_date = new SimpleDateFormat("dd-MMM-yyyy") 100 .format(sysdate); 101 return default_date; 102 /* 103 * try : date = record.annotations["date"] except KeyError : return 104 * default #Cope with a list of one string: if isinstance(date, list) 105 * and len(date)==1 : date = date[0] #TODO - allow a Python date object 106 * if not isinstance(date, str) or len(date) != 11 \ or date[2] != "-" 107 * or date[6] != "-" \ or not date[:2].isdigit() or not 108 * date[7:].isdigit() \ or int(date[:2]) > 31 \ or date[3:6] not in 109 * ["JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", 110 * "OCT", "NOV", "DEC"] : #TODO - Check is a valid date (e.g. not 31 111 * Feb) return default return date 112 */ 113 } 114 115 private String _get_data_division(S sequence) { 116 return UNKNOWN_DNA; 117 /* 118 * try: division = record.annotations["data_file_division"] except 119 * KeyError: division = "UNK" if division in ["PRI", "ROD", "MAM", 120 * "VRT", "INV", "PLN", "BCT", "VRL", "PHG", "SYN", "UNA", "EST", "PAT", 121 * "STS", "GSS", "HTG", "HTC", "ENV", "CON"]: #Good, already GenBank 122 * style # PRI - primate sequences # ROD - rodent sequences # MAM - 123 * other mammalian sequences # VRT - other vertebrate sequences # INV - 124 * invertebrate sequences # PLN - plant, fungal, and algal sequences # 125 * BCT - bacterial sequences [plus archea] # VRL - viral sequences # PHG 126 * - bacteriophage sequences # SYN - synthetic sequences # UNA - 127 * unannotated sequences # EST - EST sequences (expressed sequence tags) 128 * # PAT - patent sequences # STS - STS sequences (sequence tagged 129 * sites) # GSS - GSS sequences (genome survey sequences) # HTG - HTGS 130 * sequences (high throughput genomic sequences) # HTC - HTC sequences 131 * (high throughput cDNA sequences) # ENV - Environmental sampling 132 * sequences # CON - Constructed sequences # #(plus UNK for unknown) 133 * pass else: #See if this is in EMBL style: # Division Code # 134 * ----------------- ---- # Bacteriophage PHG - common # Environmental 135 * Sample ENV - common # Fungal FUN - map to PLN (plants + fungal) # 136 * Human HUM - map to PRI (primates) # Invertebrate INV - common # Other 137 * Mammal MAM - common # Other Vertebrate VRT - common # Mus musculus 138 * MUS - map to ROD (rodent) # Plant PLN - common # Prokaryote PRO - map 139 * to BCT (poor name) # Other Rodent ROD - common # Synthetic SYN - 140 * common # Transgenic TGN - ??? map to SYN ??? # Unclassified UNC - map 141 * to UNK # Viral VRL - common # #(plus XXX for submiting which we can 142 * map to UNK) embl_to_gbk = {"FUN":"PLN", "HUM":"PRI", "MUS":"ROD", 143 * "PRO":"BCT", "UNC":"UNK", "XXX":"UNK", } try: division = 144 * embl_to_gbk[division] except KeyError: division = "UNK" assert 145 * len(division)==3 return division 146 */ 147 } 148 149 /** 150 * Write the LOCUS line. 151 * 152 * @param sequence 153 * @param seqType 154 */ 155 private String _write_the_first_line(S sequence) { 156 /* 157 * locus = record.name if not locus or locus == "<unknown name>": locus 158 * = record.id if not locus or locus == "<unknown id>": locus = 159 * self._get_annotation_str(record, "accession", just_first=True)\ 160 */ 161 String locus; 162 try { 163 locus = sequence.getAccession().getID(); 164 } catch (Exception e) { 165 locus = ""; 166 } 167 if (locus.length() > 16) { 168 throw new RuntimeException("Locus identifier " + locus 169 + " is too long"); 170 } 171 172 String units = ""; 173 String mol_type = ""; 174 if (sequence.getCompoundSet() instanceof DNACompoundSet) { 175 units = "bp"; 176 mol_type = "DNA"; 177 } else if (sequence.getCompoundSet() instanceof DNACompoundSet) { 178 units = "bp"; 179 mol_type = "RNA"; 180 } else if (sequence.getCompoundSet() instanceof AminoAcidCompoundSet) { 181 units = "aa"; 182 mol_type = ""; 183 } else { 184 throw new RuntimeException( 185 "Need a DNACompoundSet, RNACompoundSet, or an AminoAcidCompoundSet"); 186 } 187 188 String division = _get_data_division(sequence); 189 190 if (seqType != null) { 191 division = seqType; 192 } 193 assert units.length() == 2; 194 195 // the next line does not seem right.. seqType == linear 196 // uncommenting for now 197 //assert division.length() == 3; 198 199 StringBuilder sb = new StringBuilder(); 200 Formatter formatter = new Formatter(sb, Locale.US); 201 formatter 202 .format("LOCUS %s %s %s %s %s %s" + lineSep, 203 StringManipulationHelper.padRight(locus, 16), 204 StringManipulationHelper.padLeft( 205 Integer.toString(sequence.getLength()), 11), 206 units, StringManipulationHelper.padRight(mol_type, 6), division, 207 _get_date(sequence)); 208 String output = formatter.toString(); 209 formatter.close(); 210 return output; 211 /* 212 * assert len(line) == 79+1, repr(line) #plus one for new line 213 * 214 * assert line[12:28].rstrip() == locus, \ 'LOCUS line does not contain 215 * the locus at the expected position:\n' + line assert line[28:29] == 216 * " " assert line[29:40].lstrip() == str(len(record)), \ 'LOCUS line 217 * does not contain the length at the expected position:\n' + line 218 * 219 * #Tests copied from Bio.GenBank.Scanner assert line[40:44] in [' bp ', 220 * ' aa '] , \ 'LOCUS line does not contain size units at expected 221 * position:\n' + line assert line[44:47] in [' ', 'ss-', 'ds-', 'ms-'], 222 * \ 'LOCUS line does not have valid strand type (Single stranded, 223 * ...):\n' + line assert line[47:54].strip() == "" \ or 224 * line[47:54].strip().find('DNA') != -1 \ or 225 * line[47:54].strip().find('RNA') != -1, \ 'LOCUS line does not contain 226 * valid sequence type (DNA, RNA, ...):\n' + line assert line[54:55] == 227 * ' ', \ 'LOCUS line does not contain space at position 55:\n' + line 228 * assert line[55:63].strip() in ['', 'linear', 'circular'], \ 'LOCUS 229 * line does not contain valid entry (linear, circular, ...):\n' + line 230 * assert line[63:64] == ' ', \ 'LOCUS line does not contain space at 231 * position 64:\n' + line assert line[67:68] == ' ', \ 'LOCUS line does 232 * not contain space at position 68:\n' + line assert line[70:71] == 233 * '-', \ 'LOCUS line does not contain - at position 71 in date:\n' + 234 * line assert line[74:75] == '-', \ 'LOCUS line does not contain - at 235 * position 75 in date:\n' + line 236 */ 237 } 238 239 /** 240 * This is a bit complicated due to the range of possible ways people might 241 * have done their annotation... Currently the parser uses a single string 242 * with newlines. A list of lines is also reasonable. A single (long) string 243 * is perhaps the most natural of all. This means we may need to deal with 244 * line wrapping. 245 * 246 * @param sequence 247 */ 248 private String _write_comment(S sequence) { 249 ArrayList<String> comments = sequence.getNotesList(); 250 String output = _write_multi_line("COMMENT", comments.remove(0)); 251 for (String comment : comments) { 252 output += _write_multi_line("", comment); 253 } 254 255 return output; 256 } 257 258 @Override 259 public String getHeader(S sequence) { 260 String header = _write_the_first_line(sequence); 261 String acc_with_version; 262 String accession; 263 try { 264 acc_with_version = sequence.getAccession().getID(); 265 accession = acc_with_version.split("\\.", 1)[0]; 266 } catch (Exception e) { 267 acc_with_version = ""; 268 accession = ""; 269 } 270 String description = sequence.getDescription(); 271 if ("<unknown description>".equals(description) || description == null) { 272 description = "."; 273 } 274 header += _write_multi_line("DEFINITION", description); 275 header += _write_multi_line("ACCESSION", accession); 276 header += _write_multi_line("VERSION", acc_with_version); 277 278 /* 279 * gi = self._get_annotation_str(record, "gi", just_first=True) 280 * 281 * self._write_single_line("ACCESSION", accession) if gi != ".": 282 * self._write_single_line("VERSION", "%s GI:%s" \ % (acc_with_version, 283 * gi)) else: self._write_single_line("VERSION", "%s" % 284 * (acc_with_version)) 285 * 286 * #The NCBI only expect two types of link so far, #e.g. "Project:28471" 287 * and "Trace Assembly Archive:123456" #TODO - Filter the dbxrefs list 288 * to just these? self._write_multi_entries("DBLINK", record.dbxrefs) 289 * 290 * try: #List of strings #Keywords should be given separated with semi 291 * colons, keywords = "; ".join(record.annotations["keywords"]) #with a 292 * trailing period: if not keywords.endswith(".") : keywords += "." 293 * except KeyError: #If no keywords, there should be just a period: 294 * keywords = "." 295 */ 296 297 header += _write_multi_line("KEYWORDS", "."); 298 299 /* 300 * if "segment" in record.annotations: #Deal with SEGMENT line found 301 * only in segmented records, #e.g. AH000819 segment = 302 * record.annotations["segment"] if isinstance(segment, list): assert 303 * len(segment)==1, segment segment = segment[0] 304 * self._write_single_line("SEGMENT", segment) 305 * 306 * self._write_multi_line("SOURCE", \ self._get_annotation_str(record, 307 * "source")) 308 */ 309 310 header += _write_multi_line("SOURCE", sequence.getSource()); 311 312 /* 313 * #The ORGANISM line MUST be a single line, as any continuation is the 314 * taxonomy org = self._get_annotation_str(record, "organism") if 315 * len(org) > self.MAX_WIDTH - self.HEADER_WIDTH: org = 316 * org[:self.MAX_WIDTH - self.HEADER_WIDTH-4]+"..." 317 * self._write_single_line(" ORGANISM", org) try: #List of strings 318 * #Taxonomy should be given separated with semi colons, taxonomy = 319 * "; ".join(record.annotations["taxonomy"]) #with a trailing period: if 320 * not taxonomy.endswith(".") : taxonomy += "." except KeyError: 321 * taxonomy = "." self._write_multi_line("", taxonomy) 322 * 323 * if "references" in record.annotations: self._write_references(record) 324 */ 325 if (!sequence.getNotesList().isEmpty()) { 326 header += _write_comment(sequence); 327 } 328 329 header += "FEATURES Location/Qualifiers" + lineSep; 330 int rec_length = sequence.getLength(); 331 for (FeatureInterface<AbstractSequence<C>, C> feature : sequence 332 .getFeatures()) { 333 header += _write_feature(feature, rec_length); 334 } 335 336 return header; 337 } 338 339}