001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojavax.bio.seq.io; 023 024import java.io.BufferedInputStream; 025import java.io.BufferedReader; 026import java.io.File; 027import java.io.FileReader; 028import java.io.IOException; 029import java.io.InputStreamReader; 030import java.io.PrintStream; 031import java.util.ArrayList; 032import java.util.HashSet; 033import java.util.Iterator; 034import java.util.List; 035import java.util.Set; 036import java.util.TreeSet; 037import java.util.regex.Matcher; 038import java.util.regex.Pattern; 039 040import org.biojava.bio.seq.Sequence; 041import org.biojava.bio.seq.io.ParseException; 042import org.biojava.bio.seq.io.SeqIOListener; 043import org.biojava.bio.seq.io.SymbolTokenization; 044import org.biojava.bio.symbol.IllegalAlphabetException; 045import org.biojava.bio.symbol.IllegalSymbolException; 046import org.biojava.bio.symbol.SimpleSymbolList; 047import org.biojava.bio.symbol.Symbol; 048import org.biojava.bio.symbol.SymbolList; 049import org.biojava.utils.ChangeVetoException; 050import org.biojavax.Comment; 051import org.biojavax.CrossRef; 052import org.biojavax.DocRef; 053import org.biojavax.DocRefAuthor; 054import org.biojavax.Namespace; 055import org.biojavax.Note; 056import org.biojavax.RankedCrossRef; 057import org.biojavax.RankedDocRef; 058import org.biojavax.RichObjectFactory; 059import org.biojavax.SimpleComment; 060import org.biojavax.SimpleCrossRef; 061import org.biojavax.SimpleDocRef; 062import org.biojavax.SimpleRankedCrossRef; 063import org.biojavax.SimpleRankedDocRef; 064import org.biojavax.SimpleRichAnnotation; 065import org.biojavax.bio.seq.CompoundRichLocation; 066import org.biojavax.bio.seq.RichFeature; 067import org.biojavax.bio.seq.RichLocation; 068import org.biojavax.bio.seq.RichSequence; 069import org.biojavax.bio.seq.SimplePosition; 070import org.biojavax.bio.seq.SimpleRichLocation; 071import org.biojavax.bio.taxa.NCBITaxon; 072import org.biojavax.bio.taxa.SimpleNCBITaxon; 073import org.biojavax.ontology.ComparableTerm; 074import org.biojavax.utils.StringTools; 075 076/** 077 * Format reader for GenBank files. This version of Genbank format will generate 078 * and write RichSequence objects. Loosely Based on code from the old, deprecated, 079 * org.biojava.bio.seq.io.GenbankFormat object. 080 * 081 * @author Richard Holland 082 * @author Mark Schreiber 083 * @author David Scott 084 * @author Bubba Puryear 085 * @author George Waldon 086 * @author Deepak Sheoran 087 * @since 1.5 088 */ 089public class GenbankFormat extends RichSequenceFormat.HeaderlessFormat { 090 091 // Register this format with the format auto-guesser. 092 static { 093 RichSequence.IOTools.registerFormat(GenbankFormat.class); 094 } 095 096 /** 097 * The name of this format 098 */ 099 public static final String GENBANK_FORMAT = "GENBANK"; 100 101 protected static final String LOCUS_TAG = "LOCUS"; 102 protected static final String DEFINITION_TAG = "DEFINITION"; 103 protected static final String ACCESSION_TAG = "ACCESSION"; 104 protected static final String VERSION_TAG = "VERSION"; 105 protected static final String KEYWORDS_TAG = "KEYWORDS"; 106 // "SEGMENT" 107 protected static final String SOURCE_TAG = "SOURCE"; 108 protected static final String ORGANISM_TAG = "ORGANISM"; 109 protected static final String REFERENCE_TAG = "REFERENCE"; 110 protected static final String AUTHORS_TAG = "AUTHORS"; 111 protected static final String CONSORTIUM_TAG = "CONSRTM"; 112 protected static final String TITLE_TAG = "TITLE"; 113 protected static final String JOURNAL_TAG = "JOURNAL"; 114 protected static final String PUBMED_TAG = "PUBMED"; 115 protected static final String MEDLINE_TAG = "MEDLINE"; //deprecated 116 protected static final String REMARK_TAG = "REMARK"; 117 protected static final String COMMENT_TAG = "COMMENT"; 118 protected static final String FEATURE_TAG = "FEATURES"; 119 protected static final String BASE_COUNT_TAG_FULL = "BASE COUNT"; //deprecated 120 protected static final String BASE_COUNT_TAG = "BASE"; 121 // "CONTIG" 122 protected static final String START_SEQUENCE_TAG = "ORIGIN"; 123 protected static final String END_SEQUENCE_TAG = "//"; 124 125 // locus line 126 protected static final Pattern lp = Pattern.compile("^(\\S+)\\s+\\d+\\s+(bp|aa)\\s{1,4}([dms]s-)?(\\S+)?\\s+(circular|linear)?\\s*(\\S+)?\\s*(\\S+)?$"); 127 // version line 128 protected static final Pattern vp = Pattern.compile("^(\\S*?)(\\.(\\d+))?(\\s+GI:(\\S+))?$"); 129 // reference line 130 protected static final Pattern refRange = Pattern.compile("^\\s*(\\d+)\\s+to\\s+(\\d+)$"); 131 protected static final Pattern refp = Pattern.compile("^(\\d+)\\s*(?:(\\((?:bases|residues)\\s+(\\d+\\s+to\\s+\\d+(\\s*;\\s*\\d+\\s+to\\s+\\d+)*)\\))|\\(sites\\))?"); 132 // dbxref line 133 protected static final Pattern dbxp = Pattern.compile("^([^:]+):(\\S+)$"); 134 //sections start at a line and continue till the first line afterwards with a 135 //non-whitespace first character 136 //we want to match any of the following as a new section within a section 137 // \s{0,8} word \s{0,7} value 138 // \s{21} /word = value 139 // \s{21} /word 140 protected static final Pattern sectp = Pattern.compile("^(\\s{0,8}(\\S+)\\s{0,7}(.*)|\\s{21}(/\\S+?)=(.*)|\\s{21}(/\\S+))$"); 141 142 protected static final Pattern readableFiles = Pattern.compile(".*(g[bp]k*$|\\u002eg[bp].*)"); 143 protected static final Pattern headerLine = Pattern.compile("^LOCUS.*"); 144 145 private final static HashSet isNotQuoted = new HashSet(); 146 static { 147 isNotQuoted.add("anticodon"); 148 isNotQuoted.add("citation"); 149 isNotQuoted.add("codon"); 150 isNotQuoted.add("codon_start"); 151 isNotQuoted.add("compare"); 152 isNotQuoted.add("cons_splice"); 153 isNotQuoted.add("direction"); 154 isNotQuoted.add("estimated_length"); 155 isNotQuoted.add("label"); 156 isNotQuoted.add("mod_base"); 157 isNotQuoted.add("number"); 158 isNotQuoted.add("rpt_type"); 159 isNotQuoted.add("rpt_unit_range"); 160 isNotQuoted.add("transl_except"); 161 isNotQuoted.add("transl_table"); 162 } 163 164 /** 165 * Implements some GenBank-specific terms. 166 */ 167 public static class Terms extends RichSequence.Terms { 168 /** 169 * Getter for the Genbank term 170 * @return The genbank Term 171 */ 172 public static ComparableTerm getGenBankTerm() { 173 return RichObjectFactory.getDefaultOntology().getOrCreateTerm("GenBank"); 174 } 175 } 176 177 /** 178 * {@inheritDoc} 179 * A file is in GenBank format if the name ends with gbk, contains the letters egb, or the first line of 180 * the file starts with the word LOCUS 181 */ 182 public boolean canRead(File file) throws IOException { 183 if (readableFiles.matcher(file.getName()).matches()) return true; 184 BufferedReader br = new BufferedReader(new FileReader(file)); 185 final String firstLine = br.readLine(); 186 boolean readable = firstLine!=null && headerLine.matcher(firstLine).matches(); 187 br.close(); 188 return readable; 189 } 190 191 /** 192 * {@inheritDoc} 193 * Returns an dna parser if the letters DNA or RNA appear in the first line of the file. 194 * Otherwise returns a DNA tokenizer. 195 */ 196 public SymbolTokenization guessSymbolTokenization(File file) throws IOException { 197 BufferedReader br = new BufferedReader(new FileReader(file)); 198 String firstLine = br.readLine(); 199 boolean dna = (firstLine.indexOf("DNA") >0 || firstLine.indexOf("RNA") > 0); 200 br.close(); 201 if (dna) return RichSequence.IOTools.getDNAParser(); 202 else return RichSequence.IOTools.getProteinParser(); 203 } 204 205 /** 206 * {@inheritDoc} 207 * A stream is in GenBank format if the first line of the stream starts with the word LOCUS 208 */ 209 public boolean canRead(BufferedInputStream stream) throws IOException { 210 stream.mark(2000); // some streams may not support this 211 BufferedReader br = new BufferedReader(new InputStreamReader(stream)); 212 final String firstLine = br.readLine(); 213 boolean readable = firstLine!=null && headerLine.matcher(firstLine).matches(); 214 // don't close the reader as it'll close the stream too. 215 // br.close(); 216 stream.reset(); 217 return readable; 218 } 219 220 /** 221 * {@inheritDoc} 222 * Returns an dna parser if the letters DNA or RNA appear in the first line of the stream. 223 * Otherwise returns a DNA tokenizer. 224 */ 225 public SymbolTokenization guessSymbolTokenization(BufferedInputStream stream) throws IOException { 226 stream.mark(2000); // some streams may not support this 227 BufferedReader br = new BufferedReader(new InputStreamReader(stream)); 228 String firstLine = br.readLine(); 229 boolean dna = (firstLine.indexOf("DNA") >0 || firstLine.indexOf("RNA") > 0); 230 // don't close the reader as it'll close the stream too. 231 // br.close(); 232 stream.reset(); 233 if (dna) return RichSequence.IOTools.getDNAParser(); 234 else return RichSequence.IOTools.getProteinParser(); 235 } 236 237 /** 238 * {@inheritDoc} 239 */ 240 public boolean readSequence(BufferedReader reader, 241 SymbolTokenization symParser, 242 SeqIOListener listener) 243 throws IllegalSymbolException, IOException, ParseException { 244 if (!(listener instanceof RichSeqIOListener)) throw new IllegalArgumentException("Only accepting RichSeqIOListeners today"); 245 return this.readRichSequence(reader,symParser,(RichSeqIOListener)listener,null); 246 } 247 248 private String sectionKey = null; 249 private NCBITaxon tax = null; 250 private String organism = null; 251 private String accession = null; 252 private String identifier = null; 253 /** 254 * {@inheritDoc} 255 */ 256 public boolean readRichSequence(BufferedReader reader, 257 SymbolTokenization symParser, 258 RichSeqIOListener rlistener, 259 Namespace ns) 260 throws IllegalSymbolException, IOException, ParseException { 261 262 sectionKey = null; 263 tax = null; 264 organism = null; 265 accession = null; 266 identifier = null; 267 boolean hasAnotherSequence = true; 268 //boolean hasInternalWhitespace = false; 269 270 rlistener.startSequence(); 271 272 if (ns==null) ns=RichObjectFactory.getDefaultNamespace(); 273 rlistener.setNamespace(ns); 274 275 // Get an ordered list of key->value pairs in array-tuples 276 List section = null; 277 try{ 278 do { 279 section = this.readSection(reader); 280 sectionKey = ((String[])section.get(0))[0]; 281 if(sectionKey == null){ 282 String message = ParseException.newMessage(this.getClass(), accession, identifier, "Section key was null", sectionToString(section)); 283 throw new ParseException(message); 284 } 285 // process section-by-section 286 if (sectionKey.equals(LOCUS_TAG)) { 287 String loc = ((String[])section.get(0))[1]; 288 Matcher m = lp.matcher(loc); 289 if (m.matches()) { 290 rlistener.setName(m.group(1)); 291 accession = m.group(1); // default if no accession found 292 rlistener.setAccession(accession); 293 if (m.group(4)!=null) 294 rlistener.addSequenceProperty(Terms.getMolTypeTerm(),m.group(4)); 295 // Optional extras 296 String stranded = m.group(3); 297 if(stranded!=null && stranded.equals("ss-")) 298 stranded = "single"; 299 else if(stranded!=null && stranded.equals("ms-")) 300 stranded = "mixed"; 301 else if(stranded!=null && stranded.equals("ds-")) 302 stranded = "double"; 303 String circular = m.group(5); 304 String fifth = m.group(6); 305 String sixth = m.group(7); 306 if (stranded!=null) rlistener.addSequenceProperty(Terms.getStrandedTerm(),stranded); 307 if (circular!=null && circular.equalsIgnoreCase("circular")) rlistener.setCircular(true); 308 if (sixth != null) { 309 rlistener.setDivision(fifth); 310 rlistener.addSequenceProperty(Terms.getDateUpdatedTerm(),sixth); 311 } else if (fifth!=null) { 312 rlistener.addSequenceProperty(Terms.getDateUpdatedTerm(),fifth); 313 } 314 } else { 315 String message = ParseException.newMessage(this.getClass(), accession, identifier, "Bad locus line", sectionToString(section)); 316 throw new ParseException(message); 317 } 318 } else if (sectionKey.equals(DEFINITION_TAG)) { 319 rlistener.setDescription(((String[])section.get(0))[1]); 320 } else if (sectionKey.equals(ACCESSION_TAG)) { 321 // if multiple accessions, store only first as accession, 322 // and store rest in annotation 323 String[] accs = ((String[])section.get(0))[1].split("\\s+"); 324 accession = accs[0].trim(); 325 rlistener.setAccession(accession); 326 for (int i = 1; i < accs.length; i++) { 327 rlistener.addSequenceProperty(Terms.getAdditionalAccessionTerm(),accs[i].trim()); 328 } 329 } else if (sectionKey.equals(VERSION_TAG)) { 330 String ver = ((String[])section.get(0))[1]; 331 Matcher m = vp.matcher(ver); 332 if (m.matches()) { 333 String verAcc = m.group(1); 334 if (!accession.equals(verAcc)) { 335 // the version refers to a different accession! 336 // believe the version line, and store the original 337 // accession away in the additional accession set 338 rlistener.addSequenceProperty(Terms.getAdditionalAccessionTerm(),accession); 339 accession = verAcc; 340 rlistener.setAccession(accession); 341 } 342 if (m.group(3)!=null) rlistener.setVersion(Integer.parseInt(m.group(3))); 343 if (m.group(5)!=null) { 344 identifier = m.group(5); 345 rlistener.setIdentifier(identifier); 346 } 347 } else { 348 String message = ParseException.newMessage(this.getClass(), accession, identifier, "Bad version line", sectionToString(section)); 349 throw new ParseException(message); 350 } 351 } else if (sectionKey.equals(KEYWORDS_TAG)) { 352 String val = ((String[])section.get(0))[1]; 353 if (val.endsWith(".")) val = val.substring(0, val.length()-1); // chomp dot 354 val = val.replace('\n',' '); //remove newline 355 String[] kws = val.split(";"); 356 357 for (int i = 0; i < kws.length; i++) { 358 String kw = kws[i].trim(); 359 if (kw.length()==0) continue; 360 rlistener.addSequenceProperty(Terms.getKeywordTerm(), kw); 361 } 362 } else if (sectionKey.equals(SOURCE_TAG)) { 363 // ignore - can get all this from the first feature 364 } else if (sectionKey.equals(REFERENCE_TAG) && !this.getElideReferences()) { 365 // first line of section has rank and location 366 int ref_rank; 367 List baseRangeList=null; 368 String ref = ((String[])section.get(0))[1]; 369 Matcher m = refp.matcher(ref); 370 if (m.matches()) { 371 ref_rank = Integer.parseInt(m.group(1)); 372 if (m.group(3) != null) baseRangeList=buildBaseRanges(m.group(3)); 373 } else { 374 String message = ParseException.newMessage(this.getClass(), accession, identifier, "Bad reference line", sectionToString(section)); 375 throw new ParseException(message); 376 } 377 // rest can be in any order 378 String authors = null; 379 String consortium = null; 380 String title = null; 381 String journal = null; 382 String medline = null; 383 String pubmed = null; 384 String remark = null; 385 for (int i = 1; i < section.size(); i++) { 386 String key = ((String[])section.get(i))[0]; 387 String val = ((String[])section.get(i))[1]; 388 if (key.equals(AUTHORS_TAG)) authors = val.replace('\n',' '); //see #2276 389 else if (key.equals(CONSORTIUM_TAG)) consortium = val.replace('\n',' '); //see #2276 390 else if (key.equals(TITLE_TAG)) title = val.replace('\n',' '); //see #2276 391 else if (key.equals(JOURNAL_TAG)) journal = val.replace('\n',' '); //see #2276 392 else if (key.equals(MEDLINE_TAG)) medline = val; 393 else if (key.equals(PUBMED_TAG)) pubmed = val; 394 else if (key.equals(REMARK_TAG)) remark = val.replace('\n',' '); //see #2276 395 } 396 397 // create the docref object 398 try { 399 // Use consortium as well if present. 400 if (authors==null) authors = consortium + " (consortium)"; 401 else if (consortium!=null) authors = authors + ", " + consortium + " (consortium)"; 402 // Create docref. 403 DocRef dr = null; 404 // assign either the pubmed or medline to the docref - medline gets priority 405 if (medline != null) { 406 dr = (DocRef) RichObjectFactory.getObject(SimpleDocRef.class, new Object[]{DocRefAuthor.Tools.parseAuthorString(authors), journal, title, Terms.MEDLINE_KEY, medline, new Integer(0)}); 407 if (dr.getCrossref() == null) { 408 dr.setCrossref((CrossRef) RichObjectFactory.getObject(SimpleCrossRef.class, new Object[]{Terms.MEDLINE_KEY, medline, new Integer(0)})); 409 } 410 } else if (pubmed != null) { 411 dr = (DocRef) RichObjectFactory.getObject(SimpleDocRef.class, new Object[]{DocRefAuthor.Tools.parseAuthorString(authors), journal, title, Terms.PUBMED_KEY, pubmed, new Integer(0)}); 412 if (dr.getCrossref() == null) { 413 dr.setCrossref((CrossRef) RichObjectFactory.getObject(SimpleCrossRef.class, new Object[]{Terms.PUBMED_KEY, pubmed, new Integer(0)})); 414 } 415 } else { 416 dr = (DocRef) RichObjectFactory.getObject(SimpleDocRef.class, new Object[]{DocRefAuthor.Tools.parseAuthorString(authors), journal, title}); 417 } 418 // assign the remarks 419 if (!this.getElideComments()) dr.setRemark(remark); 420 // assign the docref to the bioentry: null if no base ranges, Integers if 1 base range - the normal case, joined RichLocation if more than 1 421 RankedDocRef rdr = baseRangeList == null?new SimpleRankedDocRef(dr, null, null, ref_rank):(baseRangeList.size()==1?new SimpleRankedDocRef(dr, new Integer(((RichLocation)baseRangeList.get(0)).getMin()), new Integer(((RichLocation)baseRangeList.get(0)).getMax()), ref_rank):new SimpleRankedDocRef(dr, new CompoundRichLocation(baseRangeList), ref_rank)); 422 rlistener.setRankedDocRef(rdr); 423 } catch (ChangeVetoException e) { 424 throw new ParseException(e+", accession:"+accession); 425 } 426 } else if (sectionKey.equals(COMMENT_TAG) && !this.getElideComments()) { 427 // Set up some comments 428 rlistener.setComment(((String[])section.get(0))[1]); 429 } else if (sectionKey.equals(FEATURE_TAG) && !this.getElideFeatures()) { 430 // starting from second line of input, start a new feature whenever we come across 431 // a key that does not start with / 432 boolean seenAFeature = false; 433 int rcrossrefCount = 0; 434 boolean skippingBond = false; 435 for (int i = 1 ; i < section.size(); i++) { 436 String key = ((String[])section.get(i))[0]; 437 String val = ((String[])section.get(i))[1]; 438 if (key.startsWith("/")) { 439 if(!skippingBond) 440 { 441 key = key.substring(1); // strip leading slash 442 val = val.replaceAll("\\s*[\\n\\r]+\\s*"," ").trim(); 443 if (val.endsWith("\"")) val = val.substring(1,val.length()-1); // strip quotes 444 // parameter on old feature 445 if (key.equals("db_xref")) { 446 val = val.replaceAll("\\s+",""); 447 Matcher m = dbxp.matcher(val); 448 if (m.matches()) { 449 String dbname = m.group(1); 450 String raccession = m.group(2); 451 if (dbname.equalsIgnoreCase("taxon")) { 452 // Set the Taxon instead of a dbxref 453 tax = (NCBITaxon)RichObjectFactory.getObject(SimpleNCBITaxon.class, new Object[]{Integer.valueOf(raccession)}); 454 rlistener.setTaxon(tax); 455 try { 456 if (organism!=null) tax.addName(NCBITaxon.SCIENTIFIC,organism.replace('\n', ' '));// readSection can embed new lines 457 } catch (ChangeVetoException e) { 458 throw new ParseException(e+", accession:"+accession); 459 } 460 } else { 461 try { 462 CrossRef cr = (CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{dbname, raccession, new Integer(0)}); 463 RankedCrossRef rcr = new SimpleRankedCrossRef(cr, ++rcrossrefCount); 464 rlistener.getCurrentFeature().addRankedCrossRef(rcr); 465 } catch (ChangeVetoException e) { 466 throw new ParseException(e+", accession:"+accession); 467 } 468 } 469 } else { 470 String message = ParseException.newMessage(this.getClass(), accession, identifier, "Bad dbxref", sectionToString(section)); 471 throw new ParseException(message); 472 } 473 } else if (key.equalsIgnoreCase("organism")) { 474 try { 475 organism = val; 476 if (tax!=null) tax.addName(NCBITaxon.SCIENTIFIC,organism.replace('\n', ' '));// readSection can embed new lines 477 } catch (ChangeVetoException e) { 478 throw new ParseException(e+", accession:"+accession); 479 } 480 } else { 481 if (key.equalsIgnoreCase("translation")) { 482 // strip spaces from sequence 483 val = val.replaceAll("\\s+",""); 484 } 485 rlistener.addFeatureProperty(RichObjectFactory.getDefaultOntology().getOrCreateTerm(key),val); 486 } 487 } 488 } else { 489 // new feature! 490 // end previous feature 491 if(key.equalsIgnoreCase("bond")) 492 { 493 skippingBond = true; 494 } 495 else 496 { 497 skippingBond = false; 498 if (seenAFeature) { 499 rlistener.endFeature(); 500 } 501 // start next one, with lots of lovely info in it 502 RichFeature.Template templ = new RichFeature.Template(); 503 templ.annotation = new SimpleRichAnnotation(); 504 templ.sourceTerm = Terms.getGenBankTerm(); 505 templ.typeTerm = RichObjectFactory.getDefaultOntology().getOrCreateTerm(key); 506 templ.featureRelationshipSet = new TreeSet(); 507 templ.rankedCrossRefs = new TreeSet(); 508 String tidyLocStr = val.replaceAll("\\s+",""); 509 templ.location = GenbankLocationParser.parseLocation(ns, accession, tidyLocStr); 510 rlistener.startFeature(templ); 511 seenAFeature = true; 512 rcrossrefCount = 0; 513 } 514 515 } 516 } 517 518 if (seenAFeature) { 519 rlistener.endFeature(); 520 } 521 } else if (sectionKey.equals(BASE_COUNT_TAG)) { 522 // ignore - can calculate from sequence content later if needed 523 } else if (sectionKey.equals(START_SEQUENCE_TAG) && !this.getElideSymbols()) { 524 // our first line is ignorable as it is the ORIGIN tag 525 // the second line onwards conveniently have the number as 526 // the [0] tuple, and sequence string as [1] so all we have 527 // to do is concat the [1] parts and then strip out spaces, 528 // and replace '.' and '~' with '-' for our parser. 529 StringBuffer seq = new StringBuffer(); 530 for (int i = 1 ; i < section.size(); i++) seq.append(((String[])section.get(i))[1]); 531 try { 532 SymbolList sl = new SimpleSymbolList(symParser, 533 seq.toString().replaceAll("\\s+","").replaceAll("[\\.|~]","-")); 534 rlistener.addSymbols(symParser.getAlphabet(), 535 (Symbol[])(sl.toList().toArray(new Symbol[0])), 536 0, sl.length()); 537 } catch (IllegalAlphabetException e) { 538 String message = ParseException.newMessage(this.getClass(), accession, identifier, "Bad sequence section", sectionToString(section)); 539 throw new ParseException(e, message); 540 } 541 } 542 } while (!sectionKey.equals(END_SEQUENCE_TAG)); 543 }catch(RuntimeException e){ 544 String message = ParseException.newMessage(this.getClass(), accession, identifier, "Bad sequence section", sectionToString(section)); 545 throw new ParseException(e, message); 546 } 547 548 // Allows us to tolerate trailing whitespace without 549 // thinking that there is another Sequence to follow 550 while (true) { 551 reader.mark(1); 552 int c = reader.read(); 553 if (c == -1) { 554 hasAnotherSequence = false; 555 break; 556 } 557 if (Character.isWhitespace((char) c)) { 558 //hasInternalWhitespace = true; 559 continue; 560 } 561 //if (hasInternalWhitespace) 562 // System.err.println("Warning: whitespace found between sequence entries"); 563 reader.reset(); 564 break; 565 } 566 567 // Finish up. 568 rlistener.endSequence(); 569 return hasAnotherSequence; 570 } 571 572 // reads an indented section, combining split lines and creating a list of key->value tuples 573 private List readSection(BufferedReader br) throws ParseException { 574 List section = new ArrayList(); 575 String line = ""; 576 String currKey = null; 577 StringBuffer currVal = new StringBuffer(); 578 boolean done = false; 579 int linecount = 0; 580 581 try { 582 while (!done) { 583 br.mark(320); 584 line = br.readLine(); 585 String firstSecKey = section.isEmpty() ? "" : ((String[])section.get(0))[0]; 586 if (line != null && line.matches("\\p{Space}*")) { 587 // regular expression \p{Space}* will match line 588 // having only white space characters 589 continue; 590 } 591 if (line==null || (!line.startsWith(" ") && linecount++>0 && ( !firstSecKey.equals(START_SEQUENCE_TAG) || line.startsWith(END_SEQUENCE_TAG)))) { 592 // dump out last part of section 593 section.add(new String[]{currKey,currVal.toString()}); 594 br.reset(); 595 done = true; 596 } else { 597 if (getElideSymbols() && firstSecKey.equals(START_SEQUENCE_TAG) && !line.startsWith(END_SEQUENCE_TAG)) { 598 continue; 599 } 600 Matcher m = sectp.matcher(line); 601 if (m.matches()) { 602 // new key 603 if (currKey!=null) section.add(new String[]{currKey,currVal.toString()}); 604 // key = group(2) or group(4) or group(6) - whichever is not null 605 currKey = m.group(2)==null?(m.group(4)==null?m.group(6):m.group(4)):m.group(2); 606 currVal = new StringBuffer(); 607 // val = group(3) if group(2) not null, group(5) if group(4) not null, "" otherwise, trimmed 608 currVal.append((m.group(2)==null?(m.group(4)==null?"":m.group(5)):m.group(3)).trim()); 609 } else { 610 // concatted line or SEQ START/END line? 611 if (line.startsWith(START_SEQUENCE_TAG) || line.startsWith(END_SEQUENCE_TAG)) currKey = line; 612 else { 613 currVal.append("\n"); // newline in between lines - can be removed later 614 currVal.append(currKey.charAt(0)=='/'?line.substring(21):line.substring(12)); 615 } 616 } 617 } 618 } 619 } catch (IOException e) { 620 String message = ParseException.newMessage(this.getClass(), accession, identifier, "", sectionToString(section)); 621 throw new ParseException(e, message); 622 } catch (RuntimeException e){ 623 String message = ParseException.newMessage(this.getClass(), accession, identifier, "Bad line", line); 624 throw new ParseException(e, message); 625 } 626 return section; 627 } 628 629 private final List buildBaseRanges(final String theBaseRangeList) throws ParseException { 630 if (theBaseRangeList == null) return null; 631 final List baseRangeList = new ArrayList(); 632 final String[] baseRange = theBaseRangeList.split(";"); 633 try{ 634 for (int r=0; r<baseRange.length; r++) { 635 final Matcher rangeMatch = refRange.matcher(baseRange[r]); 636 if (rangeMatch.matches()) { 637 final int rangeStart = Integer.parseInt(rangeMatch.group(1)); 638 final int rangeEnd = Integer.parseInt(rangeMatch.group(2)); 639 baseRangeList.add(new SimpleRichLocation(new SimplePosition(rangeStart), new SimplePosition(rangeEnd), r)); 640 } else { 641 String message = ParseException.newMessage(this.getClass(), accession, identifier, "Bad reference range found", theBaseRangeList); 642 throw new ParseException(message); 643 } 644 } 645 return baseRangeList; 646 }catch(RuntimeException e){ 647 String message = ParseException.newMessage(this.getClass(), accession, identifier, "Bad base range", theBaseRangeList); 648 throw new ParseException(e, message); 649 } 650 } 651 652 /** 653 * {@inheritDoc} 654 */ 655 public void writeSequence(Sequence seq, PrintStream os) throws IOException { 656 if (this.getPrintStream()==null) this.setPrintStream(os); 657 this.writeSequence(seq, RichObjectFactory.getDefaultNamespace()); 658 } 659 660 /** 661 * {@inheritDoc} 662 */ 663 public void writeSequence(Sequence seq, String format, PrintStream os) throws IOException { 664 if (this.getPrintStream()==null) this.setPrintStream(os); 665 if (!format.equals(this.getDefaultFormat())) throw new IllegalArgumentException("Unknown format: "+format); 666 this.writeSequence(seq, RichObjectFactory.getDefaultNamespace()); 667 } 668 669 /** 670 * {@inheritDoc} 671 * Namespace is ignored as Genbank has no concept of it. 672 */ 673 public void writeSequence(Sequence seq, Namespace ns) throws IOException { 674 RichSequence rs; 675 try { 676 if (seq instanceof RichSequence) rs = (RichSequence)seq; 677 else rs = RichSequence.Tools.enrich(seq); 678 } catch (ChangeVetoException e) { 679 IOException e2 = new IOException("Unable to enrich sequence"); 680 e2.initCause(e); 681 throw e2; 682 } 683 684 SymbolTokenization tok; 685 try { 686 tok = rs.getAlphabet().getTokenization("token"); 687 } catch (Exception e) { 688 throw new RuntimeException("Unable to get alphabet tokenizer",e); 689 } 690 Set<Note> notes = rs.getNoteSet(); 691 String accession = rs.getAccession(); 692 StringBuffer accessions = new StringBuffer(); 693 accessions.append(accession); 694 String stranded = ""; 695 String udat = ""; 696 String moltype = rs.getAlphabet().getName(); 697 if ("PROTEIN-TERM".equals(moltype) || "PROTEIN".equals(moltype)) moltype = null; //a genpept curiosity 698 StringBuffer keywords = new StringBuffer(); 699 for (Iterator<Note> i = notes.iterator(); i.hasNext(); ) { 700 Note n = i.next(); 701 if (n.getTerm().equals(Terms.getStrandedTerm())) { 702 String value = n.getValue(); 703 if(value != null && value.equals("single")) 704 stranded= "ss-"; 705 else if(value != null && value.equals("mixed")) 706 stranded= "ms-"; 707 } 708 else if (n.getTerm().equals(Terms.getDateUpdatedTerm())) udat=n.getValue(); 709 else if (n.getTerm().equals(Terms.getMolTypeTerm())) moltype=n.getValue(); 710 else if (n.getTerm().equals(Terms.getAdditionalAccessionTerm())) { 711 accessions.append(" "); 712 accessions.append(n.getValue()); 713 } else if (n.getTerm().equals(Terms.getKeywordTerm())) { 714 if (n.getValue() != null) { 715 if (keywords.length()>0) keywords.append("; "); 716 keywords.append(n.getValue()); 717 } 718 } 719 } 720 721 //adjust molecule type during format conversion 722 if(moltype!=null && moltype.length()>6) { 723 if(moltype.indexOf("DNA")!=-1) moltype = "DNA"; 724 else if(moltype.indexOf("RNA")!=-1) moltype = "RNA"; 725 else moltype = "NA"; 726 } 727 728 // locus(name) + length + alpha + div + date line 729 StringBuffer locusLine = new StringBuffer(); 730 locusLine.append(StringTools.rightPad(rs.getName(),16));//13->28=15+1=16 731 locusLine.append(" ");//29 732 locusLine.append(StringTools.leftPad(""+rs.length(),11));//30->40=10+1=11 733 locusLine.append(" "+ (moltype==null? "aa":"bp") +" ");//41->44 734 locusLine.append(StringTools.leftPad(stranded,3));//45->47=2+1=3 735 locusLine.append(StringTools.rightPad(moltype==null?"":moltype,6));//48->53=5+1=6 736 locusLine.append(" ");//54->55 737 locusLine.append(StringTools.rightPad(rs.getCircular()?"circular":"linear",8));//56->63=7+1=8 738 locusLine.append(" ");//64->64 739 String div = rs.getDivision()==null?"":rs.getDivision(); 740 if(div.length()>3) div = ""; // Not a GenBank division, maybe UniProt, etc. 741 locusLine.append(StringTools.rightPad(div,3));//65->67=2+1=3 742 locusLine.append(" ");//68->68 743 locusLine.append(StringTools.rightPad(udat,11));//69->79=10+1=11 744 StringTools.writeKeyValueLine(LOCUS_TAG, locusLine.toString(), 12, this.getLineWidth(), this.getPrintStream()); 745 746 // definition line 747 StringTools.writeKeyValueLine(DEFINITION_TAG, rs.getDescription(), 12, this.getLineWidth(), this.getPrintStream()); 748 749 // accession line 750 StringTools.writeKeyValueLine(ACCESSION_TAG, accessions.toString(), 12, this.getLineWidth(), this.getPrintStream()); 751 752 // version + gi line 753 String version = accession+"."+rs.getVersion(); 754 if (rs.getIdentifier()!=null) version = version + " GI:"+rs.getIdentifier(); 755 StringTools.writeKeyValueLine(VERSION_TAG, version, 12, this.getLineWidth(), this.getPrintStream()); 756 757 // keywords line 758 keywords.append("."); 759 StringTools.writeKeyValueLine(KEYWORDS_TAG, keywords.toString(), 12, this.getLineWidth()-1, this.getPrintStream()); 760 761 // source line (from taxon) 762 // organism line 763 NCBITaxon tax = rs.getTaxon(); 764 if (tax!=null) { 765 StringTools.writeKeyValueLine(SOURCE_TAG, (isMitochondrial(rs)?"mitochondrion ":"")+tax.getDisplayName(), 12, this.getLineWidth(), this.getPrintStream()); 766 StringTools.writeKeyValueLine(" "+ORGANISM_TAG, tax.getDisplayName().split("\\s+\\(")[0]+"\n"+tax.getNameHierarchy(), 12, this.getLineWidth()-1, this.getPrintStream()); 767 } 768 769 // references - rank (bases x to y) 770 for (Iterator<RankedDocRef> r = rs.getRankedDocRefs().iterator(); r.hasNext(); ) { 771 RankedDocRef rdr = r.next(); 772 DocRef d = rdr.getDocumentReference(); 773 StringTools.writeKeyValueLine(REFERENCE_TAG, rdr.getRank()+((rdr.getLocation()==null || rdr.getLocation() ==RichLocation.EMPTY_LOCATION)?"": (moltype==null? " (residues ":" (bases ")+makeBaseRange(rdr)+")"), 12, this.getLineWidth(), this.getPrintStream()); 774 // Any authors that were in the input as CONSRTM tags will 775 // be merged into the AUTHORS tag on output. 776 StringTools.writeKeyValueLine(" "+AUTHORS_TAG, d.getAuthors(), 12, this.getLineWidth()-1, this.getPrintStream()); 777 StringTools.writeKeyValueLine(" "+TITLE_TAG, d.getTitle(), 12, this.getLineWidth(), this.getPrintStream()); 778 StringTools.writeKeyValueLine(" "+JOURNAL_TAG, d.getLocation(), 12, this.getLineWidth(), this.getPrintStream()); 779 CrossRef c = d.getCrossref(); 780 if (c!=null) StringTools.writeKeyValueLine(StringTools.leftPad(c.getDbname(),9), c.getAccession(), 12, this.getLineWidth(), this.getPrintStream()); 781 StringTools.writeKeyValueLine(" "+REMARK_TAG, d.getRemark(), 12, this.getLineWidth(), this.getPrintStream()); 782 } 783 784 // comments - if any 785 Set<Comment> comments = rs.getComments(); 786 if (!comments.isEmpty()) { 787 StringBuffer sb = new StringBuffer(); 788 for (Iterator<Comment> i = comments.iterator(); i.hasNext(); ) { 789 Comment c = i.next(); 790 sb.append(c.getComment()); 791 if (i.hasNext()) sb.append("\n"); 792 } 793 StringTools.writeKeyValueLine(COMMENT_TAG, sb.toString(), 12, this.getLineWidth(), this.getPrintStream()); 794 } 795 796 this.getPrintStream().println(FEATURE_TAG+" Location/Qualifiers"); 797 // feature_type location 798 for (Iterator i = rs.getFeatureSet().iterator(); i.hasNext(); ) { 799 RichFeature f = (RichFeature)i.next(); 800 StringTools.writeKeyValueLine(" "+f.getTypeTerm().getName(), GenbankLocationParser.writeLocation((RichLocation)f.getLocation()), 21, this.getLineWidth()-1, ",", this.getPrintStream()); 801 for (Iterator<Note> j = f.getNoteSet().iterator(); j.hasNext(); ) { 802 Note n = j.next(); 803 // /key="val" or just /key if val=="" 804 if (n.getValue()==null || n.getValue().length()==0) StringTools.writeKeyValueLine("", "/"+n.getTerm().getName(), 21, this.getLineWidth(), this.getPrintStream()); 805 else if (isNotQuoted(n)) {// doesn't have the value enclosed in quotes 806 StringTools.writeKeyValueLine("", "/"+n.getTerm().getName()+"="+n.getValue(), 21, this.getLineWidth(), this.getPrintStream()); 807 } else if (n.getTerm().getName().equals("translation")) { 808 StringTools.writeKeyValueLine("", "/"+n.getTerm().getName()+"=\""+n.getValue()+"\"", 21, this.getLineWidth()-1, this.getPrintStream()); 809 } else { 810 StringTools.writeKeyValueLine("", "/"+n.getTerm().getName()+"=\""+n.getValue()+"\"", 21, this.getLineWidth(), this.getPrintStream()); 811 } 812 } 813 // add-in to source feature only organism and db_xref="taxon:xyz" where present 814 if (f.getType().equals("source") && tax!=null) { 815 String displayName = tax.getDisplayName(); 816 if (displayName.indexOf('(')>-1) displayName = displayName.substring(0, displayName.indexOf('(')).trim(); 817 StringTools.writeKeyValueLine("", "/organism=\""+displayName+"\"", 21, this.getLineWidth()-1, this.getPrintStream());// AF252370 fits in exactly 80 - but is wrapped 818 for (Iterator<RankedCrossRef> j = f.getRankedCrossRefs().iterator(); j.hasNext(); ) { 819 RankedCrossRef rcr = j.next(); 820 CrossRef cr = rcr.getCrossRef(); 821 StringTools.writeKeyValueLine("", "/db_xref=\""+cr.getDbname()+":"+cr.getAccession()+"\"", 21, this.getLineWidth(), this.getPrintStream()); 822 } 823 StringTools.writeKeyValueLine("", "/db_xref=\"taxon:"+tax.getNCBITaxID()+"\"", 21, this.getLineWidth(), this.getPrintStream()); 824 } else { 825 // add-in other dbxrefs where present 826 for (Iterator<RankedCrossRef> j = f.getRankedCrossRefs().iterator(); j.hasNext(); ) { 827 RankedCrossRef rcr = j.next(); 828 CrossRef cr = rcr.getCrossRef(); 829 StringTools.writeKeyValueLine("", "/db_xref=\""+cr.getDbname()+":"+cr.getAccession()+"\"", 21, this.getLineWidth(), this.getPrintStream()); 830 } 831 } 832 } 833 834 //BASE COUNT obsolete in Genbank flatfile format since October 2003 835 //if (rs.getAlphabet()==AlphabetManager.alphabetForName("DNA")) { 836 // // BASE COUNT 1510 a 1074 c 835 g 1609 t 837 // int aCount = 0; 838 // int cCount = 0; 839 // int gCount = 0; 840 // int tCount = 0; 841 // int oCount = 0; 842 // for (int i = 1; i <= rs.length(); i++) { 843 // char c; 844 // try { 845 // c = tok.tokenizeSymbol(rs.symbolAt(i)).charAt(0); 846 // } catch (Exception e) { 847 // throw new RuntimeException("Unable to get symbol at position "+i,e); 848 // } 849 // switch (c) { 850 // case 'a': case 'A': 851 // aCount++; 852 // break; 853 // case 'c': case 'C': 854 // cCount++; 855 // break; 856 // case 'g': case 'G': 857 // gCount++; 858 // break; 859 // case 't': case 'T': 860 // tCount++; 861 // break; 862 // default: 863 // oCount++; 864 // } 865 // } 866 // 867 // this.getPrintStream().print(BASE_COUNT_TAG_FULL+" "); 868 // this.getPrintStream().print(aCount + " a "); 869 // this.getPrintStream().print(cCount + " c "); 870 // this.getPrintStream().print(gCount + " g "); 871 // this.getPrintStream().print(tCount + " t "); 872 // this.getPrintStream().println(oCount + " others"); 873 //} 874 875 this.getPrintStream().println(START_SEQUENCE_TAG); 876 // sequence stuff 877 Symbol[] syms = (Symbol[])rs.toList().toArray(new Symbol[0]); 878 int lines = 0; 879 int symCount = 0; 880 for (int i = 0; i < syms.length; i++) { 881 if (symCount % 60 == 0) { 882 if (lines > 0) this.getPrintStream().print("\n"); // newline from previous line 883 int lineNum = (lines*60) + 1; 884 this.getPrintStream().print(StringTools.leftPad(""+lineNum,9)); 885 lines++; 886 } 887 if (symCount % 10 == 0) this.getPrintStream().print(" "); 888 try { 889 this.getPrintStream().print(tok.tokenizeSymbol(syms[i])); 890 } catch (IllegalSymbolException e) { 891 throw new RuntimeException("Found illegal symbol: "+syms[i]); 892 } 893 symCount++; 894 } 895 if(syms.length>0) //do not create an empty line 896 this.getPrintStream().print("\n"); 897 this.getPrintStream().println(END_SEQUENCE_TAG); 898 } 899 900 /** 901 * {@inheritDoc} 902 */ 903 public String getDefaultFormat() { 904 return GENBANK_FORMAT; 905 } 906 907 private final static boolean isMitochondrial(final RichSequence theSequence) { 908 final Set featureSet = theSequence.getFeatureSet(); 909 final Iterator i = featureSet.iterator(); 910 while (i.hasNext()) { 911 final RichFeature feature = (RichFeature) i.next(); 912 if (feature.getType().equals("source")) { 913 final Set noteSet = feature.getNoteSet(); 914 final Iterator<Note> n = noteSet.iterator(); 915 while(n.hasNext()) { 916 final Note note = n.next(); 917 if (note.getTerm().getName().equals("organelle")) return note.getValue().equals("mitochondrion"); 918 } 919 } 920 } 921 return false; 922 } 923 924 private final static boolean isNotQuoted(final Note theNote) { 925 return isNotQuoted(theNote.getTerm().getName(), theNote.getValue()); 926 } 927 928 private final static boolean isNotQuoted(final String theName, final String theValue) { 929 return isNotQuoted.contains(theName); 930 } 931 932 private final static String makeBaseRange(final RankedDocRef theReference) { 933 return theReference.getLocation()==null?theReference.getStart()+" to "+theReference.getEnd():toString(theReference.getLocation()); 934 } 935 936 private final static String toString(final RichLocation theLocation) { 937 final StringBuffer list = new StringBuffer(); 938 final Iterator b = theLocation.blockIterator(); 939 while (b.hasNext()) { 940 final RichLocation location = (RichLocation) b.next(); 941 list.append(location.getMin()+" to "+location.getMax()); 942 if (b.hasNext()) list.append("; "); 943 } 944 return list.toString(); 945 } 946 947 /** 948 * Converts the current parse section to a String. Useful for debugging. 949 */ 950 String sectionToString(List section){ 951 StringBuffer parseBlock = new StringBuffer(); 952 for(Iterator i = section.listIterator(); i.hasNext();){ 953 String[] part = (String[])i.next(); 954 for(int x = 0; x < part.length; x++){ 955 parseBlock.append(part[x]); 956 if(x == 0){ 957 parseBlock.append(" "); //the gap will have been trimmed 958 } 959 } 960 } 961 return parseBlock.toString(); 962 } 963}