001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojavax.bio.seq.io; 023 024import java.io.BufferedInputStream; 025import java.io.BufferedReader; 026import java.io.File; 027import java.io.FileReader; 028import java.io.IOException; 029import java.io.InputStreamReader; 030import java.io.PrintStream; 031import java.util.ArrayList; 032import java.util.HashSet; 033import java.util.Iterator; 034import java.util.List; 035import java.util.Set; 036import java.util.TreeSet; 037import java.util.regex.Matcher; 038import java.util.regex.Pattern; 039 040import org.biojava.bio.seq.Sequence; 041import org.biojava.bio.seq.io.ParseException; 042import org.biojava.bio.seq.io.SeqIOListener; 043import org.biojava.bio.seq.io.SymbolTokenization; 044import org.biojava.bio.symbol.IllegalAlphabetException; 045import org.biojava.bio.symbol.IllegalSymbolException; 046import org.biojava.bio.symbol.SimpleSymbolList; 047import org.biojava.bio.symbol.Symbol; 048import org.biojava.bio.symbol.SymbolList; 049import org.biojava.utils.ChangeVetoException; 050import org.biojavax.Comment; 051import org.biojavax.CrossRef; 052import org.biojavax.DocRef; 053import org.biojavax.DocRefAuthor; 054import org.biojavax.Namespace; 055import org.biojavax.Note; 056import org.biojavax.RankedCrossRef; 057import org.biojavax.RankedDocRef; 058import org.biojavax.RichObjectFactory; 059import org.biojavax.SimpleComment; 060import org.biojavax.SimpleCrossRef; 061import org.biojavax.SimpleDocRef; 062import org.biojavax.SimpleRankedCrossRef; 063import org.biojavax.SimpleRankedDocRef; 064import org.biojavax.SimpleRichAnnotation; 065import org.biojavax.bio.seq.CompoundRichLocation; 066import org.biojavax.bio.seq.RichFeature; 067import org.biojavax.bio.seq.RichLocation; 068import org.biojavax.bio.seq.RichSequence; 069import org.biojavax.bio.seq.SimplePosition; 070import org.biojavax.bio.seq.SimpleRichLocation; 071import org.biojavax.bio.taxa.NCBITaxon; 072import org.biojavax.bio.taxa.SimpleNCBITaxon; 073import org.biojavax.ontology.ComparableTerm; 074import org.biojavax.utils.StringTools; 075 076/** 077 * Format reader for GenBank files. This version of Genbank format will generate 078 * and write RichSequence objects. Loosely Based on code from the old, deprecated, 079 * org.biojava.bio.seq.io.GenbankFormat object. 080 * 081 * @author Richard Holland 082 * @author Mark Schreiber 083 * @author David Scott 084 * @author Bubba Puryear 085 * @author George Waldon 086 * @author Deepak Sheoran 087 * @since 1.5 088 */ 089public class GenbankFormat extends RichSequenceFormat.HeaderlessFormat { 090 091 // Register this format with the format auto-guesser. 092 static { 093 RichSequence.IOTools.registerFormat(GenbankFormat.class); 094 } 095 096 /** 097 * The name of this format 098 */ 099 public static final String GENBANK_FORMAT = "GENBANK"; 100 101 protected static final String LOCUS_TAG = "LOCUS"; 102 protected static final String DEFINITION_TAG = "DEFINITION"; 103 protected static final String ACCESSION_TAG = "ACCESSION"; 104 protected static final String VERSION_TAG = "VERSION"; 105 protected static final String KEYWORDS_TAG = "KEYWORDS"; 106 // "SEGMENT" 107 protected static final String SOURCE_TAG = "SOURCE"; 108 protected static final String ORGANISM_TAG = "ORGANISM"; 109 protected static final String REFERENCE_TAG = "REFERENCE"; 110 protected static final String AUTHORS_TAG = "AUTHORS"; 111 protected static final String CONSORTIUM_TAG = "CONSRTM"; 112 protected static final String TITLE_TAG = "TITLE"; 113 protected static final String JOURNAL_TAG = "JOURNAL"; 114 protected static final String PUBMED_TAG = "PUBMED"; 115 protected static final String MEDLINE_TAG = "MEDLINE"; //deprecated 116 protected static final String REMARK_TAG = "REMARK"; 117 protected static final String COMMENT_TAG = "COMMENT"; 118 protected static final String FEATURE_TAG = "FEATURES"; 119 protected static final String BASE_COUNT_TAG_FULL = "BASE COUNT"; //deprecated 120 protected static final String BASE_COUNT_TAG = "BASE"; 121 // "CONTIG" 122 protected static final String START_SEQUENCE_TAG = "ORIGIN"; 123 protected static final String END_SEQUENCE_TAG = "//"; 124 125 // locus line 126 protected static final Pattern lp = Pattern.compile("^(\\S+)\\s+\\d+\\s+(bp|aa)\\s{1,4}([dms]s-)?(\\S+)?\\s+(circular|linear)?\\s*(\\S+)?\\s*(\\S+)?$"); 127 // version line 128 protected static final Pattern vp = Pattern.compile("^(\\S*?)(\\.(\\d+))?(\\s+GI:(\\S+))?$"); 129 // reference line 130 protected static final Pattern refRange = Pattern.compile("^\\s*(\\d+)\\s+to\\s+(\\d+)$"); 131 protected static final Pattern refp = Pattern.compile("^(\\d+)\\s*(?:(\\((?:bases|residues)\\s+(\\d+\\s+to\\s+\\d+(\\s*;\\s*\\d+\\s+to\\s+\\d+)*)\\))|\\(sites\\))?"); 132 // dbxref line 133 protected static final Pattern dbxp = Pattern.compile("^([^:]+):(\\S+)$"); 134 //sections start at a line and continue till the first line afterwards with a 135 //non-whitespace first character 136 //we want to match any of the following as a new section within a section 137 // \s{0,8} word \s{0,7} value 138 // \s{21} /word = value 139 // \s{21} /word 140 protected static final Pattern sectp = Pattern.compile("^(\\s{0,8}(\\S+)\\s{0,7}(.*)|\\s{21}(/\\S+?)=(.*)|\\s{21}(/\\S+))$"); 141 142 protected static final Pattern readableFiles = Pattern.compile(".*(g[bp]k*$|\\u002eg[bp].*)"); 143 protected static final Pattern headerLine = Pattern.compile("^LOCUS.*"); 144 145 private final static HashSet isNotQuoted = new HashSet(); 146 static { 147 isNotQuoted.add("anticodon"); 148 isNotQuoted.add("citation"); 149 isNotQuoted.add("codon"); 150 isNotQuoted.add("codon_start"); 151 isNotQuoted.add("compare"); 152 isNotQuoted.add("cons_splice"); 153 isNotQuoted.add("direction"); 154 isNotQuoted.add("estimated_length"); 155 isNotQuoted.add("label"); 156 isNotQuoted.add("mod_base"); 157 isNotQuoted.add("number"); 158 isNotQuoted.add("rpt_type"); 159 isNotQuoted.add("rpt_unit_range"); 160 isNotQuoted.add("transl_except"); 161 isNotQuoted.add("transl_table"); 162 } 163 164 /** 165 * Implements some GenBank-specific terms. 166 */ 167 public static class Terms extends RichSequence.Terms { 168 /** 169 * Getter for the Genbank term 170 * @return The genbank Term 171 */ 172 public static ComparableTerm getGenBankTerm() { 173 return RichObjectFactory.getDefaultOntology().getOrCreateTerm("GenBank"); 174 } 175 } 176 177 /** 178 * {@inheritDoc} 179 * A file is in GenBank format if the name ends with gbk, contains the letters egb, or the first line of 180 * the file starts with the word LOCUS 181 */ 182 public boolean canRead(File file) throws IOException { 183 if (readableFiles.matcher(file.getName()).matches()) return true; 184 BufferedReader br = new BufferedReader(new FileReader(file)); 185 final String firstLine = br.readLine(); 186 boolean readable = firstLine!=null && headerLine.matcher(firstLine).matches(); 187 br.close(); 188 return readable; 189 } 190 191 /** 192 * {@inheritDoc} 193 * Returns an dna parser if the letters DNA or RNA appear in the first line of the file. 194 * Otherwise returns a DNA tokenizer. 195 */ 196 public SymbolTokenization guessSymbolTokenization(File file) throws IOException { 197 BufferedReader br = new BufferedReader(new FileReader(file)); 198 String firstLine = br.readLine(); 199 boolean dna = (firstLine.indexOf("DNA") >0 || firstLine.indexOf("RNA") > 0); 200 br.close(); 201 if (dna) return RichSequence.IOTools.getDNAParser(); 202 else return RichSequence.IOTools.getProteinParser(); 203 } 204 205 /** 206 * {@inheritDoc} 207 * A stream is in GenBank format if the first line of the stream starts with the word LOCUS 208 */ 209 public boolean canRead(BufferedInputStream stream) throws IOException { 210 stream.mark(2000); // some streams may not support this 211 BufferedReader br = new BufferedReader(new InputStreamReader(stream)); 212 final String firstLine = br.readLine(); 213 boolean readable = firstLine!=null && headerLine.matcher(firstLine).matches(); 214 // don't close the reader as it'll close the stream too. 215 // br.close(); 216 stream.reset(); 217 return readable; 218 } 219 220 /** 221 * {@inheritDoc} 222 * Returns an dna parser if the letters DNA or RNA appear in the first line of the stream. 223 * Otherwise returns a DNA tokenizer. 224 */ 225 public SymbolTokenization guessSymbolTokenization(BufferedInputStream stream) throws IOException { 226 stream.mark(2000); // some streams may not support this 227 BufferedReader br = new BufferedReader(new InputStreamReader(stream)); 228 String firstLine = br.readLine(); 229 boolean dna = (firstLine.indexOf("DNA") >0 || firstLine.indexOf("RNA") > 0); 230 // don't close the reader as it'll close the stream too. 231 // br.close(); 232 stream.reset(); 233 if (dna) return RichSequence.IOTools.getDNAParser(); 234 else return RichSequence.IOTools.getProteinParser(); 235 } 236 237 /** 238 * {@inheritDoc} 239 */ 240 public boolean readSequence(BufferedReader reader, 241 SymbolTokenization symParser, 242 SeqIOListener listener) 243 throws IllegalSymbolException, IOException, ParseException { 244 if (!(listener instanceof RichSeqIOListener)) throw new IllegalArgumentException("Only accepting RichSeqIOListeners today"); 245 return this.readRichSequence(reader,symParser,(RichSeqIOListener)listener,null); 246 } 247 248 private String sectionKey = null; 249 private NCBITaxon tax = null; 250 private String organism = null; 251 private String accession = null; 252 private String identifier = null; 253 /** 254 * {@inheritDoc} 255 */ 256 public boolean readRichSequence(BufferedReader reader, 257 SymbolTokenization symParser, 258 RichSeqIOListener rlistener, 259 Namespace ns) 260 throws IllegalSymbolException, IOException, ParseException { 261 262 sectionKey = null; 263 tax = null; 264 organism = null; 265 accession = null; 266 identifier = null; 267 boolean hasAnotherSequence = true; 268 //boolean hasInternalWhitespace = false; 269 270 rlistener.startSequence(); 271 272 if (ns==null) ns=RichObjectFactory.getDefaultNamespace(); 273 rlistener.setNamespace(ns); 274 275 // Get an ordered list of key->value pairs in array-tuples 276 List section = null; 277 try{ 278 do { 279 section = this.readSection(reader); 280 sectionKey = ((String[])section.get(0))[0]; 281 if(sectionKey == null){ 282 String message = ParseException.newMessage(this.getClass(), accession, identifier, "Section key was null", sectionToString(section)); 283 throw new ParseException(message); 284 } 285 // process section-by-section 286 if (sectionKey.equals(LOCUS_TAG)) { 287 String loc = ((String[])section.get(0))[1]; 288 Matcher m = lp.matcher(loc); 289 if (m.matches()) { 290 rlistener.setName(m.group(1)); 291 accession = m.group(1); // default if no accession found 292 rlistener.setAccession(accession); 293 if (m.group(4)!=null) 294 rlistener.addSequenceProperty(Terms.getMolTypeTerm(),m.group(4)); 295 // Optional extras 296 String stranded = m.group(3); 297 if(stranded!=null && stranded.equals("ss-")) 298 stranded = "single"; 299 else if(stranded!=null && stranded.equals("ms-")) 300 stranded = "mixed"; 301 else if(stranded!=null && stranded.equals("ds-")) 302 stranded = "double"; 303 String circular = m.group(5); 304 String fifth = m.group(6); 305 String sixth = m.group(7); 306 if (stranded!=null) rlistener.addSequenceProperty(Terms.getStrandedTerm(),stranded); 307 if (circular!=null && circular.equalsIgnoreCase("circular")) rlistener.setCircular(true); 308 if (sixth != null) { 309 rlistener.setDivision(fifth); 310 rlistener.addSequenceProperty(Terms.getDateUpdatedTerm(),sixth); 311 } else if (fifth!=null) { 312 rlistener.addSequenceProperty(Terms.getDateUpdatedTerm(),fifth); 313 } 314 } else { 315 String message = ParseException.newMessage(this.getClass(), accession, identifier, "Bad locus line", sectionToString(section)); 316 throw new ParseException(message); 317 } 318 } else if (sectionKey.equals(DEFINITION_TAG)) { 319 rlistener.setDescription(((String[])section.get(0))[1]); 320 } else if (sectionKey.equals(ACCESSION_TAG)) { 321 // if multiple accessions, store only first as accession, 322 // and store rest in annotation 323 String[] accs = ((String[])section.get(0))[1].split("\\s+"); 324 accession = accs[0].trim(); 325 rlistener.setAccession(accession); 326 for (int i = 1; i < accs.length; i++) { 327 rlistener.addSequenceProperty(Terms.getAdditionalAccessionTerm(),accs[i].trim()); 328 } 329 } else if (sectionKey.equals(VERSION_TAG)) { 330 String ver = ((String[])section.get(0))[1]; 331 Matcher m = vp.matcher(ver); 332 if (m.matches()) { 333 String verAcc = m.group(1); 334 if (!accession.equals(verAcc)) { 335 // the version refers to a different accession! 336 // believe the version line, and store the original 337 // accession away in the additional accession set 338 rlistener.addSequenceProperty(Terms.getAdditionalAccessionTerm(),accession); 339 accession = verAcc; 340 rlistener.setAccession(accession); 341 } 342 if (m.group(3)!=null) rlistener.setVersion(Integer.parseInt(m.group(3))); 343 if (m.group(5)!=null) { 344 identifier = m.group(5); 345 rlistener.setIdentifier(identifier); 346 } 347 } else { 348 String message = ParseException.newMessage(this.getClass(), accession, identifier, "Bad version line", sectionToString(section)); 349 throw new ParseException(message); 350 } 351 } else if (sectionKey.equals(KEYWORDS_TAG)) { 352 String val = ((String[])section.get(0))[1]; 353 if (val.endsWith(".")) val = val.substring(0, val.length()-1); // chomp dot 354 val = val.replace('\n',' '); //remove newline 355 String[] kws = val.split(";"); 356 357 for (int i = 0; i < kws.length; i++) { 358 String kw = kws[i].trim(); 359 if (kw.length()==0) continue; 360 rlistener.addSequenceProperty(Terms.getKeywordTerm(), kw); 361 } 362 } else if (sectionKey.equals(SOURCE_TAG)) { 363 // ignore - can get all this from the first feature 364 } else if (sectionKey.equals(REFERENCE_TAG) && !this.getElideReferences()) { 365 // first line of section has rank and location 366 int ref_rank; 367 List baseRangeList=null; 368 String ref = ((String[])section.get(0))[1]; 369 Matcher m = refp.matcher(ref); 370 if (m.matches()) { 371 ref_rank = Integer.parseInt(m.group(1)); 372 if (m.group(3) != null) baseRangeList=buildBaseRanges(m.group(3)); 373 } else { 374 String message = ParseException.newMessage(this.getClass(), accession, identifier, "Bad reference line", sectionToString(section)); 375 throw new ParseException(message); 376 } 377 // rest can be in any order 378 String authors = null; 379 String consortium = null; 380 String title = null; 381 String journal = null; 382 String medline = null; 383 String pubmed = null; 384 String remark = null; 385 for (int i = 1; i < section.size(); i++) { 386 String key = ((String[])section.get(i))[0]; 387 String val = ((String[])section.get(i))[1]; 388 if (key.equals(AUTHORS_TAG)) authors = val.replace('\n',' '); //see #2276 389 else if (key.equals(CONSORTIUM_TAG)) consortium = val.replace('\n',' '); //see #2276 390 else if (key.equals(TITLE_TAG)) title = val.replace('\n',' '); //see #2276 391 else if (key.equals(JOURNAL_TAG)) journal = val.replace('\n',' '); //see #2276 392 else if (key.equals(MEDLINE_TAG)) medline = val; 393 else if (key.equals(PUBMED_TAG)) pubmed = val; 394 else if (key.equals(REMARK_TAG)) remark = val.replace('\n',' '); //see #2276 395 } 396 397 // create the docref object 398 try { 399 // Use consortium as well if present. 400 if (authors==null) authors = consortium + " (consortium)"; 401 else if (consortium!=null) authors = authors + ", " + consortium + " (consortium)"; 402 // Create docref. 403 DocRef dr = null; 404 // assign either the pubmed or medline to the docref - medline gets priority 405 if (medline != null) { 406 dr = (DocRef) RichObjectFactory.getObject(SimpleDocRef.class, new Object[]{DocRefAuthor.Tools.parseAuthorString(authors), journal, title, Terms.MEDLINE_KEY, medline, new Integer(0)}); 407 if (dr.getCrossref() == null) { 408 dr.setCrossref((CrossRef) RichObjectFactory.getObject(SimpleCrossRef.class, new Object[]{Terms.MEDLINE_KEY, medline, new Integer(0)})); 409 } 410 } else if (pubmed != null) { 411 dr = (DocRef) RichObjectFactory.getObject(SimpleDocRef.class, new Object[]{DocRefAuthor.Tools.parseAuthorString(authors), journal, title, Terms.PUBMED_KEY, pubmed, new Integer(0)}); 412 if (dr.getCrossref() == null) { 413 dr.setCrossref((CrossRef) RichObjectFactory.getObject(SimpleCrossRef.class, new Object[]{Terms.PUBMED_KEY, pubmed, new Integer(0)})); 414 } 415 } else { 416 dr = (DocRef) RichObjectFactory.getObject(SimpleDocRef.class, new Object[]{DocRefAuthor.Tools.parseAuthorString(authors), journal, title}); 417 } 418 // assign the remarks 419 if (!this.getElideComments()) dr.setRemark(remark); 420 // assign the docref to the bioentry: null if no base ranges, Integers if 1 base range - the normal case, joined RichLocation if more than 1 421 RankedDocRef rdr = baseRangeList == null?new SimpleRankedDocRef(dr, null, null, ref_rank):(baseRangeList.size()==1?new SimpleRankedDocRef(dr, new Integer(((RichLocation)baseRangeList.get(0)).getMin()), new Integer(((RichLocation)baseRangeList.get(0)).getMax()), ref_rank):new SimpleRankedDocRef(dr, new CompoundRichLocation(baseRangeList), ref_rank)); 422 rlistener.setRankedDocRef(rdr); 423 } catch (ChangeVetoException e) { 424 throw new ParseException(e+", accession:"+accession); 425 } 426 } else if (sectionKey.equals(COMMENT_TAG) && !this.getElideComments()) { 427 // Set up some comments 428 rlistener.setComment(((String[])section.get(0))[1]); 429 } else if (sectionKey.equals(FEATURE_TAG) && !this.getElideFeatures()) { 430 // starting from second line of input, start a new feature whenever we come across 431 // a key that does not start with / 432 boolean seenAFeature = false; 433 int rcrossrefCount = 0; 434 boolean skippingBond = false; 435 for (int i = 1 ; i < section.size(); i++) { 436 String key = ((String[])section.get(i))[0]; 437 String val = ((String[])section.get(i))[1]; 438 if (key.startsWith("/")) { 439 if(!skippingBond) 440 { 441 key = key.substring(1); // strip leading slash 442 val = val.replaceAll("\\s*[\\n\\r]+\\s*"," ").trim(); 443 if (val.endsWith("\"")) val = val.substring(1,val.length()-1); // strip quotes 444 // parameter on old feature 445 if (key.equals("db_xref")) { 446 Matcher m = dbxp.matcher(val); 447 if (m.matches()) { 448 String dbname = m.group(1); 449 String raccession = m.group(2); 450 if (dbname.equalsIgnoreCase("taxon")) { 451 // Set the Taxon instead of a dbxref 452 tax = (NCBITaxon)RichObjectFactory.getObject(SimpleNCBITaxon.class, new Object[]{Integer.valueOf(raccession)}); 453 rlistener.setTaxon(tax); 454 try { 455 if (organism!=null) tax.addName(NCBITaxon.SCIENTIFIC,organism.replace('\n', ' '));// readSection can embed new lines 456 } catch (ChangeVetoException e) { 457 throw new ParseException(e+", accession:"+accession); 458 } 459 } else { 460 try { 461 CrossRef cr = (CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{dbname, raccession, new Integer(0)}); 462 RankedCrossRef rcr = new SimpleRankedCrossRef(cr, ++rcrossrefCount); 463 rlistener.getCurrentFeature().addRankedCrossRef(rcr); 464 } catch (ChangeVetoException e) { 465 throw new ParseException(e+", accession:"+accession); 466 } 467 } 468 } else { 469 String message = ParseException.newMessage(this.getClass(), accession, identifier, "Bad dbxref", sectionToString(section)); 470 throw new ParseException(message); 471 } 472 } else if (key.equalsIgnoreCase("organism")) { 473 try { 474 organism = val; 475 if (tax!=null) tax.addName(NCBITaxon.SCIENTIFIC,organism.replace('\n', ' '));// readSection can embed new lines 476 } catch (ChangeVetoException e) { 477 throw new ParseException(e+", accession:"+accession); 478 } 479 } else { 480 if (key.equalsIgnoreCase("translation")) { 481 // strip spaces from sequence 482 val = val.replaceAll("\\s+",""); 483 } 484 rlistener.addFeatureProperty(RichObjectFactory.getDefaultOntology().getOrCreateTerm(key),val); 485 } 486 } 487 } else { 488 // new feature! 489 // end previous feature 490 if(key.equalsIgnoreCase("bond")) 491 { 492 skippingBond = true; 493 } 494 else 495 { 496 skippingBond = false; 497 if (seenAFeature) { 498 rlistener.endFeature(); 499 } 500 // start next one, with lots of lovely info in it 501 RichFeature.Template templ = new RichFeature.Template(); 502 templ.annotation = new SimpleRichAnnotation(); 503 templ.sourceTerm = Terms.getGenBankTerm(); 504 templ.typeTerm = RichObjectFactory.getDefaultOntology().getOrCreateTerm(key); 505 templ.featureRelationshipSet = new TreeSet(); 506 templ.rankedCrossRefs = new TreeSet(); 507 String tidyLocStr = val.replaceAll("\\s+",""); 508 templ.location = GenbankLocationParser.parseLocation(ns, accession, tidyLocStr); 509 rlistener.startFeature(templ); 510 seenAFeature = true; 511 rcrossrefCount = 0; 512 } 513 514 } 515 } 516 517 if (seenAFeature) { 518 rlistener.endFeature(); 519 } 520 } else if (sectionKey.equals(BASE_COUNT_TAG)) { 521 // ignore - can calculate from sequence content later if needed 522 } else if (sectionKey.equals(START_SEQUENCE_TAG) && !this.getElideSymbols()) { 523 // our first line is ignorable as it is the ORIGIN tag 524 // the second line onwards conveniently have the number as 525 // the [0] tuple, and sequence string as [1] so all we have 526 // to do is concat the [1] parts and then strip out spaces, 527 // and replace '.' and '~' with '-' for our parser. 528 StringBuffer seq = new StringBuffer(); 529 for (int i = 1 ; i < section.size(); i++) seq.append(((String[])section.get(i))[1]); 530 try { 531 SymbolList sl = new SimpleSymbolList(symParser, 532 seq.toString().replaceAll("\\s+","").replaceAll("[\\.|~]","-")); 533 rlistener.addSymbols(symParser.getAlphabet(), 534 (Symbol[])(sl.toList().toArray(new Symbol[0])), 535 0, sl.length()); 536 } catch (IllegalAlphabetException e) { 537 String message = ParseException.newMessage(this.getClass(), accession, identifier, "Bad sequence section", sectionToString(section)); 538 throw new ParseException(e, message); 539 } 540 } 541 } while (!sectionKey.equals(END_SEQUENCE_TAG)); 542 }catch(RuntimeException e){ 543 String message = ParseException.newMessage(this.getClass(), accession, identifier, "Bad sequence section", sectionToString(section)); 544 throw new ParseException(e, message); 545 } 546 547 // Allows us to tolerate trailing whitespace without 548 // thinking that there is another Sequence to follow 549 while (true) { 550 reader.mark(1); 551 int c = reader.read(); 552 if (c == -1) { 553 hasAnotherSequence = false; 554 break; 555 } 556 if (Character.isWhitespace((char) c)) { 557 //hasInternalWhitespace = true; 558 continue; 559 } 560 //if (hasInternalWhitespace) 561 // System.err.println("Warning: whitespace found between sequence entries"); 562 reader.reset(); 563 break; 564 } 565 566 // Finish up. 567 rlistener.endSequence(); 568 return hasAnotherSequence; 569 } 570 571 // reads an indented section, combining split lines and creating a list of key->value tuples 572 private List readSection(BufferedReader br) throws ParseException { 573 List section = new ArrayList(); 574 String line = ""; 575 String currKey = null; 576 StringBuffer currVal = new StringBuffer(); 577 boolean done = false; 578 int linecount = 0; 579 580 try { 581 while (!done) { 582 br.mark(320); 583 line = br.readLine(); 584 String firstSecKey = section.isEmpty() ? "" : ((String[])section.get(0))[0]; 585 if (line != null && line.matches("\\p{Space}*")) { 586 // regular expression \p{Space}* will match line 587 // having only white space characters 588 continue; 589 } 590 if (line==null || (!line.startsWith(" ") && linecount++>0 && ( !firstSecKey.equals(START_SEQUENCE_TAG) || line.startsWith(END_SEQUENCE_TAG)))) { 591 // dump out last part of section 592 section.add(new String[]{currKey,currVal.toString()}); 593 br.reset(); 594 done = true; 595 } else { 596 Matcher m = sectp.matcher(line); 597 if (m.matches()) { 598 // new key 599 if (currKey!=null) section.add(new String[]{currKey,currVal.toString()}); 600 // key = group(2) or group(4) or group(6) - whichever is not null 601 currKey = m.group(2)==null?(m.group(4)==null?m.group(6):m.group(4)):m.group(2); 602 currVal = new StringBuffer(); 603 // val = group(3) if group(2) not null, group(5) if group(4) not null, "" otherwise, trimmed 604 currVal.append((m.group(2)==null?(m.group(4)==null?"":m.group(5)):m.group(3)).trim()); 605 } else { 606 // concatted line or SEQ START/END line? 607 if (line.startsWith(START_SEQUENCE_TAG) || line.startsWith(END_SEQUENCE_TAG)) currKey = line; 608 else { 609 currVal.append("\n"); // newline in between lines - can be removed later 610 currVal.append(currKey.charAt(0)=='/'?line.substring(21):line.substring(12)); 611 } 612 } 613 } 614 } 615 } catch (IOException e) { 616 String message = ParseException.newMessage(this.getClass(), accession, identifier, "", sectionToString(section)); 617 throw new ParseException(e, message); 618 } catch (RuntimeException e){ 619 String message = ParseException.newMessage(this.getClass(), accession, identifier, "Bad line", line); 620 throw new ParseException(e, message); 621 } 622 return section; 623 } 624 625 private final List buildBaseRanges(final String theBaseRangeList) throws ParseException { 626 if (theBaseRangeList == null) return null; 627 final List baseRangeList = new ArrayList(); 628 final String[] baseRange = theBaseRangeList.split(";"); 629 try{ 630 for (int r=0; r<baseRange.length; r++) { 631 final Matcher rangeMatch = refRange.matcher(baseRange[r]); 632 if (rangeMatch.matches()) { 633 final int rangeStart = Integer.parseInt(rangeMatch.group(1)); 634 final int rangeEnd = Integer.parseInt(rangeMatch.group(2)); 635 baseRangeList.add(new SimpleRichLocation(new SimplePosition(rangeStart), new SimplePosition(rangeEnd), r)); 636 } else { 637 String message = ParseException.newMessage(this.getClass(), accession, identifier, "Bad reference range found", theBaseRangeList); 638 throw new ParseException(message); 639 } 640 } 641 return baseRangeList; 642 }catch(RuntimeException e){ 643 String message = ParseException.newMessage(this.getClass(), accession, identifier, "Bad base range", theBaseRangeList); 644 throw new ParseException(e, message); 645 } 646 } 647 648 /** 649 * {@inheritDoc} 650 */ 651 public void writeSequence(Sequence seq, PrintStream os) throws IOException { 652 if (this.getPrintStream()==null) this.setPrintStream(os); 653 this.writeSequence(seq, RichObjectFactory.getDefaultNamespace()); 654 } 655 656 /** 657 * {@inheritDoc} 658 */ 659 public void writeSequence(Sequence seq, String format, PrintStream os) throws IOException { 660 if (this.getPrintStream()==null) this.setPrintStream(os); 661 if (!format.equals(this.getDefaultFormat())) throw new IllegalArgumentException("Unknown format: "+format); 662 this.writeSequence(seq, RichObjectFactory.getDefaultNamespace()); 663 } 664 665 /** 666 * {@inheritDoc} 667 * Namespace is ignored as Genbank has no concept of it. 668 */ 669 public void writeSequence(Sequence seq, Namespace ns) throws IOException { 670 RichSequence rs; 671 try { 672 if (seq instanceof RichSequence) rs = (RichSequence)seq; 673 else rs = RichSequence.Tools.enrich(seq); 674 } catch (ChangeVetoException e) { 675 IOException e2 = new IOException("Unable to enrich sequence"); 676 e2.initCause(e); 677 throw e2; 678 } 679 680 SymbolTokenization tok; 681 try { 682 tok = rs.getAlphabet().getTokenization("token"); 683 } catch (Exception e) { 684 throw new RuntimeException("Unable to get alphabet tokenizer",e); 685 } 686 Set<Note> notes = rs.getNoteSet(); 687 String accession = rs.getAccession(); 688 StringBuffer accessions = new StringBuffer(); 689 accessions.append(accession); 690 String stranded = ""; 691 String udat = ""; 692 String moltype = rs.getAlphabet().getName(); 693 if ("PROTEIN-TERM".equals(moltype) || "PROTEIN".equals(moltype)) moltype = null; //a genpept curiosity 694 StringBuffer keywords = new StringBuffer(); 695 for (Iterator<Note> i = notes.iterator(); i.hasNext(); ) { 696 Note n = i.next(); 697 if (n.getTerm().equals(Terms.getStrandedTerm())) { 698 String value = n.getValue(); 699 if(value != null && value.equals("single")) 700 stranded= "ss-"; 701 else if(value != null && value.equals("mixed")) 702 stranded= "ms-"; 703 } 704 else if (n.getTerm().equals(Terms.getDateUpdatedTerm())) udat=n.getValue(); 705 else if (n.getTerm().equals(Terms.getMolTypeTerm())) moltype=n.getValue(); 706 else if (n.getTerm().equals(Terms.getAdditionalAccessionTerm())) { 707 accessions.append(" "); 708 accessions.append(n.getValue()); 709 } else if (n.getTerm().equals(Terms.getKeywordTerm())) { 710 if (n.getValue() != null) { 711 if (keywords.length()>0) keywords.append("; "); 712 keywords.append(n.getValue()); 713 } 714 } 715 } 716 717 //adjust molecule type during format conversion 718 if(moltype!=null && moltype.length()>6) { 719 if(moltype.indexOf("DNA")!=-1) moltype = "DNA"; 720 else if(moltype.indexOf("RNA")!=-1) moltype = "RNA"; 721 else moltype = "NA"; 722 } 723 724 // locus(name) + length + alpha + div + date line 725 StringBuffer locusLine = new StringBuffer(); 726 locusLine.append(StringTools.rightPad(rs.getName(),16));//13->28=15+1=16 727 locusLine.append(" ");//29 728 locusLine.append(StringTools.leftPad(""+rs.length(),11));//30->40=10+1=11 729 locusLine.append(" "+ (moltype==null? "aa":"bp") +" ");//41->44 730 locusLine.append(StringTools.leftPad(stranded,3));//45->47=2+1=3 731 locusLine.append(StringTools.rightPad(moltype==null?"":moltype,6));//48->53=5+1=6 732 locusLine.append(" ");//54->55 733 locusLine.append(StringTools.rightPad(rs.getCircular()?"circular":"linear",8));//56->63=7+1=8 734 locusLine.append(" ");//64->64 735 String div = rs.getDivision()==null?"":rs.getDivision(); 736 if(div.length()>3) div = ""; // Not a GenBank division, maybe UniProt, etc. 737 locusLine.append(StringTools.rightPad(div,3));//65->67=2+1=3 738 locusLine.append(" ");//68->68 739 locusLine.append(StringTools.rightPad(udat,11));//69->79=10+1=11 740 StringTools.writeKeyValueLine(LOCUS_TAG, locusLine.toString(), 12, this.getLineWidth(), this.getPrintStream()); 741 742 // definition line 743 StringTools.writeKeyValueLine(DEFINITION_TAG, rs.getDescription(), 12, this.getLineWidth(), this.getPrintStream()); 744 745 // accession line 746 StringTools.writeKeyValueLine(ACCESSION_TAG, accessions.toString(), 12, this.getLineWidth(), this.getPrintStream()); 747 748 // version + gi line 749 String version = accession+"."+rs.getVersion(); 750 if (rs.getIdentifier()!=null) version = version + " GI:"+rs.getIdentifier(); 751 StringTools.writeKeyValueLine(VERSION_TAG, version, 12, this.getLineWidth(), this.getPrintStream()); 752 753 // keywords line 754 keywords.append("."); 755 StringTools.writeKeyValueLine(KEYWORDS_TAG, keywords.toString(), 12, this.getLineWidth()-1, this.getPrintStream()); 756 757 // source line (from taxon) 758 // organism line 759 NCBITaxon tax = rs.getTaxon(); 760 if (tax!=null) { 761 StringTools.writeKeyValueLine(SOURCE_TAG, (isMitochondrial(rs)?"mitochondrion ":"")+tax.getDisplayName(), 12, this.getLineWidth(), this.getPrintStream()); 762 StringTools.writeKeyValueLine(" "+ORGANISM_TAG, tax.getDisplayName().split("\\s+\\(")[0]+"\n"+tax.getNameHierarchy(), 12, this.getLineWidth()-1, this.getPrintStream()); 763 } 764 765 // references - rank (bases x to y) 766 for (Iterator<RankedDocRef> r = rs.getRankedDocRefs().iterator(); r.hasNext(); ) { 767 RankedDocRef rdr = r.next(); 768 DocRef d = rdr.getDocumentReference(); 769 StringTools.writeKeyValueLine(REFERENCE_TAG, rdr.getRank()+((rdr.getLocation()==null || rdr.getLocation() ==RichLocation.EMPTY_LOCATION)?"": (moltype==null? " (residues ":" (bases ")+makeBaseRange(rdr)+")"), 12, this.getLineWidth(), this.getPrintStream()); 770 // Any authors that were in the input as CONSRTM tags will 771 // be merged into the AUTHORS tag on output. 772 StringTools.writeKeyValueLine(" "+AUTHORS_TAG, d.getAuthors(), 12, this.getLineWidth()-1, this.getPrintStream()); 773 StringTools.writeKeyValueLine(" "+TITLE_TAG, d.getTitle(), 12, this.getLineWidth(), this.getPrintStream()); 774 StringTools.writeKeyValueLine(" "+JOURNAL_TAG, d.getLocation(), 12, this.getLineWidth(), this.getPrintStream()); 775 CrossRef c = d.getCrossref(); 776 if (c!=null) StringTools.writeKeyValueLine(StringTools.leftPad(c.getDbname(),9), c.getAccession(), 12, this.getLineWidth(), this.getPrintStream()); 777 StringTools.writeKeyValueLine(" "+REMARK_TAG, d.getRemark(), 12, this.getLineWidth(), this.getPrintStream()); 778 } 779 780 // comments - if any 781 Set<Comment> comments = rs.getComments(); 782 if (!comments.isEmpty()) { 783 StringBuffer sb = new StringBuffer(); 784 for (Iterator<Comment> i = comments.iterator(); i.hasNext(); ) { 785 Comment c = i.next(); 786 sb.append(c.getComment()); 787 if (i.hasNext()) sb.append("\n"); 788 } 789 StringTools.writeKeyValueLine(COMMENT_TAG, sb.toString(), 12, this.getLineWidth(), this.getPrintStream()); 790 } 791 792 this.getPrintStream().println(FEATURE_TAG+" Location/Qualifiers"); 793 // feature_type location 794 for (Iterator i = rs.getFeatureSet().iterator(); i.hasNext(); ) { 795 RichFeature f = (RichFeature)i.next(); 796 StringTools.writeKeyValueLine(" "+f.getTypeTerm().getName(), GenbankLocationParser.writeLocation((RichLocation)f.getLocation()), 21, this.getLineWidth()-1, ",", this.getPrintStream()); 797 for (Iterator<Note> j = f.getNoteSet().iterator(); j.hasNext(); ) { 798 Note n = j.next(); 799 // /key="val" or just /key if val=="" 800 if (n.getValue()==null || n.getValue().length()==0) StringTools.writeKeyValueLine("", "/"+n.getTerm().getName(), 21, this.getLineWidth(), this.getPrintStream()); 801 else if (isNotQuoted(n)) {// doesn't have the value enclosed in quotes 802 StringTools.writeKeyValueLine("", "/"+n.getTerm().getName()+"="+n.getValue(), 21, this.getLineWidth(), this.getPrintStream()); 803 } else if (n.getTerm().getName().equals("translation")) { 804 StringTools.writeKeyValueLine("", "/"+n.getTerm().getName()+"=\""+n.getValue()+"\"", 21, this.getLineWidth()-1, this.getPrintStream()); 805 } else { 806 StringTools.writeKeyValueLine("", "/"+n.getTerm().getName()+"=\""+n.getValue()+"\"", 21, this.getLineWidth(), this.getPrintStream()); 807 } 808 } 809 // add-in to source feature only organism and db_xref="taxon:xyz" where present 810 if (f.getType().equals("source") && tax!=null) { 811 String displayName = tax.getDisplayName(); 812 if (displayName.indexOf('(')>-1) displayName = displayName.substring(0, displayName.indexOf('(')).trim(); 813 StringTools.writeKeyValueLine("", "/organism=\""+displayName+"\"", 21, this.getLineWidth()-1, this.getPrintStream());// AF252370 fits in exactly 80 - but is wrapped 814 for (Iterator<RankedCrossRef> j = f.getRankedCrossRefs().iterator(); j.hasNext(); ) { 815 RankedCrossRef rcr = j.next(); 816 CrossRef cr = rcr.getCrossRef(); 817 StringTools.writeKeyValueLine("", "/db_xref=\""+cr.getDbname()+":"+cr.getAccession()+"\"", 21, this.getLineWidth(), this.getPrintStream()); 818 } 819 StringTools.writeKeyValueLine("", "/db_xref=\"taxon:"+tax.getNCBITaxID()+"\"", 21, this.getLineWidth(), this.getPrintStream()); 820 } else { 821 // add-in other dbxrefs where present 822 for (Iterator<RankedCrossRef> j = f.getRankedCrossRefs().iterator(); j.hasNext(); ) { 823 RankedCrossRef rcr = j.next(); 824 CrossRef cr = rcr.getCrossRef(); 825 StringTools.writeKeyValueLine("", "/db_xref=\""+cr.getDbname()+":"+cr.getAccession()+"\"", 21, this.getLineWidth(), this.getPrintStream()); 826 } 827 } 828 } 829 830 //BASE COUNT obsolete in Genbank flatfile format since October 2003 831 //if (rs.getAlphabet()==AlphabetManager.alphabetForName("DNA")) { 832 // // BASE COUNT 1510 a 1074 c 835 g 1609 t 833 // int aCount = 0; 834 // int cCount = 0; 835 // int gCount = 0; 836 // int tCount = 0; 837 // int oCount = 0; 838 // for (int i = 1; i <= rs.length(); i++) { 839 // char c; 840 // try { 841 // c = tok.tokenizeSymbol(rs.symbolAt(i)).charAt(0); 842 // } catch (Exception e) { 843 // throw new RuntimeException("Unable to get symbol at position "+i,e); 844 // } 845 // switch (c) { 846 // case 'a': case 'A': 847 // aCount++; 848 // break; 849 // case 'c': case 'C': 850 // cCount++; 851 // break; 852 // case 'g': case 'G': 853 // gCount++; 854 // break; 855 // case 't': case 'T': 856 // tCount++; 857 // break; 858 // default: 859 // oCount++; 860 // } 861 // } 862 // 863 // this.getPrintStream().print(BASE_COUNT_TAG_FULL+" "); 864 // this.getPrintStream().print(aCount + " a "); 865 // this.getPrintStream().print(cCount + " c "); 866 // this.getPrintStream().print(gCount + " g "); 867 // this.getPrintStream().print(tCount + " t "); 868 // this.getPrintStream().println(oCount + " others"); 869 //} 870 871 this.getPrintStream().println(START_SEQUENCE_TAG); 872 // sequence stuff 873 Symbol[] syms = (Symbol[])rs.toList().toArray(new Symbol[0]); 874 int lines = 0; 875 int symCount = 0; 876 for (int i = 0; i < syms.length; i++) { 877 if (symCount % 60 == 0) { 878 if (lines > 0) this.getPrintStream().print("\n"); // newline from previous line 879 int lineNum = (lines*60) + 1; 880 this.getPrintStream().print(StringTools.leftPad(""+lineNum,9)); 881 lines++; 882 } 883 if (symCount % 10 == 0) this.getPrintStream().print(" "); 884 try { 885 this.getPrintStream().print(tok.tokenizeSymbol(syms[i])); 886 } catch (IllegalSymbolException e) { 887 throw new RuntimeException("Found illegal symbol: "+syms[i]); 888 } 889 symCount++; 890 } 891 if(syms.length>0) //do not create an empty line 892 this.getPrintStream().print("\n"); 893 this.getPrintStream().println(END_SEQUENCE_TAG); 894 } 895 896 /** 897 * {@inheritDoc} 898 */ 899 public String getDefaultFormat() { 900 return GENBANK_FORMAT; 901 } 902 903 private final static boolean isMitochondrial(final RichSequence theSequence) { 904 final Set featureSet = theSequence.getFeatureSet(); 905 final Iterator i = featureSet.iterator(); 906 while (i.hasNext()) { 907 final RichFeature feature = (RichFeature) i.next(); 908 if (feature.getType().equals("source")) { 909 final Set noteSet = feature.getNoteSet(); 910 final Iterator<Note> n = noteSet.iterator(); 911 while(n.hasNext()) { 912 final Note note = n.next(); 913 if (note.getTerm().getName().equals("organelle")) return note.getValue().equals("mitochondrion"); 914 } 915 } 916 } 917 return false; 918 } 919 920 private final static boolean isNotQuoted(final Note theNote) { 921 return isNotQuoted(theNote.getTerm().getName(), theNote.getValue()); 922 } 923 924 private final static boolean isNotQuoted(final String theName, final String theValue) { 925 return isNotQuoted.contains(theName); 926 } 927 928 private final static String makeBaseRange(final RankedDocRef theReference) { 929 return theReference.getLocation()==null?theReference.getStart()+" to "+theReference.getEnd():toString(theReference.getLocation()); 930 } 931 932 private final static String toString(final RichLocation theLocation) { 933 final StringBuffer list = new StringBuffer(); 934 final Iterator b = theLocation.blockIterator(); 935 while (b.hasNext()) { 936 final RichLocation location = (RichLocation) b.next(); 937 list.append(location.getMin()+" to "+location.getMax()); 938 if (b.hasNext()) list.append("; "); 939 } 940 return list.toString(); 941 } 942 943 /** 944 * Converts the current parse section to a String. Useful for debugging. 945 */ 946 String sectionToString(List section){ 947 StringBuffer parseBlock = new StringBuffer(); 948 for(Iterator i = section.listIterator(); i.hasNext();){ 949 String[] part = (String[])i.next(); 950 for(int x = 0; x < part.length; x++){ 951 parseBlock.append(part[x]); 952 if(x == 0){ 953 parseBlock.append(" "); //the gap will have been trimmed 954 } 955 } 956 } 957 return parseBlock.toString(); 958 } 959}