001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojavax.bio.seq.io; 023 024import java.io.BufferedInputStream; 025import java.io.BufferedReader; 026import java.io.File; 027import java.io.FileReader; 028import java.io.IOException; 029import java.io.InputStreamReader; 030import java.io.PrintStream; 031import java.util.ArrayList; 032import java.util.Iterator; 033import java.util.List; 034import java.util.Set; 035import java.util.TreeSet; 036import java.util.regex.Matcher; 037import java.util.regex.Pattern; 038 039import org.biojava.bio.seq.Sequence; 040import org.biojava.bio.seq.io.ParseException; 041import org.biojava.bio.seq.io.SeqIOListener; 042import org.biojava.bio.seq.io.SymbolTokenization; 043import org.biojava.bio.symbol.IllegalSymbolException; 044import org.biojava.bio.symbol.SimpleSymbolList; 045import org.biojava.bio.symbol.Symbol; 046import org.biojava.bio.symbol.SymbolList; 047import org.biojava.utils.ChangeVetoException; 048import org.biojavax.Comment; 049import org.biojavax.CrossRef; 050import org.biojavax.DocRef; 051import org.biojavax.DocRefAuthor; 052import org.biojavax.Namespace; 053import org.biojavax.Note; 054import org.biojavax.RankedCrossRef; 055import org.biojavax.RankedDocRef; 056import org.biojavax.RichAnnotation; 057import org.biojavax.RichObjectFactory; 058import org.biojavax.SimpleComment; 059import org.biojavax.SimpleCrossRef; 060import org.biojavax.SimpleDocRef; 061import org.biojavax.SimpleDocRefAuthor; 062import org.biojavax.SimpleNote; 063import org.biojavax.SimpleRankedCrossRef; 064import org.biojavax.SimpleRankedDocRef; 065import org.biojavax.SimpleRichAnnotation; 066import org.biojavax.bio.seq.RichFeature; 067import org.biojavax.bio.seq.RichLocation; 068import org.biojavax.bio.seq.RichSequence; 069import org.biojavax.bio.taxa.NCBITaxon; 070import org.biojavax.bio.taxa.SimpleNCBITaxon; 071import org.biojavax.ontology.ComparableTerm; 072import org.biojavax.utils.StringTools; 073 074/** 075 * Format reader for EMBL files. This version of EMBL format will generate 076 * and write RichSequence objects. Loosely Based on code from the old, deprecated, 077 * org.biojava.bio.seq.io.EmblLikeFormat object. 078 * <p> 079 * This format will read both Pre-87 and 87+ versions of EMBL. It will also write 080 * them both. By default, it will write the most recent version. If you want 081 * an earlier one, you must specify the format by passing one of the constants 082 * defined in this class to {@link #writeSequence(Sequence, String, Namespace)}. 083 * 084 * @author Richard Holland 085 * @author Jolyon Holdstock 086 * @author Mark Schreiber 087 * @since 1.5 088 */ 089public class EMBLFormat extends RichSequenceFormat.HeaderlessFormat { 090 091 // Register this format with the format auto-guesser. 092 static { 093 RichSequence.IOTools.registerFormat(EMBLFormat.class); 094 } 095 096 /** 097 * The name of the Pre-87 format 098 */ 099 public static final String EMBL_PRE87_FORMAT = "EMBL_PRE87"; 100 101 /** 102 * The name of the current format 103 */ 104 public static final String EMBL_FORMAT = "EMBL"; 105 106 protected static final String LOCUS_TAG = "ID"; 107 protected static final String ACCESSION_TAG = "AC"; 108 protected static final String VERSION_TAG = "SV"; 109 protected static final String DEFINITION_TAG = "DE"; 110 protected static final String DATE_TAG = "DT"; 111 protected static final String DATABASE_XREF_TAG = "DR"; 112 protected static final String SOURCE_TAG = "OS"; 113 protected static final String ORGANISM_TAG = "OC"; 114 protected static final String ORGANELLE_TAG = "OG"; 115 protected static final String REFERENCE_TAG = "RN"; 116 protected static final String REFERENCE_POSITION_TAG = "RP"; 117 protected static final String REFERENCE_XREF_TAG = "RX"; 118 protected static final String AUTHORS_TAG = "RA"; 119 protected static final String CONSORTIUM_TAG = "RG"; 120 protected static final String TITLE_TAG = "RT"; 121 protected static final String LOCATOR_TAG = "RL"; 122 protected static final String REMARK_TAG = "RC"; 123 protected static final String KEYWORDS_TAG = "KW"; 124 protected static final String COMMENT_TAG = "CC"; 125 protected static final String FEATURE_HEADER_TAG = "FH"; 126 protected static final String FEATURE_TAG = "FT"; 127 protected static final String CONTIG_TAG = "CO"; 128 protected static final String TPA_TAG = "AH"; 129 protected static final String START_SEQUENCE_TAG = "SQ"; 130 protected static final String DELIMITER_TAG = "XX"; 131 protected static final String END_SEQUENCE_TAG = "//"; 132 133 // the date pattern 134 // date (Rel. N, Created) 135 // date (Rel. N, Last updated, Version M) 136 protected static final Pattern dp = Pattern.compile("([^\\s]+)\\s*(\\(Rel\\.\\s+(\\d+), ([^\\)\\d]+)(\\d*)\\))?$"); 137 // locus line 138 protected static final Pattern lp = Pattern.compile("^(\\S+);\\s+SV\\s+(\\d+);\\s+(linear|circular);\\s+(\\S+\\s?\\S+?);\\s+(\\S+);\\s+(\\S+);\\s+(\\d+)\\s+(BP|AA)\\.$"); 139 protected static final Pattern lpPre87 = Pattern.compile("^(\\S+)\\s+standard;\\s+(circular)?\\s*(genomic)?\\s*(\\S+);\\s+(\\S+);\\s+\\d+\\s+BP\\.$"); 140 // version line 141 protected static final Pattern vp = Pattern.compile("^(\\S+?)\\.(\\d+)$"); 142 // reference position line 143 protected static final Pattern rpp = Pattern.compile("^(\\d+)(-(\\d+))?,?(\\s\\d+-\\d+,?)*$"); 144 // dbxref line 145 protected static final Pattern dbxp = Pattern.compile("^([^:]+):(\\S+)$"); 146 147 protected static final Pattern readableFileNames = Pattern.compile(".*\\u002e(em|dat).*"); 148 protected static final Pattern headerLine = Pattern.compile("^ID.*"); 149 150 private NCBITaxon tax = null; 151 private String organism = null; 152 private String accession = null; 153 154 /** 155 * Implements some EMBL-specific terms. 156 */ 157 public static class Terms extends RichSequence.Terms { 158 159 /** 160 * Getter for the RelUpdatedRecordVersion term 161 * @return The RelUpdatedRecordVersion Term 162 */ 163 public static ComparableTerm getRelUpdatedRecordVersionTerm() { 164 return RichObjectFactory.getDefaultOntology().getOrCreateTerm("RelUpdatedRecordVersion"); 165 } 166 167 /** 168 * Getter for the EMBL term 169 * @return The EMBL Term 170 */ 171 public static ComparableTerm getEMBLTerm() { 172 return RichObjectFactory.getDefaultOntology().getOrCreateTerm("EMBL"); 173 } 174 175 /** 176 * Getter for the Ensembl-specific 'genomic' term 177 * @return The genomic Term 178 */ 179 public static ComparableTerm getGenomicTerm() { 180 return RichObjectFactory.getDefaultOntology().getOrCreateTerm("genomic"); 181 } 182 183 /** 184 * Getter for the Ensembl-specific 'versionLine' term 185 * @return The version line Term 186 */ 187 public static ComparableTerm getVersionLineTerm() { 188 return RichObjectFactory.getDefaultOntology().getOrCreateTerm("versionLine"); 189 } 190 191 /** 192 * Getter for the Ensembl-specific 'dataClass' term 193 * @return The data class Term 194 */ 195 public static ComparableTerm getDataClassTerm() { 196 return RichObjectFactory.getDefaultOntology().getOrCreateTerm("dataClass"); 197 } 198 } 199 200 /** 201 * {@inheritDoc} 202 * A file is in EMBL format if its name contains the word eem or edat, or the first line matches 203 * the EMBL format for the ID line. 204 */ 205 public boolean canRead(File file) throws IOException { 206 if (readableFileNames.matcher(file.getName()).matches()) return true; 207 BufferedReader br = new BufferedReader(new FileReader(file)); 208 String firstLine = br.readLine(); 209 boolean readable = firstLine!=null && headerLine.matcher(firstLine).matches() && 210 (lp.matcher(firstLine.substring(3).trim()).matches() || 211 lpPre87.matcher(firstLine.substring(3).trim()).matches() 212 ); 213 br.close(); 214 return readable; 215 } 216 217 /** 218 * {@inheritDoc} 219 * Always returns a DNA tokenizer. 220 */ 221 public SymbolTokenization guessSymbolTokenization(File file) throws IOException { 222 return RichSequence.IOTools.getDNAParser(); 223 } 224 225 /** 226 * {@inheritDoc} 227 * A stream is in EMBL format if its first line matches the EMBL format for the ID line. 228 */ 229 public boolean canRead(BufferedInputStream stream) throws IOException { 230 stream.mark(2000); // some streams may not support this 231 BufferedReader br = new BufferedReader(new InputStreamReader(stream)); 232 String firstLine = br.readLine(); 233 boolean readable = firstLine!=null && headerLine.matcher(firstLine).matches() && 234 (lp.matcher(firstLine.substring(3).trim()).matches() || 235 lpPre87.matcher(firstLine.substring(3).trim()).matches() 236 ); 237 // don't close the reader as it'll close the stream too. 238 // br.close(); 239 stream.reset(); 240 return readable; 241 } 242 243 /** 244 * {@inheritDoc} 245 * Always returns a DNA tokenizer. 246 */ 247 public SymbolTokenization guessSymbolTokenization(BufferedInputStream stream) throws IOException { 248 return RichSequence.IOTools.getDNAParser(); 249 } 250 251 /** 252 * {@inheritDoc} 253 */ 254 public boolean readSequence(BufferedReader reader, 255 SymbolTokenization symParser, 256 SeqIOListener listener) 257 throws IllegalSymbolException, IOException, ParseException { 258 if (!(listener instanceof RichSeqIOListener)) throw new IllegalArgumentException("Only accepting RichSeqIOListeners today"); 259 return this.readRichSequence(reader,symParser,(RichSeqIOListener)listener,null); 260 } 261 262 /** 263 * {@inheritDoc} 264 */ 265 public boolean readRichSequence(BufferedReader reader, 266 SymbolTokenization symParser, 267 RichSeqIOListener rlistener, 268 Namespace ns) 269 throws IllegalSymbolException, IOException, ParseException { 270 tax = null; 271 organism = null; 272 accession = null; 273 boolean hasAnotherSequence = true; 274 //boolean hasInternalWhitespace = false; 275 276 rlistener.startSequence(); 277 278 if (ns==null) ns=RichObjectFactory.getDefaultNamespace(); 279 rlistener.setNamespace(ns); 280 281 // Get an ordered list of key->value pairs in array-tuples 282 String sectionKey = null; 283 do { 284 List section = this.readSection(reader); 285 sectionKey = ((String[])section.get(0))[0]; 286 if(sectionKey == null){ 287 288 String message = ParseException.newMessage(this.getClass(), accession, "No section key", "Not set", sectionToString(section)); 289 throw new ParseException(message); 290 } 291 // process section-by-section 292 if (sectionKey.equals(LOCUS_TAG)) { 293 // entryname dataclass; [circular] molecule; division; sequencelength BP. 294 String loc = ((String[])section.get(0))[1]; 295 Matcher m = lp.matcher(loc); 296 Matcher mPre87 = lpPre87.matcher(loc); 297 if (m.matches()) { 298 // first token is both name and primary accession 299 rlistener.setName(m.group(1)); 300 rlistener.setAccession(m.group(1)); 301 // second token is version 302 rlistener.setVersion(Integer.parseInt(m.group(2))); 303 // third token is circular/linear 304 rlistener.setCircular(m.group(3).equals("circular")); 305 // fourth token is moltype 306 rlistener.addSequenceProperty(Terms.getMolTypeTerm(),m.group(4)); 307 // fifth token is data class 308 rlistener.addSequenceProperty(Terms.getDataClassTerm(),m.group(5)); 309 // sixth token is taxonomic division 310 rlistener.setDivision(m.group(6)); 311 // seventh token is sequence length, which is ignored 312 // as it is calculated from the sequence data later. 313 } else if (mPre87.matches()) { 314 rlistener.setName(mPre87.group(1)); 315 if (mPre87.group(3)!=null) { 316 // add annotation for 'genomic' (Ensembl-specific term) 317 rlistener.addSequenceProperty(Terms.getGenomicTerm(),null); 318 } 319 rlistener.addSequenceProperty(Terms.getMolTypeTerm(),mPre87.group(4)); 320 rlistener.setDivision(mPre87.group(5)); 321 // Optional extras 322 String circular = mPre87.group(2); 323 if (circular!=null) rlistener.setCircular(true); 324 } else { 325 String message = ParseException.newMessage(this.getClass(),accession,"Not Set","Bad ID line found", sectionToString(section)); 326 throw new ParseException(message); 327 } 328 } else if (sectionKey.equals(DEFINITION_TAG)) { 329 rlistener.setDescription(((String[])section.get(0))[1]); 330 } else if (sectionKey.equals(SOURCE_TAG)) { 331 // only interested in organelle sub-tag 332 for (int i = 1; i < section.size(); i++) { 333 sectionKey = ((String[])section.get(i))[0]; 334 if (sectionKey.equals(ORGANELLE_TAG)) { 335 rlistener.addSequenceProperty(Terms.getOrganelleTerm(), ((String[])section.get(i))[1].trim()); 336 break; // skip out of for loop once found 337 } 338 } 339 } else if (sectionKey.equals(DATE_TAG)) { 340 String chunk = ((String[])section.get(0))[1].trim(); 341 Matcher dm = dp.matcher(chunk); 342 if (dm.matches()) { 343 String date = dm.group(1); 344 String rel = dm.group(3); 345 String type = dm.group(4); 346 if (type.equals("Created")) { 347 rlistener.addSequenceProperty(Terms.getDateCreatedTerm(), date); 348 rlistener.addSequenceProperty(Terms.getRelCreatedTerm(), rel); 349 } else if (type.equals("Last updated, Version ")) { 350 rlistener.addSequenceProperty(Terms.getDateUpdatedTerm(), date); 351 rlistener.addSequenceProperty(Terms.getRelUpdatedTerm(), rel); 352 rlistener.addSequenceProperty(Terms.getRelUpdatedRecordVersionTerm(), dm.group(5)); 353 } else { 354 String message = ParseException.newMessage(this.getClass(),accession,"not set", "Bad date type found",sectionToString(section)); 355 throw new ParseException(message); 356 } 357 } else { 358 String message = ParseException.newMessage(this.getClass(),accession,"not set", "Bad date line found",sectionToString(section)); 359 throw new ParseException(message); 360 361 } 362 } else if (sectionKey.equals(ACCESSION_TAG)) { 363 // if multiple accessions, store only first as accession, 364 // and store rest in annotation 365 String[] accs = ((String[])section.get(0))[1].split(";"); 366 accession = accs[0].trim(); 367 rlistener.setAccession(accession); 368 for (int i = 1; i < accs.length; i++) { 369 rlistener.addSequenceProperty(Terms.getAdditionalAccessionTerm(),accs[i].trim()); 370 } 371 } else if (sectionKey.equals(VERSION_TAG)) { 372 String ver = ((String[])section.get(0))[1]; 373 Matcher m = vp.matcher(ver); 374 if (m.matches()) { 375 String verAcc = m.group(1); 376 if (!accession.equals(verAcc)) { 377 // the version refers to a different accession! 378 // believe the version line, and store the original 379 // accession away in the additional accession set 380 rlistener.addSequenceProperty(Terms.getAdditionalAccessionTerm(),accession); 381 accession = verAcc; 382 rlistener.setAccession(accession); 383 } 384 rlistener.setVersion(Integer.parseInt(m.group(2))); 385 } else { 386 rlistener.addSequenceProperty(Terms.getVersionLineTerm(),ver); 387 } 388 } else if (sectionKey.equals(KEYWORDS_TAG)) { 389 String val = ((String[])section.get(0))[1]; 390 val = val.substring(0,val.length()-1); // chomp dot 391 val = val.replace('\n',' '); //remove newline 392 String[] kws = val.split(";"); 393 for (int i = 0; i < kws.length; i++) { 394 String kw = kws[i].trim(); 395 if (kw.length()==0) continue; 396 rlistener.addSequenceProperty(Terms.getKeywordTerm(), kw); 397 } 398 } else if (sectionKey.equals(DATABASE_XREF_TAG)) { 399 String val = ((String[])section.get(0))[1]; 400 val = val.substring(0,val.length()-1); // chomp dot 401 // database_identifier; primary_identifier; secondary_identifier.... 402 String[] parts = val.split(";"); 403 // construct a DBXREF out of the dbname part[0] and accession part[1] 404 CrossRef crossRef = (CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{parts[0].trim(),parts[1].trim(), new Integer(0)}); 405 // assign remaining bits of info as annotations 406 for (int j = 2; j < parts.length; j++) { 407 Note note = new SimpleNote(Terms.getAdditionalAccessionTerm(),parts[j].trim(),j-1); 408 try { 409 crossRef.getRichAnnotation().addNote(note); 410 } catch (ChangeVetoException ce) { 411 String message = ParseException.newMessage(this.getClass(),accession,"not set", "Could not annotate identifier terms",sectionToString(section)); 412 ParseException pe = new ParseException(message); 413 pe.initCause(ce); 414 throw pe; 415 } 416 } 417 RankedCrossRef rcrossRef = new SimpleRankedCrossRef(crossRef, 0); 418 rlistener.setRankedCrossRef(rcrossRef); 419 } else if (sectionKey.equals(REFERENCE_TAG) && !this.getElideReferences()) { 420 // first line of section has rank and location 421 String refrank = ((String[])section.get(0))[1]; 422 int ref_rank = Integer.parseInt(refrank.substring(1,refrank.length()-1)); 423 int ref_start = -999; 424 int ref_end = -999; 425 // rest can be in any order 426 String consortium = null; 427 String authors = ""; 428 String title = null; 429 String locator = null; 430 String pubmed = null; 431 String medline = null; 432 String doi = null; 433 String remark = null; 434 for (int i = 1; i < section.size(); i++) { 435 String key = ((String[])section.get(i))[0]; 436 String val = ((String[])section.get(i))[1]; 437 if (key.equals(AUTHORS_TAG)) { 438 if (val.endsWith(";")) val = val.substring(0,val.length()-1); // chomp semicolon 439 authors = val.replace('\n',' '); //see #2276 440 } 441 if (key.equals(CONSORTIUM_TAG)) { 442 if (val.endsWith(";")) val = val.substring(0,val.length()-1); // chomp semicolon 443 consortium = val.replace('\n',' '); //see #2276 444 } 445 if (key.equals(TITLE_TAG)) { 446 if (val.length()>1) { 447 if (val.endsWith(";")) val = val.substring(0,val.length()-1); // chomp semicolon 448 if (val.endsWith("\"")) val = val.substring(1,val.length()-1); // chomp quotes 449 title = val.replace('\n',' '); //see #2276 450 } else title=null; // single semi-colon indicates no title 451 } 452 if (key.equals(LOCATOR_TAG)) { 453 if (val.endsWith(".")) val = val.substring(0,val.length()-1); // chomp dot 454 locator = val.replace('\n',' '); //see #2276 455 } 456 if (key.equals(REFERENCE_XREF_TAG)) { 457 // database_identifier; primary_identifier. 458 String[] refs = val.split("\\.(\\s+|$)"); 459 for (int j = 0 ; j < refs.length; j++) { 460 if (refs[j].trim().length()==0) continue; 461 String[] parts = refs[j].split(";"); 462 String db = parts[0]; 463 String ref = parts[1].trim(); 464 if (db.equalsIgnoreCase(Terms.PUBMED_KEY)) pubmed = ref; 465 else if (db.equalsIgnoreCase(Terms.MEDLINE_KEY)) medline = ref; 466 else if (db.equalsIgnoreCase(Terms.DOI_KEY)) doi = ref; 467 } 468 } 469 if (key.equals(REMARK_TAG)) remark = val.replace('\n',' '); //see #2276 470 if (key.equals(REFERENCE_POSITION_TAG)) { 471 // only the first group is taken 472 // if we have multiple lines, only the last line is taken 473 Matcher m = rpp.matcher(val); 474 if (m.matches()) { 475 ref_start = Integer.parseInt(m.group(1)); 476 if(m.group(2) != null) 477 ref_end = Integer.parseInt(m.group(3)); 478 } else { 479 String message = ParseException.newMessage(this.getClass(),accession,"not set", "Bad reference line found",sectionToString(section)); 480 throw new ParseException(message); 481 } 482 } 483 } 484 // create the docref object 485 try { 486 List<DocRefAuthor> authSet = DocRefAuthor.Tools.parseAuthorString(authors); 487 if (consortium!=null) authSet.add(new SimpleDocRefAuthor(consortium, true, false)); 488 DocRef dr = (DocRef)RichObjectFactory.getObject(SimpleDocRef.class,new Object[]{authSet,locator,title}); 489 // assign either the pubmed or medline to the docref - medline gets priority, then pubmed, then doi 490 if (medline!=null) dr.setCrossref((CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{Terms.MEDLINE_KEY, medline, new Integer(0)})); 491 else if (pubmed!=null) dr.setCrossref((CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{Terms.PUBMED_KEY, pubmed, new Integer(0)})); 492 else if (doi!=null) dr.setCrossref((CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{Terms.DOI_KEY, doi, new Integer(0)})); 493 // assign the remarks 494 if (!this.getElideComments()) dr.setRemark(remark); 495 // assign the docref to the bioentry 496 RankedDocRef rdr = new SimpleRankedDocRef(dr, 497 (ref_start != -999 ? new Integer(ref_start) : null), 498 (ref_end != -999 ? new Integer(ref_end) : null), 499 ref_rank); 500 rlistener.setRankedDocRef(rdr); 501 } catch (ChangeVetoException e) { 502 String message = ParseException.newMessage(this.getClass(),accession,"not set", "",sectionToString(section)); 503 throw new ParseException(e, message); 504 } 505 } else if (sectionKey.equals(COMMENT_TAG) && !this.getElideComments()) { 506 // Set up some comments 507 rlistener.setComment(((String[])section.get(0))[1]); 508 } else if (sectionKey.equals(FEATURE_TAG) && !this.getElideFeatures()) { 509 // starting from second line of input, start a new feature whenever we come across 510 // a key that does not start with / 511 boolean seenAFeature = false; 512 int rcrossrefCount = 0; 513 for (int i = 1 ; i < section.size(); i++) { 514 String key = ((String[])section.get(i))[0]; 515 String val = ((String[])section.get(i))[1]; 516 if (key.startsWith("/")) { 517 key = key.substring(1); // strip leading slash 518 val = val.replaceAll("\\s*[\\n\\r]+\\s*"," ").trim(); 519 if (val.startsWith("\"")) val = val.substring(1,val.length()-1); // strip quotes 520 // parameter on old feature 521 if (key.equalsIgnoreCase("db_xref")) { 522 Matcher m = dbxp.matcher(val); 523 if (m.matches()) { 524 String dbname = m.group(1); 525 String raccession = m.group(2); 526 if (dbname.equalsIgnoreCase("taxon")) { 527 // Set the Taxon instead of a dbxref 528 tax = (NCBITaxon)RichObjectFactory.getObject(SimpleNCBITaxon.class, new Object[]{Integer.valueOf(raccession)}); 529 rlistener.setTaxon(tax); 530 try { 531 if (organism!=null) tax.addName(NCBITaxon.SCIENTIFIC,organism); 532 } catch (ChangeVetoException e) { 533 String message = ParseException.newMessage(this.getClass(),accession,"not set", "",sectionToString(section)); 534 throw new ParseException(e, message); 535 } 536 } else { 537 try { 538 CrossRef cr = (CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{dbname, raccession, new Integer(0)}); 539 RankedCrossRef rcr = new SimpleRankedCrossRef(cr, ++rcrossrefCount); 540 rlistener.getCurrentFeature().addRankedCrossRef(rcr); 541 } catch (ChangeVetoException e) { 542 String message = ParseException.newMessage(this.getClass(),accession,"not set", "",sectionToString(section)); 543 throw new ParseException(e, message); 544 } 545 } 546 } else { 547 String message = ParseException.newMessage(this.getClass(),accession,"not set", "Bad dbxref found",sectionToString(section)); 548 throw new ParseException(message); 549 } 550 } else if (key.equalsIgnoreCase("organism")) { 551 try { 552 organism = val; 553 if (tax!=null) tax.addName(NCBITaxon.SCIENTIFIC,organism); 554 } catch (ChangeVetoException e) { 555 String message = ParseException.newMessage(this.getClass(),accession,"not set", "",sectionToString(section)); 556 throw new ParseException(message); 557 } 558 } else { 559 if (key.equalsIgnoreCase("translation")) { 560 // strip spaces from sequence 561 val = val.replaceAll("\\s+",""); 562 } 563 rlistener.addFeatureProperty(RichObjectFactory.getDefaultOntology().getOrCreateTerm(key),val); 564 } 565 } else { 566 // new feature! 567 // end previous feature 568 if (seenAFeature) rlistener.endFeature(); 569 // start next one, with lots of lovely info in it 570 RichFeature.Template templ = new RichFeature.Template(); 571 templ.annotation = new SimpleRichAnnotation(); 572 templ.sourceTerm = Terms.getEMBLTerm(); 573 templ.typeTerm = RichObjectFactory.getDefaultOntology().getOrCreateTerm(key); 574 templ.featureRelationshipSet = new TreeSet(); 575 templ.rankedCrossRefs = new TreeSet(); 576 String tidyLocStr = val.replaceAll("\\s+",""); 577 templ.location = GenbankLocationParser.parseLocation(ns, accession, tidyLocStr); 578 rlistener.startFeature(templ); 579 seenAFeature = true; 580 rcrossrefCount = 0; 581 } 582 } 583 if (seenAFeature) rlistener.endFeature(); 584 } else if (sectionKey.equals(START_SEQUENCE_TAG) && !this.getElideSymbols()) { 585 StringBuffer seq = new StringBuffer(); 586 for (int i = 0 ; i < section.size(); i++) seq.append(((String[])section.get(i))[1]); 587 try { 588 SymbolList sl = new SimpleSymbolList(symParser, 589 seq.toString().replaceAll("\\s+","").replaceAll("[\\.|~]","-")); 590 rlistener.addSymbols(symParser.getAlphabet(), 591 (Symbol[])(sl.toList().toArray(new Symbol[0])), 592 0, sl.length()); 593 } catch (Exception e) { 594 String message = ParseException.newMessage(this.getClass(),accession,"not set", "Bad sequence",sectionToString(section)); 595 throw new ParseException(e, message); 596 } 597 } 598 } while (!sectionKey.equals(END_SEQUENCE_TAG)); 599 600 // Allows us to tolerate trailing whitespace without 601 // thinking that there is another Sequence to follow 602 while (true) { 603 reader.mark(1); 604 int c = reader.read(); 605 if (c == -1) { 606 hasAnotherSequence = false; 607 break; 608 } 609 if (Character.isWhitespace((char) c)) { 610 //hasInternalWhitespace = true; 611 continue; 612 } 613 //if (hasInternalWhitespace) 614 // System.err.println("Warning: whitespace found between sequence entries"); 615 reader.reset(); 616 break; 617 } 618 619 // Finish up. 620 rlistener.endSequence(); 621 return hasAnotherSequence; 622 } 623 624 // reads an indented section, combining split lines and creating a list of key->value tuples 625 private List readSection(BufferedReader br) throws ParseException { 626 List section = new ArrayList(); 627 String line; 628 boolean done = false; 629 630 // while not done 631 try { 632 while (!done) { 633 // mark buffer 634 br.mark(160); 635 // read token 636 line = br.readLine(); 637 if (line.length()<2) { 638 String message = ParseException.newMessage(this.getClass(),accession,"not set", "Bad line found",line); 639 throw new ParseException(message); 640 } 641 String token = line.substring(0,2); 642 // READ SEQUENCE SECTION 643 if (token.equals(START_SEQUENCE_TAG)) { 644 // from next line, read sequence until // - leave // on stack 645 StringBuffer sb = new StringBuffer(); 646 while (!done) { 647 br.mark(160); 648 line = br.readLine(); 649 if (line.startsWith(END_SEQUENCE_TAG)) { 650 br.reset(); 651 done = true; 652 } else { 653 // create sequence tag->value pair to return, sans numbers 654 sb.append(line.replaceAll("\\d","")); 655 } 656 } 657 section.add(new String[]{START_SEQUENCE_TAG,sb.toString()}); 658 } 659 // READ FEATURE TABLE SECTION 660 else if (token.equals(FEATURE_HEADER_TAG)) { 661 // create dummy feature tag->value pair and add to return set 662 section.add(new String[]{FEATURE_TAG,null}); 663 // drop next FH line 664 line = br.readLine(); // skip next line too - it is also FH 665 // read all FT lines until XX 666 String currentTag = null; 667 StringBuffer currentVal = null; 668 while (!done) { 669 line = br.readLine(); 670 if (line.startsWith(DELIMITER_TAG)) { 671 done = true; 672 // dump current tag if exists 673 if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()}); 674 } else { 675 // FT lines: FT word value 676 // or FT /word 677 // or FT /db_xref="taxon:3899.... 678 // ......" 679 line = line.substring(5); // chomp off "FT " 680 if (!line.startsWith(" ")) { 681 // dump current tag if exists 682 if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()}); 683 // case 1 : word value - splits into key-value on its own 684 String[] parts = line.trim().split("\\s+"); 685 currentTag = parts[0]; 686 currentVal = new StringBuffer(); 687 currentVal.append(parts[1]); 688 } else { 689 line = line.trim(); 690 if (line.startsWith("/")) { 691 // dump current tag if exists 692 if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()}); 693 // case 2 : /word[=.....] 694 currentVal = new StringBuffer(); 695 int equalIndex = line.indexOf('='); 696 if (equalIndex>=0) { 697 currentTag = line.substring(0, equalIndex); 698 currentVal.append(line.substring(equalIndex+1)); 699 } else { 700 currentTag = line; 701 } 702 } else { 703 // case 3 : ...." 704 currentVal.append("\n"); 705 currentVal.append(line); 706 } 707 } 708 } 709 } 710 } 711 // READ END OF SEQUENCE 712 else if (token.equals(END_SEQUENCE_TAG)) { 713 section.add(new String[]{END_SEQUENCE_TAG,null}); 714 done = true; 715 } 716 // READ DELIMITER TAG 717 else if (token.equals(DELIMITER_TAG)) { 718 section.add(new String[]{DELIMITER_TAG,null}); 719 done = true; 720 } 721 // READ THIRD PARTY ANNOTATION SECTION 722 else if (token.equals(TPA_TAG)) { 723 // exception = don't know how to do TPA yet 724 String message = ParseException.newMessage(this.getClass(),accession,"not set", "Unable to handle TPAs just yet",sectionToString(section)); 725 throw new ParseException(message); 726 } 727 // READ CONTIG SECTION 728 else if (token.equals(CONTIG_TAG)) { 729 // exception = don't know how to do contigs yet 730 String message = ParseException.newMessage(this.getClass(),accession,"not set", "Unable to handle contig assemblies just yet",sectionToString(section)); 731 throw new ParseException(message); 732 } 733 // READ DOCREF 734 else if (token.equals(DATABASE_XREF_TAG)) { 735 section.add(new String[]{DATABASE_XREF_TAG,line.substring(5).trim()}); 736 done = true; 737 } 738 // READ DATE 739 else if (token.equals(DATE_TAG)) { 740 section.add(new String[]{DATE_TAG,line.substring(5).trim()}); 741 done = true; 742 } 743 // READ NORMAL TAG/VALUE SECTION 744 else { 745 // rewind buffer to mark 746 br.reset(); 747 // read token/values until XX 748 String currentTag = null; 749 StringBuffer currentVal = null; 750 while (!done) { 751 line = br.readLine(); 752 if (line.startsWith(DELIMITER_TAG)) { 753 done = true; 754 // dump current tag if exists 755 if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()}); 756 } else { 757 try { 758 // merge neighbouring repeated tokens by concatting values 759 // return tag->value pairs 760 String tag = line.substring(0,2); 761 String value = line.substring(5); 762 if (currentTag==null || !tag.equals(currentTag)) { 763 // dump current tag if exists 764 if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()}); 765 // start new tag 766 currentTag = tag; 767 currentVal = new StringBuffer(); 768 currentVal.append(value); 769 } else { 770 currentVal.append("\n"); 771 currentVal.append(value); 772 } 773 } catch (Exception e) { 774 String message = ParseException.newMessage(this.getClass(), accession, "not set","",sectionToString(section)); 775 throw new ParseException(e, message); 776 } 777 } 778 } 779 } 780 } 781 } catch (IOException e) { 782 String message = ParseException.newMessage(this.getClass(),accession,"not set", "Unable to handle TPAs just yet",sectionToString(section)); 783 throw new ParseException(message); 784 } 785 return section; 786 } 787 788 /** 789 * {@inheritDoc} 790 */ 791 public void writeSequence(Sequence seq, PrintStream os) throws IOException { 792 if (this.getPrintStream()==null) this.setPrintStream(os); 793 this.writeSequence(seq, RichObjectFactory.getDefaultNamespace()); 794 } 795 796 /** 797 * {@inheritDoc} 798 */ 799 public void writeSequence(Sequence seq, String format, PrintStream os) throws IOException { 800 if (this.getPrintStream()==null) this.setPrintStream(os); 801 this.writeSequence(seq, format, RichObjectFactory.getDefaultNamespace()); 802 } 803 804 /** 805 * {@inheritDoc} 806 * Namespace is ignored as EMBL has no concept of it. 807 */ 808 public void writeSequence(Sequence seq, Namespace ns) throws IOException { 809 this.writeSequence(seq, this.getDefaultFormat(), ns); 810 } 811 812 /** 813 * As per {@link #writeSequence(Sequence, Namespace)}, except 814 * that it also takes a format parameter. This can be any of the formats 815 * defined as constants in this class. 816 * @param seq see {@link #writeSequence(Sequence, Namespace)} 817 * @param format the format to use. 818 * @param ns see {@link #writeSequence(Sequence, Namespace)} 819 * @throws IOException see {@link #writeSequence(Sequence, Namespace)} 820 */ 821 public void writeSequence(Sequence seq, String format, Namespace ns) throws IOException { 822 if (!format.equals(EMBL_FORMAT) && !format.equals(EMBL_PRE87_FORMAT)) 823 throw new IllegalArgumentException("Format "+format+" not recognised."); 824 825 RichSequence rs; 826 try { 827 if (seq instanceof RichSequence) rs = (RichSequence)seq; 828 else rs = RichSequence.Tools.enrich(seq); 829 } catch (ChangeVetoException e) { 830 IOException e2 = new IOException("Unable to enrich sequence"); 831 e2.initCause(e); 832 throw e2; 833 } 834 835 SymbolTokenization tok; 836 try { 837 tok = rs.getAlphabet().getTokenization("token"); 838 } catch (Exception e) { 839 throw new RuntimeException("Unable to get alphabet tokenizer",e); 840 } 841 842 Set<Note> notes = rs.getNoteSet(); 843 String accession = rs.getAccession(); 844 StringBuffer accessions = new StringBuffer(); 845 accessions.append(accession); 846 accessions.append(";"); 847 String cdat = null; 848 String udat = null; 849 String crel = null; 850 String urel = null; 851 String urecv = null; 852 String organelle = null; 853 String versionLine = null; 854 String dataClass = "STD"; 855 boolean genomic = false; 856 String moltype = rs.getAlphabet().getName(); 857 for (Iterator<Note> i = notes.iterator(); i.hasNext(); ) { 858 Note n = i.next(); 859 if (n.getTerm().equals(Terms.getDateCreatedTerm())) cdat=n.getValue(); 860 else if (n.getTerm().equals(Terms.getDateUpdatedTerm())) udat=n.getValue(); 861 else if (n.getTerm().equals(Terms.getRelCreatedTerm())) crel=n.getValue(); 862 else if (n.getTerm().equals(Terms.getRelUpdatedTerm())) urel=n.getValue(); 863 else if (n.getTerm().equals(Terms.getRelUpdatedRecordVersionTerm())) urecv=n.getValue(); 864 else if (n.getTerm().equals(Terms.getMolTypeTerm())) moltype=n.getValue(); 865 else if (n.getTerm().equals(Terms.getVersionLineTerm())) versionLine=n.getValue(); 866 else if (n.getTerm().equals(Terms.getGenomicTerm())) genomic = true; 867 else if (n.getTerm().equals(Terms.getDataClassTerm())) dataClass = n.getValue(); 868 else if (n.getTerm().equals(Terms.getAdditionalAccessionTerm())) { 869 accessions.append(" "); 870 accessions.append(n.getValue()); 871 accessions.append(";"); 872 } else if (n.getTerm().equals(Terms.getOrganelleTerm())) organelle=n.getValue(); 873 } 874 875 StringBuffer locusLine = new StringBuffer(); 876 // Division cannot be null 877 String div = rs.getDivision(); 878 if(div==null || div.length()==0 || div.length()>3) 879 div = "UNC"; //Unclassified 880 881 if (format.equals(EMBL_FORMAT)) { 882 // accession; SV version; circular/linear; moltype; dataclass; division; length BP. 883 locusLine.append(rs.getAccession()); 884 locusLine.append("; SV "); 885 locusLine.append(rs.getVersion()); 886 locusLine.append("; "); 887 locusLine.append(rs.getCircular()?"circular":"linear"); 888 locusLine.append("; "); 889 locusLine.append(moltype); 890 locusLine.append("; "); 891 locusLine.append(dataClass); 892 locusLine.append("; "); 893 locusLine.append(div); 894 locusLine.append("; "); 895 locusLine.append(rs.length()); 896 locusLine.append(" BP."); 897 } else if (format.equals(EMBL_PRE87_FORMAT)) { 898 // entryname dataclass; [circular] molecule; division; sequencelength BP. 899 locusLine.append(StringTools.rightPad(rs.getName(),9)); 900 locusLine.append(" standard; "); 901 locusLine.append(rs.getCircular()?"circular ":""); 902 // if it is Ensembl genomic, add that in too 903 if (genomic==true) locusLine.append("genomic "); 904 locusLine.append(moltype); 905 locusLine.append("; "); 906 locusLine.append(div); 907 locusLine.append("; "); 908 locusLine.append(rs.length()); 909 locusLine.append(" BP."); 910 } 911 StringTools.writeKeyValueLine(LOCUS_TAG, locusLine.toString(), 5, this.getLineWidth(), null, LOCUS_TAG, this.getPrintStream()); 912 this.getPrintStream().println(DELIMITER_TAG+" "); 913 914 // accession line 915 StringTools.writeKeyValueLine(ACCESSION_TAG, accessions.toString(), 5, this.getLineWidth(), null, ACCESSION_TAG, this.getPrintStream()); 916 this.getPrintStream().println(DELIMITER_TAG+" "); 917 918 // version line 919 if (format.equals(EMBL_PRE87_FORMAT)) { 920 if (versionLine!=null) StringTools.writeKeyValueLine(VERSION_TAG, versionLine, 5, this.getLineWidth(), null, VERSION_TAG, this.getPrintStream()); 921 else StringTools.writeKeyValueLine(VERSION_TAG, accession+"."+rs.getVersion(), 5, this.getLineWidth(), null, VERSION_TAG, this.getPrintStream()); 922 this.getPrintStream().println(DELIMITER_TAG+" "); 923 } 924 925 // date line 926 StringTools.writeKeyValueLine(DATE_TAG, (cdat==null?udat:cdat)+" (Rel. "+(crel==null?"0":crel)+", Created)", 5, this.getLineWidth(), null, DATE_TAG, this.getPrintStream()); 927 StringTools.writeKeyValueLine(DATE_TAG, udat+" (Rel. "+(urel==null?"0":urel)+", Last updated, Version "+(urecv==null?"0":urecv)+")", 5, this.getLineWidth(), null, DATE_TAG, this.getPrintStream()); 928 this.getPrintStream().println(DELIMITER_TAG+" "); 929 930 // definition line 931 StringTools.writeKeyValueLine(DEFINITION_TAG, rs.getDescription(), 5, this.getLineWidth(), null, DEFINITION_TAG, this.getPrintStream()); 932 this.getPrintStream().println(DELIMITER_TAG+" "); 933 934 // keywords line 935 StringBuffer keywords = new StringBuffer(); 936 for (Iterator<Note> n = notes.iterator(); n.hasNext(); ) { 937 Note nt = n.next(); 938 if (nt.getTerm().equals(Terms.getKeywordTerm())) { 939 if (keywords.length()>0) keywords.append("; "); 940 keywords.append(nt.getValue()); 941 } 942 } 943 if (keywords.length()>0) { 944 keywords.append("."); 945 StringTools.writeKeyValueLine(KEYWORDS_TAG, keywords.toString(), 5, this.getLineWidth(), null, KEYWORDS_TAG, this.getPrintStream()); 946 this.getPrintStream().println(DELIMITER_TAG+" "); 947 } else { 948 this.getPrintStream().println(KEYWORDS_TAG+" ."); 949 this.getPrintStream().println(DELIMITER_TAG+" "); 950 } 951 952 // source line (from taxon) 953 // organism line 954 NCBITaxon tax = rs.getTaxon(); 955 if (tax!=null) { 956 StringTools.writeKeyValueLine(SOURCE_TAG, tax.getDisplayName(), 5, this.getLineWidth(), null, SOURCE_TAG, this.getPrintStream()); 957 StringTools.writeKeyValueLine(ORGANISM_TAG, tax.getNameHierarchy(), 5, this.getLineWidth(), null, SOURCE_TAG, this.getPrintStream()); 958 if (organelle!=null) StringTools.writeKeyValueLine(ORGANELLE_TAG, organelle, 5, this.getLineWidth(), null, ORGANELLE_TAG, this.getPrintStream()); 959 this.getPrintStream().println(DELIMITER_TAG+" "); 960 } 961 962 // references - rank (bases x to y) 963 for (Iterator<RankedDocRef> r = rs.getRankedDocRefs().iterator(); r.hasNext(); ) { 964 RankedDocRef rdr = r.next(); 965 DocRef d = rdr.getDocumentReference(); 966 // RN, RC, RP, RX, RG, RA, RT, RL 967 StringTools.writeKeyValueLine(REFERENCE_TAG, "["+rdr.getRank()+"]", 5, this.getLineWidth(), null, REFERENCE_TAG, this.getPrintStream()); 968 StringTools.writeKeyValueLine(REMARK_TAG, d.getRemark(), 5, this.getLineWidth(), null, REMARK_TAG, this.getPrintStream()); 969 Integer rstart = rdr.getStart(); 970 if (rstart==null) rstart = new Integer(1); 971 Integer rend = rdr.getEnd(); 972 if (rend==null) rend = new Integer(rs.length()); 973 StringTools.writeKeyValueLine(REFERENCE_POSITION_TAG, rstart+"-"+rend, 5, this.getLineWidth(), null, REFERENCE_POSITION_TAG, this.getPrintStream()); 974 CrossRef c = d.getCrossref(); 975 if (c!=null) StringTools.writeKeyValueLine(REFERENCE_XREF_TAG, c.getDbname()+"; "+c.getAccession()+".", 5, this.getLineWidth(), null, REFERENCE_XREF_TAG, this.getPrintStream()); 976 List<DocRefAuthor> auths = d.getAuthorList(); 977 for (Iterator<DocRefAuthor> j = auths.iterator(); j.hasNext(); ) { 978 DocRefAuthor a = j.next(); 979 if (a.isConsortium()) { 980 StringTools.writeKeyValueLine(CONSORTIUM_TAG, a+";", 5, this.getLineWidth(), null, CONSORTIUM_TAG, this.getPrintStream()); 981 j.remove(); 982 } 983 } 984 if (!auths.isEmpty()) StringTools.writeKeyValueLine(AUTHORS_TAG, DocRefAuthor.Tools.generateAuthorString(auths, true)+";", 5, this.getLineWidth(), null, AUTHORS_TAG, this.getPrintStream()); 985 else StringTools.writeKeyValueLine(AUTHORS_TAG, ";", 5, this.getLineWidth(), null, AUTHORS_TAG, this.getPrintStream()); 986 if (d.getTitle()!=null && d.getTitle().length()!=0) StringTools.writeKeyValueLine(TITLE_TAG, "\""+d.getTitle()+"\";", 5, this.getLineWidth(), null, TITLE_TAG, this.getPrintStream()); 987 else StringTools.writeKeyValueLine(TITLE_TAG, ";", 5, this.getLineWidth(), null, TITLE_TAG, this.getPrintStream()); 988 StringTools.writeKeyValueLine(LOCATOR_TAG, d.getLocation()+".", 5, this.getLineWidth(), null, LOCATOR_TAG, this.getPrintStream()); 989 this.getPrintStream().println(DELIMITER_TAG+" "); 990 } 991 992 // db references - ranked 993 for (Iterator<RankedCrossRef> r = rs.getRankedCrossRefs().iterator(); r.hasNext(); ) { 994 RankedCrossRef rcr = r.next(); 995 CrossRef c = rcr.getCrossRef(); 996 Set<Note> noteset = c.getNoteSet(); 997 StringBuffer sb = new StringBuffer(); 998 sb.append(c.getDbname()); 999 sb.append("; "); 1000 sb.append(c.getAccession()); 1001 boolean hasSecondary = false; 1002 for (Iterator<Note> i = noteset.iterator(); i.hasNext(); ) { 1003 Note n = i.next(); 1004 if (n.getTerm().equals(Terms.getAdditionalAccessionTerm())) { 1005 sb.append("; "); 1006 sb.append(n.getValue()); 1007 hasSecondary = true; 1008 } 1009 } 1010 //if (!hasSecondary) sb.append("; -"); 1011 //sb.append("."); 1012 if (!hasSecondary) sb.append(";"); 1013 else sb.append("."); 1014 StringTools.writeKeyValueLine(DATABASE_XREF_TAG, sb.toString(), 5, this.getLineWidth(), null, DATABASE_XREF_TAG, this.getPrintStream()); 1015 } 1016 if (!rs.getRankedCrossRefs().isEmpty()) 1017 this.getPrintStream().println(DELIMITER_TAG+" "); 1018 1019 // comments - if any 1020 if (!rs.getComments().isEmpty()) { 1021 StringBuffer sb = new StringBuffer(); 1022 for (Iterator<Comment> i = rs.getComments().iterator(); i.hasNext(); ) { 1023 Comment c = i.next(); 1024 sb.append(c.getComment()); 1025 if (i.hasNext()) sb.append("\n"); 1026 } 1027 StringTools.writeKeyValueLine(COMMENT_TAG, sb.toString(), 5, this.getLineWidth(), null, COMMENT_TAG, this.getPrintStream()); 1028 this.getPrintStream().println(DELIMITER_TAG+" "); 1029 } 1030 1031 this.getPrintStream().println(FEATURE_HEADER_TAG+" Key Location/Qualifiers"); 1032 this.getPrintStream().println(FEATURE_HEADER_TAG+" "); 1033 // feature_type location 1034 for (Iterator i = rs.getFeatureSet().iterator(); i.hasNext(); ) { 1035 RichFeature f = (RichFeature)i.next(); 1036 StringTools.writeKeyValueLine(FEATURE_TAG+" "+f.getTypeTerm().getName(), GenbankLocationParser.writeLocation((RichLocation)f.getLocation()), 21, this.getLineWidth(), ",", FEATURE_TAG, this.getPrintStream()); 1037 for (Iterator<Note> j = f.getNoteSet().iterator(); j.hasNext(); ) { 1038 Note n = j.next(); 1039 // /key="val" or just /key if val=="" 1040 if (n.getValue()==null || n.getValue().length()==0) StringTools.writeKeyValueLine(FEATURE_TAG, "/"+n.getTerm().getName(), 21, this.getLineWidth(), null, FEATURE_TAG, this.getPrintStream()); 1041 else StringTools.writeKeyValueLine(FEATURE_TAG, "/"+n.getTerm().getName()+"=\""+n.getValue()+"\"", 21, this.getLineWidth(), null, FEATURE_TAG, this.getPrintStream()); 1042 } 1043 // add-in to source feature only organism and db_xref="taxon:xyz" where present 1044 if (f.getType().equals("source") && tax!=null) { 1045 String displayName = tax.getDisplayName(); 1046 if (displayName.indexOf('(')>-1) displayName = displayName.substring(0, displayName.indexOf('(')).trim(); 1047 StringTools.writeKeyValueLine(FEATURE_TAG, "/organism=\""+displayName+"\"", 21, this.getLineWidth(), null, FEATURE_TAG, this.getPrintStream()); 1048 StringTools.writeKeyValueLine(FEATURE_TAG, "/db_xref=\"taxon:"+tax.getNCBITaxID()+"\"", 21, this.getLineWidth(), null, FEATURE_TAG, this.getPrintStream()); 1049 } 1050 // add-in other dbxrefs where present 1051 for (Iterator<RankedCrossRef> j = f.getRankedCrossRefs().iterator(); j.hasNext(); ) { 1052 RankedCrossRef rcr = j.next(); 1053 CrossRef cr = rcr.getCrossRef(); 1054 StringTools.writeKeyValueLine(FEATURE_TAG, "/db_xref=\""+cr.getDbname()+":"+cr.getAccession()+"\"", 21, this.getLineWidth(), null, FEATURE_TAG, this.getPrintStream()); 1055 } 1056 } 1057 this.getPrintStream().println(DELIMITER_TAG+" "); 1058 1059 // SQ Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other; 1060 int aCount = 0; 1061 int cCount = 0; 1062 int gCount = 0; 1063 int tCount = 0; 1064 int oCount = 0; 1065 for (int i = 1; i <= rs.length(); i++) { 1066 char c; 1067 try { 1068 c = tok.tokenizeSymbol(rs.symbolAt(i)).charAt(0); 1069 } catch (Exception e) { 1070 throw new RuntimeException("Unable to get symbol at position "+i,e); 1071 } 1072 switch (c) { 1073 case 'a': case 'A': 1074 aCount++; 1075 break; 1076 case 'c': case 'C': 1077 cCount++; 1078 break; 1079 case 'g': case 'G': 1080 gCount++; 1081 break; 1082 case 't': case 'T': 1083 tCount++; 1084 break; 1085 default: 1086 oCount++; 1087 } 1088 } 1089 this.getPrintStream().print(START_SEQUENCE_TAG+" Sequence "+rs.length()+" BP; "); 1090 this.getPrintStream().print(aCount + " A; "); 1091 this.getPrintStream().print(cCount + " C; "); 1092 this.getPrintStream().print(gCount + " G; "); 1093 this.getPrintStream().print(tCount + " T; "); 1094 this.getPrintStream().println(oCount + " other;"); 1095 1096 // sequence stuff 1097 Symbol[] syms = (Symbol[])rs.toList().toArray(new Symbol[0]); 1098 int lineLen = 0; 1099 int symCount = 0; 1100 this.getPrintStream().print(" "); 1101 for (int i = 0; i < syms.length; i++) { 1102 if (symCount % 60 == 0 && symCount>0) { 1103 this.getPrintStream().print(StringTools.leftPad(""+symCount,10)); 1104 this.getPrintStream().print("\n "); 1105 lineLen = 0; 1106 } 1107 if (symCount % 10 == 0) { 1108 this.getPrintStream().print(" "); 1109 lineLen++; 1110 } 1111 try { 1112 this.getPrintStream().print(tok.tokenizeSymbol(syms[i])); 1113 } catch (IllegalSymbolException e) { 1114 throw new RuntimeException("Found illegal symbol: "+syms[i]); 1115 } 1116 symCount++; 1117 lineLen++; 1118 } 1119 this.getPrintStream().print(StringTools.leftPad(""+symCount,(66-lineLen)+10)); 1120 this.getPrintStream().print("\n"); 1121 this.getPrintStream().println(END_SEQUENCE_TAG); 1122 } 1123 1124 /** 1125 * {@inheritDoc} 1126 */ 1127 public String getDefaultFormat() { 1128 return EMBL_FORMAT; 1129 } 1130 1131 1132 /** 1133 * Converts the current parse section to a String. Useful for debugging. 1134 */ 1135 String sectionToString(List section){ 1136 StringBuffer parseBlock = new StringBuffer(); 1137 for(Iterator i = section.listIterator(); i.hasNext();){ 1138 String[] part = (String[])i.next(); 1139 for(int x = 0; x < part.length; x++){ 1140 parseBlock.append(part[x]); 1141 if(x == 0){ 1142 parseBlock.append(" "); //the gap will have been trimmed 1143 } 1144 } 1145 } 1146 return parseBlock.toString(); 1147 } 1148} 1149