001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022package org.biojavax.bio.seq.io;
023
024import java.io.BufferedInputStream;
025import java.io.BufferedReader;
026import java.io.File;
027import java.io.FileReader;
028import java.io.IOException;
029import java.io.InputStreamReader;
030import java.io.PrintStream;
031import java.util.ArrayList;
032import java.util.Iterator;
033import java.util.List;
034import java.util.Set;
035import java.util.TreeSet;
036import java.util.regex.Matcher;
037import java.util.regex.Pattern;
038
039import org.biojava.bio.seq.Sequence;
040import org.biojava.bio.seq.io.ParseException;
041import org.biojava.bio.seq.io.SeqIOListener;
042import org.biojava.bio.seq.io.SymbolTokenization;
043import org.biojava.bio.symbol.IllegalSymbolException;
044import org.biojava.bio.symbol.SimpleSymbolList;
045import org.biojava.bio.symbol.Symbol;
046import org.biojava.bio.symbol.SymbolList;
047import org.biojava.utils.ChangeVetoException;
048import org.biojavax.Comment;
049import org.biojavax.CrossRef;
050import org.biojavax.DocRef;
051import org.biojavax.DocRefAuthor;
052import org.biojavax.Namespace;
053import org.biojavax.Note;
054import org.biojavax.RankedCrossRef;
055import org.biojavax.RankedDocRef;
056import org.biojavax.RichAnnotation;
057import org.biojavax.RichObjectFactory;
058import org.biojavax.SimpleComment;
059import org.biojavax.SimpleCrossRef;
060import org.biojavax.SimpleDocRef;
061import org.biojavax.SimpleDocRefAuthor;
062import org.biojavax.SimpleNote;
063import org.biojavax.SimpleRankedCrossRef;
064import org.biojavax.SimpleRankedDocRef;
065import org.biojavax.SimpleRichAnnotation;
066import org.biojavax.bio.seq.RichFeature;
067import org.biojavax.bio.seq.RichLocation;
068import org.biojavax.bio.seq.RichSequence;
069import org.biojavax.bio.taxa.NCBITaxon;
070import org.biojavax.bio.taxa.SimpleNCBITaxon;
071import org.biojavax.ontology.ComparableTerm;
072import org.biojavax.utils.StringTools;
073
074/**
075 * Format reader for EMBL files. This version of EMBL format will generate
076 * and write RichSequence objects. Loosely Based on code from the old, deprecated,
077 * org.biojava.bio.seq.io.EmblLikeFormat object.
078 * <p>
079 * This format will read both Pre-87 and 87+ versions of EMBL. It will also write
080 * them both. By default, it will write the most recent version. If you want
081 * an earlier one, you must specify the format by passing one of the constants
082 * defined in this class to {@link #writeSequence(Sequence, String, Namespace)}.
083 *
084 * @author Richard Holland
085 * @author Jolyon Holdstock
086 * @author Mark Schreiber
087 * @since 1.5
088 */
089public class EMBLFormat extends RichSequenceFormat.HeaderlessFormat {
090    
091    // Register this format with the format auto-guesser.
092    static {
093        RichSequence.IOTools.registerFormat(EMBLFormat.class);
094    }
095    
096    /**
097     * The name of the Pre-87 format
098     */
099    public static final String EMBL_PRE87_FORMAT = "EMBL_PRE87";
100    
101    /**
102     * The name of the current format
103     */
104    public static final String EMBL_FORMAT = "EMBL";
105    
106    protected static final String LOCUS_TAG = "ID";
107    protected static final String ACCESSION_TAG = "AC";
108    protected static final String VERSION_TAG = "SV";
109    protected static final String DEFINITION_TAG = "DE";
110    protected static final String DATE_TAG = "DT";
111    protected static final String DATABASE_XREF_TAG = "DR";
112    protected static final String SOURCE_TAG = "OS";
113    protected static final String ORGANISM_TAG = "OC";
114    protected static final String ORGANELLE_TAG = "OG";
115    protected static final String REFERENCE_TAG = "RN";
116    protected static final String REFERENCE_POSITION_TAG = "RP";
117    protected static final String REFERENCE_XREF_TAG = "RX";
118    protected static final String AUTHORS_TAG = "RA";
119    protected static final String CONSORTIUM_TAG = "RG";
120    protected static final String TITLE_TAG = "RT";
121    protected static final String LOCATOR_TAG = "RL";
122    protected static final String REMARK_TAG = "RC";
123    protected static final String KEYWORDS_TAG = "KW";
124    protected static final String COMMENT_TAG = "CC";
125    protected static final String FEATURE_HEADER_TAG = "FH";
126    protected static final String FEATURE_TAG = "FT";
127    protected static final String CONTIG_TAG = "CO";
128    protected static final String TPA_TAG = "AH";
129    protected static final String START_SEQUENCE_TAG = "SQ";
130    protected static final String DELIMITER_TAG = "XX";
131    protected static final String END_SEQUENCE_TAG = "//";
132    
133    // the date pattern
134    // date (Rel. N, Created)
135    // date (Rel. N, Last updated, Version M)
136    protected static final Pattern dp = Pattern.compile("([^\\s]+)\\s*(\\(Rel\\.\\s+(\\d+), ([^\\)\\d]+)(\\d*)\\))?$");
137    // locus line
138    protected static final Pattern lp = Pattern.compile("^(\\S+);\\s+SV\\s+(\\d+);\\s+(linear|circular);\\s+(\\S+\\s?\\S+?);\\s+(\\S+);\\s+(\\S+);\\s+(\\d+)\\s+(BP|AA)\\.$");
139    protected static final Pattern lpPre87 = Pattern.compile("^(\\S+)\\s+standard;\\s+(circular)?\\s*(genomic)?\\s*(\\S+);\\s+(\\S+);\\s+\\d+\\s+BP\\.$");
140    // version line
141    protected static final Pattern vp = Pattern.compile("^(\\S+?)\\.(\\d+)$");
142    // reference position line
143    protected static final Pattern rpp = Pattern.compile("^(\\d+)(-(\\d+))?,?(\\s\\d+-\\d+,?)*$");
144    // dbxref line
145    protected static final Pattern dbxp = Pattern.compile("^([^:]+):(\\S+)$");
146    
147    protected static final Pattern readableFileNames = Pattern.compile(".*\\u002e(em|dat).*");
148    protected static final Pattern headerLine = Pattern.compile("^ID.*");
149    
150    private NCBITaxon tax = null;
151    private String organism = null;
152    private String accession = null;
153    
154    /**
155     * Implements some EMBL-specific terms.
156     */
157    public static class Terms extends RichSequence.Terms {
158        
159        /**
160         * Getter for the RelUpdatedRecordVersion term
161         * @return The RelUpdatedRecordVersion Term
162         */
163        public static ComparableTerm getRelUpdatedRecordVersionTerm() {
164            return RichObjectFactory.getDefaultOntology().getOrCreateTerm("RelUpdatedRecordVersion");
165        }
166        
167        /**
168         * Getter for the EMBL term
169         * @return The EMBL Term
170         */
171        public static ComparableTerm getEMBLTerm() {
172            return RichObjectFactory.getDefaultOntology().getOrCreateTerm("EMBL");
173        }
174        
175        /**
176         * Getter for the Ensembl-specific 'genomic' term
177         * @return The genomic Term
178         */
179        public static ComparableTerm getGenomicTerm() {
180            return RichObjectFactory.getDefaultOntology().getOrCreateTerm("genomic");
181        }
182        
183        /**
184         * Getter for the Ensembl-specific 'versionLine' term
185         * @return The version line Term
186         */
187        public static ComparableTerm getVersionLineTerm() {
188            return RichObjectFactory.getDefaultOntology().getOrCreateTerm("versionLine");
189        }
190        
191        /**
192         * Getter for the Ensembl-specific 'dataClass' term
193         * @return The data class Term
194         */
195        public static ComparableTerm getDataClassTerm() {
196            return RichObjectFactory.getDefaultOntology().getOrCreateTerm("dataClass");
197        }
198    }
199    
200    /**
201     * {@inheritDoc}
202     * A file is in EMBL format if its name contains the word eem or edat, or the first line matches
203     * the EMBL format for the ID line.
204     */
205    public boolean canRead(File file) throws IOException {
206        if (readableFileNames.matcher(file.getName()).matches()) return true;
207        BufferedReader br = new BufferedReader(new FileReader(file));
208        String firstLine = br.readLine();
209        boolean readable = firstLine!=null && headerLine.matcher(firstLine).matches() &&
210                (lp.matcher(firstLine.substring(3).trim()).matches() ||
211                lpPre87.matcher(firstLine.substring(3).trim()).matches()
212                );
213        br.close();
214        return readable;
215    }
216    
217    /**
218     * {@inheritDoc}
219     * Always returns a DNA tokenizer.
220     */
221    public SymbolTokenization guessSymbolTokenization(File file) throws IOException {
222        return RichSequence.IOTools.getDNAParser();
223    }
224    
225    /**
226     * {@inheritDoc}
227     * A stream is in EMBL format if its first line matches the EMBL format for the ID line.
228     */
229    public boolean canRead(BufferedInputStream stream) throws IOException {
230        stream.mark(2000); // some streams may not support this
231        BufferedReader br = new BufferedReader(new InputStreamReader(stream));
232        String firstLine = br.readLine();
233        boolean readable = firstLine!=null && headerLine.matcher(firstLine).matches() &&
234                (lp.matcher(firstLine.substring(3).trim()).matches() ||
235                lpPre87.matcher(firstLine.substring(3).trim()).matches()
236                );
237        // don't close the reader as it'll close the stream too.
238        // br.close();
239        stream.reset();
240        return readable;
241    }
242    
243    /**
244     * {@inheritDoc}
245     * Always returns a DNA tokenizer.
246     */
247    public SymbolTokenization guessSymbolTokenization(BufferedInputStream stream) throws IOException {
248        return RichSequence.IOTools.getDNAParser();
249    }
250    
251    /**
252     * {@inheritDoc}
253     */
254    public boolean readSequence(BufferedReader reader,
255            SymbolTokenization symParser,
256            SeqIOListener listener)
257            throws IllegalSymbolException, IOException, ParseException {
258        if (!(listener instanceof RichSeqIOListener)) throw new IllegalArgumentException("Only accepting RichSeqIOListeners today");
259        return this.readRichSequence(reader,symParser,(RichSeqIOListener)listener,null);
260    }
261    
262    /**
263     * {@inheritDoc}
264     */
265    public boolean readRichSequence(BufferedReader reader,
266            SymbolTokenization symParser,
267            RichSeqIOListener rlistener,
268            Namespace ns)
269            throws IllegalSymbolException, IOException, ParseException {
270        tax = null;
271        organism = null;
272        accession = null;
273        boolean hasAnotherSequence = true;
274        //boolean hasInternalWhitespace = false;
275        
276        rlistener.startSequence();
277        
278        if (ns==null) ns=RichObjectFactory.getDefaultNamespace();
279        rlistener.setNamespace(ns);
280        
281        // Get an ordered list of key->value pairs in array-tuples
282        String sectionKey = null;
283        do {
284            List section = this.readSection(reader);
285            sectionKey = ((String[])section.get(0))[0];
286            if(sectionKey == null){
287                
288                String message = ParseException.newMessage(this.getClass(), accession, "No section key", "Not set", sectionToString(section));
289                throw new ParseException(message);
290            }
291            // process section-by-section
292            if (sectionKey.equals(LOCUS_TAG)) {
293                // entryname  dataclass; [circular] molecule; division; sequencelength BP.
294                String loc = ((String[])section.get(0))[1];
295                Matcher m = lp.matcher(loc);
296                Matcher mPre87 = lpPre87.matcher(loc);
297                if (m.matches()) {
298                    // first token is both name and primary accession
299                    rlistener.setName(m.group(1));
300                    rlistener.setAccession(m.group(1));
301                    // second token is version
302                    rlistener.setVersion(Integer.parseInt(m.group(2)));
303                    // third token is circular/linear
304                    rlistener.setCircular(m.group(3).equals("circular"));
305                    // fourth token is moltype
306                    rlistener.addSequenceProperty(Terms.getMolTypeTerm(),m.group(4));
307                    // fifth token is data class
308                    rlistener.addSequenceProperty(Terms.getDataClassTerm(),m.group(5));
309                    // sixth token is taxonomic division
310                    rlistener.setDivision(m.group(6));
311                    // seventh token is sequence length, which is ignored
312                    // as it is calculated from the sequence data later.
313                } else if (mPre87.matches()) {
314                    rlistener.setName(mPre87.group(1));
315                    if (mPre87.group(3)!=null) {
316                        // add annotation for 'genomic' (Ensembl-specific term)
317                        rlistener.addSequenceProperty(Terms.getGenomicTerm(),null);
318                    }
319                    rlistener.addSequenceProperty(Terms.getMolTypeTerm(),mPre87.group(4));
320                    rlistener.setDivision(mPre87.group(5));
321                    // Optional extras
322                    String circular = mPre87.group(2);
323                    if (circular!=null) rlistener.setCircular(true);
324                } else {
325                    String message = ParseException.newMessage(this.getClass(),accession,"Not Set","Bad ID line found", sectionToString(section));
326                    throw new ParseException(message);
327                }
328            } else if (sectionKey.equals(DEFINITION_TAG)) {
329                rlistener.setDescription(((String[])section.get(0))[1]);
330            } else if (sectionKey.equals(SOURCE_TAG)) {
331                // only interested in organelle sub-tag
332                for (int i = 1; i < section.size(); i++) {
333                    sectionKey = ((String[])section.get(i))[0];
334                    if (sectionKey.equals(ORGANELLE_TAG)) {
335                        rlistener.addSequenceProperty(Terms.getOrganelleTerm(), ((String[])section.get(i))[1].trim());
336                        break; // skip out of for loop once found
337                    }
338                }
339            } else if (sectionKey.equals(DATE_TAG)) {
340                String chunk = ((String[])section.get(0))[1].trim();
341                Matcher dm = dp.matcher(chunk);
342                if (dm.matches()) {
343                    String date = dm.group(1);
344                    String rel = dm.group(3);
345                    String type = dm.group(4);
346                    if (type.equals("Created")) {
347                        rlistener.addSequenceProperty(Terms.getDateCreatedTerm(), date);
348                        rlistener.addSequenceProperty(Terms.getRelCreatedTerm(), rel);
349                    } else if (type.equals("Last updated, Version ")) {
350                        rlistener.addSequenceProperty(Terms.getDateUpdatedTerm(), date);
351                        rlistener.addSequenceProperty(Terms.getRelUpdatedTerm(), rel);
352                        rlistener.addSequenceProperty(Terms.getRelUpdatedRecordVersionTerm(), dm.group(5));
353                    } else {
354                        String message = ParseException.newMessage(this.getClass(),accession,"not set", "Bad date type found",sectionToString(section));
355                        throw new ParseException(message);
356                    }
357                } else {
358                    String message = ParseException.newMessage(this.getClass(),accession,"not set", "Bad date line found",sectionToString(section));
359                    throw new ParseException(message);
360                    
361                }
362            } else if (sectionKey.equals(ACCESSION_TAG)) {
363                // if multiple accessions, store only first as accession,
364                // and store rest in annotation
365                String[] accs = ((String[])section.get(0))[1].split(";");
366                accession = accs[0].trim();
367                rlistener.setAccession(accession);
368                for (int i = 1; i < accs.length; i++) {
369                    rlistener.addSequenceProperty(Terms.getAdditionalAccessionTerm(),accs[i].trim());
370                }
371            } else if (sectionKey.equals(VERSION_TAG)) {
372                String ver = ((String[])section.get(0))[1];
373                Matcher m = vp.matcher(ver);
374                if (m.matches()) {
375                    String verAcc = m.group(1);
376                    if (!accession.equals(verAcc)) {
377                        // the version refers to a different accession!
378                        // believe the version line, and store the original
379                        // accession away in the additional accession set
380                        rlistener.addSequenceProperty(Terms.getAdditionalAccessionTerm(),accession);
381                        accession = verAcc;
382                        rlistener.setAccession(accession);
383                    }
384                    rlistener.setVersion(Integer.parseInt(m.group(2)));
385                } else {
386                    rlistener.addSequenceProperty(Terms.getVersionLineTerm(),ver);
387                }
388            } else if (sectionKey.equals(KEYWORDS_TAG)) {
389                String val = ((String[])section.get(0))[1];
390                val = val.substring(0,val.length()-1); // chomp dot
391                val = val.replace('\n',' '); //remove newline
392                String[] kws = val.split(";");
393                for (int i = 0; i < kws.length; i++) {
394                    String kw = kws[i].trim();
395                    if (kw.length()==0) continue;
396                    rlistener.addSequenceProperty(Terms.getKeywordTerm(), kw);
397                }
398            } else if (sectionKey.equals(DATABASE_XREF_TAG)) {
399                String val = ((String[])section.get(0))[1];
400                val = val.substring(0,val.length()-1); // chomp dot
401                // database_identifier; primary_identifier; secondary_identifier....
402                String[] parts = val.split(";");
403                // construct a DBXREF out of the dbname part[0] and accession part[1]
404                CrossRef crossRef = (CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{parts[0].trim(),parts[1].trim(), new Integer(0)});
405                // assign remaining bits of info as annotations
406                for (int j = 2; j < parts.length; j++) {
407                    Note note = new SimpleNote(Terms.getAdditionalAccessionTerm(),parts[j].trim(),j-1);
408                    try {
409                        crossRef.getRichAnnotation().addNote(note);
410                    } catch (ChangeVetoException ce) {
411                        String message = ParseException.newMessage(this.getClass(),accession,"not set", "Could not annotate identifier terms",sectionToString(section));
412                        ParseException pe = new ParseException(message);
413                        pe.initCause(ce);
414                        throw pe;
415                    }
416                }
417                RankedCrossRef rcrossRef = new SimpleRankedCrossRef(crossRef, 0);
418                rlistener.setRankedCrossRef(rcrossRef);
419            } else if (sectionKey.equals(REFERENCE_TAG) && !this.getElideReferences()) {
420                // first line of section has rank and location
421                String refrank = ((String[])section.get(0))[1];
422                int ref_rank = Integer.parseInt(refrank.substring(1,refrank.length()-1));
423                int ref_start = -999;
424                int ref_end = -999;
425                // rest can be in any order
426                String consortium = null;
427                String authors = "";
428                String title = null;
429                String locator = null;
430                String pubmed = null;
431                String medline = null;
432                String doi = null;
433                String remark = null;
434                for (int i = 1; i < section.size(); i++) {
435                    String key = ((String[])section.get(i))[0];
436                    String val = ((String[])section.get(i))[1];
437                    if (key.equals(AUTHORS_TAG)) {
438                        if (val.endsWith(";")) val = val.substring(0,val.length()-1); // chomp semicolon
439                        authors = val.replace('\n',' '); //see #2276
440                    }
441                    if (key.equals(CONSORTIUM_TAG)) {
442                        if (val.endsWith(";")) val = val.substring(0,val.length()-1); // chomp semicolon
443                        consortium = val.replace('\n',' '); //see #2276
444                    }
445                    if (key.equals(TITLE_TAG)) {
446                        if (val.length()>1) {
447                            if (val.endsWith(";")) val = val.substring(0,val.length()-1); // chomp semicolon
448                            if (val.endsWith("\"")) val = val.substring(1,val.length()-1); // chomp quotes
449                            title = val.replace('\n',' '); //see #2276
450                        } else title=null; // single semi-colon indicates no title
451                    }
452                    if (key.equals(LOCATOR_TAG)) {
453                        if (val.endsWith(".")) val = val.substring(0,val.length()-1); // chomp dot
454                        locator = val.replace('\n',' '); //see #2276
455                    }
456                    if (key.equals(REFERENCE_XREF_TAG)) {
457                        // database_identifier; primary_identifier.
458                        String[] refs = val.split("\\.(\\s+|$)");
459                        for (int j = 0 ; j < refs.length; j++) {
460                            if (refs[j].trim().length()==0) continue;
461                            String[] parts = refs[j].split(";");
462                            String db = parts[0];
463                            String ref = parts[1].trim();
464                            if (db.equalsIgnoreCase(Terms.PUBMED_KEY)) pubmed = ref;
465                            else if (db.equalsIgnoreCase(Terms.MEDLINE_KEY)) medline = ref;
466                            else if (db.equalsIgnoreCase(Terms.DOI_KEY)) doi = ref;
467                        }
468                    }
469                    if (key.equals(REMARK_TAG)) remark = val.replace('\n',' '); //see #2276
470                    if (key.equals(REFERENCE_POSITION_TAG)) {
471                        // only the first group is taken
472                        // if we have multiple lines, only the last line is taken
473                        Matcher m = rpp.matcher(val);
474                        if (m.matches()) {
475                            ref_start = Integer.parseInt(m.group(1));
476                            if(m.group(2) != null)
477                                ref_end = Integer.parseInt(m.group(3));
478                        } else {
479                            String message = ParseException.newMessage(this.getClass(),accession,"not set", "Bad reference line found",sectionToString(section));
480                            throw new ParseException(message);
481                        }
482                    }
483                }
484                // create the docref object
485                try {
486                    List<DocRefAuthor> authSet = DocRefAuthor.Tools.parseAuthorString(authors);
487                    if (consortium!=null) authSet.add(new SimpleDocRefAuthor(consortium, true, false));
488                    DocRef dr = (DocRef)RichObjectFactory.getObject(SimpleDocRef.class,new Object[]{authSet,locator,title});
489                    // assign either the pubmed or medline to the docref - medline gets priority, then pubmed, then doi
490                    if (medline!=null) dr.setCrossref((CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{Terms.MEDLINE_KEY, medline, new Integer(0)}));
491                    else if (pubmed!=null) dr.setCrossref((CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{Terms.PUBMED_KEY, pubmed, new Integer(0)}));
492                    else if (doi!=null) dr.setCrossref((CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{Terms.DOI_KEY, doi, new Integer(0)}));
493                    // assign the remarks
494                    if (!this.getElideComments()) dr.setRemark(remark);
495                    // assign the docref to the bioentry
496                    RankedDocRef rdr = new SimpleRankedDocRef(dr,
497                            (ref_start != -999 ? new Integer(ref_start) : null),
498                            (ref_end != -999 ? new Integer(ref_end) : null),
499                            ref_rank);
500                    rlistener.setRankedDocRef(rdr);
501                } catch (ChangeVetoException e) {
502                    String message = ParseException.newMessage(this.getClass(),accession,"not set", "",sectionToString(section));
503                    throw new ParseException(e, message);
504                }
505            } else if (sectionKey.equals(COMMENT_TAG) && !this.getElideComments()) {
506                // Set up some comments
507                rlistener.setComment(((String[])section.get(0))[1]);
508            } else if (sectionKey.equals(FEATURE_TAG) && !this.getElideFeatures()) {
509                // starting from second line of input, start a new feature whenever we come across
510                // a key that does not start with /
511                boolean seenAFeature = false;
512                int rcrossrefCount = 0;
513                for (int i = 1 ; i < section.size(); i++) {
514                    String key = ((String[])section.get(i))[0];
515                    String val = ((String[])section.get(i))[1];
516                    if (key.startsWith("/")) {
517                        key = key.substring(1); // strip leading slash
518                        val = val.replaceAll("\\s*[\\n\\r]+\\s*"," ").trim();
519                        if (val.startsWith("\"")) val = val.substring(1,val.length()-1); // strip quotes
520                        // parameter on old feature
521                        if (key.equalsIgnoreCase("db_xref")) {
522                            Matcher m = dbxp.matcher(val);
523                            if (m.matches()) {
524                                String dbname = m.group(1);
525                                String raccession = m.group(2);
526                                if (dbname.equalsIgnoreCase("taxon")) {
527                                    // Set the Taxon instead of a dbxref
528                                    tax = (NCBITaxon)RichObjectFactory.getObject(SimpleNCBITaxon.class, new Object[]{Integer.valueOf(raccession)});
529                                    rlistener.setTaxon(tax);
530                                    try {
531                                        if (organism!=null) tax.addName(NCBITaxon.SCIENTIFIC,organism);
532                                    } catch (ChangeVetoException e) {
533                                        String message = ParseException.newMessage(this.getClass(),accession,"not set", "",sectionToString(section));
534                                        throw new ParseException(e, message);
535                                    }
536                                } else {
537                                    try {
538                                        CrossRef cr = (CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{dbname, raccession, new Integer(0)});
539                                        RankedCrossRef rcr = new SimpleRankedCrossRef(cr, ++rcrossrefCount);
540                                        rlistener.getCurrentFeature().addRankedCrossRef(rcr);
541                                    } catch (ChangeVetoException e) {
542                                        String message = ParseException.newMessage(this.getClass(),accession,"not set", "",sectionToString(section));
543                                        throw new ParseException(e, message);
544                                    }
545                                }
546                            } else {
547                                String message = ParseException.newMessage(this.getClass(),accession,"not set", "Bad dbxref found",sectionToString(section));
548                                throw new ParseException(message);
549                            }
550                        } else if (key.equalsIgnoreCase("organism")) {
551                            try {
552                                organism = val;
553                                if (tax!=null) tax.addName(NCBITaxon.SCIENTIFIC,organism);
554                            } catch (ChangeVetoException e) {
555                                String message = ParseException.newMessage(this.getClass(),accession,"not set", "",sectionToString(section));
556                                throw new ParseException(message);
557                            }
558                        } else {
559                            if (key.equalsIgnoreCase("translation")) {
560                                // strip spaces from sequence
561                                val = val.replaceAll("\\s+","");
562                            }
563                            rlistener.addFeatureProperty(RichObjectFactory.getDefaultOntology().getOrCreateTerm(key),val);
564                        }
565                    } else {
566                        // new feature!
567                        // end previous feature
568                        if (seenAFeature) rlistener.endFeature();
569                        // start next one, with lots of lovely info in it
570                        RichFeature.Template templ = new RichFeature.Template();
571                        templ.annotation = new SimpleRichAnnotation();
572                        templ.sourceTerm = Terms.getEMBLTerm();
573                        templ.typeTerm = RichObjectFactory.getDefaultOntology().getOrCreateTerm(key);
574                        templ.featureRelationshipSet = new TreeSet();
575                        templ.rankedCrossRefs = new TreeSet();
576                        String tidyLocStr = val.replaceAll("\\s+","");
577                        templ.location = GenbankLocationParser.parseLocation(ns, accession, tidyLocStr);
578                        rlistener.startFeature(templ);
579                        seenAFeature = true;
580                        rcrossrefCount = 0;
581                    }
582                }
583                if (seenAFeature) rlistener.endFeature();
584            } else if (sectionKey.equals(START_SEQUENCE_TAG) && !this.getElideSymbols()) {
585                StringBuffer seq = new StringBuffer();
586                for (int i = 0 ; i < section.size(); i++) seq.append(((String[])section.get(i))[1]);
587                try {
588                    SymbolList sl = new SimpleSymbolList(symParser,
589                            seq.toString().replaceAll("\\s+","").replaceAll("[\\.|~]","-"));
590                    rlistener.addSymbols(symParser.getAlphabet(),
591                            (Symbol[])(sl.toList().toArray(new Symbol[0])),
592                            0, sl.length());
593                } catch (Exception e) {
594                    String message = ParseException.newMessage(this.getClass(),accession,"not set", "Bad sequence",sectionToString(section));
595                    throw new ParseException(e, message);
596                }
597            }
598        } while (!sectionKey.equals(END_SEQUENCE_TAG));
599        
600        // Allows us to tolerate trailing whitespace without
601        // thinking that there is another Sequence to follow
602        while (true) {
603            reader.mark(1);
604            int c = reader.read();
605            if (c == -1) {
606                hasAnotherSequence = false;
607                break;
608            }
609            if (Character.isWhitespace((char) c)) {
610                //hasInternalWhitespace = true;
611                continue;
612            }
613            //if (hasInternalWhitespace)
614            //    System.err.println("Warning: whitespace found between sequence entries");
615            reader.reset();
616            break;
617        }
618        
619        // Finish up.
620        rlistener.endSequence();
621        return hasAnotherSequence;
622    }
623    
624    // reads an indented section, combining split lines and creating a list of key->value tuples
625    private List readSection(BufferedReader br) throws ParseException {
626        List section = new ArrayList();
627        String line;
628        boolean done = false;
629        
630        // while not done
631        try {
632            while (!done) {
633                // mark buffer
634                br.mark(160);
635                // read token
636                line = br.readLine();
637                if (line.length()<2) {
638                    String message = ParseException.newMessage(this.getClass(),accession,"not set", "Bad line found",line);
639                    throw new ParseException(message);
640                }
641                String token = line.substring(0,2);
642                // READ SEQUENCE SECTION
643                if (token.equals(START_SEQUENCE_TAG)) {
644                    //      from next line, read sequence until // - leave // on stack
645                    StringBuffer sb = new StringBuffer();
646                    while (!done) {
647                        br.mark(160);
648                        line = br.readLine();
649                        if (line.startsWith(END_SEQUENCE_TAG)) {
650                            br.reset();
651                            done = true;
652                        } else {
653                            //      create sequence tag->value pair to return, sans numbers
654                            sb.append(line.replaceAll("\\d",""));
655                        }
656                    }
657                    section.add(new String[]{START_SEQUENCE_TAG,sb.toString()});
658                }
659                // READ FEATURE TABLE SECTION
660                else if (token.equals(FEATURE_HEADER_TAG)) {
661                    //      create dummy feature tag->value pair and add to return set
662                    section.add(new String[]{FEATURE_TAG,null});
663                    //      drop next FH line
664                    line = br.readLine(); // skip next line too - it is also FH
665                    //      read all FT lines until XX
666                    String currentTag = null;
667                    StringBuffer currentVal = null;
668                    while (!done) {
669                        line = br.readLine();
670                        if (line.startsWith(DELIMITER_TAG)) {
671                            done = true;
672                            // dump current tag if exists
673                            if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()});
674                        } else {
675                            //         FT lines:   FT   word            value
676                            //         or          FT                   /word
677                            //         or          FT                   /db_xref="taxon:3899....
678                            //                                          ......"
679                            line = line.substring(5); // chomp off "FT   "
680                            if (!line.startsWith(" ")) {
681                                // dump current tag if exists
682                                if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()});
683                                // case 1 : word value - splits into key-value on its own
684                                String[] parts = line.trim().split("\\s+");
685                                currentTag = parts[0];
686                                currentVal = new StringBuffer();
687                                currentVal.append(parts[1]);
688                            } else {
689                                line = line.trim();
690                                if (line.startsWith("/")) {
691                                    // dump current tag if exists
692                                    if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()});
693                                    // case 2 : /word[=.....]
694                                    currentVal = new StringBuffer();
695                                    int equalIndex = line.indexOf('=');
696                                    if (equalIndex>=0) {
697                                        currentTag = line.substring(0, equalIndex);
698                                        currentVal.append(line.substring(equalIndex+1));
699                                    } else {
700                                        currentTag = line;
701                                    }
702                                } else {
703                                    // case 3 : ...."
704                                    currentVal.append("\n");
705                                    currentVal.append(line);
706                                }
707                            }
708                        }
709                    }
710                }
711                // READ END OF SEQUENCE
712                else if (token.equals(END_SEQUENCE_TAG)) {
713                    section.add(new String[]{END_SEQUENCE_TAG,null});
714                    done = true;
715                }
716                // READ DELIMITER TAG
717                else if (token.equals(DELIMITER_TAG)) {
718                    section.add(new String[]{DELIMITER_TAG,null});
719                    done = true;
720                }
721                // READ THIRD PARTY ANNOTATION SECTION
722                else if (token.equals(TPA_TAG)) {
723                    //      exception = don't know how to do TPA yet
724                    String message = ParseException.newMessage(this.getClass(),accession,"not set", "Unable to handle TPAs just yet",sectionToString(section));
725                    throw new ParseException(message);
726                }
727                // READ CONTIG SECTION
728                else if (token.equals(CONTIG_TAG)) {
729                    //      exception = don't know how to do contigs yet
730                    String message = ParseException.newMessage(this.getClass(),accession,"not set", "Unable to handle contig assemblies just yet",sectionToString(section));
731                    throw new ParseException(message);
732                }
733                // READ DOCREF
734                else if (token.equals(DATABASE_XREF_TAG)) {
735                    section.add(new String[]{DATABASE_XREF_TAG,line.substring(5).trim()});
736                    done = true;
737                }
738                // READ DATE
739                else if (token.equals(DATE_TAG)) {
740                    section.add(new String[]{DATE_TAG,line.substring(5).trim()});
741                    done = true;
742                }
743                // READ NORMAL TAG/VALUE SECTION
744                else {
745                    //      rewind buffer to mark
746                    br.reset();
747                    //      read token/values until XX
748                    String currentTag = null;
749                    StringBuffer currentVal = null;
750                    while (!done) {
751                        line = br.readLine();
752                        if (line.startsWith(DELIMITER_TAG)) {
753                            done = true;
754                            // dump current tag if exists
755                            if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()});
756                        } else {
757                            try {
758                                //      merge neighbouring repeated tokens by concatting values
759                                //      return tag->value pairs
760                                String tag = line.substring(0,2);
761                                String value = line.substring(5);
762                                if (currentTag==null || !tag.equals(currentTag)) {
763                                    // dump current tag if exists
764                                    if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()});
765                                    // start new tag
766                                    currentTag = tag;
767                                    currentVal = new StringBuffer();
768                                    currentVal.append(value);
769                                } else {
770                                    currentVal.append("\n");
771                                    currentVal.append(value);
772                                }
773                            } catch (Exception e) {
774                                String message = ParseException.newMessage(this.getClass(), accession, "not set","",sectionToString(section));
775                                throw new ParseException(e, message);
776                            }
777                        }
778                    }
779                }
780            }
781        } catch (IOException e) {
782            String message = ParseException.newMessage(this.getClass(),accession,"not set", "Unable to handle TPAs just yet",sectionToString(section));
783            throw new ParseException(message);
784        }
785        return section;
786    }
787    
788    /**
789     * {@inheritDoc}
790     */
791    public void writeSequence(Sequence seq, PrintStream os) throws IOException {
792        if (this.getPrintStream()==null) this.setPrintStream(os);
793        this.writeSequence(seq, RichObjectFactory.getDefaultNamespace());
794    }
795    
796    /**
797     * {@inheritDoc}
798     */
799    public void writeSequence(Sequence seq, String format, PrintStream os) throws IOException {
800        if (this.getPrintStream()==null) this.setPrintStream(os);
801        this.writeSequence(seq, format, RichObjectFactory.getDefaultNamespace());
802    }
803    
804    /**
805     * {@inheritDoc}
806     * Namespace is ignored as EMBL has no concept of it.
807     */
808    public void writeSequence(Sequence seq, Namespace ns) throws IOException {
809        this.writeSequence(seq, this.getDefaultFormat(), ns);
810    }
811    
812    /**
813     * As per {@link #writeSequence(Sequence, Namespace)}, except
814     * that it also takes a format parameter. This can be any of the formats
815     * defined as constants in this class.
816     * @param seq see {@link #writeSequence(Sequence, Namespace)}
817     * @param format the format to use.
818     * @param ns see {@link #writeSequence(Sequence, Namespace)}
819     * @throws IOException see {@link #writeSequence(Sequence, Namespace)}
820     */
821    public void writeSequence(Sequence seq, String format, Namespace ns) throws IOException {
822        if (!format.equals(EMBL_FORMAT) && !format.equals(EMBL_PRE87_FORMAT))
823            throw new IllegalArgumentException("Format "+format+" not recognised.");
824        
825        RichSequence rs;
826        try {
827            if (seq instanceof RichSequence) rs = (RichSequence)seq;
828            else rs = RichSequence.Tools.enrich(seq);
829        } catch (ChangeVetoException e) {
830            IOException e2 = new IOException("Unable to enrich sequence");
831            e2.initCause(e);
832            throw e2;
833        }
834        
835        SymbolTokenization tok;
836        try {
837            tok = rs.getAlphabet().getTokenization("token");
838        } catch (Exception e) {
839            throw new RuntimeException("Unable to get alphabet tokenizer",e);
840        }
841        
842        Set<Note> notes = rs.getNoteSet();
843        String accession = rs.getAccession();
844        StringBuffer accessions = new StringBuffer();
845        accessions.append(accession);
846        accessions.append(";");
847        String cdat = null;
848        String udat = null;
849        String crel = null;
850        String urel = null;
851        String urecv = null;
852        String organelle = null;
853        String versionLine = null;
854        String dataClass = "STD";
855        boolean genomic = false;
856        String moltype = rs.getAlphabet().getName();
857        for (Iterator<Note> i = notes.iterator(); i.hasNext(); ) {
858            Note n = i.next();
859            if (n.getTerm().equals(Terms.getDateCreatedTerm())) cdat=n.getValue();
860            else if (n.getTerm().equals(Terms.getDateUpdatedTerm())) udat=n.getValue();
861            else if (n.getTerm().equals(Terms.getRelCreatedTerm())) crel=n.getValue();
862            else if (n.getTerm().equals(Terms.getRelUpdatedTerm())) urel=n.getValue();
863            else if (n.getTerm().equals(Terms.getRelUpdatedRecordVersionTerm())) urecv=n.getValue();
864            else if (n.getTerm().equals(Terms.getMolTypeTerm())) moltype=n.getValue();
865            else if (n.getTerm().equals(Terms.getVersionLineTerm())) versionLine=n.getValue();
866            else if (n.getTerm().equals(Terms.getGenomicTerm())) genomic = true;
867            else if (n.getTerm().equals(Terms.getDataClassTerm())) dataClass = n.getValue();
868            else if (n.getTerm().equals(Terms.getAdditionalAccessionTerm())) {
869                accessions.append(" ");
870                accessions.append(n.getValue());
871                accessions.append(";");
872            } else if (n.getTerm().equals(Terms.getOrganelleTerm())) organelle=n.getValue();
873        }
874        
875        StringBuffer locusLine = new StringBuffer();
876        // Division cannot be null
877        String div = rs.getDivision();
878        if(div==null || div.length()==0 || div.length()>3)
879            div = "UNC"; //Unclassified
880            
881        if (format.equals(EMBL_FORMAT)) {
882            // accession; SV version; circular/linear; moltype; dataclass; division; length BP.
883            locusLine.append(rs.getAccession());
884            locusLine.append("; SV ");
885            locusLine.append(rs.getVersion());
886            locusLine.append("; ");
887            locusLine.append(rs.getCircular()?"circular":"linear");
888            locusLine.append("; ");
889            locusLine.append(moltype);
890            locusLine.append("; ");
891            locusLine.append(dataClass);
892            locusLine.append("; ");
893            locusLine.append(div);
894            locusLine.append("; ");
895            locusLine.append(rs.length());
896            locusLine.append(" BP.");
897        } else if (format.equals(EMBL_PRE87_FORMAT)) {
898            // entryname  dataclass; [circular] molecule; division; sequencelength BP.
899            locusLine.append(StringTools.rightPad(rs.getName(),9));
900            locusLine.append(" standard; ");
901            locusLine.append(rs.getCircular()?"circular ":"");
902            // if it is Ensembl genomic, add that in too
903            if (genomic==true) locusLine.append("genomic ");
904            locusLine.append(moltype);
905            locusLine.append("; ");
906            locusLine.append(div);
907            locusLine.append("; ");
908            locusLine.append(rs.length());
909            locusLine.append(" BP.");
910        }
911        StringTools.writeKeyValueLine(LOCUS_TAG, locusLine.toString(), 5, this.getLineWidth(), null, LOCUS_TAG, this.getPrintStream());
912        this.getPrintStream().println(DELIMITER_TAG+"   ");
913        
914        // accession line
915        StringTools.writeKeyValueLine(ACCESSION_TAG, accessions.toString(), 5, this.getLineWidth(), null, ACCESSION_TAG, this.getPrintStream());
916        this.getPrintStream().println(DELIMITER_TAG+"   ");
917        
918        // version line
919        if (format.equals(EMBL_PRE87_FORMAT)) {
920            if (versionLine!=null) StringTools.writeKeyValueLine(VERSION_TAG, versionLine, 5, this.getLineWidth(), null, VERSION_TAG, this.getPrintStream());
921            else StringTools.writeKeyValueLine(VERSION_TAG, accession+"."+rs.getVersion(), 5, this.getLineWidth(), null, VERSION_TAG, this.getPrintStream());
922            this.getPrintStream().println(DELIMITER_TAG+"   ");
923        }
924        
925        // date line
926        StringTools.writeKeyValueLine(DATE_TAG, (cdat==null?udat:cdat)+" (Rel. "+(crel==null?"0":crel)+", Created)", 5, this.getLineWidth(), null, DATE_TAG, this.getPrintStream());
927        StringTools.writeKeyValueLine(DATE_TAG, udat+" (Rel. "+(urel==null?"0":urel)+", Last updated, Version "+(urecv==null?"0":urecv)+")", 5, this.getLineWidth(), null, DATE_TAG, this.getPrintStream());
928        this.getPrintStream().println(DELIMITER_TAG+"   ");
929        
930        // definition line
931        StringTools.writeKeyValueLine(DEFINITION_TAG, rs.getDescription(), 5, this.getLineWidth(), null, DEFINITION_TAG, this.getPrintStream());
932        this.getPrintStream().println(DELIMITER_TAG+"   ");
933        
934        // keywords line
935        StringBuffer keywords = new StringBuffer();
936        for (Iterator<Note> n = notes.iterator(); n.hasNext(); ) {
937            Note nt = n.next();
938            if (nt.getTerm().equals(Terms.getKeywordTerm())) {
939                if (keywords.length()>0) keywords.append("; ");
940                keywords.append(nt.getValue());
941            }
942        }
943        if (keywords.length()>0) {
944            keywords.append(".");
945            StringTools.writeKeyValueLine(KEYWORDS_TAG, keywords.toString(), 5, this.getLineWidth(), null, KEYWORDS_TAG, this.getPrintStream());
946            this.getPrintStream().println(DELIMITER_TAG+"   ");
947        } else {
948            this.getPrintStream().println(KEYWORDS_TAG+"   .");
949            this.getPrintStream().println(DELIMITER_TAG+"   ");
950        }
951        
952        // source line (from taxon)
953        //   organism line
954        NCBITaxon tax = rs.getTaxon();
955        if (tax!=null) {
956            StringTools.writeKeyValueLine(SOURCE_TAG, tax.getDisplayName(), 5, this.getLineWidth(), null, SOURCE_TAG, this.getPrintStream());
957            StringTools.writeKeyValueLine(ORGANISM_TAG, tax.getNameHierarchy(), 5, this.getLineWidth(), null, SOURCE_TAG, this.getPrintStream());
958            if (organelle!=null) StringTools.writeKeyValueLine(ORGANELLE_TAG, organelle, 5, this.getLineWidth(), null, ORGANELLE_TAG, this.getPrintStream());
959            this.getPrintStream().println(DELIMITER_TAG+"   ");
960        }
961        
962        // references - rank (bases x to y)
963        for (Iterator<RankedDocRef> r = rs.getRankedDocRefs().iterator(); r.hasNext(); ) {
964            RankedDocRef rdr = r.next();
965            DocRef d = rdr.getDocumentReference();
966            // RN, RC, RP, RX, RG, RA, RT, RL
967            StringTools.writeKeyValueLine(REFERENCE_TAG, "["+rdr.getRank()+"]", 5, this.getLineWidth(), null, REFERENCE_TAG, this.getPrintStream());
968            StringTools.writeKeyValueLine(REMARK_TAG, d.getRemark(), 5, this.getLineWidth(), null, REMARK_TAG, this.getPrintStream());
969            Integer rstart = rdr.getStart();
970            if (rstart==null) rstart = new Integer(1);
971            Integer rend = rdr.getEnd();
972            if (rend==null) rend = new Integer(rs.length());
973            StringTools.writeKeyValueLine(REFERENCE_POSITION_TAG, rstart+"-"+rend, 5, this.getLineWidth(), null, REFERENCE_POSITION_TAG, this.getPrintStream());
974            CrossRef c = d.getCrossref();
975            if (c!=null) StringTools.writeKeyValueLine(REFERENCE_XREF_TAG, c.getDbname()+"; "+c.getAccession()+".", 5, this.getLineWidth(), null, REFERENCE_XREF_TAG, this.getPrintStream());
976            List<DocRefAuthor> auths = d.getAuthorList();
977            for (Iterator<DocRefAuthor> j = auths.iterator(); j.hasNext(); ) {
978                DocRefAuthor a = j.next();
979                if (a.isConsortium()) {
980                    StringTools.writeKeyValueLine(CONSORTIUM_TAG, a+";", 5, this.getLineWidth(), null, CONSORTIUM_TAG, this.getPrintStream());
981                    j.remove();
982                }
983            }
984            if (!auths.isEmpty()) StringTools.writeKeyValueLine(AUTHORS_TAG, DocRefAuthor.Tools.generateAuthorString(auths, true)+";", 5, this.getLineWidth(), null, AUTHORS_TAG, this.getPrintStream());
985            else StringTools.writeKeyValueLine(AUTHORS_TAG, ";", 5, this.getLineWidth(), null, AUTHORS_TAG, this.getPrintStream());
986            if (d.getTitle()!=null && d.getTitle().length()!=0) StringTools.writeKeyValueLine(TITLE_TAG, "\""+d.getTitle()+"\";", 5, this.getLineWidth(), null, TITLE_TAG, this.getPrintStream());
987            else StringTools.writeKeyValueLine(TITLE_TAG, ";", 5, this.getLineWidth(), null, TITLE_TAG, this.getPrintStream());
988            StringTools.writeKeyValueLine(LOCATOR_TAG, d.getLocation()+".", 5, this.getLineWidth(), null, LOCATOR_TAG, this.getPrintStream());
989            this.getPrintStream().println(DELIMITER_TAG+"   ");
990        }
991        
992        // db references - ranked
993        for (Iterator<RankedCrossRef> r = rs.getRankedCrossRefs().iterator(); r.hasNext(); ) {
994            RankedCrossRef rcr = r.next();
995            CrossRef c = rcr.getCrossRef();
996            Set<Note> noteset = c.getNoteSet();
997            StringBuffer sb = new StringBuffer();
998            sb.append(c.getDbname());
999            sb.append("; ");
1000            sb.append(c.getAccession());
1001            boolean hasSecondary = false;
1002            for (Iterator<Note> i = noteset.iterator(); i.hasNext(); ) {
1003                Note n = i.next();
1004                if (n.getTerm().equals(Terms.getAdditionalAccessionTerm())) {
1005                    sb.append("; ");
1006                    sb.append(n.getValue());
1007                    hasSecondary = true;
1008                }
1009            }
1010            //if (!hasSecondary) sb.append("; -"); 
1011            //sb.append(".");
1012            if (!hasSecondary) sb.append(";");
1013            else sb.append(".");
1014            StringTools.writeKeyValueLine(DATABASE_XREF_TAG, sb.toString(), 5, this.getLineWidth(), null, DATABASE_XREF_TAG, this.getPrintStream());
1015        }
1016        if (!rs.getRankedCrossRefs().isEmpty())
1017            this.getPrintStream().println(DELIMITER_TAG+"   ");
1018        
1019        // comments - if any
1020        if (!rs.getComments().isEmpty()) {
1021            StringBuffer sb = new StringBuffer();
1022            for (Iterator<Comment> i = rs.getComments().iterator(); i.hasNext(); ) {
1023                Comment c = i.next();
1024                sb.append(c.getComment());
1025                if (i.hasNext()) sb.append("\n");
1026            }
1027            StringTools.writeKeyValueLine(COMMENT_TAG, sb.toString(), 5, this.getLineWidth(), null, COMMENT_TAG, this.getPrintStream());
1028            this.getPrintStream().println(DELIMITER_TAG+"   ");
1029        }
1030        
1031        this.getPrintStream().println(FEATURE_HEADER_TAG+"   Key             Location/Qualifiers");
1032        this.getPrintStream().println(FEATURE_HEADER_TAG+"   ");
1033        // feature_type     location
1034        for (Iterator i = rs.getFeatureSet().iterator(); i.hasNext(); ) {
1035            RichFeature f = (RichFeature)i.next();
1036            StringTools.writeKeyValueLine(FEATURE_TAG+"   "+f.getTypeTerm().getName(), GenbankLocationParser.writeLocation((RichLocation)f.getLocation()), 21, this.getLineWidth(), ",", FEATURE_TAG, this.getPrintStream());
1037            for (Iterator<Note> j = f.getNoteSet().iterator(); j.hasNext(); ) {
1038                Note n = j.next();
1039                // /key="val" or just /key if val==""
1040                if (n.getValue()==null || n.getValue().length()==0) StringTools.writeKeyValueLine(FEATURE_TAG, "/"+n.getTerm().getName(), 21, this.getLineWidth(), null, FEATURE_TAG, this.getPrintStream());
1041                else StringTools.writeKeyValueLine(FEATURE_TAG, "/"+n.getTerm().getName()+"=\""+n.getValue()+"\"", 21, this.getLineWidth(), null, FEATURE_TAG, this.getPrintStream());
1042            }
1043            // add-in to source feature only organism and db_xref="taxon:xyz" where present
1044            if (f.getType().equals("source") && tax!=null) {
1045                String displayName = tax.getDisplayName();
1046                if (displayName.indexOf('(')>-1) displayName = displayName.substring(0, displayName.indexOf('(')).trim();
1047                StringTools.writeKeyValueLine(FEATURE_TAG, "/organism=\""+displayName+"\"", 21, this.getLineWidth(), null, FEATURE_TAG, this.getPrintStream());
1048                StringTools.writeKeyValueLine(FEATURE_TAG, "/db_xref=\"taxon:"+tax.getNCBITaxID()+"\"", 21, this.getLineWidth(), null, FEATURE_TAG, this.getPrintStream());
1049            }
1050            // add-in other dbxrefs where present
1051            for (Iterator<RankedCrossRef> j = f.getRankedCrossRefs().iterator(); j.hasNext(); ) {
1052                RankedCrossRef rcr = j.next();
1053                CrossRef cr = rcr.getCrossRef();
1054                StringTools.writeKeyValueLine(FEATURE_TAG, "/db_xref=\""+cr.getDbname()+":"+cr.getAccession()+"\"", 21, this.getLineWidth(), null, FEATURE_TAG, this.getPrintStream());
1055            }
1056        }
1057        this.getPrintStream().println(DELIMITER_TAG+"   ");
1058        
1059        // SQ   Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other;
1060        int aCount = 0;
1061        int cCount = 0;
1062        int gCount = 0;
1063        int tCount = 0;
1064        int oCount = 0;
1065        for (int i = 1; i <= rs.length(); i++) {
1066            char c;
1067            try {
1068                c = tok.tokenizeSymbol(rs.symbolAt(i)).charAt(0);
1069            } catch (Exception e) {
1070                throw new RuntimeException("Unable to get symbol at position "+i,e);
1071            }
1072            switch (c) {
1073                case 'a': case 'A':
1074                    aCount++;
1075                    break;
1076                case 'c': case 'C':
1077                    cCount++;
1078                    break;
1079                case 'g': case 'G':
1080                    gCount++;
1081                    break;
1082                case 't': case 'T':
1083                    tCount++;
1084                    break;
1085                default:
1086                    oCount++;
1087            }
1088        }
1089        this.getPrintStream().print(START_SEQUENCE_TAG+"   Sequence "+rs.length()+" BP; ");
1090        this.getPrintStream().print(aCount + " A; ");
1091        this.getPrintStream().print(cCount + " C; ");
1092        this.getPrintStream().print(gCount + " G; ");
1093        this.getPrintStream().print(tCount + " T; ");
1094        this.getPrintStream().println(oCount + " other;");
1095        
1096        // sequence stuff
1097        Symbol[] syms = (Symbol[])rs.toList().toArray(new Symbol[0]);
1098        int lineLen = 0;
1099        int symCount = 0;
1100        this.getPrintStream().print("    ");
1101        for (int i = 0; i < syms.length; i++) {
1102            if (symCount % 60 == 0 && symCount>0) {
1103                this.getPrintStream().print(StringTools.leftPad(""+symCount,10));
1104                this.getPrintStream().print("\n    ");
1105                lineLen = 0;
1106            }
1107            if (symCount % 10 == 0) {
1108                this.getPrintStream().print(" ");
1109                lineLen++;
1110            }
1111            try {
1112                this.getPrintStream().print(tok.tokenizeSymbol(syms[i]));
1113            } catch (IllegalSymbolException e) {
1114                throw new RuntimeException("Found illegal symbol: "+syms[i]);
1115            }
1116            symCount++;
1117            lineLen++;
1118        }
1119        this.getPrintStream().print(StringTools.leftPad(""+symCount,(66-lineLen)+10));
1120        this.getPrintStream().print("\n");
1121        this.getPrintStream().println(END_SEQUENCE_TAG);
1122    }
1123    
1124    /**
1125     * {@inheritDoc}
1126     */
1127    public String getDefaultFormat() {
1128        return EMBL_FORMAT;
1129    }
1130    
1131    
1132    /**
1133     * Converts the current parse section to a String. Useful for debugging.
1134     */
1135    String sectionToString(List section){
1136        StringBuffer parseBlock = new StringBuffer();
1137        for(Iterator i = section.listIterator(); i.hasNext();){
1138            String[] part = (String[])i.next();
1139            for(int x = 0; x < part.length; x++){
1140                parseBlock.append(part[x]);
1141                if(x == 0){
1142                    parseBlock.append("   "); //the gap will have been trimmed
1143                }
1144            }
1145        }
1146        return parseBlock.toString();
1147    }
1148}
1149