001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022package org.biojavax.bio.seq.io;
023
024import java.io.BufferedInputStream;
025import java.io.BufferedReader;
026import java.io.File;
027import java.io.FileReader;
028import java.io.IOException;
029import java.io.InputStreamReader;
030import java.io.PrintStream;
031import java.util.ArrayList;
032import java.util.HashSet;
033import java.util.Iterator;
034import java.util.List;
035import java.util.Set;
036import java.util.TreeSet;
037import java.util.regex.Matcher;
038import java.util.regex.Pattern;
039
040import org.biojava.bio.seq.Sequence;
041import org.biojava.bio.seq.io.ParseException;
042import org.biojava.bio.seq.io.SeqIOListener;
043import org.biojava.bio.seq.io.SymbolTokenization;
044import org.biojava.bio.symbol.IllegalAlphabetException;
045import org.biojava.bio.symbol.IllegalSymbolException;
046import org.biojava.bio.symbol.SimpleSymbolList;
047import org.biojava.bio.symbol.Symbol;
048import org.biojava.bio.symbol.SymbolList;
049import org.biojava.utils.ChangeVetoException;
050import org.biojavax.Comment;
051import org.biojavax.CrossRef;
052import org.biojavax.DocRef;
053import org.biojavax.DocRefAuthor;
054import org.biojavax.Namespace;
055import org.biojavax.Note;
056import org.biojavax.RankedCrossRef;
057import org.biojavax.RankedDocRef;
058import org.biojavax.RichObjectFactory;
059import org.biojavax.SimpleComment;
060import org.biojavax.SimpleCrossRef;
061import org.biojavax.SimpleDocRef;
062import org.biojavax.SimpleRankedCrossRef;
063import org.biojavax.SimpleRankedDocRef;
064import org.biojavax.SimpleRichAnnotation;
065import org.biojavax.bio.seq.CompoundRichLocation;
066import org.biojavax.bio.seq.RichFeature;
067import org.biojavax.bio.seq.RichLocation;
068import org.biojavax.bio.seq.RichSequence;
069import org.biojavax.bio.seq.SimplePosition;
070import org.biojavax.bio.seq.SimpleRichLocation;
071import org.biojavax.bio.taxa.NCBITaxon;
072import org.biojavax.bio.taxa.SimpleNCBITaxon;
073import org.biojavax.ontology.ComparableTerm;
074import org.biojavax.utils.StringTools;
075
076/**
077 * Format reader for GenBank files. This version of Genbank format will generate
078 * and write RichSequence objects. Loosely Based on code from the old, deprecated,
079 * org.biojava.bio.seq.io.GenbankFormat object.
080 *
081 * @author Richard Holland
082 * @author Mark Schreiber
083 * @author David Scott
084 * @author Bubba Puryear
085 * @author George Waldon
086 * @author Deepak Sheoran
087 * @since 1.5
088 */
089public class GenbankFormat extends RichSequenceFormat.HeaderlessFormat {
090    
091    // Register this format with the format auto-guesser.
092    static {
093        RichSequence.IOTools.registerFormat(GenbankFormat.class);
094    }
095    
096    /**
097     * The name of this format
098     */
099    public static final String GENBANK_FORMAT = "GENBANK";
100    
101    protected static final String LOCUS_TAG =           "LOCUS";
102    protected static final String DEFINITION_TAG =      "DEFINITION";
103    protected static final String ACCESSION_TAG =       "ACCESSION";
104    protected static final String VERSION_TAG =         "VERSION";
105    protected static final String KEYWORDS_TAG =        "KEYWORDS";
106    //                                                  "SEGMENT"
107    protected static final String SOURCE_TAG =          "SOURCE";
108    protected static final String ORGANISM_TAG =        "ORGANISM";
109    protected static final String REFERENCE_TAG =       "REFERENCE";
110    protected static final String AUTHORS_TAG =         "AUTHORS";
111    protected static final String CONSORTIUM_TAG =      "CONSRTM";
112    protected static final String TITLE_TAG =           "TITLE";
113    protected static final String JOURNAL_TAG =         "JOURNAL";
114    protected static final String PUBMED_TAG =          "PUBMED";
115    protected static final String MEDLINE_TAG =         "MEDLINE"; //deprecated
116    protected static final String REMARK_TAG =          "REMARK";
117    protected static final String COMMENT_TAG =         "COMMENT";
118    protected static final String FEATURE_TAG =         "FEATURES";
119    protected static final String BASE_COUNT_TAG_FULL = "BASE COUNT"; //deprecated
120    protected static final String BASE_COUNT_TAG =      "BASE";
121    //                                                  "CONTIG"
122    protected static final String START_SEQUENCE_TAG =  "ORIGIN";
123    protected static final String END_SEQUENCE_TAG =    "//";
124    
125    // locus line
126    protected static final Pattern lp = Pattern.compile("^(\\S+)\\s+\\d+\\s+(bp|aa)\\s{1,4}([dms]s-)?(\\S+)?\\s+(circular|linear)?\\s*(\\S+)?\\s*(\\S+)?$");
127    // version line
128    protected static final Pattern vp = Pattern.compile("^(\\S*?)(\\.(\\d+))?(\\s+GI:(\\S+))?$");
129    // reference line
130    protected static final Pattern refRange = Pattern.compile("^\\s*(\\d+)\\s+to\\s+(\\d+)$");
131    protected static final Pattern refp = Pattern.compile("^(\\d+)\\s*(?:(\\((?:bases|residues)\\s+(\\d+\\s+to\\s+\\d+(\\s*;\\s*\\d+\\s+to\\s+\\d+)*)\\))|\\(sites\\))?");
132    // dbxref line
133    protected static final Pattern dbxp = Pattern.compile("^([^:]+):(\\S+)$");
134    //sections start at a line and continue till the first line afterwards with a
135    //non-whitespace first character
136    //we want to match any of the following as a new section within a section
137    //  \s{0,8} word \s{0,7} value
138    //  \s{21} /word = value
139    //  \s{21} /word
140    protected static final Pattern sectp = Pattern.compile("^(\\s{0,8}(\\S+)\\s{0,7}(.*)|\\s{21}(/\\S+?)=(.*)|\\s{21}(/\\S+))$");
141    
142    protected static final Pattern readableFiles = Pattern.compile(".*(g[bp]k*$|\\u002eg[bp].*)");
143    protected static final Pattern headerLine = Pattern.compile("^LOCUS.*");
144    
145    private final static HashSet isNotQuoted = new HashSet();
146    static {
147        isNotQuoted.add("anticodon");
148        isNotQuoted.add("citation");
149        isNotQuoted.add("codon");
150        isNotQuoted.add("codon_start");
151        isNotQuoted.add("compare");
152        isNotQuoted.add("cons_splice");
153        isNotQuoted.add("direction");
154        isNotQuoted.add("estimated_length");
155        isNotQuoted.add("label");
156        isNotQuoted.add("mod_base");
157        isNotQuoted.add("number");
158        isNotQuoted.add("rpt_type");
159        isNotQuoted.add("rpt_unit_range");
160        isNotQuoted.add("transl_except");
161        isNotQuoted.add("transl_table");
162    }
163    
164    /**
165     * Implements some GenBank-specific terms.
166     */
167    public static class Terms extends RichSequence.Terms {        
168        /**
169         * Getter for the Genbank term
170         * @return The genbank Term
171         */
172        public static ComparableTerm getGenBankTerm() {
173            return RichObjectFactory.getDefaultOntology().getOrCreateTerm("GenBank");
174        }
175    }
176    
177    /**
178     * {@inheritDoc}
179     * A file is in GenBank format if the name ends with gbk, contains the letters egb, or the first line of
180     * the file starts with the word LOCUS
181     */
182    public boolean canRead(File file) throws IOException {
183        if (readableFiles.matcher(file.getName()).matches()) return true;
184        BufferedReader br = new BufferedReader(new FileReader(file));
185        final String firstLine = br.readLine();
186        boolean readable = firstLine!=null && headerLine.matcher(firstLine).matches();
187        br.close();
188        return readable;
189    }
190    
191    /**
192     * {@inheritDoc}
193     * Returns an dna parser if the letters DNA or RNA appear in the first line of the file.
194     * Otherwise returns a DNA tokenizer.
195     */
196    public SymbolTokenization guessSymbolTokenization(File file) throws IOException {
197        BufferedReader br = new BufferedReader(new FileReader(file));
198        String firstLine = br.readLine();
199        boolean dna = (firstLine.indexOf("DNA") >0 || firstLine.indexOf("RNA") > 0);
200        br.close();
201        if (dna) return RichSequence.IOTools.getDNAParser();
202        else return RichSequence.IOTools.getProteinParser();
203    }
204    
205    /**
206     * {@inheritDoc}
207     * A stream is in GenBank format if the first line of the stream starts with the word LOCUS
208     */
209    public boolean canRead(BufferedInputStream stream) throws IOException {
210        stream.mark(2000); // some streams may not support this
211        BufferedReader br = new BufferedReader(new InputStreamReader(stream));
212        final String firstLine = br.readLine();
213        boolean readable = firstLine!=null && headerLine.matcher(firstLine).matches();
214        // don't close the reader as it'll close the stream too.
215        // br.close();
216        stream.reset();
217        return readable;
218    }
219    
220    /**
221     * {@inheritDoc}
222     * Returns an dna parser if the letters DNA or RNA appear in the first line of the stream.
223     * Otherwise returns a DNA tokenizer.
224     */
225    public SymbolTokenization guessSymbolTokenization(BufferedInputStream stream) throws IOException {
226        stream.mark(2000); // some streams may not support this
227        BufferedReader br = new BufferedReader(new InputStreamReader(stream));
228        String firstLine = br.readLine();
229        boolean dna = (firstLine.indexOf("DNA") >0 || firstLine.indexOf("RNA") > 0);
230        // don't close the reader as it'll close the stream too.
231        // br.close();
232        stream.reset();
233        if (dna) return RichSequence.IOTools.getDNAParser();
234        else return RichSequence.IOTools.getProteinParser();
235    }
236    
237    /**
238     * {@inheritDoc}
239     */
240    public boolean readSequence(BufferedReader reader,
241            SymbolTokenization symParser,
242            SeqIOListener listener)
243            throws IllegalSymbolException, IOException, ParseException {
244        if (!(listener instanceof RichSeqIOListener)) throw new IllegalArgumentException("Only accepting RichSeqIOListeners today");
245        return this.readRichSequence(reader,symParser,(RichSeqIOListener)listener,null);
246    }
247    
248    private String sectionKey = null;
249    private NCBITaxon tax = null;
250    private String organism = null;
251    private String accession = null;
252    private String identifier = null;
253    /**
254     * {@inheritDoc}
255     */
256    public boolean readRichSequence(BufferedReader reader,
257            SymbolTokenization symParser,
258            RichSeqIOListener rlistener,
259            Namespace ns)
260            throws IllegalSymbolException, IOException, ParseException {
261        
262        sectionKey = null;
263        tax = null;
264        organism = null;
265        accession = null;
266        identifier = null;
267        boolean hasAnotherSequence = true;
268        //boolean hasInternalWhitespace = false;
269        
270        rlistener.startSequence();
271        
272        if (ns==null) ns=RichObjectFactory.getDefaultNamespace();
273        rlistener.setNamespace(ns);
274        
275        // Get an ordered list of key->value pairs in array-tuples
276        List section = null;
277        try{
278            do {
279                section = this.readSection(reader);
280                sectionKey = ((String[])section.get(0))[0];
281                if(sectionKey == null){
282                    String message = ParseException.newMessage(this.getClass(), accession, identifier, "Section key was null", sectionToString(section));
283                    throw new ParseException(message);
284                }
285                // process section-by-section
286                if (sectionKey.equals(LOCUS_TAG)) {
287                    String loc = ((String[])section.get(0))[1];
288                    Matcher m = lp.matcher(loc);
289                    if (m.matches()) {
290                        rlistener.setName(m.group(1));
291                        accession = m.group(1); // default if no accession found
292                        rlistener.setAccession(accession);
293                        if (m.group(4)!=null)
294                            rlistener.addSequenceProperty(Terms.getMolTypeTerm(),m.group(4));
295                        // Optional extras
296                        String stranded = m.group(3);
297                        if(stranded!=null && stranded.equals("ss-"))
298                            stranded = "single";
299                        else if(stranded!=null && stranded.equals("ms-"))
300                            stranded = "mixed";
301                        else if(stranded!=null && stranded.equals("ds-"))
302                            stranded = "double";
303                        String circular = m.group(5);
304                        String fifth = m.group(6);
305                        String sixth = m.group(7);
306                        if (stranded!=null) rlistener.addSequenceProperty(Terms.getStrandedTerm(),stranded);
307                        if (circular!=null && circular.equalsIgnoreCase("circular")) rlistener.setCircular(true);
308                        if (sixth != null) {
309                            rlistener.setDivision(fifth);
310                            rlistener.addSequenceProperty(Terms.getDateUpdatedTerm(),sixth);
311                        } else if (fifth!=null) {
312                            rlistener.addSequenceProperty(Terms.getDateUpdatedTerm(),fifth);
313                        }
314                    } else {
315                        String message = ParseException.newMessage(this.getClass(), accession, identifier, "Bad locus line", sectionToString(section));
316                        throw new ParseException(message);
317                    }
318                } else if (sectionKey.equals(DEFINITION_TAG)) {
319                    rlistener.setDescription(((String[])section.get(0))[1]);
320                } else if (sectionKey.equals(ACCESSION_TAG)) {
321                    // if multiple accessions, store only first as accession,
322                    // and store rest in annotation
323                    String[] accs = ((String[])section.get(0))[1].split("\\s+");
324                    accession = accs[0].trim();
325                    rlistener.setAccession(accession);
326                    for (int i = 1; i < accs.length; i++) {
327                        rlistener.addSequenceProperty(Terms.getAdditionalAccessionTerm(),accs[i].trim());
328                    }
329                } else if (sectionKey.equals(VERSION_TAG)) {
330                    String ver = ((String[])section.get(0))[1];
331                    Matcher m = vp.matcher(ver);
332                    if (m.matches()) {
333                        String verAcc = m.group(1);
334                        if (!accession.equals(verAcc)) {
335                            // the version refers to a different accession!
336                            // believe the version line, and store the original
337                            // accession away in the additional accession set
338                            rlistener.addSequenceProperty(Terms.getAdditionalAccessionTerm(),accession);
339                            accession = verAcc;
340                            rlistener.setAccession(accession);
341                        }
342                        if (m.group(3)!=null) rlistener.setVersion(Integer.parseInt(m.group(3)));
343                        if (m.group(5)!=null) {
344                            identifier = m.group(5);
345                            rlistener.setIdentifier(identifier);
346                        }
347                    } else {
348                        String message = ParseException.newMessage(this.getClass(), accession, identifier, "Bad version line", sectionToString(section));
349                        throw new ParseException(message);
350                    }
351                } else if (sectionKey.equals(KEYWORDS_TAG)) {
352                    String val = ((String[])section.get(0))[1];
353                    if (val.endsWith(".")) val = val.substring(0, val.length()-1); // chomp dot
354                    val = val.replace('\n',' '); //remove newline
355                    String[] kws = val.split(";");
356                    
357                    for (int i = 0; i < kws.length; i++) {
358                        String kw = kws[i].trim();
359                        if (kw.length()==0) continue;
360                        rlistener.addSequenceProperty(Terms.getKeywordTerm(), kw);
361                    }
362                } else if (sectionKey.equals(SOURCE_TAG)) {
363                    // ignore - can get all this from the first feature
364                } else if (sectionKey.equals(REFERENCE_TAG) && !this.getElideReferences()) {
365                    // first line of section has rank and location
366                    int ref_rank;
367                    List baseRangeList=null;
368                    String ref = ((String[])section.get(0))[1];
369                    Matcher m = refp.matcher(ref);
370                    if (m.matches()) {
371                        ref_rank = Integer.parseInt(m.group(1));
372                        if (m.group(3) != null) baseRangeList=buildBaseRanges(m.group(3));
373                    } else {
374                        String message = ParseException.newMessage(this.getClass(), accession, identifier, "Bad reference line", sectionToString(section));
375                        throw new ParseException(message);
376                    }
377                    // rest can be in any order
378                    String authors = null;
379                    String consortium = null;
380                    String title = null;
381                    String journal = null;
382                    String medline = null;
383                    String pubmed = null;
384                    String remark = null;
385                    for (int i = 1; i < section.size(); i++) {
386                        String key = ((String[])section.get(i))[0];
387                        String val = ((String[])section.get(i))[1];
388                        if (key.equals(AUTHORS_TAG)) authors = val.replace('\n',' '); //see #2276
389                        else if (key.equals(CONSORTIUM_TAG)) consortium = val.replace('\n',' '); //see #2276
390                        else if (key.equals(TITLE_TAG)) title = val.replace('\n',' '); //see #2276
391                        else if (key.equals(JOURNAL_TAG)) journal = val.replace('\n',' '); //see #2276
392                        else if (key.equals(MEDLINE_TAG)) medline = val;
393                        else if (key.equals(PUBMED_TAG)) pubmed = val;
394                        else if (key.equals(REMARK_TAG)) remark = val.replace('\n',' '); //see #2276
395                    }
396                    
397                    // create the docref object
398                    try {
399                        // Use consortium as well if present.
400                        if (authors==null) authors = consortium + " (consortium)";
401                        else if (consortium!=null) authors = authors + ", " + consortium + " (consortium)";
402                        // Create docref.
403                        DocRef dr = null;
404                        // assign either the pubmed or medline to the docref - medline gets priority
405                        if (medline != null) {
406                            dr = (DocRef) RichObjectFactory.getObject(SimpleDocRef.class, new Object[]{DocRefAuthor.Tools.parseAuthorString(authors), journal, title, Terms.MEDLINE_KEY, medline, new Integer(0)});
407                            if (dr.getCrossref() == null) {
408                                dr.setCrossref((CrossRef) RichObjectFactory.getObject(SimpleCrossRef.class, new Object[]{Terms.MEDLINE_KEY, medline, new Integer(0)}));
409                            }
410                        } else if (pubmed != null) {
411                            dr = (DocRef) RichObjectFactory.getObject(SimpleDocRef.class, new Object[]{DocRefAuthor.Tools.parseAuthorString(authors), journal, title, Terms.PUBMED_KEY, pubmed, new Integer(0)});
412                            if (dr.getCrossref() == null) {
413                                dr.setCrossref((CrossRef) RichObjectFactory.getObject(SimpleCrossRef.class, new Object[]{Terms.PUBMED_KEY, pubmed, new Integer(0)}));
414                            }
415                        } else {
416                            dr = (DocRef) RichObjectFactory.getObject(SimpleDocRef.class, new Object[]{DocRefAuthor.Tools.parseAuthorString(authors), journal, title});
417                        }                        
418                        // assign the remarks
419                        if (!this.getElideComments()) dr.setRemark(remark);
420                        // assign the docref to the bioentry: null if no base ranges, Integers if 1 base range - the normal case, joined RichLocation if more than 1
421                        RankedDocRef rdr = baseRangeList == null?new SimpleRankedDocRef(dr, null, null, ref_rank):(baseRangeList.size()==1?new SimpleRankedDocRef(dr, new Integer(((RichLocation)baseRangeList.get(0)).getMin()), new Integer(((RichLocation)baseRangeList.get(0)).getMax()), ref_rank):new SimpleRankedDocRef(dr, new CompoundRichLocation(baseRangeList), ref_rank));
422                        rlistener.setRankedDocRef(rdr);
423                    } catch (ChangeVetoException e) {
424                        throw new ParseException(e+", accession:"+accession);
425                    }
426                } else if (sectionKey.equals(COMMENT_TAG) && !this.getElideComments()) {
427                    // Set up some comments
428                    rlistener.setComment(((String[])section.get(0))[1]);
429                } else if (sectionKey.equals(FEATURE_TAG) && !this.getElideFeatures()) {
430                    // starting from second line of input, start a new feature whenever we come across
431                    // a key that does not start with /
432                    boolean seenAFeature = false;
433                    int rcrossrefCount = 0;
434                    boolean skippingBond = false;
435                    for (int i = 1 ; i < section.size(); i++) {
436                        String key = ((String[])section.get(i))[0];
437                        String val = ((String[])section.get(i))[1];
438                        if (key.startsWith("/")) {
439                                  if(!skippingBond)
440                                  {
441                                    key = key.substring(1); // strip leading slash
442                                    val = val.replaceAll("\\s*[\\n\\r]+\\s*"," ").trim();
443                                    if (val.endsWith("\"")) val = val.substring(1,val.length()-1); // strip quotes
444                                    // parameter on old feature
445                                    if (key.equals("db_xref")) {
446                                        Matcher m = dbxp.matcher(val);
447                                        if (m.matches()) {
448                                            String dbname = m.group(1);
449                                            String raccession = m.group(2);
450                                            if (dbname.equalsIgnoreCase("taxon")) {
451                                                // Set the Taxon instead of a dbxref
452                                                tax = (NCBITaxon)RichObjectFactory.getObject(SimpleNCBITaxon.class, new Object[]{Integer.valueOf(raccession)});
453                                                rlistener.setTaxon(tax);
454                                                try {
455                                                    if (organism!=null) tax.addName(NCBITaxon.SCIENTIFIC,organism.replace('\n', ' '));// readSection can embed new lines
456                                                } catch (ChangeVetoException e) {
457                                                    throw new ParseException(e+", accession:"+accession);
458                                                }
459                                            } else {
460                                                try {
461                                                    CrossRef cr = (CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{dbname, raccession, new Integer(0)});
462                                                    RankedCrossRef rcr = new SimpleRankedCrossRef(cr, ++rcrossrefCount);
463                                                    rlistener.getCurrentFeature().addRankedCrossRef(rcr);
464                                                } catch (ChangeVetoException e) {
465                                                    throw new ParseException(e+", accession:"+accession);
466                                                }
467                                            }
468                                        } else {
469                                            String message = ParseException.newMessage(this.getClass(), accession, identifier, "Bad dbxref", sectionToString(section));
470                                            throw new ParseException(message);
471                                        }
472                                    } else if (key.equalsIgnoreCase("organism")) {
473                                        try {
474                                            organism = val;
475                                            if (tax!=null) tax.addName(NCBITaxon.SCIENTIFIC,organism.replace('\n', ' '));// readSection can embed new lines
476                                        } catch (ChangeVetoException e) {
477                                            throw new ParseException(e+", accession:"+accession);
478                                        }
479                                    } else {
480                                        if (key.equalsIgnoreCase("translation")) {
481                                            // strip spaces from sequence
482                                            val = val.replaceAll("\\s+","");
483                                        }
484                                        rlistener.addFeatureProperty(RichObjectFactory.getDefaultOntology().getOrCreateTerm(key),val);
485                                    }
486                                }
487                        } else {
488                            // new feature!
489                            // end previous feature
490                            if(key.equalsIgnoreCase("bond"))
491                            {
492                                skippingBond = true;
493                            }
494                            else
495                            {
496                                skippingBond = false;
497                                if (seenAFeature) {
498                                        rlistener.endFeature();
499                                }
500                                    // start next one, with lots of lovely info in it
501                                    RichFeature.Template templ = new RichFeature.Template();
502                                    templ.annotation = new SimpleRichAnnotation();
503                                    templ.sourceTerm = Terms.getGenBankTerm();
504                                    templ.typeTerm = RichObjectFactory.getDefaultOntology().getOrCreateTerm(key);
505                                    templ.featureRelationshipSet = new TreeSet();
506                                    templ.rankedCrossRefs = new TreeSet();
507                                    String tidyLocStr = val.replaceAll("\\s+","");
508                                    templ.location = GenbankLocationParser.parseLocation(ns, accession, tidyLocStr);
509                                    rlistener.startFeature(templ);
510                                    seenAFeature = true;
511                                    rcrossrefCount = 0;
512                            }
513                            
514                        }
515                    }
516                    
517                    if (seenAFeature) {
518                        rlistener.endFeature();
519                    }
520                } else if (sectionKey.equals(BASE_COUNT_TAG)) {
521                    // ignore - can calculate from sequence content later if needed
522                } else if (sectionKey.equals(START_SEQUENCE_TAG) && !this.getElideSymbols()) {
523                    // our first line is ignorable as it is the ORIGIN tag
524                    // the second line onwards conveniently have the number as
525                    // the [0] tuple, and sequence string as [1] so all we have
526                    // to do is concat the [1] parts and then strip out spaces,
527                    // and replace '.' and '~' with '-' for our parser.
528                    StringBuffer seq = new StringBuffer();
529                    for (int i = 1 ; i < section.size(); i++) seq.append(((String[])section.get(i))[1]);
530                    try {
531                        SymbolList sl = new SimpleSymbolList(symParser,
532                                seq.toString().replaceAll("\\s+","").replaceAll("[\\.|~]","-"));
533                        rlistener.addSymbols(symParser.getAlphabet(),
534                                (Symbol[])(sl.toList().toArray(new Symbol[0])),
535                                0, sl.length());
536                    } catch (IllegalAlphabetException e) {
537                        String message = ParseException.newMessage(this.getClass(), accession, identifier, "Bad sequence section", sectionToString(section));
538                        throw new ParseException(e, message);
539                    }
540                }
541            } while (!sectionKey.equals(END_SEQUENCE_TAG));
542        }catch(RuntimeException e){
543            String message = ParseException.newMessage(this.getClass(), accession, identifier, "Bad sequence section", sectionToString(section));
544            throw new ParseException(e, message);
545        }
546        
547        // Allows us to tolerate trailing whitespace without
548        // thinking that there is another Sequence to follow
549        while (true) {
550            reader.mark(1);
551            int c = reader.read();
552            if (c == -1) {
553                hasAnotherSequence = false;
554                break;
555            }
556            if (Character.isWhitespace((char) c)) {
557                //hasInternalWhitespace = true;
558                continue;
559            }
560            //if (hasInternalWhitespace)
561            //    System.err.println("Warning: whitespace found between sequence entries");
562            reader.reset();
563            break;
564        }
565        
566        // Finish up.
567        rlistener.endSequence();
568        return hasAnotherSequence;
569    }
570    
571    // reads an indented section, combining split lines and creating a list of key->value tuples
572    private List readSection(BufferedReader br) throws ParseException {
573        List section = new ArrayList();
574        String line = "";
575        String currKey = null;
576        StringBuffer currVal = new StringBuffer();
577        boolean done = false;
578        int linecount = 0;
579        
580        try {
581            while (!done) {
582                br.mark(320);
583                line = br.readLine();
584                String firstSecKey = section.isEmpty() ? "" : ((String[])section.get(0))[0];
585                if (line != null && line.matches("\\p{Space}*")) {
586                   // regular expression \p{Space}* will match line 
587                   // having only white space characters
588                   continue;
589                }
590                if (line==null || (!line.startsWith(" ") && linecount++>0 && ( !firstSecKey.equals(START_SEQUENCE_TAG)  || line.startsWith(END_SEQUENCE_TAG)))) {
591                    // dump out last part of section
592                    section.add(new String[]{currKey,currVal.toString()});
593                    br.reset();
594                    done = true;
595                } else {
596                    Matcher m = sectp.matcher(line);
597                    if (m.matches()) {
598                        // new key
599                        if (currKey!=null) section.add(new String[]{currKey,currVal.toString()});
600                        // key = group(2) or group(4) or group(6) - whichever is not null
601                        currKey = m.group(2)==null?(m.group(4)==null?m.group(6):m.group(4)):m.group(2);
602                        currVal = new StringBuffer();
603                        // val = group(3) if group(2) not null, group(5) if group(4) not null, "" otherwise, trimmed
604                        currVal.append((m.group(2)==null?(m.group(4)==null?"":m.group(5)):m.group(3)).trim());
605                    } else {
606                        // concatted line or SEQ START/END line?
607                        if (line.startsWith(START_SEQUENCE_TAG) || line.startsWith(END_SEQUENCE_TAG)) currKey = line;
608                        else {
609                            currVal.append("\n"); // newline in between lines - can be removed later
610                            currVal.append(currKey.charAt(0)=='/'?line.substring(21):line.substring(12));
611                        }
612                    }
613                }
614            }
615        } catch (IOException e) {
616            String message = ParseException.newMessage(this.getClass(), accession, identifier, "", sectionToString(section));
617            throw new ParseException(e, message);
618        } catch (RuntimeException e){
619            String message = ParseException.newMessage(this.getClass(), accession, identifier, "Bad line", line);
620            throw new ParseException(e, message);
621        }
622        return section;
623    }
624    
625    private final List buildBaseRanges(final String theBaseRangeList) throws ParseException {
626        if (theBaseRangeList == null) return null;
627        final List baseRangeList = new ArrayList();
628        final String[] baseRange = theBaseRangeList.split(";");
629        try{
630        for (int r=0; r<baseRange.length; r++) {
631            final Matcher rangeMatch = refRange.matcher(baseRange[r]);
632            if (rangeMatch.matches()) {
633                final int rangeStart = Integer.parseInt(rangeMatch.group(1));
634                final int rangeEnd = Integer.parseInt(rangeMatch.group(2));
635                baseRangeList.add(new SimpleRichLocation(new SimplePosition(rangeStart), new SimplePosition(rangeEnd), r));
636            } else {
637                String message = ParseException.newMessage(this.getClass(), accession, identifier, "Bad reference range found", theBaseRangeList);
638                throw new ParseException(message);
639            }
640        }
641        return baseRangeList;
642        }catch(RuntimeException e){
643            String message = ParseException.newMessage(this.getClass(), accession, identifier, "Bad base range", theBaseRangeList);
644            throw new ParseException(e, message);
645        }
646    }
647    
648    /**
649     * {@inheritDoc}
650     */
651    public void writeSequence(Sequence seq, PrintStream os) throws IOException {
652        if (this.getPrintStream()==null) this.setPrintStream(os);
653        this.writeSequence(seq, RichObjectFactory.getDefaultNamespace());
654    }
655    
656    /**
657     * {@inheritDoc}
658     */
659    public void writeSequence(Sequence seq, String format, PrintStream os) throws IOException {
660        if (this.getPrintStream()==null) this.setPrintStream(os);
661        if (!format.equals(this.getDefaultFormat())) throw new IllegalArgumentException("Unknown format: "+format);
662        this.writeSequence(seq, RichObjectFactory.getDefaultNamespace());
663    }
664    
665    /**
666     * {@inheritDoc}
667     * Namespace is ignored as Genbank has no concept of it.
668     */
669    public void writeSequence(Sequence seq, Namespace ns) throws IOException {
670        RichSequence rs;
671        try {
672            if (seq instanceof RichSequence) rs = (RichSequence)seq;
673            else rs = RichSequence.Tools.enrich(seq);
674        } catch (ChangeVetoException e) {
675            IOException e2 = new IOException("Unable to enrich sequence");
676            e2.initCause(e);
677            throw e2;
678        }
679        
680        SymbolTokenization tok;
681        try {
682            tok = rs.getAlphabet().getTokenization("token");
683        } catch (Exception e) {
684            throw new RuntimeException("Unable to get alphabet tokenizer",e);
685        }
686        Set<Note> notes = rs.getNoteSet();
687        String accession = rs.getAccession();
688        StringBuffer accessions = new StringBuffer();
689        accessions.append(accession);
690        String stranded = "";
691        String udat = "";
692        String moltype = rs.getAlphabet().getName();
693        if ("PROTEIN-TERM".equals(moltype) || "PROTEIN".equals(moltype)) moltype = null; //a genpept curiosity
694        StringBuffer keywords = new StringBuffer();
695        for (Iterator<Note> i = notes.iterator(); i.hasNext(); ) {
696            Note n = i.next();
697            if (n.getTerm().equals(Terms.getStrandedTerm())) {
698                String value = n.getValue();
699                if(value != null && value.equals("single"))
700                    stranded= "ss-";
701                else if(value != null && value.equals("mixed"))
702                    stranded= "ms-";
703            }
704            else if (n.getTerm().equals(Terms.getDateUpdatedTerm())) udat=n.getValue();
705            else if (n.getTerm().equals(Terms.getMolTypeTerm())) moltype=n.getValue();
706            else if (n.getTerm().equals(Terms.getAdditionalAccessionTerm())) {
707                accessions.append(" ");
708                accessions.append(n.getValue());
709            } else if (n.getTerm().equals(Terms.getKeywordTerm())) {
710                if (n.getValue() != null) {
711                    if (keywords.length()>0) keywords.append("; ");
712                    keywords.append(n.getValue());
713                }
714            }
715        }
716        
717        //adjust molecule type during format conversion
718        if(moltype!=null && moltype.length()>6) {
719            if(moltype.indexOf("DNA")!=-1) moltype = "DNA";
720            else if(moltype.indexOf("RNA")!=-1) moltype = "RNA";
721            else moltype = "NA";
722        }
723        
724        // locus(name) + length + alpha + div + date line
725        StringBuffer locusLine = new StringBuffer();
726        locusLine.append(StringTools.rightPad(rs.getName(),16));//13->28=15+1=16
727        locusLine.append(" ");//29
728        locusLine.append(StringTools.leftPad(""+rs.length(),11));//30->40=10+1=11
729        locusLine.append(" "+ (moltype==null? "aa":"bp") +" ");//41->44
730        locusLine.append(StringTools.leftPad(stranded,3));//45->47=2+1=3
731        locusLine.append(StringTools.rightPad(moltype==null?"":moltype,6));//48->53=5+1=6
732        locusLine.append("  ");//54->55
733        locusLine.append(StringTools.rightPad(rs.getCircular()?"circular":"linear",8));//56->63=7+1=8
734        locusLine.append(" ");//64->64
735        String div = rs.getDivision()==null?"":rs.getDivision();
736        if(div.length()>3) div = ""; // Not a GenBank division, maybe UniProt, etc.
737        locusLine.append(StringTools.rightPad(div,3));//65->67=2+1=3
738        locusLine.append(" ");//68->68
739        locusLine.append(StringTools.rightPad(udat,11));//69->79=10+1=11
740        StringTools.writeKeyValueLine(LOCUS_TAG, locusLine.toString(), 12, this.getLineWidth(), this.getPrintStream());
741        
742        // definition line
743        StringTools.writeKeyValueLine(DEFINITION_TAG, rs.getDescription(), 12, this.getLineWidth(), this.getPrintStream());
744        
745        // accession line
746        StringTools.writeKeyValueLine(ACCESSION_TAG, accessions.toString(), 12, this.getLineWidth(), this.getPrintStream());
747        
748        // version + gi line
749        String version = accession+"."+rs.getVersion();
750        if (rs.getIdentifier()!=null) version = version + "  GI:"+rs.getIdentifier();
751        StringTools.writeKeyValueLine(VERSION_TAG, version, 12, this.getLineWidth(), this.getPrintStream());
752        
753        // keywords line
754        keywords.append(".");
755        StringTools.writeKeyValueLine(KEYWORDS_TAG, keywords.toString(), 12, this.getLineWidth()-1, this.getPrintStream());
756        
757        // source line (from taxon)
758        //   organism line
759        NCBITaxon tax = rs.getTaxon();
760        if (tax!=null) {
761            StringTools.writeKeyValueLine(SOURCE_TAG, (isMitochondrial(rs)?"mitochondrion ":"")+tax.getDisplayName(), 12, this.getLineWidth(), this.getPrintStream());
762            StringTools.writeKeyValueLine("  "+ORGANISM_TAG, tax.getDisplayName().split("\\s+\\(")[0]+"\n"+tax.getNameHierarchy(), 12, this.getLineWidth()-1, this.getPrintStream());
763        }
764        
765        // references - rank (bases x to y)
766        for (Iterator<RankedDocRef> r = rs.getRankedDocRefs().iterator(); r.hasNext(); ) {
767            RankedDocRef rdr = r.next();
768            DocRef d = rdr.getDocumentReference();
769            StringTools.writeKeyValueLine(REFERENCE_TAG, rdr.getRank()+((rdr.getLocation()==null || rdr.getLocation() ==RichLocation.EMPTY_LOCATION)?"": (moltype==null? "  (residues ":"  (bases ")+makeBaseRange(rdr)+")"), 12, this.getLineWidth(), this.getPrintStream());
770            // Any authors that were in the input as CONSRTM tags will
771            // be merged into the AUTHORS tag on output.
772            StringTools.writeKeyValueLine("  "+AUTHORS_TAG, d.getAuthors(), 12, this.getLineWidth()-1, this.getPrintStream());
773            StringTools.writeKeyValueLine("  "+TITLE_TAG, d.getTitle(), 12, this.getLineWidth(), this.getPrintStream());
774            StringTools.writeKeyValueLine("  "+JOURNAL_TAG, d.getLocation(), 12, this.getLineWidth(), this.getPrintStream());
775            CrossRef c = d.getCrossref();
776            if (c!=null) StringTools.writeKeyValueLine(StringTools.leftPad(c.getDbname(),9), c.getAccession(), 12, this.getLineWidth(), this.getPrintStream());
777            StringTools.writeKeyValueLine("  "+REMARK_TAG, d.getRemark(), 12, this.getLineWidth(), this.getPrintStream());
778        }
779        
780        // comments - if any
781        Set<Comment> comments = rs.getComments();
782        if (!comments.isEmpty()) {
783            StringBuffer sb = new StringBuffer();
784            for (Iterator<Comment> i = comments.iterator(); i.hasNext(); ) {
785                Comment c = i.next();
786                sb.append(c.getComment());
787                if (i.hasNext()) sb.append("\n");
788            }
789            StringTools.writeKeyValueLine(COMMENT_TAG, sb.toString(), 12, this.getLineWidth(), this.getPrintStream());
790        }
791        
792        this.getPrintStream().println(FEATURE_TAG+"             Location/Qualifiers");
793        // feature_type     location
794        for (Iterator i = rs.getFeatureSet().iterator(); i.hasNext(); ) {
795            RichFeature f = (RichFeature)i.next();
796            StringTools.writeKeyValueLine("     "+f.getTypeTerm().getName(), GenbankLocationParser.writeLocation((RichLocation)f.getLocation()), 21, this.getLineWidth()-1, ",", this.getPrintStream());
797            for (Iterator<Note> j = f.getNoteSet().iterator(); j.hasNext(); ) {
798                Note n = j.next();
799                // /key="val" or just /key if val==""
800                if (n.getValue()==null || n.getValue().length()==0) StringTools.writeKeyValueLine("", "/"+n.getTerm().getName(), 21, this.getLineWidth(), this.getPrintStream());
801                else if (isNotQuoted(n)) {// doesn't have the value enclosed in quotes
802                    StringTools.writeKeyValueLine("", "/"+n.getTerm().getName()+"="+n.getValue(), 21, this.getLineWidth(), this.getPrintStream());
803                } else if (n.getTerm().getName().equals("translation")) {
804                    StringTools.writeKeyValueLine("", "/"+n.getTerm().getName()+"=\""+n.getValue()+"\"", 21, this.getLineWidth()-1, this.getPrintStream());
805                } else {
806                    StringTools.writeKeyValueLine("", "/"+n.getTerm().getName()+"=\""+n.getValue()+"\"", 21, this.getLineWidth(), this.getPrintStream());
807                }
808            }
809            // add-in to source feature only organism and db_xref="taxon:xyz" where present
810            if (f.getType().equals("source") && tax!=null) {
811                String displayName = tax.getDisplayName();
812                if (displayName.indexOf('(')>-1) displayName = displayName.substring(0, displayName.indexOf('(')).trim();
813                StringTools.writeKeyValueLine("", "/organism=\""+displayName+"\"", 21, this.getLineWidth()-1, this.getPrintStream());// AF252370 fits in exactly 80 - but is wrapped
814                for (Iterator<RankedCrossRef> j = f.getRankedCrossRefs().iterator(); j.hasNext(); ) {
815                    RankedCrossRef rcr = j.next();
816                    CrossRef cr = rcr.getCrossRef();
817                    StringTools.writeKeyValueLine("", "/db_xref=\""+cr.getDbname()+":"+cr.getAccession()+"\"", 21, this.getLineWidth(), this.getPrintStream());
818                }
819                StringTools.writeKeyValueLine("", "/db_xref=\"taxon:"+tax.getNCBITaxID()+"\"", 21, this.getLineWidth(), this.getPrintStream());
820            } else {
821                // add-in other dbxrefs where present
822                for (Iterator<RankedCrossRef> j = f.getRankedCrossRefs().iterator(); j.hasNext(); ) {
823                    RankedCrossRef rcr = j.next();
824                    CrossRef cr = rcr.getCrossRef();
825                    StringTools.writeKeyValueLine("", "/db_xref=\""+cr.getDbname()+":"+cr.getAccession()+"\"", 21, this.getLineWidth(), this.getPrintStream());
826                }
827            }
828        }
829        
830        //BASE COUNT obsolete in Genbank flatfile format since October 2003
831        //if (rs.getAlphabet()==AlphabetManager.alphabetForName("DNA")) {
832        //    // BASE COUNT     1510 a   1074 c    835 g   1609 t
833        //    int aCount = 0;
834        //    int cCount = 0;
835        //    int gCount = 0;
836        //    int tCount = 0;
837        //    int oCount = 0;
838        //    for (int i = 1; i <= rs.length(); i++) {
839        //        char c;
840        //        try {
841        //            c = tok.tokenizeSymbol(rs.symbolAt(i)).charAt(0);
842        //        } catch (Exception e) {
843        //            throw new RuntimeException("Unable to get symbol at position "+i,e);
844        //        }
845        //        switch (c) {
846        //            case 'a': case 'A':
847        //                aCount++;
848        //                break;
849        //            case 'c': case 'C':
850        //                cCount++;
851        //                break;
852        //            case 'g': case 'G':
853        //                gCount++;
854        //                break;
855        //            case 't': case 'T':
856        //                tCount++;
857        //                break;
858        //            default:
859        //                oCount++;
860        //        }
861        //    }
862        //
863        //    this.getPrintStream().print(BASE_COUNT_TAG_FULL+"    ");
864        //    this.getPrintStream().print(aCount + " a   ");
865        //    this.getPrintStream().print(cCount + " c   ");
866        //    this.getPrintStream().print(gCount + " g   ");
867        //    this.getPrintStream().print(tCount + " t    ");
868        //    this.getPrintStream().println(oCount + " others");
869        //}
870        
871        this.getPrintStream().println(START_SEQUENCE_TAG);
872        // sequence stuff
873        Symbol[] syms = (Symbol[])rs.toList().toArray(new Symbol[0]);
874        int lines = 0;
875        int symCount = 0;
876        for (int i = 0; i < syms.length; i++) {
877            if (symCount % 60 == 0) {
878                if (lines > 0) this.getPrintStream().print("\n"); // newline from previous line
879                int lineNum = (lines*60) + 1;
880                this.getPrintStream().print(StringTools.leftPad(""+lineNum,9));
881                lines++;
882            }
883            if (symCount % 10 == 0) this.getPrintStream().print(" ");
884            try {
885                this.getPrintStream().print(tok.tokenizeSymbol(syms[i]));
886            } catch (IllegalSymbolException e) {
887                throw new RuntimeException("Found illegal symbol: "+syms[i]);
888            }
889            symCount++;
890        }
891        if(syms.length>0) //do not create an empty line
892            this.getPrintStream().print("\n");
893        this.getPrintStream().println(END_SEQUENCE_TAG);
894    }
895    
896    /**
897     * {@inheritDoc}
898     */
899    public String getDefaultFormat() {
900        return GENBANK_FORMAT;
901    }
902    
903    private final static boolean isMitochondrial(final RichSequence theSequence) {
904        final Set featureSet = theSequence.getFeatureSet();
905        final Iterator i = featureSet.iterator();
906        while (i.hasNext()) {
907            final RichFeature feature = (RichFeature) i.next();
908            if (feature.getType().equals("source")) {
909                final Set noteSet = feature.getNoteSet();
910                final Iterator<Note> n = noteSet.iterator();
911                while(n.hasNext()) {
912                    final Note note = n.next();
913                    if (note.getTerm().getName().equals("organelle")) return note.getValue().equals("mitochondrion");
914                }
915            }
916        }
917        return false;
918    }
919    
920    private final static boolean isNotQuoted(final Note theNote) {
921        return isNotQuoted(theNote.getTerm().getName(), theNote.getValue());
922    }
923    
924    private final static boolean isNotQuoted(final String theName, final String theValue) {
925        return isNotQuoted.contains(theName);
926    }
927    
928    private final static String makeBaseRange(final RankedDocRef theReference) {
929        return theReference.getLocation()==null?theReference.getStart()+" to "+theReference.getEnd():toString(theReference.getLocation());
930    }
931    
932    private final static String toString(final RichLocation theLocation) {
933        final StringBuffer list = new StringBuffer();
934        final Iterator b = theLocation.blockIterator();
935        while (b.hasNext()) {
936            final RichLocation location = (RichLocation) b.next();
937            list.append(location.getMin()+" to "+location.getMax());
938            if (b.hasNext()) list.append("; ");
939        }
940        return list.toString();
941    }
942    
943    /**
944     * Converts the current parse section to a String. Useful for debugging.
945     */
946    String sectionToString(List section){
947        StringBuffer parseBlock = new StringBuffer();
948        for(Iterator i = section.listIterator(); i.hasNext();){
949            String[] part = (String[])i.next();
950            for(int x = 0; x < part.length; x++){
951                parseBlock.append(part[x]);
952                if(x == 0){
953                    parseBlock.append("   "); //the gap will have been trimmed
954                }
955            }
956        }
957        return parseBlock.toString();
958    }
959}