001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022package org.biojavax.bio.seq.io;
023
024import java.io.BufferedInputStream;
025import java.io.BufferedReader;
026import java.io.File;
027import java.io.FileReader;
028import java.io.IOException;
029import java.io.InputStreamReader;
030import java.io.PrintStream;
031import java.util.ArrayList;
032import java.util.HashSet;
033import java.util.Iterator;
034import java.util.List;
035import java.util.Set;
036import java.util.TreeSet;
037import java.util.regex.Matcher;
038import java.util.regex.Pattern;
039
040import org.biojava.bio.seq.Sequence;
041import org.biojava.bio.seq.io.ParseException;
042import org.biojava.bio.seq.io.SeqIOListener;
043import org.biojava.bio.seq.io.SymbolTokenization;
044import org.biojava.bio.symbol.IllegalAlphabetException;
045import org.biojava.bio.symbol.IllegalSymbolException;
046import org.biojava.bio.symbol.SimpleSymbolList;
047import org.biojava.bio.symbol.Symbol;
048import org.biojava.bio.symbol.SymbolList;
049import org.biojava.utils.ChangeVetoException;
050import org.biojavax.Comment;
051import org.biojavax.CrossRef;
052import org.biojavax.DocRef;
053import org.biojavax.DocRefAuthor;
054import org.biojavax.Namespace;
055import org.biojavax.Note;
056import org.biojavax.RankedCrossRef;
057import org.biojavax.RankedDocRef;
058import org.biojavax.RichObjectFactory;
059import org.biojavax.SimpleComment;
060import org.biojavax.SimpleCrossRef;
061import org.biojavax.SimpleDocRef;
062import org.biojavax.SimpleRankedCrossRef;
063import org.biojavax.SimpleRankedDocRef;
064import org.biojavax.SimpleRichAnnotation;
065import org.biojavax.bio.seq.CompoundRichLocation;
066import org.biojavax.bio.seq.RichFeature;
067import org.biojavax.bio.seq.RichLocation;
068import org.biojavax.bio.seq.RichSequence;
069import org.biojavax.bio.seq.SimplePosition;
070import org.biojavax.bio.seq.SimpleRichLocation;
071import org.biojavax.bio.taxa.NCBITaxon;
072import org.biojavax.bio.taxa.SimpleNCBITaxon;
073import org.biojavax.ontology.ComparableTerm;
074import org.biojavax.utils.StringTools;
075
076/**
077 * Format reader for GenBank files. This version of Genbank format will generate
078 * and write RichSequence objects. Loosely Based on code from the old, deprecated,
079 * org.biojava.bio.seq.io.GenbankFormat object.
080 *
081 * @author Richard Holland
082 * @author Mark Schreiber
083 * @author David Scott
084 * @author Bubba Puryear
085 * @author George Waldon
086 * @author Deepak Sheoran
087 * @since 1.5
088 */
089public class GenbankFormat extends RichSequenceFormat.HeaderlessFormat {
090    
091    // Register this format with the format auto-guesser.
092    static {
093        RichSequence.IOTools.registerFormat(GenbankFormat.class);
094    }
095    
096    /**
097     * The name of this format
098     */
099    public static final String GENBANK_FORMAT = "GENBANK";
100    
101    protected static final String LOCUS_TAG =           "LOCUS";
102    protected static final String DEFINITION_TAG =      "DEFINITION";
103    protected static final String ACCESSION_TAG =       "ACCESSION";
104    protected static final String VERSION_TAG =         "VERSION";
105    protected static final String KEYWORDS_TAG =        "KEYWORDS";
106    //                                                  "SEGMENT"
107    protected static final String SOURCE_TAG =          "SOURCE";
108    protected static final String ORGANISM_TAG =        "ORGANISM";
109    protected static final String REFERENCE_TAG =       "REFERENCE";
110    protected static final String AUTHORS_TAG =         "AUTHORS";
111    protected static final String CONSORTIUM_TAG =      "CONSRTM";
112    protected static final String TITLE_TAG =           "TITLE";
113    protected static final String JOURNAL_TAG =         "JOURNAL";
114    protected static final String PUBMED_TAG =          "PUBMED";
115    protected static final String MEDLINE_TAG =         "MEDLINE"; //deprecated
116    protected static final String REMARK_TAG =          "REMARK";
117    protected static final String COMMENT_TAG =         "COMMENT";
118    protected static final String FEATURE_TAG =         "FEATURES";
119    protected static final String BASE_COUNT_TAG_FULL = "BASE COUNT"; //deprecated
120    protected static final String BASE_COUNT_TAG =      "BASE";
121    //                                                  "CONTIG"
122    protected static final String START_SEQUENCE_TAG =  "ORIGIN";
123    protected static final String END_SEQUENCE_TAG =    "//";
124    
125    // locus line
126    protected static final Pattern lp = Pattern.compile("^(\\S+)\\s+\\d+\\s+(bp|aa)\\s{1,4}([dms]s-)?(\\S+)?\\s+(circular|linear)?\\s*(\\S+)?\\s*(\\S+)?$");
127    // version line
128    protected static final Pattern vp = Pattern.compile("^(\\S*?)(\\.(\\d+))?(\\s+GI:(\\S+))?$");
129    // reference line
130    protected static final Pattern refRange = Pattern.compile("^\\s*(\\d+)\\s+to\\s+(\\d+)$");
131    protected static final Pattern refp = Pattern.compile("^(\\d+)\\s*(?:(\\((?:bases|residues)\\s+(\\d+\\s+to\\s+\\d+(\\s*;\\s*\\d+\\s+to\\s+\\d+)*)\\))|\\(sites\\))?");
132    // dbxref line
133    protected static final Pattern dbxp = Pattern.compile("^([^:]+):(\\S+)$");
134    //sections start at a line and continue till the first line afterwards with a
135    //non-whitespace first character
136    //we want to match any of the following as a new section within a section
137    //  \s{0,8} word \s{0,7} value
138    //  \s{21} /word = value
139    //  \s{21} /word
140    protected static final Pattern sectp = Pattern.compile("^(\\s{0,8}(\\S+)\\s{0,7}(.*)|\\s{21}(/\\S+?)=(.*)|\\s{21}(/\\S+))$");
141    
142    protected static final Pattern readableFiles = Pattern.compile(".*(g[bp]k*$|\\u002eg[bp].*)");
143    protected static final Pattern headerLine = Pattern.compile("^LOCUS.*");
144    
145    private final static HashSet isNotQuoted = new HashSet();
146    static {
147        isNotQuoted.add("anticodon");
148        isNotQuoted.add("citation");
149        isNotQuoted.add("codon");
150        isNotQuoted.add("codon_start");
151        isNotQuoted.add("compare");
152        isNotQuoted.add("cons_splice");
153        isNotQuoted.add("direction");
154        isNotQuoted.add("estimated_length");
155        isNotQuoted.add("label");
156        isNotQuoted.add("mod_base");
157        isNotQuoted.add("number");
158        isNotQuoted.add("rpt_type");
159        isNotQuoted.add("rpt_unit_range");
160        isNotQuoted.add("transl_except");
161        isNotQuoted.add("transl_table");
162    }
163    
164    /**
165     * Implements some GenBank-specific terms.
166     */
167    public static class Terms extends RichSequence.Terms {        
168        /**
169         * Getter for the Genbank term
170         * @return The genbank Term
171         */
172        public static ComparableTerm getGenBankTerm() {
173            return RichObjectFactory.getDefaultOntology().getOrCreateTerm("GenBank");
174        }
175    }
176    
177    /**
178     * {@inheritDoc}
179     * A file is in GenBank format if the name ends with gbk, contains the letters egb, or the first line of
180     * the file starts with the word LOCUS
181     */
182    public boolean canRead(File file) throws IOException {
183        if (readableFiles.matcher(file.getName()).matches()) return true;
184        BufferedReader br = new BufferedReader(new FileReader(file));
185        final String firstLine = br.readLine();
186        boolean readable = firstLine!=null && headerLine.matcher(firstLine).matches();
187        br.close();
188        return readable;
189    }
190    
191    /**
192     * {@inheritDoc}
193     * Returns an dna parser if the letters DNA or RNA appear in the first line of the file.
194     * Otherwise returns a DNA tokenizer.
195     */
196    public SymbolTokenization guessSymbolTokenization(File file) throws IOException {
197        BufferedReader br = new BufferedReader(new FileReader(file));
198        String firstLine = br.readLine();
199        boolean dna = (firstLine.indexOf("DNA") >0 || firstLine.indexOf("RNA") > 0);
200        br.close();
201        if (dna) return RichSequence.IOTools.getDNAParser();
202        else return RichSequence.IOTools.getProteinParser();
203    }
204    
205    /**
206     * {@inheritDoc}
207     * A stream is in GenBank format if the first line of the stream starts with the word LOCUS
208     */
209    public boolean canRead(BufferedInputStream stream) throws IOException {
210        stream.mark(2000); // some streams may not support this
211        BufferedReader br = new BufferedReader(new InputStreamReader(stream));
212        final String firstLine = br.readLine();
213        boolean readable = firstLine!=null && headerLine.matcher(firstLine).matches();
214        // don't close the reader as it'll close the stream too.
215        // br.close();
216        stream.reset();
217        return readable;
218    }
219    
220    /**
221     * {@inheritDoc}
222     * Returns an dna parser if the letters DNA or RNA appear in the first line of the stream.
223     * Otherwise returns a DNA tokenizer.
224     */
225    public SymbolTokenization guessSymbolTokenization(BufferedInputStream stream) throws IOException {
226        stream.mark(2000); // some streams may not support this
227        BufferedReader br = new BufferedReader(new InputStreamReader(stream));
228        String firstLine = br.readLine();
229        boolean dna = (firstLine.indexOf("DNA") >0 || firstLine.indexOf("RNA") > 0);
230        // don't close the reader as it'll close the stream too.
231        // br.close();
232        stream.reset();
233        if (dna) return RichSequence.IOTools.getDNAParser();
234        else return RichSequence.IOTools.getProteinParser();
235    }
236    
237    /**
238     * {@inheritDoc}
239     */
240    public boolean readSequence(BufferedReader reader,
241            SymbolTokenization symParser,
242            SeqIOListener listener)
243            throws IllegalSymbolException, IOException, ParseException {
244        if (!(listener instanceof RichSeqIOListener)) throw new IllegalArgumentException("Only accepting RichSeqIOListeners today");
245        return this.readRichSequence(reader,symParser,(RichSeqIOListener)listener,null);
246    }
247    
248    private String sectionKey = null;
249    private NCBITaxon tax = null;
250    private String organism = null;
251    private String accession = null;
252    private String identifier = null;
253    /**
254     * {@inheritDoc}
255     */
256    public boolean readRichSequence(BufferedReader reader,
257            SymbolTokenization symParser,
258            RichSeqIOListener rlistener,
259            Namespace ns)
260            throws IllegalSymbolException, IOException, ParseException {
261        
262        sectionKey = null;
263        tax = null;
264        organism = null;
265        accession = null;
266        identifier = null;
267        boolean hasAnotherSequence = true;
268        //boolean hasInternalWhitespace = false;
269        
270        rlistener.startSequence();
271        
272        if (ns==null) ns=RichObjectFactory.getDefaultNamespace();
273        rlistener.setNamespace(ns);
274        
275        // Get an ordered list of key->value pairs in array-tuples
276        List section = null;
277        try{
278            do {
279                section = this.readSection(reader);
280                sectionKey = ((String[])section.get(0))[0];
281                if(sectionKey == null){
282                    String message = ParseException.newMessage(this.getClass(), accession, identifier, "Section key was null", sectionToString(section));
283                    throw new ParseException(message);
284                }
285                // process section-by-section
286                if (sectionKey.equals(LOCUS_TAG)) {
287                    String loc = ((String[])section.get(0))[1];
288                    Matcher m = lp.matcher(loc);
289                    if (m.matches()) {
290                        rlistener.setName(m.group(1));
291                        accession = m.group(1); // default if no accession found
292                        rlistener.setAccession(accession);
293                        if (m.group(4)!=null)
294                            rlistener.addSequenceProperty(Terms.getMolTypeTerm(),m.group(4));
295                        // Optional extras
296                        String stranded = m.group(3);
297                        if(stranded!=null && stranded.equals("ss-"))
298                            stranded = "single";
299                        else if(stranded!=null && stranded.equals("ms-"))
300                            stranded = "mixed";
301                        else if(stranded!=null && stranded.equals("ds-"))
302                            stranded = "double";
303                        String circular = m.group(5);
304                        String fifth = m.group(6);
305                        String sixth = m.group(7);
306                        if (stranded!=null) rlistener.addSequenceProperty(Terms.getStrandedTerm(),stranded);
307                        if (circular!=null && circular.equalsIgnoreCase("circular")) rlistener.setCircular(true);
308                        if (sixth != null) {
309                            rlistener.setDivision(fifth);
310                            rlistener.addSequenceProperty(Terms.getDateUpdatedTerm(),sixth);
311                        } else if (fifth!=null) {
312                            rlistener.addSequenceProperty(Terms.getDateUpdatedTerm(),fifth);
313                        }
314                    } else {
315                        String message = ParseException.newMessage(this.getClass(), accession, identifier, "Bad locus line", sectionToString(section));
316                        throw new ParseException(message);
317                    }
318                } else if (sectionKey.equals(DEFINITION_TAG)) {
319                    rlistener.setDescription(((String[])section.get(0))[1]);
320                } else if (sectionKey.equals(ACCESSION_TAG)) {
321                    // if multiple accessions, store only first as accession,
322                    // and store rest in annotation
323                    String[] accs = ((String[])section.get(0))[1].split("\\s+");
324                    accession = accs[0].trim();
325                    rlistener.setAccession(accession);
326                    for (int i = 1; i < accs.length; i++) {
327                        rlistener.addSequenceProperty(Terms.getAdditionalAccessionTerm(),accs[i].trim());
328                    }
329                } else if (sectionKey.equals(VERSION_TAG)) {
330                    String ver = ((String[])section.get(0))[1];
331                    Matcher m = vp.matcher(ver);
332                    if (m.matches()) {
333                        String verAcc = m.group(1);
334                        if (!accession.equals(verAcc)) {
335                            // the version refers to a different accession!
336                            // believe the version line, and store the original
337                            // accession away in the additional accession set
338                            rlistener.addSequenceProperty(Terms.getAdditionalAccessionTerm(),accession);
339                            accession = verAcc;
340                            rlistener.setAccession(accession);
341                        }
342                        if (m.group(3)!=null) rlistener.setVersion(Integer.parseInt(m.group(3)));
343                        if (m.group(5)!=null) {
344                            identifier = m.group(5);
345                            rlistener.setIdentifier(identifier);
346                        }
347                    } else {
348                        String message = ParseException.newMessage(this.getClass(), accession, identifier, "Bad version line", sectionToString(section));
349                        throw new ParseException(message);
350                    }
351                } else if (sectionKey.equals(KEYWORDS_TAG)) {
352                    String val = ((String[])section.get(0))[1];
353                    if (val.endsWith(".")) val = val.substring(0, val.length()-1); // chomp dot
354                    val = val.replace('\n',' '); //remove newline
355                    String[] kws = val.split(";");
356                    
357                    for (int i = 0; i < kws.length; i++) {
358                        String kw = kws[i].trim();
359                        if (kw.length()==0) continue;
360                        rlistener.addSequenceProperty(Terms.getKeywordTerm(), kw);
361                    }
362                } else if (sectionKey.equals(SOURCE_TAG)) {
363                    // ignore - can get all this from the first feature
364                } else if (sectionKey.equals(REFERENCE_TAG) && !this.getElideReferences()) {
365                    // first line of section has rank and location
366                    int ref_rank;
367                    List baseRangeList=null;
368                    String ref = ((String[])section.get(0))[1];
369                    Matcher m = refp.matcher(ref);
370                    if (m.matches()) {
371                        ref_rank = Integer.parseInt(m.group(1));
372                        if (m.group(3) != null) baseRangeList=buildBaseRanges(m.group(3));
373                    } else {
374                        String message = ParseException.newMessage(this.getClass(), accession, identifier, "Bad reference line", sectionToString(section));
375                        throw new ParseException(message);
376                    }
377                    // rest can be in any order
378                    String authors = null;
379                    String consortium = null;
380                    String title = null;
381                    String journal = null;
382                    String medline = null;
383                    String pubmed = null;
384                    String remark = null;
385                    for (int i = 1; i < section.size(); i++) {
386                        String key = ((String[])section.get(i))[0];
387                        String val = ((String[])section.get(i))[1];
388                        if (key.equals(AUTHORS_TAG)) authors = val.replace('\n',' '); //see #2276
389                        else if (key.equals(CONSORTIUM_TAG)) consortium = val.replace('\n',' '); //see #2276
390                        else if (key.equals(TITLE_TAG)) title = val.replace('\n',' '); //see #2276
391                        else if (key.equals(JOURNAL_TAG)) journal = val.replace('\n',' '); //see #2276
392                        else if (key.equals(MEDLINE_TAG)) medline = val;
393                        else if (key.equals(PUBMED_TAG)) pubmed = val;
394                        else if (key.equals(REMARK_TAG)) remark = val.replace('\n',' '); //see #2276
395                    }
396                    
397                    // create the docref object
398                    try {
399                        // Use consortium as well if present.
400                        if (authors==null) authors = consortium + " (consortium)";
401                        else if (consortium!=null) authors = authors + ", " + consortium + " (consortium)";
402                        // Create docref.
403                        DocRef dr = null;
404                        // assign either the pubmed or medline to the docref - medline gets priority
405                        if (medline != null) {
406                            dr = (DocRef) RichObjectFactory.getObject(SimpleDocRef.class, new Object[]{DocRefAuthor.Tools.parseAuthorString(authors), journal, title, Terms.MEDLINE_KEY, medline, new Integer(0)});
407                            if (dr.getCrossref() == null) {
408                                dr.setCrossref((CrossRef) RichObjectFactory.getObject(SimpleCrossRef.class, new Object[]{Terms.MEDLINE_KEY, medline, new Integer(0)}));
409                            }
410                        } else if (pubmed != null) {
411                            dr = (DocRef) RichObjectFactory.getObject(SimpleDocRef.class, new Object[]{DocRefAuthor.Tools.parseAuthorString(authors), journal, title, Terms.PUBMED_KEY, pubmed, new Integer(0)});
412                            if (dr.getCrossref() == null) {
413                                dr.setCrossref((CrossRef) RichObjectFactory.getObject(SimpleCrossRef.class, new Object[]{Terms.PUBMED_KEY, pubmed, new Integer(0)}));
414                            }
415                        } else {
416                            dr = (DocRef) RichObjectFactory.getObject(SimpleDocRef.class, new Object[]{DocRefAuthor.Tools.parseAuthorString(authors), journal, title});
417                        }                        
418                        // assign the remarks
419                        if (!this.getElideComments()) dr.setRemark(remark);
420                        // assign the docref to the bioentry: null if no base ranges, Integers if 1 base range - the normal case, joined RichLocation if more than 1
421                        RankedDocRef rdr = baseRangeList == null?new SimpleRankedDocRef(dr, null, null, ref_rank):(baseRangeList.size()==1?new SimpleRankedDocRef(dr, new Integer(((RichLocation)baseRangeList.get(0)).getMin()), new Integer(((RichLocation)baseRangeList.get(0)).getMax()), ref_rank):new SimpleRankedDocRef(dr, new CompoundRichLocation(baseRangeList), ref_rank));
422                        rlistener.setRankedDocRef(rdr);
423                    } catch (ChangeVetoException e) {
424                        throw new ParseException(e+", accession:"+accession);
425                    }
426                } else if (sectionKey.equals(COMMENT_TAG) && !this.getElideComments()) {
427                    // Set up some comments
428                    rlistener.setComment(((String[])section.get(0))[1]);
429                } else if (sectionKey.equals(FEATURE_TAG) && !this.getElideFeatures()) {
430                    // starting from second line of input, start a new feature whenever we come across
431                    // a key that does not start with /
432                    boolean seenAFeature = false;
433                    int rcrossrefCount = 0;
434                    boolean skippingBond = false;
435                    for (int i = 1 ; i < section.size(); i++) {
436                        String key = ((String[])section.get(i))[0];
437                        String val = ((String[])section.get(i))[1];
438                        if (key.startsWith("/")) {
439                                  if(!skippingBond)
440                                  {
441                                    key = key.substring(1); // strip leading slash
442                                    val = val.replaceAll("\\s*[\\n\\r]+\\s*"," ").trim();
443                                    if (val.endsWith("\"")) val = val.substring(1,val.length()-1); // strip quotes
444                                    // parameter on old feature
445                                    if (key.equals("db_xref")) {
446                                    val = val.replaceAll("\\s+","");
447                                        Matcher m = dbxp.matcher(val);
448                                        if (m.matches()) {
449                                            String dbname = m.group(1);
450                                            String raccession = m.group(2);
451                                            if (dbname.equalsIgnoreCase("taxon")) {
452                                                // Set the Taxon instead of a dbxref
453                                                tax = (NCBITaxon)RichObjectFactory.getObject(SimpleNCBITaxon.class, new Object[]{Integer.valueOf(raccession)});
454                                                rlistener.setTaxon(tax);
455                                                try {
456                                                    if (organism!=null) tax.addName(NCBITaxon.SCIENTIFIC,organism.replace('\n', ' '));// readSection can embed new lines
457                                                } catch (ChangeVetoException e) {
458                                                    throw new ParseException(e+", accession:"+accession);
459                                                }
460                                            } else {
461                                                try {
462                                                    CrossRef cr = (CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{dbname, raccession, new Integer(0)});
463                                                    RankedCrossRef rcr = new SimpleRankedCrossRef(cr, ++rcrossrefCount);
464                                                    rlistener.getCurrentFeature().addRankedCrossRef(rcr);
465                                                } catch (ChangeVetoException e) {
466                                                    throw new ParseException(e+", accession:"+accession);
467                                                }
468                                            }
469                                        } else {
470                                            String message = ParseException.newMessage(this.getClass(), accession, identifier, "Bad dbxref", sectionToString(section));
471                                            throw new ParseException(message);
472                                        }
473                                    } else if (key.equalsIgnoreCase("organism")) {
474                                        try {
475                                            organism = val;
476                                            if (tax!=null) tax.addName(NCBITaxon.SCIENTIFIC,organism.replace('\n', ' '));// readSection can embed new lines
477                                        } catch (ChangeVetoException e) {
478                                            throw new ParseException(e+", accession:"+accession);
479                                        }
480                                    } else {
481                                        if (key.equalsIgnoreCase("translation")) {
482                                            // strip spaces from sequence
483                                            val = val.replaceAll("\\s+","");
484                                        }
485                                        rlistener.addFeatureProperty(RichObjectFactory.getDefaultOntology().getOrCreateTerm(key),val);
486                                    }
487                                }
488                        } else {
489                            // new feature!
490                            // end previous feature
491                            if(key.equalsIgnoreCase("bond"))
492                            {
493                                skippingBond = true;
494                            }
495                            else
496                            {
497                                skippingBond = false;
498                                if (seenAFeature) {
499                                        rlistener.endFeature();
500                                }
501                                    // start next one, with lots of lovely info in it
502                                    RichFeature.Template templ = new RichFeature.Template();
503                                    templ.annotation = new SimpleRichAnnotation();
504                                    templ.sourceTerm = Terms.getGenBankTerm();
505                                    templ.typeTerm = RichObjectFactory.getDefaultOntology().getOrCreateTerm(key);
506                                    templ.featureRelationshipSet = new TreeSet();
507                                    templ.rankedCrossRefs = new TreeSet();
508                                    String tidyLocStr = val.replaceAll("\\s+","");
509                                    templ.location = GenbankLocationParser.parseLocation(ns, accession, tidyLocStr);
510                                    rlistener.startFeature(templ);
511                                    seenAFeature = true;
512                                    rcrossrefCount = 0;
513                            }
514                            
515                        }
516                    }
517                    
518                    if (seenAFeature) {
519                        rlistener.endFeature();
520                    }
521                } else if (sectionKey.equals(BASE_COUNT_TAG)) {
522                    // ignore - can calculate from sequence content later if needed
523                } else if (sectionKey.equals(START_SEQUENCE_TAG) && !this.getElideSymbols()) {
524                    // our first line is ignorable as it is the ORIGIN tag
525                    // the second line onwards conveniently have the number as
526                    // the [0] tuple, and sequence string as [1] so all we have
527                    // to do is concat the [1] parts and then strip out spaces,
528                    // and replace '.' and '~' with '-' for our parser.
529                    StringBuffer seq = new StringBuffer();
530                    for (int i = 1 ; i < section.size(); i++) seq.append(((String[])section.get(i))[1]);
531                    try {
532                        SymbolList sl = new SimpleSymbolList(symParser,
533                                seq.toString().replaceAll("\\s+","").replaceAll("[\\.|~]","-"));
534                        rlistener.addSymbols(symParser.getAlphabet(),
535                                (Symbol[])(sl.toList().toArray(new Symbol[0])),
536                                0, sl.length());
537                    } catch (IllegalAlphabetException e) {
538                        String message = ParseException.newMessage(this.getClass(), accession, identifier, "Bad sequence section", sectionToString(section));
539                        throw new ParseException(e, message);
540                    }
541                }
542            } while (!sectionKey.equals(END_SEQUENCE_TAG));
543        }catch(RuntimeException e){
544            String message = ParseException.newMessage(this.getClass(), accession, identifier, "Bad sequence section", sectionToString(section));
545            throw new ParseException(e, message);
546        }
547        
548        // Allows us to tolerate trailing whitespace without
549        // thinking that there is another Sequence to follow
550        while (true) {
551            reader.mark(1);
552            int c = reader.read();
553            if (c == -1) {
554                hasAnotherSequence = false;
555                break;
556            }
557            if (Character.isWhitespace((char) c)) {
558                //hasInternalWhitespace = true;
559                continue;
560            }
561            //if (hasInternalWhitespace)
562            //    System.err.println("Warning: whitespace found between sequence entries");
563            reader.reset();
564            break;
565        }
566        
567        // Finish up.
568        rlistener.endSequence();
569        return hasAnotherSequence;
570    }
571    
572    // reads an indented section, combining split lines and creating a list of key->value tuples
573    private List readSection(BufferedReader br) throws ParseException {
574        List section = new ArrayList();
575        String line = "";
576        String currKey = null;
577        StringBuffer currVal = new StringBuffer();
578        boolean done = false;
579        int linecount = 0;
580        
581        try {
582            while (!done) {
583                br.mark(320);
584                line = br.readLine();
585                String firstSecKey = section.isEmpty() ? "" : ((String[])section.get(0))[0];
586                if (line != null && line.matches("\\p{Space}*")) {
587                   // regular expression \p{Space}* will match line 
588                   // having only white space characters
589                   continue;
590                }
591                if (line==null || (!line.startsWith(" ") && linecount++>0 && ( !firstSecKey.equals(START_SEQUENCE_TAG)  || line.startsWith(END_SEQUENCE_TAG)))) {
592                    // dump out last part of section
593                    section.add(new String[]{currKey,currVal.toString()});
594                    br.reset();
595                    done = true;
596                } else {
597                    if (getElideSymbols() && firstSecKey.equals(START_SEQUENCE_TAG) && !line.startsWith(END_SEQUENCE_TAG)) {
598                        continue;
599                    }
600                    Matcher m = sectp.matcher(line);
601                    if (m.matches()) {
602                        // new key
603                        if (currKey!=null) section.add(new String[]{currKey,currVal.toString()});
604                        // key = group(2) or group(4) or group(6) - whichever is not null
605                        currKey = m.group(2)==null?(m.group(4)==null?m.group(6):m.group(4)):m.group(2);
606                        currVal = new StringBuffer();
607                        // val = group(3) if group(2) not null, group(5) if group(4) not null, "" otherwise, trimmed
608                        currVal.append((m.group(2)==null?(m.group(4)==null?"":m.group(5)):m.group(3)).trim());
609                    } else {
610                        // concatted line or SEQ START/END line?
611                        if (line.startsWith(START_SEQUENCE_TAG) || line.startsWith(END_SEQUENCE_TAG)) currKey = line;
612                        else {
613                            currVal.append("\n"); // newline in between lines - can be removed later
614                            currVal.append(currKey.charAt(0)=='/'?line.substring(21):line.substring(12));
615                        }
616                    }
617                }
618            }
619        } catch (IOException e) {
620            String message = ParseException.newMessage(this.getClass(), accession, identifier, "", sectionToString(section));
621            throw new ParseException(e, message);
622        } catch (RuntimeException e){
623            String message = ParseException.newMessage(this.getClass(), accession, identifier, "Bad line", line);
624            throw new ParseException(e, message);
625        }
626        return section;
627    }
628    
629    private final List buildBaseRanges(final String theBaseRangeList) throws ParseException {
630        if (theBaseRangeList == null) return null;
631        final List baseRangeList = new ArrayList();
632        final String[] baseRange = theBaseRangeList.split(";");
633        try{
634        for (int r=0; r<baseRange.length; r++) {
635            final Matcher rangeMatch = refRange.matcher(baseRange[r]);
636            if (rangeMatch.matches()) {
637                final int rangeStart = Integer.parseInt(rangeMatch.group(1));
638                final int rangeEnd = Integer.parseInt(rangeMatch.group(2));
639                baseRangeList.add(new SimpleRichLocation(new SimplePosition(rangeStart), new SimplePosition(rangeEnd), r));
640            } else {
641                String message = ParseException.newMessage(this.getClass(), accession, identifier, "Bad reference range found", theBaseRangeList);
642                throw new ParseException(message);
643            }
644        }
645        return baseRangeList;
646        }catch(RuntimeException e){
647            String message = ParseException.newMessage(this.getClass(), accession, identifier, "Bad base range", theBaseRangeList);
648            throw new ParseException(e, message);
649        }
650    }
651    
652    /**
653     * {@inheritDoc}
654     */
655    public void writeSequence(Sequence seq, PrintStream os) throws IOException {
656        if (this.getPrintStream()==null) this.setPrintStream(os);
657        this.writeSequence(seq, RichObjectFactory.getDefaultNamespace());
658    }
659    
660    /**
661     * {@inheritDoc}
662     */
663    public void writeSequence(Sequence seq, String format, PrintStream os) throws IOException {
664        if (this.getPrintStream()==null) this.setPrintStream(os);
665        if (!format.equals(this.getDefaultFormat())) throw new IllegalArgumentException("Unknown format: "+format);
666        this.writeSequence(seq, RichObjectFactory.getDefaultNamespace());
667    }
668    
669    /**
670     * {@inheritDoc}
671     * Namespace is ignored as Genbank has no concept of it.
672     */
673    public void writeSequence(Sequence seq, Namespace ns) throws IOException {
674        RichSequence rs;
675        try {
676            if (seq instanceof RichSequence) rs = (RichSequence)seq;
677            else rs = RichSequence.Tools.enrich(seq);
678        } catch (ChangeVetoException e) {
679            IOException e2 = new IOException("Unable to enrich sequence");
680            e2.initCause(e);
681            throw e2;
682        }
683        
684        SymbolTokenization tok;
685        try {
686            tok = rs.getAlphabet().getTokenization("token");
687        } catch (Exception e) {
688            throw new RuntimeException("Unable to get alphabet tokenizer",e);
689        }
690        Set<Note> notes = rs.getNoteSet();
691        String accession = rs.getAccession();
692        StringBuffer accessions = new StringBuffer();
693        accessions.append(accession);
694        String stranded = "";
695        String udat = "";
696        String moltype = rs.getAlphabet().getName();
697        if ("PROTEIN-TERM".equals(moltype) || "PROTEIN".equals(moltype)) moltype = null; //a genpept curiosity
698        StringBuffer keywords = new StringBuffer();
699        for (Iterator<Note> i = notes.iterator(); i.hasNext(); ) {
700            Note n = i.next();
701            if (n.getTerm().equals(Terms.getStrandedTerm())) {
702                String value = n.getValue();
703                if(value != null && value.equals("single"))
704                    stranded= "ss-";
705                else if(value != null && value.equals("mixed"))
706                    stranded= "ms-";
707            }
708            else if (n.getTerm().equals(Terms.getDateUpdatedTerm())) udat=n.getValue();
709            else if (n.getTerm().equals(Terms.getMolTypeTerm())) moltype=n.getValue();
710            else if (n.getTerm().equals(Terms.getAdditionalAccessionTerm())) {
711                accessions.append(" ");
712                accessions.append(n.getValue());
713            } else if (n.getTerm().equals(Terms.getKeywordTerm())) {
714                if (n.getValue() != null) {
715                    if (keywords.length()>0) keywords.append("; ");
716                    keywords.append(n.getValue());
717                }
718            }
719        }
720        
721        //adjust molecule type during format conversion
722        if(moltype!=null && moltype.length()>6) {
723            if(moltype.indexOf("DNA")!=-1) moltype = "DNA";
724            else if(moltype.indexOf("RNA")!=-1) moltype = "RNA";
725            else moltype = "NA";
726        }
727        
728        // locus(name) + length + alpha + div + date line
729        StringBuffer locusLine = new StringBuffer();
730        locusLine.append(StringTools.rightPad(rs.getName(),16));//13->28=15+1=16
731        locusLine.append(" ");//29
732        locusLine.append(StringTools.leftPad(""+rs.length(),11));//30->40=10+1=11
733        locusLine.append(" "+ (moltype==null? "aa":"bp") +" ");//41->44
734        locusLine.append(StringTools.leftPad(stranded,3));//45->47=2+1=3
735        locusLine.append(StringTools.rightPad(moltype==null?"":moltype,6));//48->53=5+1=6
736        locusLine.append("  ");//54->55
737        locusLine.append(StringTools.rightPad(rs.getCircular()?"circular":"linear",8));//56->63=7+1=8
738        locusLine.append(" ");//64->64
739        String div = rs.getDivision()==null?"":rs.getDivision();
740        if(div.length()>3) div = ""; // Not a GenBank division, maybe UniProt, etc.
741        locusLine.append(StringTools.rightPad(div,3));//65->67=2+1=3
742        locusLine.append(" ");//68->68
743        locusLine.append(StringTools.rightPad(udat,11));//69->79=10+1=11
744        StringTools.writeKeyValueLine(LOCUS_TAG, locusLine.toString(), 12, this.getLineWidth(), this.getPrintStream());
745        
746        // definition line
747        StringTools.writeKeyValueLine(DEFINITION_TAG, rs.getDescription(), 12, this.getLineWidth(), this.getPrintStream());
748        
749        // accession line
750        StringTools.writeKeyValueLine(ACCESSION_TAG, accessions.toString(), 12, this.getLineWidth(), this.getPrintStream());
751        
752        // version + gi line
753        String version = accession+"."+rs.getVersion();
754        if (rs.getIdentifier()!=null) version = version + "  GI:"+rs.getIdentifier();
755        StringTools.writeKeyValueLine(VERSION_TAG, version, 12, this.getLineWidth(), this.getPrintStream());
756        
757        // keywords line
758        keywords.append(".");
759        StringTools.writeKeyValueLine(KEYWORDS_TAG, keywords.toString(), 12, this.getLineWidth()-1, this.getPrintStream());
760        
761        // source line (from taxon)
762        //   organism line
763        NCBITaxon tax = rs.getTaxon();
764        if (tax!=null) {
765            StringTools.writeKeyValueLine(SOURCE_TAG, (isMitochondrial(rs)?"mitochondrion ":"")+tax.getDisplayName(), 12, this.getLineWidth(), this.getPrintStream());
766            StringTools.writeKeyValueLine("  "+ORGANISM_TAG, tax.getDisplayName().split("\\s+\\(")[0]+"\n"+tax.getNameHierarchy(), 12, this.getLineWidth()-1, this.getPrintStream());
767        }
768        
769        // references - rank (bases x to y)
770        for (Iterator<RankedDocRef> r = rs.getRankedDocRefs().iterator(); r.hasNext(); ) {
771            RankedDocRef rdr = r.next();
772            DocRef d = rdr.getDocumentReference();
773            StringTools.writeKeyValueLine(REFERENCE_TAG, rdr.getRank()+((rdr.getLocation()==null || rdr.getLocation() ==RichLocation.EMPTY_LOCATION)?"": (moltype==null? "  (residues ":"  (bases ")+makeBaseRange(rdr)+")"), 12, this.getLineWidth(), this.getPrintStream());
774            // Any authors that were in the input as CONSRTM tags will
775            // be merged into the AUTHORS tag on output.
776            StringTools.writeKeyValueLine("  "+AUTHORS_TAG, d.getAuthors(), 12, this.getLineWidth()-1, this.getPrintStream());
777            StringTools.writeKeyValueLine("  "+TITLE_TAG, d.getTitle(), 12, this.getLineWidth(), this.getPrintStream());
778            StringTools.writeKeyValueLine("  "+JOURNAL_TAG, d.getLocation(), 12, this.getLineWidth(), this.getPrintStream());
779            CrossRef c = d.getCrossref();
780            if (c!=null) StringTools.writeKeyValueLine(StringTools.leftPad(c.getDbname(),9), c.getAccession(), 12, this.getLineWidth(), this.getPrintStream());
781            StringTools.writeKeyValueLine("  "+REMARK_TAG, d.getRemark(), 12, this.getLineWidth(), this.getPrintStream());
782        }
783        
784        // comments - if any
785        Set<Comment> comments = rs.getComments();
786        if (!comments.isEmpty()) {
787            StringBuffer sb = new StringBuffer();
788            for (Iterator<Comment> i = comments.iterator(); i.hasNext(); ) {
789                Comment c = i.next();
790                sb.append(c.getComment());
791                if (i.hasNext()) sb.append("\n");
792            }
793            StringTools.writeKeyValueLine(COMMENT_TAG, sb.toString(), 12, this.getLineWidth(), this.getPrintStream());
794        }
795        
796        this.getPrintStream().println(FEATURE_TAG+"             Location/Qualifiers");
797        // feature_type     location
798        for (Iterator i = rs.getFeatureSet().iterator(); i.hasNext(); ) {
799            RichFeature f = (RichFeature)i.next();
800            StringTools.writeKeyValueLine("     "+f.getTypeTerm().getName(), GenbankLocationParser.writeLocation((RichLocation)f.getLocation()), 21, this.getLineWidth()-1, ",", this.getPrintStream());
801            for (Iterator<Note> j = f.getNoteSet().iterator(); j.hasNext(); ) {
802                Note n = j.next();
803                // /key="val" or just /key if val==""
804                if (n.getValue()==null || n.getValue().length()==0) StringTools.writeKeyValueLine("", "/"+n.getTerm().getName(), 21, this.getLineWidth(), this.getPrintStream());
805                else if (isNotQuoted(n)) {// doesn't have the value enclosed in quotes
806                    StringTools.writeKeyValueLine("", "/"+n.getTerm().getName()+"="+n.getValue(), 21, this.getLineWidth(), this.getPrintStream());
807                } else if (n.getTerm().getName().equals("translation")) {
808                    StringTools.writeKeyValueLine("", "/"+n.getTerm().getName()+"=\""+n.getValue()+"\"", 21, this.getLineWidth()-1, this.getPrintStream());
809                } else {
810                    StringTools.writeKeyValueLine("", "/"+n.getTerm().getName()+"=\""+n.getValue()+"\"", 21, this.getLineWidth(), this.getPrintStream());
811                }
812            }
813            // add-in to source feature only organism and db_xref="taxon:xyz" where present
814            if (f.getType().equals("source") && tax!=null) {
815                String displayName = tax.getDisplayName();
816                if (displayName.indexOf('(')>-1) displayName = displayName.substring(0, displayName.indexOf('(')).trim();
817                StringTools.writeKeyValueLine("", "/organism=\""+displayName+"\"", 21, this.getLineWidth()-1, this.getPrintStream());// AF252370 fits in exactly 80 - but is wrapped
818                for (Iterator<RankedCrossRef> j = f.getRankedCrossRefs().iterator(); j.hasNext(); ) {
819                    RankedCrossRef rcr = j.next();
820                    CrossRef cr = rcr.getCrossRef();
821                    StringTools.writeKeyValueLine("", "/db_xref=\""+cr.getDbname()+":"+cr.getAccession()+"\"", 21, this.getLineWidth(), this.getPrintStream());
822                }
823                StringTools.writeKeyValueLine("", "/db_xref=\"taxon:"+tax.getNCBITaxID()+"\"", 21, this.getLineWidth(), this.getPrintStream());
824            } else {
825                // add-in other dbxrefs where present
826                for (Iterator<RankedCrossRef> j = f.getRankedCrossRefs().iterator(); j.hasNext(); ) {
827                    RankedCrossRef rcr = j.next();
828                    CrossRef cr = rcr.getCrossRef();
829                    StringTools.writeKeyValueLine("", "/db_xref=\""+cr.getDbname()+":"+cr.getAccession()+"\"", 21, this.getLineWidth(), this.getPrintStream());
830                }
831            }
832        }
833        
834        //BASE COUNT obsolete in Genbank flatfile format since October 2003
835        //if (rs.getAlphabet()==AlphabetManager.alphabetForName("DNA")) {
836        //    // BASE COUNT     1510 a   1074 c    835 g   1609 t
837        //    int aCount = 0;
838        //    int cCount = 0;
839        //    int gCount = 0;
840        //    int tCount = 0;
841        //    int oCount = 0;
842        //    for (int i = 1; i <= rs.length(); i++) {
843        //        char c;
844        //        try {
845        //            c = tok.tokenizeSymbol(rs.symbolAt(i)).charAt(0);
846        //        } catch (Exception e) {
847        //            throw new RuntimeException("Unable to get symbol at position "+i,e);
848        //        }
849        //        switch (c) {
850        //            case 'a': case 'A':
851        //                aCount++;
852        //                break;
853        //            case 'c': case 'C':
854        //                cCount++;
855        //                break;
856        //            case 'g': case 'G':
857        //                gCount++;
858        //                break;
859        //            case 't': case 'T':
860        //                tCount++;
861        //                break;
862        //            default:
863        //                oCount++;
864        //        }
865        //    }
866        //
867        //    this.getPrintStream().print(BASE_COUNT_TAG_FULL+"    ");
868        //    this.getPrintStream().print(aCount + " a   ");
869        //    this.getPrintStream().print(cCount + " c   ");
870        //    this.getPrintStream().print(gCount + " g   ");
871        //    this.getPrintStream().print(tCount + " t    ");
872        //    this.getPrintStream().println(oCount + " others");
873        //}
874        
875        this.getPrintStream().println(START_SEQUENCE_TAG);
876        // sequence stuff
877        Symbol[] syms = (Symbol[])rs.toList().toArray(new Symbol[0]);
878        int lines = 0;
879        int symCount = 0;
880        for (int i = 0; i < syms.length; i++) {
881            if (symCount % 60 == 0) {
882                if (lines > 0) this.getPrintStream().print("\n"); // newline from previous line
883                int lineNum = (lines*60) + 1;
884                this.getPrintStream().print(StringTools.leftPad(""+lineNum,9));
885                lines++;
886            }
887            if (symCount % 10 == 0) this.getPrintStream().print(" ");
888            try {
889                this.getPrintStream().print(tok.tokenizeSymbol(syms[i]));
890            } catch (IllegalSymbolException e) {
891                throw new RuntimeException("Found illegal symbol: "+syms[i]);
892            }
893            symCount++;
894        }
895        if(syms.length>0) //do not create an empty line
896            this.getPrintStream().print("\n");
897        this.getPrintStream().println(END_SEQUENCE_TAG);
898    }
899    
900    /**
901     * {@inheritDoc}
902     */
903    public String getDefaultFormat() {
904        return GENBANK_FORMAT;
905    }
906    
907    private final static boolean isMitochondrial(final RichSequence theSequence) {
908        final Set featureSet = theSequence.getFeatureSet();
909        final Iterator i = featureSet.iterator();
910        while (i.hasNext()) {
911            final RichFeature feature = (RichFeature) i.next();
912            if (feature.getType().equals("source")) {
913                final Set noteSet = feature.getNoteSet();
914                final Iterator<Note> n = noteSet.iterator();
915                while(n.hasNext()) {
916                    final Note note = n.next();
917                    if (note.getTerm().getName().equals("organelle")) return note.getValue().equals("mitochondrion");
918                }
919            }
920        }
921        return false;
922    }
923    
924    private final static boolean isNotQuoted(final Note theNote) {
925        return isNotQuoted(theNote.getTerm().getName(), theNote.getValue());
926    }
927    
928    private final static boolean isNotQuoted(final String theName, final String theValue) {
929        return isNotQuoted.contains(theName);
930    }
931    
932    private final static String makeBaseRange(final RankedDocRef theReference) {
933        return theReference.getLocation()==null?theReference.getStart()+" to "+theReference.getEnd():toString(theReference.getLocation());
934    }
935    
936    private final static String toString(final RichLocation theLocation) {
937        final StringBuffer list = new StringBuffer();
938        final Iterator b = theLocation.blockIterator();
939        while (b.hasNext()) {
940            final RichLocation location = (RichLocation) b.next();
941            list.append(location.getMin()+" to "+location.getMax());
942            if (b.hasNext()) list.append("; ");
943        }
944        return list.toString();
945    }
946    
947    /**
948     * Converts the current parse section to a String. Useful for debugging.
949     */
950    String sectionToString(List section){
951        StringBuffer parseBlock = new StringBuffer();
952        for(Iterator i = section.listIterator(); i.hasNext();){
953            String[] part = (String[])i.next();
954            for(int x = 0; x < part.length; x++){
955                parseBlock.append(part[x]);
956                if(x == 0){
957                    parseBlock.append("   "); //the gap will have been trimmed
958                }
959            }
960        }
961        return parseBlock.toString();
962    }
963}