001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022package org.biojavax.bio.seq.io;
023
024import java.io.BufferedInputStream;
025import java.io.BufferedReader;
026import java.io.File;
027import java.io.FileReader;
028import java.io.IOException;
029import java.io.InputStreamReader;
030import java.io.PrintStream;
031import java.io.PrintWriter;
032import java.util.ArrayList;
033import java.util.Collection;
034import java.util.Iterator;
035import java.util.LinkedHashMap;
036import java.util.List;
037import java.util.Map;
038import java.util.Set;
039import java.util.TreeMap;
040import java.util.TreeSet;
041import java.util.regex.Pattern;
042
043import javax.xml.parsers.ParserConfigurationException;
044
045import org.biojava.bio.seq.Sequence;
046import org.biojava.bio.seq.io.ParseException;
047import org.biojava.bio.seq.io.SeqIOListener;
048import org.biojava.bio.seq.io.SymbolTokenization;
049import org.biojava.bio.symbol.IllegalSymbolException;
050import org.biojava.bio.symbol.SimpleSymbolList;
051import org.biojava.bio.symbol.Symbol;
052import org.biojava.bio.symbol.SymbolList;
053import org.biojava.utils.ChangeVetoException;
054import org.biojava.utils.xml.PrettyXMLWriter;
055import org.biojava.utils.xml.XMLWriter;
056import org.biojavax.Comment;
057import org.biojavax.CrossRef;
058import org.biojavax.DocRef;
059import org.biojavax.DocRefAuthor;
060import org.biojavax.Namespace;
061import org.biojavax.Note;
062import org.biojavax.RankedCrossRef;
063import org.biojavax.RankedDocRef;
064import org.biojavax.RichAnnotation;
065import org.biojavax.RichObjectFactory;
066import org.biojavax.SimpleCrossRef;
067import org.biojavax.SimpleDocRef;
068import org.biojavax.SimpleDocRefAuthor;
069import org.biojavax.SimpleNote;
070import org.biojavax.SimpleRankedCrossRef;
071import org.biojavax.SimpleRankedDocRef;
072import org.biojavax.SimpleRichAnnotation;
073import org.biojavax.bio.seq.Position;
074import org.biojavax.bio.seq.RichFeature;
075import org.biojavax.bio.seq.RichLocation;
076import org.biojavax.bio.seq.RichSequence;
077import org.biojavax.bio.seq.RichLocation.Strand;
078import org.biojavax.bio.taxa.NCBITaxon;
079import org.biojavax.bio.taxa.SimpleNCBITaxon;
080import org.biojavax.ontology.ComparableTerm;
081import org.biojavax.utils.StringTools;
082import org.biojavax.utils.XMLTools;
083import org.xml.sax.Attributes;
084import org.xml.sax.SAXException;
085import org.xml.sax.helpers.DefaultHandler;
086
087/**
088 * Format reader for EMBLxml files. This version of EMBLxml format will generate
089 * and write RichSequence objects. Loosely Based on code from the old, deprecated,
090 * org.biojava.bio.seq.io.GenbankXmlFormat object.
091 *
092 * Understands http://www.ebi.ac.uk/embl/dtd/EMBL_Services_V1.1.dtd
093 *
094 * @author Alan Li (code based on his work)
095 * @author Richard Holland
096 * @author Mark Schreiber
097 * @since 1.5
098 */
099public class EMBLxmlFormat extends RichSequenceFormat.BasicFormat {
100    
101    // Register this format with the format auto-guesser.
102    static {
103        RichSequence.IOTools.registerFormat(EMBLxmlFormat.class);
104    }
105    
106    /**
107     * The name of this format
108     */
109    public static final String EMBLXML_FORMAT = "EMBLxml";
110    
111    protected static final String ENTRY_GROUP_TAG = "EMBL_Services";
112    protected static final String ENTRY_TAG = "entry";
113    protected static final String ENTRY_ACCESSION_ATTR = "accession";
114    protected static final String ENTRY_TAX_DIVISION_ATTR = "taxonomicDivision";
115    protected static final String ENTRY_DATACLASS_ATTR = "dataClass";
116    protected static final String ENTRY_CREATED_ATTR = "created";
117    protected static final String ENTRY_RELCREATED_ATTR = "releaseCreated";
118    protected static final String ENTRY_UPDATED_ATTR = "lastUpdated";
119    protected static final String ENTRY_RELUPDATED_ATTR = "releaseLastUpdated";
120    protected static final String ENTRY_VER_ATTR = "version";
121    protected static final String ENTRY_SUBACC_ATTR = "submitterAccession";
122    protected static final String ENTRY_SUBVER_ATTR = "submitterVersion";
123    protected static final String ENTRY_SUBWGSVER_ATTR = "submitterWgsVersion";
124    protected static final String ENTRY_STATUS_ATTR = "status";
125    protected static final String ENTRY_STATUS_DATE_ATTR = "statusDate";
126    
127    protected static final String SEC_ACC_TAG = "secondaryAccession";
128    protected static final String PROJ_ACC_TAG = "projectAccession";
129    protected static final String DESC_TAG = "description";
130    protected static final String KEYWORD_TAG = "keyword";
131    protected static final String REFERENCE_TAG = "reference";
132    
133    protected static final String CITATION_TAG = "citation";
134    protected static final String CITATION_ID_ATTR = "id";
135    protected static final String CITATION_TYPE_ATTR = "type";
136    protected static final String CITATION_DATE_ATTR = "date";
137    protected static final String CITATION_NAME_ATTR = "name";
138    protected static final String CITATION_VOL_ATTR = "volume";
139    protected static final String CITATION_ISSUE_ATTR = "issue";
140    protected static final String CITATION_FIRST_ATTR = "first";
141    protected static final String CITATION_LAST_ATTR = "last";
142    protected static final String CITATION_PUB_ATTR = "publisher";
143    protected static final String CITATION_PATENT_ATTR = "patentNumber";
144    protected static final String CITATION_INSTITUTE_ATTR = "institute";
145    protected static final String CITATION_YEAR_ATTR = "year";
146    
147    protected static final String DBREFERENCE_TAG = "dbreference";
148    protected static final String DBREF_DB_ATTR = "db";
149    protected static final String DBREF_PRIMARY_ATTR = "primary";
150    protected static final String DBREF_SEC_ATTR = "secondary";
151    
152    protected static final String CONSORTIUM_TAG = "consortium";
153    protected static final String TITLE_TAG = "title";
154    protected static final String EDITOR_TAG = "editor";
155    protected static final String AUTHOR_TAG = "author";
156    protected static final String PATENT_TAG = "patentApplicant";
157    protected static final String LOCATOR_TAG = "locator";
158    
159    protected static final String CITATION_LOCATION_TAG = "citationLocation";
160    protected static final String REF_POS_BEGIN_ATTR = "begin";
161    protected static final String REF_POS_END_ATTR = "end";
162    
163    protected static final String COMMENT_TAG = "comment";
164    
165    protected static final String FEATURE_TAG = "feature";
166    protected static final String FEATURE_NAME_ATTR = "name";
167    
168    protected static final String ORGANISM_TAG = "organism";
169    protected static final String SCINAME_TAG = "scientificName";
170    protected static final String COMNAME_TAG = "preferredCommonName";
171    protected static final String TAXID_TAG = "taxId";
172    protected static final String LINEAGE_TAG = "lineage";
173    protected static final String TAXON_TAG = "taxon";
174    protected static final String ORGANELLE_TAG = "organelle";
175    
176    protected static final String QUALIFIER_TAG = "qualifier";
177    protected static final String QUALIFIER_NAME_ATTR = "name";
178    
179    protected static final String LOCATION_TAG = "location";
180    protected static final String LOCATION_TYPE_ATTR = "type";
181    protected static final String LOCATION_COMPL_ATTR = "complement";
182    
183    protected static final String LOCATION_ELEMENT_TAG = "locationElement";
184    protected static final String LOC_ELEMENT_TYPE_ATTR = "type";
185    protected static final String LOC_ELEMENT_ACC_ATTR = "accession";
186    protected static final String LOC_ELEMENT_VER_ATTR = "version";
187    protected static final String LOC_ELEMENT_COMPL_ATTR = "complement";
188    
189    protected static final String BASEPOSITION_TAG = "basePosition";
190    protected static final String BASEPOSITION_TYPE_ATTR = "type";
191    
192    protected static final String CONTIG_TAG = "contig";
193    protected static final String SEQUENCE_TAG = "sequence";
194    protected static final String SEQUENCE_TYPE_ATTR = "type";
195    protected static final String SEQUENCE_LENGTH_ATTR = "length";
196    protected static final String SEQUENCE_TOPOLOGY_ATTR = "topology";
197    protected static final String SEQUENCE_VER_ATTR = "version";
198    
199    protected static final Pattern xmlSchema = Pattern.compile(".*http://www\\.ebi\\.ac\\.uk/schema/EMBL_schema\\.xsd.*");
200    
201    /**
202     * Implements some EMBLxml-specific terms.
203     */
204    public static class Terms extends RichSequence.Terms {        
205        /**
206         * Getter for the SubmitterAccession term
207         * @return The SubmitterAccession Term
208         */
209        public static ComparableTerm getSubmitterAccessionTerm() {
210            return RichObjectFactory.getDefaultOntology().getOrCreateTerm("SubmitterAccession");
211        }
212        
213        /**
214         * Getter for the SubmitterVersion term
215         * @return The SubmitterVersion Term
216         */
217        public static ComparableTerm getSubmitterVersionTerm() {
218            return RichObjectFactory.getDefaultOntology().getOrCreateTerm("SubmitterVersion");
219        }
220        
221        /**
222         * Getter for the SubmitterWgsVersion term
223         * @return The SubmitterWgsVersion Term
224         */
225        public static ComparableTerm getSubmitterWgsVersionTerm() {
226            return RichObjectFactory.getDefaultOntology().getOrCreateTerm("SubmitterWgsVersion");
227        }
228        
229        /**
230         * Getter for the Status term
231         * @return The Status Term
232         */
233        public static ComparableTerm getStatusTerm() {
234            return RichObjectFactory.getDefaultOntology().getOrCreateTerm("Status");
235        }
236        
237        /**
238         * Getter for the StatusDate term
239         * @return The StatusDate Term
240         */
241        public static ComparableTerm getStatusDateTerm() {
242            return RichObjectFactory.getDefaultOntology().getOrCreateTerm("StatusDate");
243        }
244        
245        /**
246         * Getter for the ProjectAccession term
247         * @return The ProjectAccession Term
248         */
249        public static ComparableTerm getProjectAccessionTerm() {
250            return RichObjectFactory.getDefaultOntology().getOrCreateTerm("ProjectAccession");
251        }
252        
253        /**
254         * Getter for the EMBLxml term
255         * @return The EMBLxml Term
256         */
257        public static ComparableTerm getEMBLxmlTerm() {
258            return RichObjectFactory.getDefaultOntology().getOrCreateTerm("EMBLxml");
259        }
260        
261        /**
262         * Getter for the Ensembl-specific 'dataClass' term
263         * @return The data class Term
264         */
265        public static ComparableTerm getDataClassTerm() {
266            return RichObjectFactory.getDefaultOntology().getOrCreateTerm("dataClass");
267        }
268    }
269    
270    /**
271     * {@inheritDoc}
272     * A file is in EMBLxml format if the second XML line contains the phrase "http://www.ebi.ac.uk/schema/EMBL_schema.xsd".
273     */
274    @Override
275    public boolean canRead(File file) throws IOException {
276        BufferedReader br = new BufferedReader(new FileReader(file));
277        br.readLine(); // skip first line
278        String secondLine = br.readLine();
279        boolean readable = secondLine!=null && xmlSchema.matcher(secondLine).matches(); // check on second line
280        br.close();
281        return readable;
282    }
283    
284    /**
285     * {@inheritDoc}
286     * Always returns a DNA tokenizer.
287     */
288    @Override
289    public SymbolTokenization guessSymbolTokenization(File file) throws IOException {
290        return RichSequence.IOTools.getDNAParser();
291    }
292    
293    /**
294     * {@inheritDoc}
295     * A stream is in EMBLxml format if the second XML line contains the phrase "http://www.ebi.ac.uk/schema/EMBL_schema.xsd".
296     */
297    public boolean canRead(BufferedInputStream stream) throws IOException {
298        stream.mark(2000); // some streams may not support this
299        BufferedReader br = new BufferedReader(new InputStreamReader(stream));
300        br.readLine(); // skip first line
301        String secondLine = br.readLine();
302        boolean readable = secondLine!=null && xmlSchema.matcher(secondLine).matches(); // check on second line
303        // don't close the reader as it'll close the stream too.
304        // br.close();
305        stream.reset();
306        return readable;
307    }
308    
309    /**
310     * {@inheritDoc}
311     * Always returns a DNA tokenizer.
312     */
313    public SymbolTokenization guessSymbolTokenization(BufferedInputStream stream) throws IOException {
314        return RichSequence.IOTools.getDNAParser();
315    }
316    
317    /**
318     * {@inheritDoc}
319     */
320    public boolean readSequence(BufferedReader reader,
321            SymbolTokenization symParser,
322            SeqIOListener listener)
323            throws IllegalSymbolException, IOException, ParseException {
324        if (!(listener instanceof RichSeqIOListener)) throw new IllegalArgumentException("Only accepting RichSeqIOListeners today");
325        return this.readRichSequence(reader,symParser,(RichSeqIOListener)listener,null);
326    }
327    
328    /**
329     * {@inheritDoc}
330     */
331    public boolean readRichSequence(BufferedReader reader,
332            SymbolTokenization symParser,
333            RichSeqIOListener rlistener,
334            Namespace ns)
335            throws IllegalSymbolException, IOException, ParseException {
336        
337        try {
338            DefaultHandler m_handler = new EMBLxmlHandler(this,symParser,rlistener,ns);
339            return XMLTools.readXMLChunk(reader, m_handler, ENTRY_TAG);
340        } catch (ParserConfigurationException e) {
341            throw new ParseException(e);
342        } catch (SAXException e) {
343            throw new ParseException(e);
344        }
345    }
346    
347    private PrintWriter pw;
348    private XMLWriter xml;
349    
350    /**
351     * {@inheritDoc}
352     */
353    public void beginWriting() throws IOException {
354        // make an XML writer
355        pw = new PrintWriter(this.getPrintStream());
356        xml = new PrettyXMLWriter(pw);
357        xml.printRaw("<?xml version=\"1.0\" encoding=\"UTF-8\" ?>");
358        xml.openTag(ENTRY_GROUP_TAG);
359        xml.attribute("xmlns:ebi", "http://www.ebi.ac.uk/embl/schema");
360        xml.attribute("xmlns:xsi", "http://www.w3.org/2001/XMLSchema-instance");
361        xml.attribute("xsi:noNamespaceSchemaLocation","http://www.ebi.ac.uk/embl/schema/EMBL_Services_V1.1.xsd");
362    }
363    
364    /**
365     * {@inheritDoc}
366     */
367    public void finishWriting() throws IOException {
368        xml.closeTag(ENTRY_GROUP_TAG);
369        pw.flush();
370    }
371    
372    /**
373     * {@inheritDoc}
374     */
375    public void writeSequence(Sequence seq, PrintStream os) throws IOException {
376        if (this.getPrintStream()==null) this.setPrintStream(this.getPrintStream());
377        this.writeSequence(seq, RichObjectFactory.getDefaultNamespace());
378    }
379    
380    /**
381     * {@inheritDoc}
382     */
383    public void writeSequence(Sequence seq, String format, PrintStream os) throws IOException {
384        if (this.getPrintStream()==null) this.setPrintStream(this.getPrintStream());
385        if (!format.equals(this.getDefaultFormat())) throw new IllegalArgumentException("Unknown format: "+format);
386        this.writeSequence(seq, RichObjectFactory.getDefaultNamespace());
387    }
388    
389    /**
390     * {@inheritDoc}
391     * Namespace is ignored as EMBLxml has no concept of it.
392     */
393    public void writeSequence(Sequence seq, Namespace ns) throws IOException {
394        RichSequence rs;
395        try {
396            if (seq instanceof RichSequence) rs = (RichSequence)seq;
397            else rs = RichSequence.Tools.enrich(seq);
398        } catch (ChangeVetoException e) {
399            IOException e2 = new IOException("Unable to enrich sequence");
400            e2.initCause(e);
401            throw e2;
402        }
403                
404        Set<Note> notes = rs.getNoteSet();
405        List accessions = new ArrayList();
406        List projAccessions = new ArrayList();
407        List kws = new ArrayList();
408        List organelles = new ArrayList();
409        String cdat = null;
410        String udat = null;
411        String crel = null;
412        String urel = null;
413        String dataClass = null;
414        String moltype = rs.getAlphabet().getName();
415        String subWgsVer = null;
416        String subAcc = null;
417        String subVer = null;
418        String status = null;
419        String statusDate = null;
420        for (Iterator<Note> i = notes.iterator(); i.hasNext();) {
421            Note n = i.next();
422            if (n.getTerm().equals(Terms.getDateCreatedTerm())) cdat=n.getValue();
423            else if (n.getTerm().equals(Terms.getDateUpdatedTerm())) udat=n.getValue();
424            else if (n.getTerm().equals(Terms.getRelCreatedTerm())) crel=n.getValue();
425            else if (n.getTerm().equals(Terms.getRelUpdatedTerm())) urel=n.getValue();
426            else if (n.getTerm().equals(Terms.getMolTypeTerm())) moltype=n.getValue();
427            else if (n.getTerm().equals(Terms.getAdditionalAccessionTerm())) accessions.add(n.getValue());
428            else if (n.getTerm().equals(Terms.getProjectAccessionTerm())) projAccessions.add(n.getValue());
429            else if (n.getTerm().equals(Terms.getOrganelleTerm())) organelles.add(n.getValue());
430            else if (n.getTerm().equals(Terms.getKeywordTerm())) kws.add(n.getValue());
431            else if (n.getTerm().equals(Terms.getDataClassTerm())) dataClass = n.getValue();
432            else if (n.getTerm().equals(Terms.getSubmitterAccessionTerm())) subAcc = n.getValue();
433            else if (n.getTerm().equals(Terms.getSubmitterVersionTerm())) subVer = n.getValue();
434            else if (n.getTerm().equals(Terms.getSubmitterWgsVersionTerm())) subWgsVer = n.getValue();
435            else if (n.getTerm().equals(Terms.getStatusTerm())) status = n.getValue();
436            else if (n.getTerm().equals(Terms.getStatusDateTerm())) statusDate = n.getValue();
437        }
438        
439        xml.openTag(ENTRY_TAG);
440        xml.attribute(ENTRY_ACCESSION_ATTR,rs.getAccession());
441        xml.attribute(ENTRY_TAX_DIVISION_ATTR,rs.getDivision());
442        xml.attribute(ENTRY_DATACLASS_ATTR,dataClass);
443        xml.attribute(ENTRY_CREATED_ATTR,cdat==null?udat:cdat);
444        xml.attribute(ENTRY_RELCREATED_ATTR,crel==null?"0":crel);
445        xml.attribute(ENTRY_UPDATED_ATTR,udat);
446        xml.attribute(ENTRY_RELUPDATED_ATTR,urel==null?"0":urel);
447        xml.attribute(ENTRY_VER_ATTR,""+rs.getVersion());
448        if (subAcc!=null)
449            xml.attribute(ENTRY_SUBACC_ATTR,subAcc);
450        if (subVer!=null)
451            xml.attribute(ENTRY_SUBVER_ATTR,subVer);
452        if (subWgsVer!=null)
453            xml.attribute(ENTRY_SUBWGSVER_ATTR,subWgsVer);
454        if (status!=null)
455            xml.attribute(ENTRY_STATUS_ATTR,status);
456        if (statusDate!=null)
457            xml.attribute(ENTRY_STATUS_DATE_ATTR,statusDate);
458        
459        for (Iterator i = accessions.iterator(); i.hasNext(); ) {
460            xml.openTag(SEC_ACC_TAG);
461            xml.print((String)i.next());
462            xml.closeTag(SEC_ACC_TAG);
463        }
464        
465        for (Iterator i = projAccessions.iterator(); i.hasNext(); ) {
466            xml.openTag(PROJ_ACC_TAG);
467            xml.print((String)i.next());
468            xml.closeTag(PROJ_ACC_TAG);
469        }
470        
471        xml.openTag(DESC_TAG);
472        xml.print(rs.getDescription());
473        xml.closeTag(DESC_TAG);
474        
475        for (Iterator i = kws.iterator(); i.hasNext(); ) {
476            xml.openTag(KEYWORD_TAG);
477            xml.print((String)i.next());
478            xml.closeTag(KEYWORD_TAG);
479        }
480        
481        for (Iterator i = rs.getRankedDocRefs().iterator(); i.hasNext(); ) {
482            RankedDocRef rdr = (RankedDocRef)i.next();
483            DocRef dr = rdr.getDocumentReference();
484            
485            xml.openTag(REFERENCE_TAG);
486            
487            xml.openTag(CITATION_TAG);
488            xml.attribute(CITATION_ID_ATTR,""+rdr.getRank());
489            xml.attribute(CITATION_TYPE_ATTR,"journal article");
490            
491            CrossRef cr = dr.getCrossref();
492            if (cr!=null) {
493                xml.openTag(DBREFERENCE_TAG);
494                xml.attribute(DBREF_DB_ATTR,cr.getDbname());
495                xml.attribute(DBREF_PRIMARY_ATTR,cr.getAccession());
496                if (!cr.getNoteSet().isEmpty()) {
497                    for (Iterator<Note> j = cr.getNoteSet().iterator(); j.hasNext(); ) {
498                        Note n = j.next();
499                        if (n.getTerm().equals(Terms.getAdditionalAccessionTerm())) {
500                            xml.attribute(DBREF_SEC_ATTR,n.getValue());
501                            break;
502                        }
503                    }
504                }
505                xml.closeTag(DBREFERENCE_TAG);
506            }
507            
508            List<DocRefAuthor> auths = dr.getAuthorList();
509            
510            for (Iterator<DocRefAuthor> j = auths.iterator(); j.hasNext(); ) {
511                DocRefAuthor a = j.next();
512                if (a.isConsortium()) {
513                    xml.openTag(CONSORTIUM_TAG);
514                    xml.print(a.getName());
515                    xml.closeTag(CONSORTIUM_TAG);
516                    j.remove();
517                }
518            }
519            
520            if (dr.getTitle()!=null) {
521                xml.openTag(TITLE_TAG);
522                xml.print(dr.getTitle());
523                xml.closeTag(TITLE_TAG);
524            }
525            
526            for (Iterator<DocRefAuthor> j = auths.iterator(); j.hasNext(); ) {
527                DocRefAuthor a = j.next();
528                if (a.isEditor()) {
529                    xml.openTag(EDITOR_TAG);
530                    xml.print(a.getName());
531                    xml.closeTag(EDITOR_TAG);
532                } else {
533                    xml.openTag(AUTHOR_TAG);
534                    xml.print(a.getName());
535                    xml.closeTag(AUTHOR_TAG);
536                }
537            }
538            
539            xml.openTag(LOCATOR_TAG);
540            xml.print(dr.getLocation());
541            xml.closeTag(LOCATOR_TAG);
542            xml.closeTag(CITATION_TAG);
543            
544            xml.openTag(CITATION_LOCATION_TAG);     
545            Integer rstart = rdr.getStart();
546            if (rstart==null) rstart = new Integer(1);
547            Integer rend = rdr.getEnd();
548            if (rend==null) rend = new Integer(rs.length());
549            xml.attribute(REF_POS_BEGIN_ATTR,""+rstart);
550            xml.attribute(REF_POS_END_ATTR,""+rend);
551            if (dr.getRemark()!=null) {
552                xml.openTag(COMMENT_TAG);
553                xml.print(dr.getRemark());
554                xml.closeTag(COMMENT_TAG);
555            }
556            xml.closeTag(CITATION_LOCATION_TAG);
557            
558            xml.closeTag(REFERENCE_TAG);
559        }
560        
561        for (Iterator<RankedCrossRef> i = rs.getRankedCrossRefs().iterator(); i.hasNext(); ) {
562            RankedCrossRef rcr = i.next();
563            CrossRef cr = rcr.getCrossRef();
564            
565            xml.openTag(DBREFERENCE_TAG);
566            xml.attribute(DBREF_DB_ATTR,cr.getDbname());
567            xml.attribute(DBREF_PRIMARY_ATTR,cr.getAccession());
568            
569            if (!cr.getNoteSet().isEmpty()) {
570                for (Iterator<Note> j = cr.getNoteSet().iterator(); j.hasNext(); ) {
571                    Note n = j.next();
572                    if (n.getTerm().equals(Terms.getAdditionalAccessionTerm())) {
573                        xml.attribute(DBREF_SEC_ATTR,n.getValue());
574                        break;
575                    }
576                }
577            }
578            
579            xml.closeTag(DBREFERENCE_TAG);
580        }
581        
582        for (Iterator<Comment> i = rs.getComments().iterator(); i.hasNext(); ) {
583            xml.openTag(COMMENT_TAG);
584            xml.println(i.next().getComment());
585            xml.closeTag(COMMENT_TAG);
586        }
587        
588        NCBITaxon tax = rs.getTaxon();
589        for (Iterator i = rs.getFeatureSet().iterator(); i.hasNext(); ) {
590            RichFeature f = (RichFeature)i.next();
591            xml.openTag(FEATURE_TAG);
592            xml.attribute(FEATURE_NAME_ATTR,f.getTypeTerm().getName());
593            
594            // display organism on source feature only
595            if (f.getTypeTerm().getName().equals("source") && tax!=null) {
596                xml.openTag(ORGANISM_TAG);
597                
598                String[] parts = tax.getDisplayName().split("(\\(|\\))");
599                xml.openTag(SCINAME_TAG);
600                xml.print(parts[0].trim());
601                xml.closeTag(SCINAME_TAG);
602                if (parts.length>1) {
603                    xml.openTag(COMNAME_TAG);
604                    xml.print(parts[1].trim());
605                    xml.closeTag(COMNAME_TAG);
606                }
607                
608                xml.openTag(TAXID_TAG);
609                xml.print(""+tax.getNCBITaxID());
610                xml.closeTag(TAXID_TAG);
611                
612                String hierarchy = tax.getNameHierarchy();
613                hierarchy = hierarchy.substring(0,hierarchy.length()-1); // chomp "."
614                if (hierarchy.length()>0) {
615                    parts = hierarchy.split(";");
616                    xml.openTag(LINEAGE_TAG);
617                    for (int j = 0; j < parts.length; j++) {
618                        xml.openTag(TAXON_TAG);
619                        xml.print(parts[j].trim());
620                        xml.closeTag(TAXON_TAG);
621                    }
622                    xml.closeTag(LINEAGE_TAG);
623                }
624                
625                for (final Iterator j = organelles.iterator(); j.hasNext(); ) {
626                    final String organelle = (String)j.next();
627                    xml.openTag(ORGANELLE_TAG);
628                    xml.print(organelle);
629                    xml.closeTag(ORGANELLE_TAG);
630                }
631                
632                xml.closeTag(ORGANISM_TAG);
633            }
634            
635            for (Iterator<RankedCrossRef> j = f.getRankedCrossRefs().iterator(); j.hasNext(); ) {
636                RankedCrossRef rcr = j.next();
637                CrossRef cr = rcr.getCrossRef();
638                
639                xml.openTag(DBREFERENCE_TAG);
640                xml.attribute(DBREF_DB_ATTR,cr.getDbname());
641                xml.attribute(DBREF_PRIMARY_ATTR,cr.getAccession());
642                
643                if (!cr.getNoteSet().isEmpty()) {
644                    for (Iterator<Note> k = cr.getNoteSet().iterator(); k.hasNext(); ) {
645                        Note n = k.next();
646                        if (n.getTerm().equals(Terms.getAdditionalAccessionTerm())) {
647                            xml.attribute(DBREF_SEC_ATTR,n.getValue());
648                            break;
649                        }
650                    }
651                }
652                
653                xml.closeTag(DBREFERENCE_TAG);
654            }
655            
656            for (Iterator<Note> j = f.getNoteSet().iterator(); j.hasNext();) {
657                Note n = j.next();
658                xml.openTag(QUALIFIER_TAG);
659                xml.attribute(QUALIFIER_NAME_ATTR,n.getTerm().getName());
660                if (n.getValue()!=null && !n.getValue().equals("")) {
661                        if (n.getTerm().getName().equalsIgnoreCase("translation")) {
662                                String[] lines = StringTools.wordWrap(n.getValue(), "\\s+", this.getLineWidth());
663                                for (int k = 0; k < lines.length; k++) xml.println(lines[k]);
664                        } else {        
665                                xml.print(n.getValue());
666                        }
667                }       
668                xml.closeTag(QUALIFIER_TAG);
669            }
670            
671            // make it easy for ourselves by flattening into a single compound location
672            RichLocation rle = (RichLocation)f.getLocation();
673            Collection locElements = RichLocation.Tools.flatten(rle);
674            xml.openTag(LOCATION_TAG);
675            xml.attribute(LOCATION_TYPE_ATTR,(locElements.size()>1?rle.getTerm().getName():"single"));
676            xml.attribute(LOCATION_COMPL_ATTR,"false");
677            for (Iterator j = locElements.iterator(); j.hasNext(); ) {
678                RichLocation rl = (RichLocation)j.next();
679                xml.openTag(LOCATION_ELEMENT_TAG);
680                
681                if (rl.getStrand().equals(Strand.NEGATIVE_STRAND)) {
682                    xml.attribute(LOC_ELEMENT_COMPL_ATTR,"true");
683                } else {
684                    xml.attribute(LOC_ELEMENT_COMPL_ATTR,"false");
685                }
686                
687                if (rl.getCrossRef()!=null) {
688                    xml.attribute(LOC_ELEMENT_ACC_ATTR,rl.getCrossRef().getAccession());
689                    xml.attribute(LOC_ELEMENT_VER_ATTR,""+rl.getCrossRef().getVersion());
690                }
691                
692                Position start = rl.getMinPosition();
693                // EMBLxml does not support fuzzy locations so we only ever
694                // use the start coordinate.
695                
696                // output first base only
697                xml.attribute(LOC_ELEMENT_TYPE_ATTR,"site");
698                    
699                xml.openTag(BASEPOSITION_TAG);
700                if (start.getFuzzyStart()) xml.attribute(BASEPOSITION_TYPE_ATTR,"<");
701                else if (start.getFuzzyEnd()) xml.attribute(BASEPOSITION_TYPE_ATTR,"<");
702                else xml.attribute(BASEPOSITION_TYPE_ATTR,"simple");
703                xml.print(""+start.getStart());
704                xml.closeTag(BASEPOSITION_TAG);
705                
706                xml.closeTag(LOCATION_ELEMENT_TAG);
707            }
708
709            xml.closeTag(LOCATION_TAG);
710            
711            xml.closeTag(FEATURE_TAG);
712        }
713        
714        xml.openTag(SEQUENCE_TAG);
715        xml.attribute(SEQUENCE_TYPE_ATTR,moltype);
716        xml.attribute(SEQUENCE_LENGTH_ATTR,""+rs.length());
717        xml.attribute(SEQUENCE_TOPOLOGY_ATTR,rs.getCircular()?"circular":"linear");
718        xml.attribute(SEQUENCE_VER_ATTR,""+rs.getSeqVersion().intValue());
719        String[] lines = StringTools.wordWrap(rs.seqString(), "\\s+", this.getLineWidth());
720        for (int i = 0; i < lines.length; i ++) xml.println(lines[i]);
721        xml.closeTag(SEQUENCE_TAG);
722        
723        xml.closeTag(ENTRY_TAG);
724        
725        pw.flush();
726    }
727    
728    /**
729     * {@inheritDoc}
730     */
731    public String getDefaultFormat() {
732        return EMBLXML_FORMAT;
733    }
734    
735    // SAX event handler for parsing http://www.ebi.ac.uk/embl/Documentation/DTD/EMBL_dtd.txt
736    private class EMBLxmlHandler extends DefaultHandler {
737        
738        private RichSequenceFormat parent;
739        private SymbolTokenization symParser;
740        private RichSeqIOListener rlistener;
741        private Namespace ns;
742        private StringBuffer m_currentString;
743        
744        private NCBITaxon tax;
745        private String accession;
746        private RichFeature.Template templ;
747        private String currFeatQual;
748        private String currRefLocation;
749        private List currRefAuthors;
750        private String currRefTitle;
751        private Map currNames = new TreeMap();
752        private int currRefStart;
753        private int currRefEnd;
754        private int currRefRank;
755        private int currLocBrackets;
756        private int currLocElemBrackets;
757        private StringBuffer currLocStr;
758        private String currBaseType;
759        private boolean firstBase; // oooh err!
760        private boolean firstLocationElement;
761        private List currDBXrefs = new ArrayList();
762        private List currComments = new ArrayList();
763        private Map currQuals = new LinkedHashMap();
764        
765        // construct a new handler that will populate the given list of sequences
766        private EMBLxmlHandler(RichSequenceFormat parent,
767                SymbolTokenization symParser,
768                RichSeqIOListener rlistener,
769                Namespace ns) {
770            this.parent = parent;
771            this.symParser = symParser;
772            this.rlistener = rlistener;
773            this.ns = ns;
774            this.m_currentString = new StringBuffer();
775        }
776        
777        // process an opening tag
778        @Override
779        public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
780            if (qName.equals(ENTRY_TAG)) {
781                try {
782                    rlistener.startSequence();
783                    if (ns==null) ns=RichObjectFactory.getDefaultNamespace();
784                    rlistener.setNamespace(ns);
785                    for (int i = 0; i < attributes.getLength(); i++) {
786                        String name = attributes.getQName(i);
787                        String val = attributes.getValue(i);
788                        if (name.equals(ENTRY_ACCESSION_ATTR)) {
789                            accession = val;
790                            rlistener.setAccession(accession);
791                            rlistener.setName(accession);
792                        } else if (name.equals(ENTRY_TAX_DIVISION_ATTR)) rlistener.setDivision(val);
793                        else if (name.equals(ENTRY_DATACLASS_ATTR)) rlistener.addSequenceProperty(Terms.getDataClassTerm(),val);
794                        else if (name.equals(ENTRY_CREATED_ATTR)) rlistener.addSequenceProperty(Terms.getDateCreatedTerm(),val);
795                        else if (name.equals(ENTRY_UPDATED_ATTR)) rlistener.addSequenceProperty(Terms.getDateUpdatedTerm(),val);
796                        else if (name.equals(ENTRY_RELCREATED_ATTR)) rlistener.addSequenceProperty(Terms.getRelCreatedTerm(),val);
797                        else if (name.equals(ENTRY_RELUPDATED_ATTR)) rlistener.addSequenceProperty(Terms.getRelUpdatedTerm(),val);
798                        else if (name.equals(ENTRY_VER_ATTR)) rlistener.setVersion(Integer.parseInt(val));
799                        else if (name.equals(ENTRY_SUBACC_ATTR)) rlistener.addSequenceProperty(Terms.getSubmitterAccessionTerm(),val);
800                        else if (name.equals(ENTRY_SUBVER_ATTR)) rlistener.addSequenceProperty(Terms.getSubmitterVersionTerm(),val);
801                        else if (name.equals(ENTRY_SUBWGSVER_ATTR)) rlistener.addSequenceProperty(Terms.getSubmitterWgsVersionTerm(),val);
802                        else if (name.equals(ENTRY_STATUS_ATTR)) rlistener.addSequenceProperty(Terms.getStatusTerm(),val);
803                        else if (name.equals(ENTRY_STATUS_DATE_ATTR)) rlistener.addSequenceProperty(Terms.getStatusDateTerm(),val);
804                    }
805                    currNames.clear();
806                    currComments.clear();
807                    currDBXrefs.clear();
808                } catch (ParseException e) {
809                    throw new SAXException(e);
810                }
811            }
812            
813            else if (qName.equals(REFERENCE_TAG) && !this.parent.getElideReferences()) {
814                currRefLocation = null;
815                currRefAuthors = new ArrayList();
816                currRefTitle = null;
817                currRefStart = -999;
818                currRefEnd = -999;
819                currRefRank = 0;
820                currDBXrefs.clear();
821                currComments.clear();
822            } else if (qName.equals(CITATION_LOCATION_TAG) && !this.parent.getElideReferences()) {
823                for (int i = 0; i < attributes.getLength(); i++) {
824                    String name = attributes.getQName(i);
825                    String val = attributes.getValue(i);
826                    if (name.equals(REF_POS_BEGIN_ATTR)) currRefStart = Integer.parseInt(val);
827                    else if (name.equals(REF_POS_END_ATTR)) currRefEnd = Integer.parseInt(val);
828                }
829            } else if (qName.equals(CITATION_TAG) && !this.parent.getElideReferences()) {
830                StringBuffer currRef = new StringBuffer();
831                for (int i = 0; i < attributes.getLength(); i++) {
832                    String name = attributes.getQName(i);
833                    String val = attributes.getValue(i);
834                    if (name.equals(CITATION_ID_ATTR)) currRefRank = Integer.parseInt(val);
835                    // combine everything else into a fake reference to use if locator is a no-show
836                    else if (!name.equals(CITATION_TYPE_ATTR)) {
837                        if (currRef.length()>0) currRef.append(" ");
838                        currRef.append(val);
839                    }
840                }
841                currRefLocation = currRef.toString();
842            }
843            
844            else if (qName.equals(DBREFERENCE_TAG)) {
845                String db = null;
846                String primary = null;
847                String secondary = null;
848                for (int i = 0; i < attributes.getLength(); i++) {
849                    String name = attributes.getQName(i);
850                    String val = attributes.getValue(i);
851                    if (name.equals(DBREF_DB_ATTR)) db = val;
852                    else if (name.equals(DBREF_PRIMARY_ATTR)) primary = val;
853                    else if (name.equals(DBREF_SEC_ATTR)) secondary = val;
854                }
855                CrossRef dbx = (CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{db, primary, new Integer(0)});
856                if (secondary!=null) {
857                    Note note = new SimpleNote(Terms.getAdditionalAccessionTerm(),secondary,0);
858                    try {
859                        dbx.getRichAnnotation().addNote(note);
860                    } catch (ChangeVetoException ce) {
861                        SAXException pe = new SAXException("Could not annotate identifier terms");
862                        pe.initCause(ce);
863                        throw pe;
864                    }
865                }
866                currDBXrefs.add(dbx);
867            }
868            
869            else if (qName.equals(FEATURE_TAG) && !this.parent.getElideFeatures()) {
870                templ = new RichFeature.Template();
871                templ.annotation = new SimpleRichAnnotation();
872                templ.sourceTerm = Terms.getEMBLxmlTerm();
873                templ.featureRelationshipSet = new TreeSet();
874                templ.rankedCrossRefs = new TreeSet();
875                for (int i = 0; i < attributes.getLength(); i++) {
876                    String name = attributes.getQName(i);
877                    String val = attributes.getValue(i);
878                    if (name.equals(FEATURE_NAME_ATTR)) templ.typeTerm = RichObjectFactory.getDefaultOntology().getOrCreateTerm(val);
879                }
880                currLocStr = new StringBuffer();
881                currDBXrefs.clear();
882                currQuals.clear();
883            } else if (qName.equals(QUALIFIER_TAG) && !this.parent.getElideFeatures()) {
884                for (int i = 0; i < attributes.getLength(); i++) {
885                    String name = attributes.getQName(i);
886                    String val = attributes.getValue(i);
887                    if (name.equals(QUALIFIER_NAME_ATTR)) currFeatQual = val;
888                }
889            } else if (qName.equals(LOCATION_TAG) && !this.parent.getElideFeatures()) {
890                currLocBrackets = 0;
891                for (int i = 0; i < attributes.getLength(); i++) {
892                    String name = attributes.getQName(i);
893                    String val = attributes.getValue(i);
894                    if (name.equals(LOCATION_TYPE_ATTR) && !val.equalsIgnoreCase("single")) {
895                        // open a bracket just in case
896                        currLocStr.append(val);
897                        currLocStr.append("(");
898                        currLocBrackets++;
899                    } else if (name.equals(LOCATION_COMPL_ATTR) && val.equalsIgnoreCase("true")) {
900                        currLocStr.append("complement");
901                        currLocStr.append("(");
902                        currLocBrackets++;
903                    }
904                }
905                firstLocationElement = true;
906            } else if (qName.equals(LOCATION_ELEMENT_TAG) && !this.parent.getElideFeatures()) {
907                String currAcc = null;
908                String currVer = null;
909                if (!firstLocationElement) currLocStr.append(",");
910                for (int i = 0; i < attributes.getLength(); i++) {
911                    String name = attributes.getQName(i);
912                    String val = attributes.getValue(i);
913                    if (name.equals(LOCATION_COMPL_ATTR) && val.equalsIgnoreCase("true")) {
914                        currLocStr.append("complement");
915                        currLocStr.append("(");
916                        currLocElemBrackets++;
917                    } else if (name.equals(LOC_ELEMENT_ACC_ATTR)) currAcc = val;
918                    else if (name.equals(LOC_ELEMENT_VER_ATTR)) currVer = val;
919                }
920                if (currAcc!=null) {
921                    currLocStr.append(currAcc);
922                    if (currVer!=null) {
923                        currLocStr.append(".");
924                        currLocStr.append(currVer);
925                    }
926                    currLocStr.append(":");
927                }
928                firstBase = true;
929            } else if (qName.equals(BASEPOSITION_TAG) && !this.parent.getElideFeatures()) {
930                for (int i = 0; i < attributes.getLength(); i++) {
931                    String name = attributes.getQName(i);
932                    String val = attributes.getValue(i);
933                    if (name.equals(BASEPOSITION_TYPE_ATTR)) currBaseType = val;
934                }
935            }
936            
937            else if (qName.equals(CONTIG_TAG))  {
938                String message = ParseException.newMessage(this.getClass(),accession,"not set", "Unable to handle contig assemblies just yet", qName);
939                ParseException e = new ParseException(message);
940                SAXException pe = new SAXException("Could not set contig properties");
941                pe.initCause(e);
942                throw pe;
943            }
944            
945            else if (qName.equals(SEQUENCE_TAG)) {
946                try {
947                    for (int i = 0; i < attributes.getLength(); i++) {
948                        String name = attributes.getQName(i);
949                        String val = attributes.getValue(i);
950                        if (name.equals(SEQUENCE_TYPE_ATTR)) rlistener.addSequenceProperty(Terms.getMolTypeTerm(),val);
951                        else if (name.equals(SEQUENCE_VER_ATTR)) rlistener.setSeqVersion(val);
952                        else if (name.equals(SEQUENCE_TOPOLOGY_ATTR) && val.equalsIgnoreCase("circular")) rlistener.setCircular(true);
953                    }
954                } catch (ParseException e) {
955                    SAXException pe = new SAXException("Could not set sequence properties");
956                    pe.initCause(e);
957                    throw pe;
958                }
959            }
960        }
961        
962        // process a closing tag - we will have read the text already
963        @Override
964        public void endElement(String uri, String localName, String qName) throws SAXException {
965            String val = this.m_currentString.toString().trim();
966            
967            try {
968                if (qName.equals(SEC_ACC_TAG)) {
969                    rlistener.addSequenceProperty(Terms.getAdditionalAccessionTerm(),val);
970                } else if (qName.equals(PROJ_ACC_TAG)) {
971                    rlistener.addSequenceProperty(Terms.getProjectAccessionTerm(),val);
972                } else if (qName.equals(ORGANELLE_TAG)) {
973                    rlistener.addSequenceProperty(Terms.getOrganelleTerm(),val);
974                } else if (qName.equals(DESC_TAG)) {
975                    rlistener.setDescription(val);
976                } else if (qName.equals(KEYWORD_TAG)) {
977                    rlistener.addSequenceProperty(Terms.getKeywordTerm(), val);
978                } else if (qName.equals(COMMENT_TAG)) {
979                    currComments.add(val);
980                }
981                
982                else if (qName.equals(TITLE_TAG)) {
983                    currRefTitle = val;
984                } else if (qName.equals(AUTHOR_TAG)) {
985                    currRefAuthors.add(new SimpleDocRefAuthor(val,false,false));
986                } else if (qName.equals(EDITOR_TAG)) {
987                    currRefAuthors.add(new SimpleDocRefAuthor(val,false,true));
988                } else if (qName.equals(CONSORTIUM_TAG)) {
989                    currRefAuthors.add(new SimpleDocRefAuthor(val,true,false));
990                } else if (qName.equals(LOCATOR_TAG)) {
991                    currRefLocation = val;
992                } else if (qName.equals(REFERENCE_TAG) && !this.parent.getElideReferences()) {
993                    // do the crossrefs
994                    CrossRef useForDocRef = null;
995                    for (Iterator j = currDBXrefs.iterator(); j.hasNext();) {
996                        CrossRef dbx = (CrossRef)j.next();
997                        RankedCrossRef rdbx = new SimpleRankedCrossRef(dbx,0);
998                        rlistener.setRankedCrossRef(rdbx);
999                        if (useForDocRef==null) useForDocRef = dbx;
1000                        else {
1001                            // medline gets priority, then pubmed - if multiple, use last
1002                            if (dbx.getDbname().equalsIgnoreCase(Terms.MEDLINE_KEY) ||
1003                                    (dbx.getDbname().equalsIgnoreCase(Terms.PUBMED_KEY) &&
1004                                    !useForDocRef.getDbname().equalsIgnoreCase(Terms.MEDLINE_KEY))) {
1005                                useForDocRef = dbx;
1006                            }
1007                        }
1008                    }
1009                    // do the comment - will only be one, if any
1010                    String currRefRemark = null;
1011                    if (currComments.size()>0) currRefRemark = (String)currComments.iterator().next();
1012                    // create the docref object
1013                    try {
1014                        DocRef dr = (DocRef)RichObjectFactory.getObject(SimpleDocRef.class,new Object[]{currRefAuthors,currRefLocation,currRefTitle});
1015                        // assign the pubmed or medline to the docref - medline gets priority
1016                        if (useForDocRef!=null) dr.setCrossref(useForDocRef);
1017                        // assign the remarks
1018                        dr.setRemark(currRefRemark);
1019                        // assign the docref to the bioentry
1020                        RankedDocRef rdr = new SimpleRankedDocRef(dr,
1021                                (currRefStart != -999 ? new Integer(currRefStart) : null),
1022                                (currRefEnd != -999 ? new Integer(currRefEnd) : null),
1023                                currRefRank);
1024                        rlistener.setRankedDocRef(rdr);
1025                    } catch (ChangeVetoException e) {
1026                        throw new ParseException(e);
1027                    }
1028                    currDBXrefs.clear();
1029                    currComments.clear();
1030                }
1031                
1032                else if (qName.equals(LOCATION_TAG) && !this.parent.getElideFeatures()) {
1033                    while (currLocBrackets-->0) currLocStr.append(")"); // close the location groups
1034                    String tidyLocStr = currLocStr.toString().replaceAll("\\s+","");
1035                    templ.location = GenbankLocationParser.parseLocation(ns, accession, tidyLocStr);
1036                } else if (qName.equals(LOCATION_ELEMENT_TAG) && !this.parent.getElideFeatures()) {
1037                    while (currLocElemBrackets-->0) currLocStr.append(")"); // close the location groups
1038                    firstLocationElement = false;
1039                } else if (qName.equals(BASEPOSITION_TAG) && !this.parent.getElideFeatures()) {
1040                    if (!firstBase) currLocStr.append("..");
1041                    // left angle bracket, right angle bracket, simple, fuzzy
1042                    if (currBaseType.equals("<")) {
1043                        currLocStr.append("<");
1044                        currLocStr.append(val);
1045                    } else if (currBaseType.equals(">")) {
1046                        currLocStr.append(val);
1047                        currLocStr.append(">");
1048                    } else if (currBaseType.equalsIgnoreCase("simple")) {
1049                        currLocStr.append(val);
1050                    }
1051                    firstBase = false;
1052                } else if (qName.equals(QUALIFIER_TAG) && !this.parent.getElideFeatures()) {
1053                    currQuals.put(currFeatQual,val);
1054                } else if (qName.equals(FEATURE_TAG) && !this.parent.getElideFeatures()) {
1055                    // start the feature
1056                    rlistener.startFeature(templ);
1057                    // assign qualifiers
1058                    for (Iterator j = currQuals.keySet().iterator(); j.hasNext(); ) {
1059                        String qualName = (String)j.next();
1060                        String qualVal = (String)currQuals.get(qualName);
1061                        if (qualName.equalsIgnoreCase("translation")) {
1062                            // strip spaces from sequence
1063                            qualVal = qualVal.replaceAll("\\s+","");
1064                        }
1065                        rlistener.addFeatureProperty(RichObjectFactory.getDefaultOntology().getOrCreateTerm(qualName),qualVal);
1066                    }
1067                    // do the crossrefs
1068                    int rcrossrefCount = 0;
1069                    for (Iterator j = currDBXrefs.iterator(); j.hasNext();) {
1070                        CrossRef dbx = (CrossRef)j.next();
1071                        RankedCrossRef rdbx = new SimpleRankedCrossRef(dbx, ++rcrossrefCount);
1072                        try {
1073                            rlistener.getCurrentFeature().addRankedCrossRef(rdbx);
1074                        } catch (ChangeVetoException ce) {
1075                            throw new ParseException(ce);
1076                        }
1077                    }
1078                    // end the feature
1079                    rlistener.endFeature();
1080                    currDBXrefs.clear();
1081                }
1082                
1083                else if (qName.equals(TAXID_TAG)) {
1084                    tax = (NCBITaxon)RichObjectFactory.getObject(SimpleNCBITaxon.class, new Object[]{Integer.valueOf(val)});
1085                    rlistener.setTaxon(tax);
1086                    for (Iterator j = currNames.keySet().iterator(); j.hasNext(); ) {
1087                        String nameClass = (String)j.next();
1088                        Set nameSet = (Set)currNames.get(nameClass);
1089                        try {
1090                            for (Iterator k = nameSet.iterator(); k.hasNext(); ) {
1091                                String name = (String)k.next();
1092                                tax.addName(nameClass,name);
1093                            }
1094                        } catch (ChangeVetoException ce) {
1095                            throw new ParseException(ce);
1096                        }
1097                    }
1098                    currNames.clear();
1099                } else if (qName.equals(SCINAME_TAG)) {
1100                    try {
1101                        if (tax==null) {
1102                            if (!currNames.containsKey(NCBITaxon.SCIENTIFIC)) currNames.put(NCBITaxon.SCIENTIFIC,new TreeSet());
1103                            ((Set)currNames.get(NCBITaxon.SCIENTIFIC)).add(val);
1104                        } else {
1105                            tax.addName(NCBITaxon.SCIENTIFIC,val);
1106                        }
1107                    } catch (ChangeVetoException ce) {
1108                        throw new ParseException(ce);
1109                    }
1110                } else if (qName.equals(COMNAME_TAG)) {
1111                    try {
1112                        if (tax==null) {
1113                            if (!currNames.containsKey(NCBITaxon.COMMON)) currNames.put(NCBITaxon.COMMON,new TreeSet());
1114                            ((Set)currNames.get(NCBITaxon.COMMON)).add(val);
1115                        } else {
1116                            tax.addName(NCBITaxon.COMMON,val);
1117                        }
1118                    } catch (ChangeVetoException ce) {
1119                        throw new ParseException(ce);
1120                    }
1121                }
1122                
1123                else if (qName.equals(SEQUENCE_TAG) && !this.parent.getElideSymbols()) {
1124                    try {
1125                        SymbolList sl = new SimpleSymbolList(symParser,
1126                                val.replaceAll("\\s+","").replaceAll("[\\.|~]","-"));
1127                        rlistener.addSymbols(symParser.getAlphabet(),
1128                                (Symbol[])(sl.toList().toArray(new Symbol[0])),
1129                                0, sl.length());
1130                    } catch (Exception e) {
1131                        throw new ParseException(e);
1132                    }
1133                }
1134                
1135                else if (qName.equals(ENTRY_TAG)) {
1136                    // do the comments
1137                    for (Iterator j = currComments.iterator(); j.hasNext();) {
1138                        rlistener.setComment((String)j.next());
1139                    }
1140                    // do the crossrefs
1141                    for (Iterator j = currDBXrefs.iterator(); j.hasNext();) {
1142                        CrossRef dbx = (CrossRef)j.next();
1143                        RankedCrossRef rdbx = new SimpleRankedCrossRef(dbx, 0);
1144                        rlistener.setRankedCrossRef(rdbx);
1145                    }
1146                    // end the sequence
1147                    rlistener.endSequence();
1148                    currComments.clear();
1149                    currDBXrefs.clear();
1150                }
1151                
1152            } catch (ParseException e) {
1153                throw new SAXException(e);
1154            }
1155            
1156            // drop old string
1157            this.m_currentString.setLength(0);
1158        }
1159        
1160        // process text inside tags
1161        @Override
1162        public void characters(char[] ch, int start, int length) {
1163            this.m_currentString.append(ch, start, length);
1164        }
1165    }
1166}
1167