001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022package org.biojavax.bio.seq.io;
023
024import java.io.BufferedInputStream;
025import java.io.BufferedReader;
026import java.io.File;
027import java.io.FileReader;
028import java.io.IOException;
029import java.io.InputStreamReader;
030import java.io.PrintStream;
031import java.io.PrintWriter;
032import java.util.ArrayList;
033import java.util.Iterator;
034import java.util.List;
035import java.util.Map;
036import java.util.Set;
037import java.util.TreeMap;
038import java.util.TreeSet;
039import java.util.regex.Matcher;
040import java.util.regex.Pattern;
041
042import javax.xml.parsers.ParserConfigurationException;
043
044import org.biojava.bio.proteomics.MassCalc;
045import org.biojava.bio.seq.Sequence;
046import org.biojava.bio.seq.io.ParseException;
047import org.biojava.bio.seq.io.SeqIOListener;
048import org.biojava.bio.seq.io.SymbolTokenization;
049import org.biojava.bio.symbol.IllegalSymbolException;
050import org.biojava.bio.symbol.Location;
051import org.biojava.bio.symbol.SimpleSymbolList;
052import org.biojava.bio.symbol.Symbol;
053import org.biojava.bio.symbol.SymbolList;
054import org.biojava.utils.ChangeVetoException;
055import org.biojava.utils.xml.PrettyXMLWriter;
056import org.biojava.utils.xml.XMLWriter;
057import org.biojavax.Comment;
058import org.biojavax.CrossRef;
059import org.biojavax.DocRef;
060import org.biojavax.DocRefAuthor;
061import org.biojavax.Namespace;
062import org.biojavax.Note;
063import org.biojavax.RankedCrossRef;
064import org.biojavax.RankedDocRef;
065import org.biojavax.RichAnnotation;
066import org.biojavax.RichObjectFactory;
067import org.biojavax.SimpleCrossRef;
068import org.biojavax.SimpleDocRef;
069import org.biojavax.SimpleDocRefAuthor;
070import org.biojavax.SimpleNamespace;
071import org.biojavax.SimpleNote;
072import org.biojavax.SimpleRankedCrossRef;
073import org.biojavax.SimpleRankedDocRef;
074import org.biojavax.SimpleRichAnnotation;
075import org.biojavax.bio.seq.Position;
076import org.biojavax.bio.seq.RichFeature;
077import org.biojavax.bio.seq.RichLocation;
078import org.biojavax.bio.seq.RichSequence;
079import org.biojavax.bio.seq.io.UniProtCommentParser.Event;
080import org.biojavax.bio.seq.io.UniProtCommentParser.Interaction;
081import org.biojavax.bio.seq.io.UniProtCommentParser.Isoform;
082import org.biojavax.bio.taxa.NCBITaxon;
083import org.biojavax.bio.taxa.SimpleNCBITaxon;
084import org.biojavax.ontology.ComparableOntology;
085import org.biojavax.ontology.ComparableTerm;
086import org.biojavax.ontology.SimpleComparableOntology;
087import org.biojavax.utils.CRC64Checksum;
088import org.biojavax.utils.StringTools;
089import org.biojavax.utils.XMLTools;
090import org.xml.sax.Attributes;
091import org.xml.sax.SAXException;
092import org.xml.sax.helpers.DefaultHandler;
093
094/**
095 * Format reader for UniProtXML files. This version of UniProtXML format will generate
096 * and write RichSequence objects. Loosely Based on code from the old, deprecated,
097 * org.biojava.bio.seq.io.GenbankXmlFormat object.
098 *
099 * Understands http://www.ebi.uniprot.org/support/docs/uniprot.xsd
100 *
101 * @author Alan Li (code based on his work)
102 * @author Richard Holland
103 * @since 1.5
104 */
105public class UniProtXMLFormat extends RichSequenceFormat.BasicFormat {
106                        
107    // Register this format with the format auto-guesser.
108    static {
109        RichSequence.IOTools.registerFormat(UniProtXMLFormat.class);
110    }
111    
112    /**
113     * The name of this format
114     */
115    public static final String UNIPROTXML_FORMAT = "UniProtXML";
116    
117    protected static final String ENTRY_GROUP_TAG = "uniprot";
118    protected static final String ENTRY_TAG = "entry";
119    protected static final String ENTRY_VERSION_ATTR = "version";
120    protected static final String ENTRY_NAMESPACE_ATTR = "dataset";
121    protected static final String ENTRY_CREATED_ATTR = "created";
122    protected static final String ENTRY_UPDATED_ATTR = "modified";
123    protected static final String COPYRIGHT_TAG = "copyright";
124    
125    protected static final String ACCESSION_TAG = "accession";
126    protected static final String NAME_TAG = "name";
127    protected static final String TEXT_TAG = "text";
128    
129    protected static final String REF_ATTR = "ref";
130    protected static final String TYPE_ATTR = "type";
131    protected static final String KEY_ATTR = "key";
132    protected static final String ID_ATTR = "id";
133    protected static final String EVIDENCE_ATTR = "evidence";
134    protected static final String VALUE_ATTR = "value";
135    protected static final String STATUS_ATTR = "value";
136    protected static final String NAME_ATTR = "name";
137    
138    protected static final String PROTEIN_TAG = "protein";
139    protected static final String PROTEIN_TYPE_ATTR = "type";
140    
141    protected static final String DOMAIN_TAG = "domain";
142    protected static final String COMPONENT_TAG = "component";
143    protected static final String GENE_TAG = "gene";
144    protected static final String ORGANISM_TAG = "organism";
145    protected static final String DBXREF_TAG = "dbReference";
146    protected static final String PROPERTY_TAG = "property";
147    protected static final String LINEAGE_TAG = "lineage";
148    protected static final String TAXON_TAG = "taxon";
149    protected static final String GENELOCATION_TAG = "geneLocation";
150    protected static final String GENELOCATION_NAME_TAG = "name";
151    
152    protected static final String REFERENCE_TAG = "reference";
153    protected static final String CITATION_TAG = "citation";
154    protected static final String TITLE_TAG = "title";
155    protected static final String EDITOR_LIST_TAG = "editorList";
156    protected static final String AUTHOR_LIST_TAG = "authorList";
157    protected static final String PERSON_TAG = "person";
158    protected static final String CONSORTIUM_TAG = "consortium";
159    protected static final String LOCATOR_TAG = "locator";
160    protected static final String RP_LINE_TAG = "scope";
161    protected static final String RC_LINE_TAG = "source";
162    protected static final String RC_SPECIES_TAG = "species";
163    protected static final String RC_TISSUE_TAG = "tissue";
164    protected static final String RC_TRANSP_TAG = "transposon";
165    protected static final String RC_STRAIN_TAG = "strain";
166    protected static final String RC_PLASMID_TAG = "plasmid";
167    
168    protected static final String COMMENT_TAG = "comment";
169    protected static final String COMMENT_MASS_ATTR = "mass";
170    protected static final String COMMENT_ERROR_ATTR = "error";
171    protected static final String COMMENT_METHOD_ATTR = "method";
172    protected static final String COMMENT_LOCTYPE_ATTR = "locationType";
173    
174    protected static final String COMMENT_ABSORPTION_TAG = "absorption";
175    protected static final String COMMENT_ABS_MAX_TAG = "max";
176    protected static final String COMMENT_KINETICS_TAG = "kinetics";
177    protected static final String COMMENT_KIN_KM_TAG = "KM";
178    protected static final String COMMENT_KIN_VMAX_TAG = "VMax";
179    protected static final String COMMENT_PH_TAG = "phDependence";
180    protected static final String COMMENT_REDOX_TAG = "redoxPotential";
181    protected static final String COMMENT_TEMPERATURE_TAG = "temperatureDependence";
182    protected static final String COMMENT_LINK_TAG = "link";
183    protected static final String COMMENT_LINK_URI_ATTR = "uri";
184    protected static final String COMMENT_EVENT_TAG = "event";
185    protected static final String COMMENT_ISOFORM_TAG = "isoform";
186    protected static final String COMMENT_INTERACTANT_TAG = "interactant";
187    protected static final String COMMENT_INTERACT_INTACT_ATTR = "intactId";
188    protected static final String COMMENT_INTERACT_LABEL_TAG = "label";
189    protected static final String COMMENT_ORGANISMS_TAG = "organismsDiffer";
190    protected static final String COMMENT_EXPERIMENTS_TAG = "experiments";
191    
192    protected static final String NOTE_TAG = "note";
193    protected static final String KEYWORD_TAG = "keyword";
194    protected static final String PROTEIN_EXISTS_TAG = "proteinExistence";
195    protected static final String ID_TAG = "id";
196    
197    protected static final String FEATURE_TAG = "feature";
198    protected static final String FEATURE_DESC_ATTR = "description";
199    protected static final String FEATURE_ORIGINAL_TAG = "original";
200    protected static final String FEATURE_VARIATION_TAG = "variation";
201    
202    protected static final String EVIDENCE_TAG = "evidence";
203    protected static final String EVIDENCE_CATEGORY_ATTR = "category";
204    protected static final String EVIDENCE_ATTRIBUTE_ATTR = "attribute";
205    protected static final String EVIDENCE_DATE_ATTR = "date";
206    
207    protected static final String LOCATION_TAG = "location";
208    protected static final String LOCATION_SEQ_ATTR = "sequence";
209    protected static final String LOCATION_BEGIN_TAG = "begin";
210    protected static final String LOCATION_END_TAG = "end";
211    protected static final String LOCATION_POSITION_ATTR = "position";
212    protected static final String LOCATION_POSITION_TAG = "position";
213    
214    protected static final String SEQUENCE_TAG = "sequence";
215    protected static final String SEQUENCE_VERSION_ATTR = "version";
216    protected static final String SEQUENCE_LENGTH_ATTR = "length";
217    protected static final String SEQUENCE_MASS_ATTR = "mass";
218    protected static final String SEQUENCE_CHECKSUM_ATTR = "checksum";
219    protected static final String SEQUENCE_MODIFIED_ATTR = "modified";
220    
221    // RP line parser
222    protected static final Pattern rppat = Pattern.compile("SEQUENCE OF (\\d+)-(\\d+)");
223    
224    protected static final Pattern xmlSchema = Pattern.compile(".*http://www\\.uniprot\\.org/support/docs/uniprot\\.xsd.*");
225    
226    /**
227     * Implements some UniProtXML-specific terms.
228     */
229    public static class Terms extends RichSequence.Terms {        
230        public static final String CONTAINS_PREFIX = "Contains:";
231        public static final String INCLUDES_PREFIX = "Includes:";
232        
233        public static final String GENENAME_KEY = "primary";
234        public static final String GENESYNONYM_KEY = "synonym";
235        public static final String ORDLOCNAME_KEY = "ordered locus";
236        public static final String ORFNAME_KEY = "ORF";
237        
238        public static final String NCBI_TAXON_KEY = "NCBI Taxonomy";
239        public static final String COMMON_NAME_KEY = "common";
240        public static final String FULL_NAME_KEY = "full";
241        public static final String SCIENTIFIC_NAME_KEY = "scientific";
242        public static final String SYNONYM_NAME_KEY = "synonym";
243        public static final String ABBREV_NAME_KEY = "abbreviation";
244        
245        public static final String LOC_FUZZY_START_KEY = "less than";
246        public static final String LOC_FUZZY_END_KEY = "greater than";
247        
248        // Ontology for uniprot keywords (because they have identifiers, aaargh...)
249        private static ComparableOntology uniprotKWOnto = null;
250        
251        /**
252         * Getter for the protein exists term
253         * @return The protein exists Term
254         */
255        public static ComparableTerm getProteinExistsTerm() {
256            return RichObjectFactory.getDefaultOntology().getOrCreateTerm("UniProt protein exists");
257        }
258        
259        /**
260         * Getter for the private uniprot ontology.
261         * @return the ontology.
262         */
263        public static ComparableOntology getUniprotKWOnto() {
264            return (ComparableOntology)RichObjectFactory.getObject(SimpleComparableOntology.class, new Object[]{"uniprot_kw"});
265        }
266        
267        /**
268         * Getter for the UniProtXML term
269         * @return The UniProtXML Term
270         */
271        public static ComparableTerm getUniProtXMLTerm() {
272            return RichObjectFactory.getDefaultOntology().getOrCreateTerm("UniProtXML");
273        }
274        
275        /**
276         * Getter for the protein type term
277         * @return The protein type Term
278         */
279        public static ComparableTerm getProteinTypeTerm() {
280            return RichObjectFactory.getDefaultOntology().getOrCreateTerm("protein_type");
281        }
282        
283        /**
284         * Getter for the evidence category term
285         * @return The evidence category Term
286         */
287        public static ComparableTerm getEvidenceCategoryTerm() {
288            return RichObjectFactory.getDefaultOntology().getOrCreateTerm("evidence_category");
289        }
290        
291        /**
292         * Getter for the evidence type term
293         * @return The evidence type Term
294         */
295        public static ComparableTerm getEvidenceTypeTerm() {
296            return RichObjectFactory.getDefaultOntology().getOrCreateTerm("evidence_type");
297        }
298        
299        /**
300         * Getter for the evidence date term
301         * @return The evidence date Term
302         */
303        public static ComparableTerm getEvidenceDateTerm() {
304            return RichObjectFactory.getDefaultOntology().getOrCreateTerm("evidence_date");
305        }
306        
307        /**
308         * Getter for the evidence attr term
309         * @return The evidence attr Term
310         */
311        public static ComparableTerm getEvidenceAttrTerm() {
312            return RichObjectFactory.getDefaultOntology().getOrCreateTerm("evidence_attr");
313        }
314        
315        /**
316         * Getter for the feature ref term
317         * @return The feature ref Term
318         */
319        public static ComparableTerm getFeatureRefTerm() {
320            return RichObjectFactory.getDefaultOntology().getOrCreateTerm("feature_ref");
321        }
322        
323        /**
324         * Getter for the feature status term
325         * @return The feature status Term
326         */
327        public static ComparableTerm getFeatureStatusTerm() {
328            return RichObjectFactory.getDefaultOntology().getOrCreateTerm("feature_status");
329        }
330        
331        /**
332         * Getter for the feature original term
333         * @return The feature original Term
334         */
335        public static ComparableTerm getFeatureOriginalTerm() {
336            return RichObjectFactory.getDefaultOntology().getOrCreateTerm("feature_original");
337        }
338        
339        /**
340         * Getter for the feature variation term
341         * @return The feature variation Term
342         */
343        public static ComparableTerm getFeatureVariationTerm() {
344            return RichObjectFactory.getDefaultOntology().getOrCreateTerm("feature_variation");
345        }
346        
347        /**
348         * Getter for the location seq term
349         * @return The location seq Term
350         */
351        public static ComparableTerm getLocationSequenceTerm() {
352            return RichObjectFactory.getDefaultOntology().getOrCreateTerm("locseq");
353        }
354    }    
355    
356    /**
357     * {@inheritDoc}
358     * A file is in UniProtXML format if the second XML line contains the phrase "http://www.uniprot.org/support/docs/uniprot.xsd".
359     */
360    @Override
361    public boolean canRead(File file) throws IOException {
362        BufferedReader br = new BufferedReader(new FileReader(file));
363        br.readLine(); // skip first line
364        String secondLine = br.readLine();
365        boolean readable = secondLine!=null && xmlSchema.matcher(secondLine).matches(); // check on second line
366        br.close();
367        return readable;
368    }
369    
370    /**
371     * {@inheritDoc}
372     * Always returns a protein tokenizer.
373     */
374    @Override
375    public SymbolTokenization guessSymbolTokenization(File file) throws IOException {
376        return RichSequence.IOTools.getProteinParser();
377    }
378    
379    /**
380     * {@inheritDoc}
381     * A stream is in UniProtXML format if the second XML line contains the phrase "http://www.uniprot.org/support/docs/uniprot.xsd".
382     */
383    public boolean canRead(BufferedInputStream stream) throws IOException {
384        stream.mark(2000); // some streams may not support this
385        BufferedReader br = new BufferedReader(new InputStreamReader(stream));
386        br.readLine(); // skip first line
387        String secondLine = br.readLine();
388        boolean readable = secondLine!=null && xmlSchema.matcher(secondLine).matches(); // check on second line
389        // don't close the reader as it'll close the stream too.
390        // br.close();
391        stream.reset();
392        return readable;
393    }
394    
395    /**
396     * {@inheritDoc}
397     * Always returns a protein tokenizer.
398     */
399    public SymbolTokenization guessSymbolTokenization(BufferedInputStream stream) throws IOException {
400        return RichSequence.IOTools.getProteinParser();
401    }
402    
403    /**
404     * {@inheritDoc}
405     */
406    public boolean readSequence(BufferedReader reader,
407            SymbolTokenization symParser,
408            SeqIOListener listener)
409            throws IllegalSymbolException, IOException, ParseException {
410        if (!(listener instanceof RichSeqIOListener)) throw new IllegalArgumentException("Only accepting RichSeqIOListeners today");
411        return this.readRichSequence(reader,symParser,(RichSeqIOListener)listener,null);
412    }
413    
414    /**
415     * {@inheritDoc}
416     * If namespace is null, then the namespace of the sequence in the fasta is used.
417     * If the namespace is null and so is the namespace of the sequence in the fasta,
418     * then the default namespace is used.
419     */
420    public boolean readRichSequence(BufferedReader reader,
421            SymbolTokenization symParser,
422            RichSeqIOListener rlistener,
423            Namespace ns)
424            throws IllegalSymbolException, IOException, ParseException {
425        
426        Pattern copyright = Pattern.compile(".*<"+COPYRIGHT_TAG+".*");
427        
428        try {
429            rlistener.startSequence();                   
430            DefaultHandler m_handler = new UniProtXMLHandler(this,symParser,rlistener,ns);
431            boolean hasMore=XMLTools.readXMLChunk(reader, m_handler, ENTRY_TAG);
432            // deal with copyright chunk
433            reader.mark(10000);
434            String line = reader.readLine();
435            reader.reset();
436            if (copyright.matcher(line).matches()) XMLTools.readXMLChunk(reader, m_handler, COPYRIGHT_TAG);
437            // all done!
438            rlistener.endSequence();
439            return hasMore;
440        } catch (ParserConfigurationException e) {
441            throw new ParseException(e);
442        } catch (SAXException e) {
443            throw new ParseException(e);
444        }
445    }
446    
447    private PrintWriter pw;
448    private XMLWriter xml;
449    
450    /**
451     * {@inheritDoc}
452     */
453    public void beginWriting() throws IOException {
454        // make an XML writer
455        pw = new PrintWriter(this.getPrintStream());
456        xml = new PrettyXMLWriter(pw);
457        xml.printRaw("<?xml version=\"1.0\" encoding=\"UTF-8\" ?>");
458        xml.openTag(ENTRY_GROUP_TAG);
459        xml.attribute("xmlns","http://uniprot.org/uniprot");
460        xml.attribute("xmlns:xsi","http://www.w3.org/2001/XMLSchema-instance");
461        xml.attribute("xsi:schemaLocation","http://uniprot.org/uniprot http://www.uniprot.org/support/docs/uniprot.xsd");
462    }
463    
464    /**
465     * {@inheritDoc}
466     */
467    public void finishWriting() throws IOException {
468        xml.closeTag(ENTRY_GROUP_TAG);
469        pw.flush();
470    }
471    
472    /**
473     * {@inheritDoc}
474     */
475    public void writeSequence(Sequence seq, PrintStream os) throws IOException {
476        if (this.getPrintStream()==null) this.setPrintStream(this.getPrintStream());
477        this.writeSequence(seq, RichObjectFactory.getDefaultNamespace());
478    }
479    
480    /**
481     * {@inheritDoc}
482     */
483    public void writeSequence(Sequence seq, String format, PrintStream os) throws IOException {
484        if (this.getPrintStream()==null) this.setPrintStream(this.getPrintStream());
485        if (!format.equals(this.getDefaultFormat())) throw new IllegalArgumentException("Unknown format: "+format);
486        this.writeSequence(seq, RichObjectFactory.getDefaultNamespace());
487    }
488    
489    /**
490     * {@inheritDoc}
491     * If namespace is null, then the sequence's own namespace is used.
492     */
493    public void writeSequence(Sequence seq, Namespace ns) throws IOException {
494        RichSequence rs;
495        try {
496            if (seq instanceof RichSequence) rs = (RichSequence)seq;
497            else rs = RichSequence.Tools.enrich(seq);
498        } catch (ChangeVetoException e) {
499            IOException e2 = new IOException("Unable to enrich sequence");
500            e2.initCause(e);
501            throw e2;
502        }
503        
504        int key = 1;
505        
506        Set<Note> notes = rs.getNoteSet();
507        List accessions = new ArrayList();
508        List kws = new ArrayList();
509        String cdat = null;
510        String udat = null;
511        String arel = null;
512        String adat = null;
513        String copyright = null;
514        String proteinType = null;
515        String proteinExists = null;
516        Map genenames = new TreeMap();
517        Map genesynonyms = new TreeMap();
518        Map orfnames = new TreeMap();
519        Map ordlocnames = new TreeMap();
520        Set evidenceIDs = new TreeSet();
521        Set organelles = new TreeSet();
522        Map evcats = new TreeMap();
523        Map evtypes = new TreeMap();
524        Map evdates = new TreeMap();
525        Map evattrs = new TreeMap();
526        Map speciesRecs = new TreeMap();
527        Map strainRecs = new TreeMap();
528        Map tissueRecs = new TreeMap();
529        Map transpRecs = new TreeMap();
530        Map plasmidRecs = new TreeMap();
531        for (Iterator<Note> i = notes.iterator(); i.hasNext();) {
532            Note n = i.next();
533            if (n.getTerm().equals(Terms.getDateCreatedTerm())) cdat=n.getValue();
534            else if (n.getTerm().equals(Terms.getDateUpdatedTerm())) udat=n.getValue();
535            else if (n.getTerm().equals(Terms.getRelAnnotatedTerm())) arel=n.getValue();
536            else if (n.getTerm().equals(Terms.getDateAnnotatedTerm())) adat=n.getValue();
537            else if (n.getTerm().equals(Terms.getAdditionalAccessionTerm())) accessions.add(n.getValue());
538            else if (n.getTerm().equals(Terms.getOrganelleTerm())) organelles.add(n.getValue());
539            else if (n.getTerm().equals(Terms.getKeywordTerm())) {
540                ComparableTerm t = Terms.getUniprotKWOnto().getOrCreateTerm(n.getValue());
541                try {
542                    if (t.getIdentifier()==null || t.getIdentifier().length()==0) t.setIdentifier("UNKNOWN");
543                } catch (ChangeVetoException ce) {
544                    IOException e = new IOException("Failed to assign keyword identifier");
545                    e.initCause(ce);
546                    throw e;
547                }
548                kws.add(t);
549            } else if (n.getTerm().equals(Terms.getCopyrightTerm())) copyright=n.getValue();
550            else if (n.getTerm().equals(Terms.getProteinTypeTerm())) proteinType=n.getValue();
551            else if (n.getTerm().equals(Terms.getProteinExistsTerm())) proteinExists=n.getValue();
552            // use the nasty hack to split the reference rank away from the actual value in this field
553            else if (n.getTerm().equals(Terms.getGeneNameTerm()))  {
554                String ref = n.getValue();
555                int colon = ref.indexOf(':');
556                Integer refID = new Integer(0);
557                if (colon>=1) refID = new Integer(ref.substring(0,colon));
558                genenames.put(refID, ref.substring(colon+1)); // map of id -> string as only one name per gene
559            } else if (n.getTerm().equals(Terms.getGeneSynonymTerm())) {
560                String ref = n.getValue();
561                int colon = ref.indexOf(':');
562                Integer refID = new Integer(0);
563                if (colon>=1) refID = new Integer(ref.substring(0,colon));
564                if (genesynonyms.get(refID)==null) genesynonyms.put(refID, new ArrayList());
565                ((List)genesynonyms.get(refID)).add(ref.substring(colon+1));
566            } else if (n.getTerm().equals(Terms.getOrderedLocusNameTerm())) {
567                String ref = n.getValue();
568                int colon = ref.indexOf(':');
569                Integer refID = new Integer(0);
570                if (colon>=1) refID = new Integer(ref.substring(0,colon));
571                if (ordlocnames.get(refID)==null) ordlocnames.put(refID, new ArrayList());
572                ((List)ordlocnames.get(refID)).add(ref.substring(colon+1));
573            } else if (n.getTerm().equals(Terms.getORFNameTerm())) {
574                String ref = n.getValue();
575                int colon = ref.indexOf(':');
576                Integer refID = new Integer(0);
577                if (colon>=1) refID = new Integer(ref.substring(0,colon));
578                if (orfnames.get(refID)==null) orfnames.put(refID, new ArrayList());
579                ((List)orfnames.get(refID)).add(ref.substring(colon+1));
580            }
581            // use the nasty hack to split the reference rank away from the actual value in this field
582            else if (n.getTerm().equals(Terms.getEvidenceCategoryTerm()))  {
583                String ref = n.getValue();
584                int colon = ref.indexOf(':');
585                Integer refID = new Integer(0);
586                if (colon>=1) refID = new Integer(ref.substring(0,colon));
587                evcats.put(refID, ref.substring(colon+1)); // map of id -> string as only one name per gene
588                evidenceIDs.add(refID);
589            } else if (n.getTerm().equals(Terms.getEvidenceTypeTerm()))  {
590                String ref = n.getValue();
591                int colon = ref.indexOf(':');
592                Integer refID = new Integer(0);
593                if (colon>=1) refID = new Integer(ref.substring(0,colon));
594                evtypes.put(refID, ref.substring(colon+1)); // map of id -> string as only one name per gene
595                evidenceIDs.add(refID);
596            } else if (n.getTerm().equals(Terms.getEvidenceDateTerm()))  {
597                String ref = n.getValue();
598                int colon = ref.indexOf(':');
599                Integer refID = new Integer(0);
600                if (colon>=1) refID = new Integer(ref.substring(0,colon));
601                evdates.put(refID, ref.substring(colon+1)); // map of id -> string as only one name per gene
602                evidenceIDs.add(refID);
603            } else if (n.getTerm().equals(Terms.getEvidenceAttrTerm()))  {
604                String ref = n.getValue();
605                int colon = ref.indexOf(':');
606                Integer refID = new Integer(0);
607                if (colon>=1) refID = new Integer(ref.substring(0,colon));
608                evattrs.put(refID, ref.substring(colon+1)); // map of id -> string as only one name per gene
609                evidenceIDs.add(refID);
610            }
611            // use the nasty hack to split the reference rank away from the actual value in this field
612            // we'll end up with a bunch in key 0 for those which did not come from us. We ignore these for now.
613            else if (n.getTerm().equals(Terms.getSpeciesTerm())) {
614                String ref = n.getValue();
615                int colon = ref.indexOf(':');
616                Integer refID = new Integer(0);
617                if (colon>=1) refID = new Integer(ref.substring(0,colon));
618                if (speciesRecs.get(refID)==null) speciesRecs.put(refID, new ArrayList());
619                ((List)speciesRecs.get(refID)).add(ref.substring(colon+1));
620            } else if (n.getTerm().equals(Terms.getStrainTerm()))  {
621                String ref = n.getValue();
622                int colon = ref.indexOf(':');
623                Integer refID = new Integer(0);
624                if (colon>=1) refID = new Integer(ref.substring(0,colon));
625                if (strainRecs.get(refID)==null) strainRecs.put(refID, new ArrayList());
626                ((List)strainRecs.get(refID)).add(ref.substring(colon+1));
627            } else if (n.getTerm().equals(Terms.getTissueTerm()))  {
628                String ref = n.getValue();
629                int colon = ref.indexOf(':');
630                Integer refID = new Integer(0);
631                if (colon>=1) refID = new Integer(ref.substring(0,colon));
632                if (tissueRecs.get(refID)==null) tissueRecs.put(refID, new ArrayList());
633                ((List)tissueRecs.get(refID)).add(ref.substring(colon+1));
634            } else if (n.getTerm().equals(Terms.getTransposonTerm()))  {
635                String ref = n.getValue();
636                int colon = ref.indexOf(':');
637                Integer refID = new Integer(0);
638                if (colon>=1) refID = new Integer(ref.substring(0,colon));
639                if (transpRecs.get(refID)==null) transpRecs.put(refID, new ArrayList());
640                ((List)transpRecs.get(refID)).add(ref.substring(colon+1));
641            } else if (n.getTerm().equals(Terms.getPlasmidTerm()))  {
642                String ref = n.getValue();
643                int colon = ref.indexOf(':');
644                Integer refID = new Integer(0);
645                if (colon>=1) refID = new Integer(ref.substring(0,colon));
646                if (plasmidRecs.get(refID)==null) plasmidRecs.put(refID, new ArrayList());
647                ((List)plasmidRecs.get(refID)).add(ref.substring(colon+1));
648            }
649        }
650        
651        xml.openTag(ENTRY_TAG);
652        xml.attribute(ENTRY_VERSION_ATTR,""+(arel==null?""+rs.getVersion():arel));
653        xml.attribute(ENTRY_NAMESPACE_ATTR,(ns==null?rs.getNamespace().getName():ns.getName()));
654        xml.attribute(ENTRY_CREATED_ATTR,cdat);
655        xml.attribute(ENTRY_UPDATED_ATTR,(adat==null?cdat:adat)); // annotation update
656        
657        xml.openTag(ACCESSION_TAG);
658        xml.print(rs.getAccession());
659        xml.closeTag(ACCESSION_TAG);
660        
661        xml.openTag(NAME_TAG);
662        xml.print(rs.getName());
663        xml.closeTag(NAME_TAG);
664        
665        xml.openTag(PROTEIN_TAG);
666        if (proteinType!=null) xml.attribute(TYPE_ATTR,proteinType);
667        String desc = rs.getDescription().trim(); // this is only going to make sense if it was a UniProt seq to start with
668        if (desc.endsWith(".")) desc = desc.substring(0, desc.length()-1); // chomp trailing dot
669        String[] parts = desc.split("\\[");
670        for (int j = 0 ; j < parts.length; j++) {
671            if (parts[j].startsWith(Terms.CONTAINS_PREFIX)) {
672                // contains section
673                String chunk = parts[j].substring(Terms.CONTAINS_PREFIX.length()+1).trim();
674                if (chunk.endsWith("]")) chunk = chunk.substring(0, chunk.length()-1); // chomp trailing ]
675                String[] moreparts = chunk.split(";");
676                for (int k = 0; k < moreparts.length; k++) {
677                    xml.openTag(DOMAIN_TAG);
678                    String[] names = moreparts[k].split("\\(");
679                    for (int l = 0; l < names.length; l++) {
680                        String name = names[l].trim();
681                        if (name.endsWith(")")) name = name.substring(0,name.length()-1); // chomp trailing )
682                        xml.openTag(NAME_TAG);
683                        xml.print(name);
684                        xml.closeTag(NAME_TAG);
685                    }
686                    xml.closeTag(DOMAIN_TAG);
687                }
688            } else if (parts[j].startsWith(Terms.INCLUDES_PREFIX)) {
689                // includes section
690                String chunk = parts[j].substring(Terms.INCLUDES_PREFIX.length()+1).trim();
691                if (chunk.endsWith("]")) chunk = chunk.substring(0, chunk.length()-1); // chomp trailing ]
692                String[] moreparts = chunk.split(";");
693                for (int k = 0; k < moreparts.length; k++) {
694                    xml.openTag(COMPONENT_TAG);
695                    String[] names = moreparts[k].split("\\(");
696                    for (int l = 0; l < names.length; l++) {
697                        String name = names[l].trim();
698                        if (name.endsWith(")")) name = name.substring(0,name.length()-1); // chomp trailing )
699                        xml.openTag(NAME_TAG);
700                        xml.print(name);
701                        xml.closeTag(NAME_TAG);
702                    }
703                    xml.closeTag(COMPONENT_TAG);
704                }
705            } else {
706                // plain names
707                String[] names = parts[j].split("\\(");
708                for (int l = 0; l < names.length; l++) {
709                    String name = names[l].trim();
710                    if (name.endsWith(")")) name = name.substring(0,name.length()-1); // chomp trailing )
711                    xml.openTag(NAME_TAG);
712                    xml.print(name);
713                    xml.closeTag(NAME_TAG);
714                }
715            }
716        }
717        xml.closeTag(PROTEIN_TAG);
718        
719        // gene line
720        for (Iterator i = genenames.keySet().iterator(); i.hasNext(); ) {
721            Integer geneid = (Integer)i.next();
722            String genename = (String)genenames.get(geneid);
723            List synonyms = (List)genesynonyms.get(geneid);
724            List orfs = (List)orfnames.get(geneid);
725            List ordlocs = (List)ordlocnames.get(geneid);
726            
727            xml.openTag(GENE_TAG);
728            
729            xml.openTag(NAME_TAG);
730            xml.attribute(TYPE_ATTR,Terms.GENENAME_KEY);
731            xml.print(genename);
732            xml.closeTag(NAME_TAG);
733            
734            if (synonyms!=null) {
735                for (Iterator j = synonyms.iterator(); j.hasNext(); ) {
736                    xml.openTag(NAME_TAG);
737                    xml.attribute(TYPE_ATTR,Terms.GENESYNONYM_KEY);
738                    xml.print((String)j.next());
739                    xml.closeTag(NAME_TAG);
740                }
741            }
742            if (ordlocs!=null) {
743                for (Iterator j = synonyms.iterator(); j.hasNext(); ) {
744                    xml.openTag(NAME_TAG);
745                    xml.attribute(TYPE_ATTR,Terms.ORDLOCNAME_KEY);
746                    xml.print((String)j.next());
747                    xml.closeTag(NAME_TAG);
748                }
749            }
750            if (orfs!=null) {
751                for (Iterator j = synonyms.iterator(); j.hasNext(); ) {
752                    xml.openTag(NAME_TAG);
753                    xml.attribute(TYPE_ATTR,Terms.ORFNAME_KEY);
754                    xml.print((String)j.next());
755                    xml.closeTag(NAME_TAG);
756                }
757            }
758            
759            xml.closeTag(GENE_TAG);
760        }
761        
762        // source line (from taxon)
763        //   organism line
764        NCBITaxon tax = rs.getTaxon();
765        if (tax!=null) {
766            xml.openTag(ORGANISM_TAG);
767            xml.attribute(KEY_ATTR,""+(key++));
768            
769            for (Iterator i = tax.getNameClasses().iterator(); i.hasNext(); ) {
770                String nameclass = (String)i.next();
771                String ournameclass = Terms.COMMON_NAME_KEY;
772                if (nameclass.equalsIgnoreCase(Terms.FULL_NAME_KEY)) ournameclass = NCBITaxon.EQUIVALENT;
773                else if (nameclass.equalsIgnoreCase(Terms.SCIENTIFIC_NAME_KEY)) ournameclass = NCBITaxon.SCIENTIFIC;
774                else if (nameclass.equalsIgnoreCase(Terms.SYNONYM_NAME_KEY)) ournameclass = NCBITaxon.SYNONYM;
775                else if (nameclass.equalsIgnoreCase(Terms.ABBREV_NAME_KEY)) ournameclass = NCBITaxon.ACRONYM;
776                for (Iterator j = tax.getNames(nameclass).iterator(); j.hasNext(); ) {
777                    xml.openTag(NAME_TAG);
778                    xml.attribute(TYPE_ATTR,ournameclass);
779                    xml.print((String)j.next());
780                    xml.closeTag(NAME_TAG);
781                }
782            }
783            
784            xml.openTag(DBXREF_TAG);
785            xml.attribute(KEY_ATTR,""+(key++));
786            xml.attribute(TYPE_ATTR,Terms.NCBI_TAXON_KEY);
787            xml.attribute(ID_ATTR,""+tax.getNCBITaxID());
788            xml.closeTag(DBXREF_TAG);
789            
790            String h = tax.getNameHierarchy();
791            h = h.substring(0, h.length()-1); // chomp dot
792            String[] hierarch = h.split(";");
793            xml.openTag(LINEAGE_TAG);
794            for (int j = 0; j < hierarch.length; j++) {
795                xml.openTag(TAXON_TAG);
796                xml.print(hierarch[j].trim());
797                xml.closeTag(TAXON_TAG);
798            }
799            xml.closeTag(LINEAGE_TAG);
800            
801            xml.closeTag(ORGANISM_TAG);
802        }
803        
804        // gene location line (organelle)
805        for (Iterator i = organelles.iterator(); i.hasNext(); ) {
806            String org = (String)i.next();
807            xml.openTag(GENELOCATION_TAG);
808            if (org.startsWith("Plasmid")) {
809                xml.attribute(TYPE_ATTR,"plasmid");
810                String[] subparts = org.split(",");
811                for (int j = 0; j < parts.length; j++) {
812                    org = subparts[j].trim();
813                    if (org.startsWith("and")) org = org.substring(3).trim();
814                    org = org.substring("Plasmid".length()).trim();
815                    xml.openTag(GENELOCATION_NAME_TAG);
816                    xml.attribute(STATUS_ATTR,"known");
817                    xml.print(org);
818                    xml.closeTag(GENELOCATION_NAME_TAG);
819                }
820            } else {
821                xml.attribute(TYPE_ATTR,org.toLowerCase()); // uniprotxml must have lower case
822            }
823            xml.closeTag(GENELOCATION_TAG);
824        }
825        
826        // docrefs
827        for (Iterator<RankedDocRef> i = rs.getRankedDocRefs().iterator(); i.hasNext(); ) {
828            RankedDocRef rdr = i.next();
829            DocRef dr = rdr.getDocumentReference();
830            
831            xml.openTag(REFERENCE_TAG);
832            xml.attribute(KEY_ATTR,""+(key++));
833            
834            xml.openTag(CITATION_TAG);
835            xml.attribute(TYPE_ATTR,"journal article"); // faking it i know
836            
837            if (dr.getTitle()!=null) {
838                xml.openTag(TITLE_TAG);
839                xml.print(dr.getTitle());
840                xml.closeTag(TITLE_TAG);
841            }
842            
843            List<DocRefAuthor> auths = new ArrayList(dr.getAuthorList());
844            List<DocRefAuthor> editors = new ArrayList<DocRefAuthor>(auths);
845            for (final Iterator<DocRefAuthor> j = editors.iterator(); j.hasNext(); ) {
846                DocRefAuthor a = j.next();
847                if (!a.isEditor())
848                    j.remove();
849                else
850                    auths.remove(a);
851            }
852            if (!editors.isEmpty()) {
853                xml.openTag(EDITOR_LIST_TAG);
854                for (Iterator<DocRefAuthor> j = editors.iterator(); j.hasNext(); ) {
855                    DocRefAuthor a = j.next();
856                    if (a.isEditor()) {
857                        if (a.isConsortium()) {
858                            xml.openTag(CONSORTIUM_TAG);
859                            xml.attribute(NAME_ATTR,a.getName());
860                            xml.closeTag(CONSORTIUM_TAG);
861                        } else {
862                            xml.openTag(PERSON_TAG);
863                            xml.attribute(NAME_ATTR,a.getName());
864                            xml.closeTag(PERSON_TAG);
865                        }
866                    }
867                }
868                xml.closeTag(EDITOR_LIST_TAG);
869            }
870            if (!auths.isEmpty()) {
871                xml.openTag(AUTHOR_LIST_TAG);
872                for (Iterator j = auths.iterator(); j.hasNext(); ) {
873                    DocRefAuthor a = (DocRefAuthor)j.next();
874                    if (a.isConsortium()) {
875                        xml.openTag(CONSORTIUM_TAG);
876                        xml.attribute(NAME_ATTR,a.getName());
877                        xml.closeTag(CONSORTIUM_TAG);
878                    } else {
879                        xml.openTag(PERSON_TAG);
880                        xml.attribute(NAME_ATTR,a.getName());
881                        xml.closeTag(PERSON_TAG);
882                    }
883                }
884                xml.closeTag(AUTHOR_LIST_TAG);
885            }
886            
887            xml.openTag(LOCATOR_TAG);
888            xml.print(dr.getLocation());
889            xml.closeTag(LOCATOR_TAG);
890            
891            CrossRef cr = dr.getCrossref();
892            if (cr!=null) {
893                xml.openTag(DBXREF_TAG);
894                xml.attribute(TYPE_ATTR,cr.getDbname());
895                xml.attribute(ID_ATTR,cr.getAccession());
896                xml.attribute(KEY_ATTR,""+(key++));
897                if (!cr.getNoteSet().isEmpty()) {
898                    for (Iterator<Note> j = cr.getNoteSet().iterator(); j.hasNext(); ) {
899                        Note n = j.next();
900                        xml.openTag(PROPERTY_TAG);
901                        xml.attribute(TYPE_ATTR,n.getTerm().getName());
902                        xml.attribute(VALUE_ATTR,n.getValue());
903                        xml.closeTag(PROPERTY_TAG);
904                    }
905                }
906                xml.closeTag(DBXREF_TAG);
907            }
908            
909            xml.closeTag(CITATION_TAG);
910            
911            // RP
912            xml.openTag(RP_LINE_TAG);
913            xml.print(dr.getRemark());
914            xml.closeTag(RP_LINE_TAG);
915            // Print out ref position if present
916            if (rdr.getStart()!=null && rdr.getEnd()!=null && !rppat.matcher(dr.getRemark()).matches()) {
917                xml.openTag(RP_LINE_TAG);
918                xml.print("SEQUENCE OF "+rdr.getStart()+"-"+rdr.getEnd()+".");
919                xml.closeTag(RP_LINE_TAG);
920            }
921            
922            // RC
923            boolean rcOpened = false;
924            Integer rank = new Integer(rdr.getRank());
925            if (speciesRecs.get(rank)!=null) {
926                if (!rcOpened) {
927                    xml.openTag(RC_LINE_TAG);
928                    rcOpened = true;
929                }
930                for (Iterator j = ((List)speciesRecs.get(rank)).iterator(); j.hasNext(); ) {
931                    xml.openTag(RC_SPECIES_TAG);
932                    xml.print((String)j.next());
933                    xml.closeTag(RC_SPECIES_TAG);
934                }
935            }
936            if (strainRecs.get(rank)!=null) {
937                if (!rcOpened) {
938                    xml.openTag(RC_LINE_TAG);
939                    rcOpened = true;
940                }
941                for (Iterator j = ((List)strainRecs.get(rank)).iterator(); j.hasNext(); ) {
942                    xml.openTag(RC_STRAIN_TAG);
943                    xml.print((String)j.next());
944                    xml.closeTag(RC_STRAIN_TAG);
945                }
946            }
947            if (tissueRecs.get(rank)!=null) {
948                if (!rcOpened) {
949                    xml.openTag(RC_LINE_TAG);
950                    rcOpened = true;
951                }
952                for (Iterator j = ((List)tissueRecs.get(rank)).iterator(); j.hasNext(); ) {
953                    xml.openTag(RC_TISSUE_TAG);
954                    xml.print((String)j.next());
955                    xml.closeTag(RC_TISSUE_TAG);
956                }
957            }
958            if (transpRecs.get(rank)!=null) {
959                if (!rcOpened) {
960                    xml.openTag(RC_LINE_TAG);
961                    rcOpened = true;
962                }
963                for (Iterator j = ((List)transpRecs.get(rank)).iterator(); j.hasNext(); ) {
964                    xml.openTag(RC_TRANSP_TAG);
965                    xml.print((String)j.next());
966                    xml.closeTag(RC_TRANSP_TAG);
967                }
968            }
969            if (plasmidRecs.get(rank)!=null) {
970                if (!rcOpened) {
971                    xml.openTag(RC_LINE_TAG);
972                    rcOpened = true;
973                }
974                for (Iterator j = ((List)plasmidRecs.get(rank)).iterator(); j.hasNext(); ) {
975                    xml.openTag(RC_PLASMID_TAG);
976                    xml.print((String)j.next());
977                    xml.closeTag(RC_PLASMID_TAG);
978                }
979            }
980            if (rcOpened)
981                xml.closeTag(RC_LINE_TAG);
982            
983            xml.closeTag(REFERENCE_TAG);
984        }
985        
986        // comments
987        for (Iterator<Comment> i = rs.getComments().iterator(); i.hasNext(); ) {
988            // use UniProtCommentParser to convert each text comment from string to object
989            // do not print unconvertible ones (eg. no -!- on text)
990            Comment c = i.next();
991            if (UniProtCommentParser.isParseable(c)) {
992                // otherwise parse and display appropriately
993                UniProtCommentParser ucp = new UniProtCommentParser();
994                try {
995                    ucp.parseComment(c);
996                } catch (ParseException ce) {
997                    IOException e = new IOException("Failed to parse comment when outputting");
998                    e.initCause(ce);
999                    throw e;
1000                }
1001                String type = ucp.getCommentType();
1002                String xtype = type.toLowerCase(); // uniprotxml requires lower case
1003                if (type.equalsIgnoreCase(UniProtCommentParser.PTM)) xtype = "posttranslational modification";
1004                else if (type.equalsIgnoreCase(UniProtCommentParser.DATABASE)) xtype = "online information";
1005                
1006                xml.openTag(COMMENT_TAG);
1007                xml.attribute(TYPE_ATTR,xtype);
1008                
1009                // database comment
1010                if (type.equalsIgnoreCase(UniProtCommentParser.DATABASE)) {
1011                    xml.attribute(NAME_ATTR,ucp.getDatabaseName());
1012                    
1013                    xml.openTag(COMMENT_LINK_TAG);
1014                    xml.attribute(COMMENT_LINK_URI_ATTR,ucp.getUri());
1015                    xml.closeTag(COMMENT_LINK_TAG);
1016                }
1017                // mass spec
1018                else if (type.equalsIgnoreCase(UniProtCommentParser.MASS_SPECTROMETRY)) {
1019                    xml.attribute(COMMENT_MASS_ATTR,""+ucp.getMolecularWeight());
1020                    if (ucp.getMolWeightError()!=null) xml.attribute(COMMENT_ERROR_ATTR,""+ucp.getMolWeightError());
1021                    xml.attribute(COMMENT_METHOD_ATTR,""+ucp.getMolWeightMethod());
1022                    
1023                    xml.openTag(LOCATION_TAG);
1024                    xml.openTag(LOCATION_BEGIN_TAG);
1025                    xml.attribute(LOCATION_POSITION_ATTR,""+ucp.getMolWeightRangeStart());
1026                    xml.closeTag(LOCATION_BEGIN_TAG);
1027                    xml.openTag(LOCATION_END_TAG);
1028                    xml.attribute(LOCATION_POSITION_ATTR,""+ucp.getMolWeightRangeEnd());
1029                    xml.closeTag(LOCATION_END_TAG);
1030                    xml.closeTag(LOCATION_TAG);
1031                }
1032                // interaction
1033                else if (type.equalsIgnoreCase(UniProtCommentParser.INTERACTION)) {
1034                    // UniProt flat allows for multiple interactions per comment, but
1035                    // UniProtXML only allows for a single one. So, we have to open/close
1036                    // and write additional comments as necessary.
1037                    for (Iterator j = ucp.getInteractions().iterator(); j.hasNext(); ) {
1038                        // process comment
1039                        Interaction interact = (Interaction)j.next();
1040                        
1041                        xml.openTag(COMMENT_INTERACTANT_TAG);
1042                        xml.attribute(COMMENT_INTERACT_INTACT_ATTR,interact.getFirstIntActID());
1043                        xml.closeTag(COMMENT_INTERACTANT_TAG);
1044                        
1045                        xml.openTag(COMMENT_INTERACTANT_TAG);
1046                        xml.attribute(COMMENT_INTERACT_INTACT_ATTR,interact.getSecondIntActID());
1047                        xml.openTag(ID_TAG);
1048                        xml.print(interact.getID());
1049                        xml.closeTag(ID_TAG);
1050                        if (interact.getLabel()!=null) {
1051                            xml.openTag(COMMENT_INTERACT_LABEL_TAG);
1052                            xml.print(interact.getLabel());
1053                            xml.closeTag(COMMENT_INTERACT_LABEL_TAG);
1054                        }
1055                        xml.closeTag(COMMENT_INTERACTANT_TAG);
1056                        
1057                        xml.openTag(COMMENT_ORGANISMS_TAG);
1058                        xml.print(interact.isOrganismsDiffer()?"true":"false");
1059                        xml.closeTag(COMMENT_ORGANISMS_TAG);
1060                        
1061                        xml.openTag(COMMENT_EXPERIMENTS_TAG);
1062                        xml.print(""+interact.getNumberExperiments());
1063                        xml.closeTag(COMMENT_EXPERIMENTS_TAG);
1064                        
1065                        // if has next, close and open next comment tag
1066                        if (j.hasNext()) {
1067                            xml.closeTag(COMMENT_TAG);
1068                            xml.openTag(COMMENT_TAG);
1069                            xml.attribute(TYPE_ATTR,xtype);
1070                        }
1071                    }
1072                }
1073                // alternative products
1074                else if (type.equalsIgnoreCase(UniProtCommentParser.ALTERNATIVE_PRODUCTS)) {
1075                    for (Iterator j = ucp.getEvents().iterator(); j.hasNext(); ) {
1076                        Event event = (Event)j.next();
1077                        xml.openTag(COMMENT_EVENT_TAG);
1078                        xml.attribute(TYPE_ATTR,event.getType().toLowerCase()); // uniprotxml requires lowercase
1079                        xml.closeTag(COMMENT_EVENT_TAG);
1080                    }
1081                    for (Iterator j = ucp.getIsoforms().iterator(); j.hasNext(); ) {
1082                        Isoform isoform = (Isoform)j.next();
1083                        xml.openTag(COMMENT_ISOFORM_TAG);
1084                        for (Iterator k = isoform.getIsoIDs().iterator(); k.hasNext(); ) {
1085                            xml.openTag(ID_TAG);
1086                            xml.print((String)k.next());
1087                            xml.closeTag(ID_TAG);
1088                        }
1089                        for (Iterator k = isoform.getNames().iterator(); k.hasNext(); ) {
1090                            xml.openTag(NAME_TAG);
1091                            xml.print((String)k.next());
1092                            xml.closeTag(NAME_TAG);
1093                        }
1094                        xml.openTag(SEQUENCE_TAG);
1095                        xml.attribute(TYPE_ATTR,isoform.getSequenceType().toLowerCase());
1096                        if (isoform.getSequenceType().equalsIgnoreCase("Described")) {
1097                            xml.attribute(REF_ATTR,isoform.getSequenceRef());
1098                        }
1099                        xml.closeTag(SEQUENCE_TAG);
1100                        xml.openTag(NOTE_TAG);
1101                        xml.print(isoform.getNote());
1102                        xml.closeTag(NOTE_TAG);
1103                        xml.closeTag(COMMENT_ISOFORM_TAG);
1104                    }
1105                }
1106                // biophysicoblahblah stuff
1107                else if (type.equalsIgnoreCase(UniProtCommentParser.BIOPHYSICOCHEMICAL_PROPERTIES)) {
1108                    if (ucp.getAbsorptionNote()!=null) {
1109                        xml.openTag(COMMENT_ABSORPTION_TAG);
1110                        xml.openTag(COMMENT_ABS_MAX_TAG);
1111                        xml.print(ucp.getAbsorptionMax());
1112                        xml.closeTag(COMMENT_ABS_MAX_TAG);
1113                        xml.openTag(TEXT_TAG);
1114                        xml.print(ucp.getAbsorptionNote());
1115                        xml.closeTag(TEXT_TAG);
1116                        xml.closeTag(COMMENT_ABSORPTION_TAG);
1117                    }
1118                    if (ucp.getKineticsNote()!=null) {
1119                        xml.openTag(COMMENT_KINETICS_TAG);
1120                        for (Iterator j = ucp.getKMs().iterator(); j.hasNext(); ) {
1121                            xml.openTag(COMMENT_KIN_KM_TAG);
1122                            xml.print((String)j.next());
1123                            xml.closeTag(COMMENT_KIN_KM_TAG);
1124                        }
1125                        for (Iterator j = ucp.getVMaxes().iterator(); j.hasNext(); ) {
1126                            xml.openTag(COMMENT_KIN_VMAX_TAG);
1127                            xml.print((String)j.next());
1128                            xml.closeTag(COMMENT_KIN_VMAX_TAG);
1129                        }
1130                        xml.openTag(TEXT_TAG);
1131                        xml.print(ucp.getKineticsNote());
1132                        xml.closeTag(TEXT_TAG);
1133                        xml.closeTag(COMMENT_KINETICS_TAG);
1134                    }
1135                    if (ucp.getPHDependence()!=null) {
1136                        xml.openTag(COMMENT_PH_TAG);
1137                        xml.print(ucp.getPHDependence());
1138                        xml.closeTag(COMMENT_PH_TAG);
1139                    }
1140                    if (ucp.getRedoxPotential()!=null) {
1141                        xml.openTag(COMMENT_REDOX_TAG);
1142                        xml.print(ucp.getRedoxPotential());
1143                        xml.closeTag(COMMENT_REDOX_TAG);
1144                    }
1145                    if (ucp.getTemperatureDependence()!=null) {
1146                        xml.openTag(COMMENT_TEMPERATURE_TAG);
1147                        xml.print(ucp.getTemperatureDependence());
1148                        xml.closeTag(COMMENT_TEMPERATURE_TAG);
1149                    }
1150                }
1151                // all other comments
1152                else {
1153                    xml.openTag(TEXT_TAG);
1154                    xml.print(ucp.getText());
1155                    xml.closeTag(TEXT_TAG);
1156                }
1157                
1158                // finish comment up
1159                if (ucp.getNote()!=null) {
1160                    xml.openTag(NOTE_TAG);
1161                    xml.print(ucp.getNote());
1162                    xml.closeTag(NOTE_TAG);
1163                }
1164                
1165                xml.closeTag(COMMENT_TAG);
1166            }
1167        }
1168        
1169        // xrefs
1170        for (Iterator<RankedCrossRef> i = rs.getRankedCrossRefs().iterator(); i.hasNext(); ) {
1171            RankedCrossRef rcr = i.next();
1172            CrossRef cr = rcr.getCrossRef();
1173            
1174            xml.openTag(DBXREF_TAG);
1175            String dbname = cr.getDbname();
1176            xml.attribute(TYPE_ATTR,dbname);
1177            xml.attribute(ID_ATTR,cr.getAccession());
1178            xml.attribute(KEY_ATTR,""+(key++));
1179            if (!cr.getNoteSet().isEmpty()) {
1180                int acccount = 2;
1181                for (Iterator<Note> j = cr.getNoteSet().iterator(); j.hasNext(); ) {
1182                    Note n = j.next();
1183                    if (n.getTerm().equals(Terms.getAdditionalAccessionTerm()) && !n.getValue().equals("-")) {
1184                        xml.openTag(PROPERTY_TAG);
1185                        String name = n.getTerm().getName();
1186                        if (acccount==2) {
1187                            // SECONDARY IDENTIFIER
1188                            if (dbname.equalsIgnoreCase("HIV") ||
1189                                    dbname.equalsIgnoreCase("INTERPRO") ||
1190                                    dbname.equalsIgnoreCase("PANTHER") ||
1191                                    dbname.equalsIgnoreCase("PFAM") ||
1192                                    dbname.equalsIgnoreCase("PIR") ||
1193                                    dbname.equalsIgnoreCase("PRINTS") ||
1194                                    dbname.equalsIgnoreCase("PRODOM") ||
1195                                    dbname.equalsIgnoreCase("REBASE") ||
1196                                    dbname.equalsIgnoreCase("SMART") ||
1197                                    dbname.equalsIgnoreCase("TIGRFAMS")) {
1198                                // the secondary identifier is the entry name.
1199                                name = "entry name";
1200                            } else if (dbname.equalsIgnoreCase("PDB")) {
1201                                // the secondary identifier is the structure determination method, which is controlled vocabulary that currently includes: X-ray(for X-ray crystallography), NMR(for NMR spectroscopy), EM(for electron microscopy and cryo-electron diffraction), Fiber(for fiber diffraction), IR(for infrared spectroscopy), Model(for predicted models) and Neutron(for neutron diffraction).
1202                                name = "structure determination method";
1203                            } else if (dbname.equalsIgnoreCase("DICTYBASE") ||
1204                                    dbname.equalsIgnoreCase("ECOGENE") ||
1205                                    dbname.equalsIgnoreCase("FLYBASE") ||
1206                                    dbname.equalsIgnoreCase("HGNC") ||
1207                                    dbname.equalsIgnoreCase("MGI") ||
1208                                    dbname.equalsIgnoreCase("RGD") ||
1209                                    dbname.equalsIgnoreCase("SGD") ||
1210                                    dbname.equalsIgnoreCase("STYGENE") ||
1211                                    dbname.equalsIgnoreCase("SUBTILIST") ||
1212                                    dbname.equalsIgnoreCase("WORMBASE") ||
1213                                    dbname.equalsIgnoreCase("ZFIN")) {
1214                                // the secondary identifier is the gene designation. If the gene designation is not available, a dash('-') is used.
1215                                name = "gene designation";
1216                            } else if (dbname.equalsIgnoreCase("GO")) {
1217                                // the second identifier is a 1-letter abbreviation for one of the 3 ontology aspects, separated from the GO term by a column. If the term is longer than 46 characters, the first 43 characters are indicated followed by 3 dots('...'). The abbreviations for the 3 distinct aspects of the ontology are P(biological Process), F(molecular Function), and C(cellular Component).
1218                                name = "term";
1219                            } else if (dbname.equalsIgnoreCase("HAMAP")) {
1220                                // the secondary identifier indicates if a domain is 'atypical' and/or 'fused', otherwise the field is empty('-').
1221                                name = "domain";
1222                            } else if (dbname.equalsIgnoreCase("ECO2DBASE")) {
1223                                // the secondary identifier is the latest release number or edition of the database that has been used to derive the cross-reference.
1224                                name = "release number";
1225                            } else if (dbname.equalsIgnoreCase("SWISS-2DPAGE") ||
1226                                    dbname.equalsIgnoreCase("HSC-2DPAGE")) {
1227                                // the secondary identifier is the species or tissue of origin.
1228                                name = "organism name";
1229                            } else if (dbname.equalsIgnoreCase("ENSEMBL")) {
1230                                // the secondary identifier is the species of origin.
1231                                name = "organism name";
1232                            } else if (dbname.equalsIgnoreCase("PIRSF")) {
1233                                // the secondary identifier is the protein family name.
1234                                name = "protein family name";
1235                            } else if (dbname.equalsIgnoreCase("AARHUS") ||
1236                                    dbname.equalsIgnoreCase("GHENT-2DPAGE")) {
1237                                // the secondary identifier is either 'IEF' (for isoelectric focusing) or 'NEPHGE' (for non-equilibrium pH gradient electrophoresis).
1238                                name = "secondary identifier";
1239                            } else if (dbname.equalsIgnoreCase("WORMPEP")) {
1240                                // the secondary identifier is a number attributed by the C.elegans genome-sequencing project to that protein.
1241                                name = "C.elegans number";
1242                            } else if (dbname.equalsIgnoreCase("AGD") ||
1243                                    dbname.equalsIgnoreCase("ANU-2DPAGE") ||
1244                                    dbname.equalsIgnoreCase("COMPLUYEAST-2DPAGE") ||
1245                                    dbname.equalsIgnoreCase("ECHOBASE") ||
1246                                    dbname.equalsIgnoreCase("GENEDB_SPOMBE") ||
1247                                    dbname.equalsIgnoreCase("GERMONLINE") ||
1248                                    dbname.equalsIgnoreCase("GLYCOSUITEDB") ||
1249                                    dbname.equalsIgnoreCase("GRAMENE") ||
1250                                    dbname.equalsIgnoreCase("H-INVDB") ||
1251                                    dbname.equalsIgnoreCase("INTACT") ||
1252                                    dbname.equalsIgnoreCase("LEGIOLIST") ||
1253                                    dbname.equalsIgnoreCase("LEPROMA") ||
1254                                    dbname.equalsIgnoreCase("LISTILIST") ||
1255                                    dbname.equalsIgnoreCase("MAIZEDB") ||
1256                                    dbname.equalsIgnoreCase("MEROPS") ||
1257                                    dbname.equalsIgnoreCase("MIM") ||
1258                                    dbname.equalsIgnoreCase("MYPULIST") ||
1259                                    dbname.equalsIgnoreCase("OGP") ||
1260                                    dbname.equalsIgnoreCase("PHCI-2DPAGE") ||
1261                                    dbname.equalsIgnoreCase("PHOSSITE") ||
1262                                    dbname.equalsIgnoreCase("PHOTOLIST") ||
1263                                    dbname.equalsIgnoreCase("PMMA-2DPAGE") ||
1264                                    dbname.equalsIgnoreCase("RAT-HEART-2DPAGE") ||
1265                                    dbname.equalsIgnoreCase("REACTOME") ||
1266                                    dbname.equalsIgnoreCase("SAGALIST") ||
1267                                    dbname.equalsIgnoreCase("SIENA-2DPAGE") ||
1268                                    dbname.equalsIgnoreCase("TAIR") ||
1269                                    dbname.equalsIgnoreCase("TIGR") ||
1270                                    dbname.equalsIgnoreCase("TRANSFAC") ||
1271                                    dbname.equalsIgnoreCase("TUBERCULIST")) {
1272                                // the secondary identifier is not used and a dash('-') is stored in that field.
1273                                // should never get here - I hope!
1274                            } else if (dbname.equalsIgnoreCase("HSSP")) {
1275                                // the secondary identifier is the entry name of the PDB structure related to that of the entry in which the HSSP cross-reference is present.
1276                                name = "entry name";
1277                            } else if (dbname.equalsIgnoreCase("GENEFARM")) {
1278                                // the secondary identifier is the gene family identifier. If the gene family identifier is not available, a dash('-') is used.
1279                                name = "gene family";
1280                            } else if (dbname.equalsIgnoreCase("SMR")) {
1281                                // the secondary identifier indicates the range(s) relevant to the structure model(s).
1282                                name = "range";
1283                            } else if (dbname.equalsIgnoreCase("EMBL") ||
1284                                    dbname.equalsIgnoreCase("DDBJ") ||
1285                                    dbname.equalsIgnoreCase("GENBANK")) {
1286                                // PROTEIN_ID; STATUS_IDENTIFIER; MOLECULE_TYPE
1287                                name = "protein id";
1288                            } else if (dbname.equalsIgnoreCase("PROSITE")) {
1289                                // ENTRY_NAME; STATUS.
1290                                name = "entry name";
1291                            }
1292                        } else if (acccount==3) {
1293                            // TERTIARY IDENTIFIER
1294                            if (dbname.equalsIgnoreCase("HAMAP") ||
1295                                    dbname.equalsIgnoreCase("PANTHER") ||
1296                                    dbname.equalsIgnoreCase("PFAM") ||
1297                                    dbname.equalsIgnoreCase("PIRSF") ||
1298                                    dbname.equalsIgnoreCase("PRODOM") ||
1299                                    dbname.equalsIgnoreCase("SMART") ||
1300                                    dbname.equalsIgnoreCase("TIGRFAMS")) {
1301                                // the tertiary identifier is the number of hits found in the sequence.
1302                                name = "number of hits";
1303                            } else if (dbname.equalsIgnoreCase("GO")) {
1304                                // the tertiary identifier is a 3-character GO evidence code. The meaning of the evidence codes is: IDA=inferred from direct assay, IMP=inferred from mutant phenotype, IGI=inferred from genetic interaction, IPI=inferred from physical interaction, IEP=inferred from expression pattern, TAS=traceable author statement, NAS=non-traceable author statement, IC=inferred by curator, ISS=inferred from sequence or structural similarity.
1305                                name = "evidence";
1306                            } else if (dbname.equalsIgnoreCase("PDB")) {
1307                                // the tertiary identifier indicates the chain(s) and the corresponding range, of which the structure has been determined. If the range is unknown, a dash is given rather than the range positions(e.g. 'A/B=-.'), if the chains and the range is unknown, a dash is used.
1308                                name = "chains";
1309                            } else if (dbname.equalsIgnoreCase("EMBL") ||
1310                                    dbname.equalsIgnoreCase("DDBJ") ||
1311                                    dbname.equalsIgnoreCase("GENBANK")) {
1312                                // PROTEIN_ID; STATUS_IDENTIFIER; MOLECULE_TYPE
1313                                name = "status identifier";
1314                            } else if (dbname.equalsIgnoreCase("PROSITE")) {
1315                                // ENTRY_NAME; STATUS.
1316                                name = "status";
1317                            }
1318                        } else {
1319                            // QUATERNARY AND ADDITIONAL
1320                            if (dbname.equalsIgnoreCase("EMBL") ||
1321                                    dbname.equalsIgnoreCase("DDBJ") ||
1322                                    dbname.equalsIgnoreCase("GENBANK")) {
1323                                // PROTEIN_ID; STATUS_IDENTIFIER; MOLECULE_TYPE
1324                                name = "molecule type";
1325                            }
1326                        }
1327                        xml.attribute(TYPE_ATTR,name);
1328                        xml.attribute(VALUE_ATTR,n.getValue());
1329                        xml.closeTag(PROPERTY_TAG);
1330                        acccount++;
1331                    }
1332                }
1333            }
1334            xml.closeTag(DBXREF_TAG);
1335        }
1336        
1337        // protein exists
1338        xml.openTag(PROTEIN_EXISTS_TAG);
1339        xml.attribute(TYPE_ATTR,proteinExists);
1340        xml.closeTag(PROTEIN_EXISTS_TAG);
1341        
1342        // keywords
1343        for (Iterator j = kws.iterator(); j.hasNext(); ) {
1344            ComparableTerm t = (ComparableTerm)j.next();
1345            xml.openTag(KEYWORD_TAG);
1346            xml.attribute(ID_ATTR,t.getIdentifier());
1347            xml.print(t.getName());
1348            xml.closeTag(KEYWORD_TAG);
1349        }
1350        
1351        // features
1352        for (Iterator i = rs.getFeatureSet().iterator(); i.hasNext(); ) {
1353            RichFeature f = (RichFeature)i.next();
1354            String descr = null;
1355            String ftid = null;
1356            String ref = null;
1357            String status = null;
1358            String original = null;
1359            String locseq = null;
1360            List variation = new ArrayList();
1361            for (Iterator<Note> j = f.getNoteSet().iterator(); j.hasNext(); ) {
1362                Note n = j.next();
1363                if (n.getTerm().equals(Terms.getFTIdTerm())) ftid = n.getValue();
1364                else if (n.getTerm().equals(Terms.getFeatureDescTerm())) descr = n.getValue();
1365                else if (n.getTerm().equals(Terms.getFeatureStatusTerm())) status = n.getValue();
1366                else if (n.getTerm().equals(Terms.getFeatureRefTerm())) ref = n.getValue();
1367                else if (n.getTerm().equals(Terms.getFeatureOriginalTerm())) original = n.getValue();
1368                else if (n.getTerm().equals(Terms.getFeatureVariationTerm())) variation.add(n.getValue());
1369                else if (n.getTerm().equals(Terms.getLocationSequenceTerm())) locseq = n.getValue();
1370            }
1371            
1372            xml.openTag(FEATURE_TAG);
1373            
1374            xml.attribute(TYPE_ATTR,f.getTypeTerm().getName()); // TODO : need to translate from UniProt flatfile format names?
1375            if (ftid!=null) xml.attribute(ID_ATTR,ftid);
1376            if (descr!=null) xml.attribute(FEATURE_DESC_ATTR,descr);
1377            if (ref!=null) xml.attribute(REF_ATTR,ref);
1378            if (status!=null) xml.attribute(STATUS_ATTR,status);
1379            if (original!=null) {
1380                xml.openTag(FEATURE_ORIGINAL_TAG);
1381                xml.print(original.trim());
1382                xml.closeTag(FEATURE_ORIGINAL_TAG);
1383            }
1384            for (Iterator j = variation.iterator(); j.hasNext(); ) {
1385                xml.openTag(FEATURE_VARIATION_TAG);
1386                xml.print(((String)j.next()).trim());
1387                xml.closeTag(FEATURE_VARIATION_TAG);
1388            }
1389            
1390            xml.openTag(LOCATION_TAG);
1391            if (locseq!=null) xml.attribute(LOCATION_SEQ_ATTR,locseq.trim());
1392            RichLocation rl = (RichLocation)f.getLocation();
1393            if (rl.getMinPosition().equals(rl.getMaxPosition())) {
1394                // point position
1395                xml.openTag(LOCATION_POSITION_TAG);
1396                if (rl.getMinPosition().getFuzzyStart() || rl.getMaxPosition().getFuzzyStart()) xml.attribute(STATUS_ATTR,"less than");
1397                else if (rl.getMinPosition().getFuzzyEnd() || rl.getMaxPosition().getFuzzyEnd()) xml.attribute(STATUS_ATTR,"greater than");
1398                xml.attribute(LOCATION_POSITION_ATTR,""+rl.getMin());
1399                xml.closeTag(LOCATION_POSITION_TAG);
1400            } else {
1401                // range position
1402                // begin
1403                xml.openTag(LOCATION_BEGIN_TAG);
1404                Position begin = rl.getMinPosition();
1405                if (begin.getFuzzyStart()) xml.attribute(STATUS_ATTR,"less than");
1406                else if (begin.getFuzzyEnd()) xml.attribute(STATUS_ATTR,"greater than");
1407                xml.attribute(LOCATION_POSITION_ATTR,""+begin.getStart());
1408                xml.closeTag(LOCATION_BEGIN_TAG);
1409                // end
1410                xml.openTag(LOCATION_END_TAG);
1411                Position end = rl.getMaxPosition();
1412                if (end.getFuzzyStart()) xml.attribute(STATUS_ATTR,"less than");
1413                else if (end.getFuzzyEnd()) xml.attribute(STATUS_ATTR,"greater than");
1414                xml.attribute(LOCATION_POSITION_ATTR,""+end.getEnd());
1415                xml.closeTag(LOCATION_END_TAG);
1416            }
1417            xml.closeTag(LOCATION_TAG);
1418            
1419            xml.closeTag(FEATURE_TAG);
1420        }
1421        
1422        // evidence
1423        for (Iterator i = evidenceIDs.iterator(); i.hasNext(); ) {
1424            Integer evidenceID = (Integer)i.next();
1425            String cat = (String)evcats.get(evidenceID);
1426            String type = (String)evtypes.get(evidenceID);
1427            String date = (String)evdates.get(evidenceID);
1428            String attr = (String)evattrs.get(evidenceID);
1429            
1430            xml.openTag(EVIDENCE_TAG);
1431            xml.attribute(KEY_ATTR,""+(key++));
1432            xml.attribute(EVIDENCE_CATEGORY_ATTR,cat);
1433            xml.attribute(EVIDENCE_DATE_ATTR,date);
1434            xml.attribute(TYPE_ATTR,type);
1435            if (attr!=null) xml.attribute(EVIDENCE_ATTRIBUTE_ATTR,attr);
1436            xml.closeTag(EVIDENCE_TAG);
1437        }
1438        
1439        // sequence
1440        int mw = 0;
1441        try {
1442            mw = (int)MassCalc.getMolecularWeight(rs);
1443        } catch (IllegalSymbolException e) {
1444            throw new RuntimeException("Found illegal symbol", e);
1445        }
1446        CRC64Checksum crc = new CRC64Checksum();
1447        String seqstr = rs.seqString();
1448        crc.update(seqstr.getBytes(),0,seqstr.length());
1449        xml.openTag(SEQUENCE_TAG);
1450        xml.attribute(SEQUENCE_VERSION_ATTR,""+rs.getVersion());
1451        xml.attribute(SEQUENCE_LENGTH_ATTR,""+rs.length());
1452        xml.attribute(SEQUENCE_MASS_ATTR,""+mw);
1453        xml.attribute(SEQUENCE_CHECKSUM_ATTR,""+crc);
1454        xml.attribute(SEQUENCE_MODIFIED_ATTR,(udat==null?cdat:udat)); // sequence update
1455        String[] lines = StringTools.wordWrap(rs.seqString(), "\\s+", this.getLineWidth());
1456        for (int i = 0; i < lines.length; i ++) xml.println(lines[i]);
1457        xml.closeTag(SEQUENCE_TAG);
1458        
1459        // close entry
1460        xml.closeTag(ENTRY_TAG);
1461        
1462        // copyright (if present)
1463        if (copyright!=null) {
1464            xml.openTag(COPYRIGHT_TAG);
1465            xml.println(copyright);
1466            xml.closeTag(COPYRIGHT_TAG);
1467        }
1468        
1469        pw.flush();
1470    }
1471    
1472    /**
1473     * {@inheritDoc}
1474     */
1475    public String getDefaultFormat() {
1476        return UNIPROTXML_FORMAT;
1477    }
1478    
1479    // SAX event handler for parsing http://www.ebi.uniprot.org/support/docs/uniprot.xsd
1480    private class UniProtXMLHandler extends DefaultHandler {
1481        
1482        private RichSequenceFormat parent;
1483        private SymbolTokenization symParser;
1484        private RichSeqIOListener rlistener;
1485        private Namespace ns;
1486        private StringBuffer m_currentString;
1487        
1488        private NCBITaxon tax;
1489        private RichFeature.Template templ;
1490        private StringBuffer proteinDesc;
1491        private boolean firstNameInProteinGroup;
1492        private boolean firstDomainInProteinGroup;
1493        private boolean firstComponentInProteinGroup;
1494        private int currGene;
1495        private String geneNameClass;
1496        private String organismNameClass;
1497        private Map currNames = new TreeMap();
1498        private StringBuffer organelleDesc;
1499        private List currDBXrefs = new ArrayList();
1500        private List currComments = new ArrayList();
1501        private String currRefLocation;
1502        private List currRefAuthors;
1503        private String currRefTitle;
1504        private int currRefStart;
1505        private int currRefEnd;
1506        private int currRefRank;
1507        private String currPersonIs;
1508        private int currRCID;
1509        private int currEvID;
1510        private String currKWID;
1511        private UniProtCommentParser currUCParser;
1512        private Interaction currUCParserInteract;
1513        private Event currUCParserEvent;
1514        private Isoform currUCParserIsoform;
1515        private String currLocIsFor;
1516        private String currTextIsFor;
1517        private String currNoteIsFor;
1518        private String currSeqIsFor;
1519        private String currIDIsFor;
1520        private String currNameIsFor;
1521        private int interactantCount;
1522        private StringBuffer currLocStr;
1523        private int featNoteRank;
1524        
1525        // construct a new handler that will populate the given list of sequences
1526        private UniProtXMLHandler(RichSequenceFormat parent,
1527                SymbolTokenization symParser,
1528                RichSeqIOListener rlistener,
1529                Namespace ns) {
1530            this.parent = parent;
1531            this.symParser = symParser;
1532            this.rlistener = rlistener;
1533            this.ns = ns;
1534            this.m_currentString = new StringBuffer();
1535        }
1536        
1537        
1538        // process an opening tag
1539        @Override
1540        public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
1541            
1542            if (qName.equals(ENTRY_TAG)) {
1543                try {
1544                    for (int i = 0; i < attributes.getLength(); i++) {
1545                        String name = attributes.getQName(i);
1546                        String val = attributes.getValue(i);
1547                        if (name.equals(ENTRY_NAMESPACE_ATTR) && this.ns==null) ns=(Namespace)RichObjectFactory.getObject(SimpleNamespace.class,new Object[]{val});
1548                        else if (name.equals(ENTRY_VERSION_ATTR)) rlistener.addSequenceProperty(Terms.getRelAnnotatedTerm(), val);
1549                        else if (name.equals(ENTRY_CREATED_ATTR)) rlistener.addSequenceProperty(Terms.getDateCreatedTerm(), val);
1550                        else if (name.equals(ENTRY_UPDATED_ATTR)) rlistener.addSequenceProperty(Terms.getDateAnnotatedTerm(), val);
1551                    }
1552                    if (this.ns==null) ns=RichObjectFactory.getDefaultNamespace();
1553                    rlistener.setNamespace(ns);
1554                } catch (ParseException e) {
1555                    throw new SAXException(e);
1556                }
1557                this.currNameIsFor = "ENTRY";
1558                this.currSeqIsFor = "ENTRY";
1559                this.currGene = 0;
1560                this.currNames.clear();
1561                this.currRefRank = 0;
1562                this.currRCID = 0;
1563                this.currEvID = 0;
1564            }
1565            
1566            else if (qName.equals(PROTEIN_TAG)) {
1567                for (int i = 0; i < attributes.getLength(); i++) {
1568                    String name = attributes.getQName(i).trim();
1569                    String val = attributes.getValue(i).trim();
1570                    try {
1571                        if (name.equals(TYPE_ATTR)) rlistener.addSequenceProperty(Terms.getProteinTypeTerm(),val);
1572                    } catch (ParseException e) {
1573                        throw new SAXException(e);
1574                    }
1575                }
1576                this.proteinDesc = new StringBuffer();
1577                this.currNameIsFor = "PROTEIN";
1578                this.firstNameInProteinGroup = true;
1579                this.firstDomainInProteinGroup = true;
1580                this.firstComponentInProteinGroup = true;
1581            } else if (qName.equals(DOMAIN_TAG)) {
1582                if (!this.firstComponentInProteinGroup) proteinDesc.append("]");
1583                if (this.firstDomainInProteinGroup) proteinDesc.append(" ["+Terms.CONTAINS_PREFIX);
1584                else proteinDesc.append(";");
1585                this.firstDomainInProteinGroup = false;
1586                this.firstNameInProteinGroup = true;
1587            } else if (qName.equals(COMPONENT_TAG)) {
1588                if (!this.firstDomainInProteinGroup) proteinDesc.append("]");
1589                if (this.firstComponentInProteinGroup) proteinDesc.append(" ["+Terms.INCLUDES_PREFIX);
1590                else proteinDesc.append(";");
1591                this.firstComponentInProteinGroup = false;
1592                this.firstNameInProteinGroup = true;
1593            }
1594            
1595            else if (qName.equals(GENE_TAG)) {
1596                this.currGene++;
1597                this.currNameIsFor="GENE";
1598            }
1599            
1600            else if (qName.equals(NAME_TAG)) {
1601                if (this.currNameIsFor.equals("GENE")) {
1602                    for (int i = 0; i < attributes.getLength(); i++) {
1603                        String name = attributes.getQName(i);
1604                        String val = attributes.getValue(i);
1605                        if (name.equals(TYPE_ATTR)) this.geneNameClass=val;
1606                    }
1607                }
1608                
1609                else if (this.currNameIsFor.equals("ORGANISM")) {
1610                    for (int i = 0; i < attributes.getLength(); i++) {
1611                        String name = attributes.getQName(i);
1612                        String val = attributes.getValue(i);
1613                        if (name.equals(TYPE_ATTR)) this.organismNameClass=val;
1614                    }
1615                }
1616            }
1617            
1618            else if (qName.equals(ORGANISM_TAG)) {
1619                this.currNameIsFor="ORGANISM";
1620            }
1621            
1622            else if (qName.equals(DBXREF_TAG)) {
1623                if (this.currNameIsFor.equals("ORGANISM")) {
1624                    Integer taxID = null;
1625                    for (int i = 0; i < attributes.getLength(); i++) {
1626                        String name = attributes.getQName(i);
1627                        String val = attributes.getValue(i);
1628                        if (name.equals(ID_ATTR)) taxID = Integer.valueOf(val);
1629                    }
1630                    try {
1631                        tax = (NCBITaxon)RichObjectFactory.getObject(SimpleNCBITaxon.class, new Object[]{taxID});
1632                        rlistener.setTaxon(tax);
1633                        for (Iterator j = currNames.keySet().iterator(); j.hasNext(); ) {
1634                            String nameClass = (String)j.next();
1635                            Set nameSet = (Set)this.currNames.get(nameClass);
1636                            try {
1637                                for (Iterator k = nameSet.iterator(); k.hasNext(); ) {
1638                                    String name = (String)k.next();
1639                                    tax.addName(nameClass,name);
1640                                }
1641                            } catch (ChangeVetoException ce) {
1642                                throw new ParseException(ce);
1643                            }
1644                        }
1645                    } catch (ParseException e) {
1646                        throw new SAXException(e);
1647                    }
1648                    this.currNames.clear();
1649                }
1650                
1651                else {
1652                    String type = null;
1653                    String id = null;
1654                    for (int i = 0; i < attributes.getLength(); i++) {
1655                        String name = attributes.getQName(i);
1656                        String val = attributes.getValue(i);
1657                        if (name.equals(ID_ATTR)) id = val;
1658                        else if (name.equals(TYPE_ATTR)) type = val;
1659                    }
1660                    CrossRef dbx = (CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{type, id, new Integer(0)});
1661                    this.currDBXrefs.add(dbx);
1662                }
1663            } else if (qName.equals(PROPERTY_TAG)) {
1664                String id = null;
1665                for (int i = 0; i < attributes.getLength(); i++) {
1666                    String name = attributes.getQName(i);
1667                    String val = attributes.getValue(i);
1668                    if (name.equals(VALUE_ATTR)) id = val;
1669                }
1670                Note note = new SimpleNote(Terms.getAdditionalAccessionTerm(),id,1);
1671                try {
1672                    int last = this.currDBXrefs.size();
1673                    ((CrossRef)this.currDBXrefs.get(last-1)).getRichAnnotation().addNote(note);
1674                } catch (ChangeVetoException ce) {
1675                    SAXException pe = new SAXException("Could not annotate identifier terms");
1676                    pe.initCause(ce);
1677                    throw pe;
1678                }
1679            }
1680            
1681            else if (qName.equals(GENELOCATION_TAG)) {
1682                this.currNameIsFor = "ORGANELLE";
1683                this.organelleDesc = new StringBuffer();
1684                for (int i = 0; i < attributes.getLength(); i++) {
1685                    String name = attributes.getQName(i);
1686                    String val = attributes.getValue(i);
1687                    if (name.equals(TYPE_ATTR)) {
1688                        val = val.toUpperCase().charAt(0)+val.substring(1); // init caps for flat format
1689                        if (!val.equals("Plasmid")) this.organelleDesc.append(val);
1690                    }
1691                }
1692            }
1693            
1694            else if (qName.equals(REFERENCE_TAG) && !this.parent.getElideReferences()) {
1695                this.currRefLocation = null;
1696                this.currRefAuthors = new ArrayList();
1697                this.currRefTitle = null;
1698                this.currDBXrefs.clear();
1699                this.currComments.clear();
1700                this.currRefRank++;
1701                this.currRefStart = -999;
1702                this.currRefEnd = -999;
1703            } else if (qName.equals(CITATION_TAG) && !this.parent.getElideReferences()) {
1704                StringBuffer currRef = new StringBuffer();
1705                for (int i = 0; i < attributes.getLength(); i++) {
1706                    String name = attributes.getQName(i);
1707                    String val = attributes.getValue(i);
1708                    // combine everything except type into a fake reference to use if locator is a no-show
1709                    if (!name.equals(TYPE_ATTR)) {
1710                        if (currRef.length()>0) currRef.append(" ");
1711                        currRef.append(val);
1712                    }
1713                }
1714                this.currRefLocation = currRef.toString();
1715            } else if (qName.equals(EDITOR_LIST_TAG)) {
1716                this.currPersonIs = "EDITOR";
1717            } else if (qName.equals(AUTHOR_LIST_TAG)) {
1718                this.currPersonIs = "AUTHOR";
1719            }  else if (qName.equals(PERSON_TAG)) {
1720                for (int i = 0; i < attributes.getLength(); i++) {
1721                    String name = attributes.getQName(i);
1722                    String val = attributes.getValue(i);
1723                    if (name.equals(NAME_ATTR)) {
1724                        if (this.currPersonIs.equals("AUTHOR")) currRefAuthors.add(new SimpleDocRefAuthor(val, false, false));
1725                        else if (this.currPersonIs.equals("EDITOR")) currRefAuthors.add(new SimpleDocRefAuthor(val, false, true));
1726                    }
1727                }
1728            } else if (qName.equals(CONSORTIUM_TAG)) {
1729                for (int i = 0; i < attributes.getLength(); i++) {
1730                    String name = attributes.getQName(i);
1731                    String val = attributes.getValue(i);
1732                    if (name.equals(NAME_ATTR)) {
1733                        if (this.currPersonIs.equals("AUTHOR")) currRefAuthors.add(new SimpleDocRefAuthor(val, true, false));
1734                        else if (this.currPersonIs.equals("EDITOR")) currRefAuthors.add(new SimpleDocRefAuthor(val, true, true));
1735                    }
1736                }
1737            }  else if (qName.equals(RC_LINE_TAG)) {
1738                this.currRCID++;
1739            }
1740            
1741            else if (qName.equals(PROTEIN_EXISTS_TAG)) {
1742                try {
1743                    for (int i = 0; i < attributes.getLength(); i++) {
1744                        String name = attributes.getQName(i);
1745                        String val = attributes.getValue(i);
1746                        if (name.equals(TYPE_ATTR)) rlistener.addSequenceProperty(Terms.getProteinExistsTerm(),val);
1747                    }
1748                } catch (ParseException e) {
1749                    SAXException pe = new SAXException("Could not annotate protein exists terms");
1750                    pe.initCause(e);
1751                    throw pe;
1752                }
1753            }
1754            
1755            else if (qName.equals(KEYWORD_TAG)) {
1756                for (int i = 0; i < attributes.getLength(); i++) {
1757                    String name = attributes.getQName(i);
1758                    String val = attributes.getValue(i);
1759                    if (name.equals(ID_ATTR)) this.currKWID = val;
1760                }
1761            }
1762            
1763            else if (qName.equals(EVIDENCE_TAG)) {
1764                this.currEvID++;
1765                try {
1766                    for (int i = 0; i < attributes.getLength(); i++) {
1767                        String name = attributes.getQName(i);
1768                        String val = attributes.getValue(i);
1769                        if (name.equals(EVIDENCE_CATEGORY_ATTR)) rlistener.addSequenceProperty(Terms.getEvidenceCategoryTerm(),val);
1770                        else if (name.equals(EVIDENCE_DATE_ATTR)) rlistener.addSequenceProperty(Terms.getEvidenceDateTerm(),val);
1771                        else if (name.equals(TYPE_ATTR)) rlistener.addSequenceProperty(Terms.getEvidenceTypeTerm(),val);
1772                        else if (name.equals(EVIDENCE_ATTRIBUTE_ATTR)) rlistener.addSequenceProperty(Terms.getEvidenceAttrTerm(),val);
1773                    }
1774                } catch (ParseException e) {
1775                    SAXException pe = new SAXException("Could not annotate evidence terms");
1776                    pe.initCause(e);
1777                    throw pe;
1778                }
1779            }
1780            
1781            else if (qName.equals(LOCATION_TAG)) {
1782                this.currLocStr = new StringBuffer();
1783                if (this.currLocIsFor.equals("FEATURE")) {
1784                    try {
1785                        for (int i = 0; i < attributes.getLength(); i++) {
1786                            String name = attributes.getQName(i);
1787                            String val = attributes.getValue(i);
1788                            if (name.equals(LOCATION_SEQ_ATTR)) {
1789                                Note note = new SimpleNote(Terms.getLocationSequenceTerm(), val, this.featNoteRank++);
1790                                ((RichAnnotation)templ.annotation).addNote(note);
1791                            }
1792                        }
1793                    } catch (ChangeVetoException e) {
1794                        SAXException pe = new SAXException("Could not create location terms");
1795                        pe.initCause(e);
1796                        throw pe;
1797                    }
1798                }
1799            } else if (qName.equals(LOCATION_BEGIN_TAG) || qName.equals(LOCATION_END_TAG) || qName.equals(LOCATION_POSITION_TAG)) {
1800                StringBuffer pos = new StringBuffer();
1801                pos.append(" "); // space between start and end
1802                for (int i = 0; i < attributes.getLength(); i++) {
1803                    String name = attributes.getQName(i);
1804                    String val = attributes.getValue(i);
1805                    if (name.equals(STATUS_ATTR)) {
1806                        if (val.equals("less than")) pos.append("<");
1807                        else if (val.equals("greater than")) pos.append(">");
1808                    } else if (name.equals(LOCATION_POSITION_ATTR)) {
1809                        pos.append(val);
1810                    }
1811                }
1812                this.currLocStr.append(pos.toString());
1813                if (qName.equals(LOCATION_POSITION_TAG)) currLocStr.append(pos.toString()); // fake it as begin=end
1814            }
1815            
1816            else if (qName.equals(FEATURE_TAG) && !this.parent.getElideFeatures()) {
1817                this.featNoteRank = 1;
1818                templ = new RichFeature.Template();
1819                templ.annotation = new SimpleRichAnnotation();
1820                templ.sourceTerm = Terms.getUniProtXMLTerm();
1821                templ.featureRelationshipSet = new TreeSet();
1822                templ.rankedCrossRefs = new TreeSet();
1823                try {
1824                    for (int i = 0; i < attributes.getLength(); i++) {
1825                        String name = attributes.getQName(i);
1826                        String val = attributes.getValue(i);
1827                        if (name.equals(TYPE_ATTR)) {
1828                            templ.typeTerm = RichObjectFactory.getDefaultOntology().getOrCreateTerm(val);
1829                        } else if (name.equals(ID_ATTR)) {
1830                            Note note = new SimpleNote(Terms.getFTIdTerm(), val, this.featNoteRank++);
1831                            ((RichAnnotation)templ.annotation).addNote(note);
1832                        } else if (name.equals(FEATURE_DESC_ATTR)) {
1833                            Note note = new SimpleNote(Terms.getFeatureDescTerm(), val, this.featNoteRank++);
1834                            ((RichAnnotation)templ.annotation).addNote(note);
1835                        } else if (name.equals(STATUS_ATTR)) {
1836                            Note note = new SimpleNote(Terms.getFeatureStatusTerm(), val, this.featNoteRank++);
1837                            ((RichAnnotation)templ.annotation).addNote(note);
1838                        } else if (name.equals(REF_ATTR)) {
1839                            Note note = new SimpleNote(Terms.getFeatureRefTerm(), val, this.featNoteRank++);
1840                            ((RichAnnotation)templ.annotation).addNote(note);
1841                        }
1842                    }
1843                } catch (ChangeVetoException e) {
1844                    SAXException pe = new SAXException("Could not create location terms");
1845                    pe.initCause(e);
1846                    throw pe;
1847                }
1848                this.currLocStr = new StringBuffer();
1849                this.currLocIsFor = "FEATURE";
1850            }
1851            
1852            else if (qName.equals(COMMENT_TAG)) {
1853                this.currUCParser = new UniProtCommentParser();
1854                this.currUCParser.setInteractions(new ArrayList());
1855                this.currUCParser.setEvents(new ArrayList());
1856                this.currUCParser.setIsoforms(new ArrayList());
1857                this.currUCParser.setKMs(new ArrayList());
1858                this.currUCParser.setVMaxes(new ArrayList());
1859                for (int i = 0; i < attributes.getLength(); i++) {
1860                    String name = attributes.getQName(i).trim();
1861                    String val = attributes.getValue(i).trim();
1862                    if (name.equals(TYPE_ATTR)) {
1863                        String type = val.toUpperCase(); // easier to check this way, plus flat uniprot requires it
1864                        if (type.equals("POSTTRANSLATIONAL MODIFICATION")) type="PTM";
1865                        else if (type.equals("ONLINE INFORMATION")) type="DATABASE";
1866                        currUCParser.setCommentType(type);
1867                    } else if (name.equals(COMMENT_MASS_ATTR)) this.currUCParser.setMolecularWeight(Integer.parseInt(val));
1868                    else if (name.equals(COMMENT_ERROR_ATTR)) this.currUCParser.setMolWeightError(Integer.valueOf(val));
1869                    else if (name.equals(COMMENT_METHOD_ATTR)) this.currUCParser.setMolWeightMethod(val);
1870                    else if (name.equals(NAME_ATTR)) this.currUCParser.setDatabaseName(val);
1871                }
1872                this.currLocIsFor="COMMENT";
1873                this.currTextIsFor="COMMENT";
1874                this.currNoteIsFor="COMMENT";
1875                this.interactantCount = 0;
1876            } else if (qName.equals(COMMENT_ABSORPTION_TAG)) {
1877                this.currTextIsFor="ABSORPTION";
1878            } else if (qName.equals(COMMENT_KINETICS_TAG)) {
1879                this.currTextIsFor="KINETICS";
1880            } else if (qName.equals(COMMENT_LINK_TAG)) {
1881                this.currTextIsFor="KINETICS";
1882                for (int i = 0; i < attributes.getLength(); i++) {
1883                    String name = attributes.getQName(i);
1884                    String val = attributes.getValue(i);
1885                    if (name.equals(COMMENT_LINK_URI_ATTR)) this.currUCParser.setUri(val);
1886                }
1887            } else if (qName.equals(COMMENT_EVENT_TAG)) {
1888                this.currUCParserEvent = new Event();
1889                for (int i = 0; i < attributes.getLength(); i++) {
1890                    String name = attributes.getQName(i);
1891                    String val = attributes.getValue(i);
1892                    if (name.equals(TYPE_ATTR)) {
1893                        val = val.toUpperCase().charAt(0)+val.substring(1); // make first letter upper case for flat uniprot
1894                        this.currUCParserEvent.setType(val);
1895                    }
1896                }
1897                currUCParser.getEvents().add(currUCParserEvent);
1898            } else if (qName.equals(COMMENT_ISOFORM_TAG)) {
1899                this.currUCParserIsoform = new Isoform();
1900                this.currUCParser.getIsoforms().add(currUCParserIsoform);
1901                this.currUCParserEvent.setNamedIsoforms(this.currUCParser.getIsoforms().size());
1902                this.currNameIsFor="ISOFORM";
1903                this.currNoteIsFor="ISOFORM";
1904                this.currSeqIsFor="ISOFORM";
1905                this.currIDIsFor="ISOFORM";
1906            } else if (qName.equals(COMMENT_INTERACTANT_TAG)) {
1907                this.currIDIsFor="INTERACTION";
1908                this.interactantCount++;
1909                for (int i = 0; i < attributes.getLength(); i++) {
1910                    String name = attributes.getQName(i);
1911                    String val = attributes.getValue(i);
1912                    if (name.equals(COMMENT_INTERACT_INTACT_ATTR)) {
1913                        if (this.interactantCount%2==1) {
1914                            this.currUCParserInteract = new Interaction();
1915                            this.currUCParserInteract.setFirstIntActID(val);
1916                            this.currUCParser.getInteractions().add(this.currUCParserInteract);
1917                        }
1918                        else this.currUCParserInteract.setSecondIntActID(val);
1919                    }
1920                }
1921            }
1922            
1923            else if (qName.equals(SEQUENCE_TAG)) {
1924                if (this.currSeqIsFor.equals("ENTRY")) {
1925                    try {
1926                        for (int i = 0; i < attributes.getLength(); i++) {
1927                            String name = attributes.getQName(i);
1928                            String val = attributes.getValue(i);
1929                            if (name.equals(SEQUENCE_MODIFIED_ATTR)) {
1930                                rlistener.addSequenceProperty(Terms.getDateUpdatedTerm(),val);
1931                            }
1932                            else if (name.equals(SEQUENCE_VERSION_ATTR))
1933                                rlistener.setVersion(Integer.parseInt(val));
1934                        }
1935                    } catch (ParseException e) {
1936                        SAXException pe = new SAXException("Could not set sequence properties");
1937                        pe.initCause(e);
1938                        throw pe;
1939                    }
1940                }
1941                
1942                else if (this.currSeqIsFor.equals("ISOFORM")) {
1943                    for (int i = 0; i < attributes.getLength(); i++) {
1944                        String name = attributes.getQName(i);
1945                        String val = attributes.getValue(i);
1946                        if (name.equals(TYPE_ATTR)) {
1947                            val = val.toUpperCase().charAt(0)+val.substring(1); // init caps for flat uniprot
1948                            this.currUCParserIsoform.setSequenceType(val);
1949                        } else if (name.equals(REF_ATTR)) {
1950                            this.currUCParserIsoform.setSequenceRef(val);
1951                        }
1952                    }
1953                }
1954            }
1955        }
1956        
1957        // process a closing tag - we will have read the text already
1958        @Override
1959        public void endElement(String uri, String localName, String qName) throws SAXException {
1960            String val = this.m_currentString.toString().trim();
1961            
1962            try {
1963                if (qName.equals(COPYRIGHT_TAG)) {
1964                    rlistener.addSequenceProperty(Terms.getCopyrightTerm(),val);
1965                }
1966                
1967                else if (qName.equals(ACCESSION_TAG)) {
1968                    rlistener.setAccession(val);
1969                } else if (qName.equals(NAME_TAG)) {
1970                    if (this.currNameIsFor.equals("ENTRY")) rlistener.setName(val);
1971                    
1972                    else if (this.currNameIsFor.equals("PROTEIN")) {
1973                        if (this.firstNameInProteinGroup) {
1974                            proteinDesc.append(" ");
1975                            proteinDesc.append(val);
1976                        } else {
1977                            proteinDesc.append(" (");
1978                            proteinDesc.append(val);
1979                            proteinDesc.append(")");
1980                        }
1981                        this.firstNameInProteinGroup = false;
1982                    }
1983                    
1984                    else if (this.currNameIsFor.equals("GENE")) {
1985                        if (this.geneNameClass.equals(Terms.GENENAME_KEY)) rlistener.addSequenceProperty(Terms.getGeneNameTerm(), this.currGene+":"+val);
1986                        else if (this.geneNameClass.equals(Terms.GENESYNONYM_KEY)) rlistener.addSequenceProperty(Terms.getGeneSynonymTerm(), this.currGene+":"+val);
1987                        else if (this.geneNameClass.equals(Terms.ORDLOCNAME_KEY)) rlistener.addSequenceProperty(Terms.getOrderedLocusNameTerm(), this.currGene+":"+val);
1988                        else if (this.geneNameClass.equals(Terms.ORFNAME_KEY)) rlistener.addSequenceProperty(Terms.getORFNameTerm(), this.currGene+":"+val);
1989                    }
1990                    
1991                    else if (this.currNameIsFor.equals("ORGANISM")) {
1992                        String ournameclass = NCBITaxon.COMMON;
1993                        if (this.organismNameClass.equals(Terms.ABBREV_NAME_KEY)) ournameclass = NCBITaxon.ACRONYM;
1994                        else if (this.organismNameClass.equals(Terms.FULL_NAME_KEY)) ournameclass = NCBITaxon.EQUIVALENT;
1995                        else if (this.organismNameClass.equals(Terms.SCIENTIFIC_NAME_KEY)) ournameclass = NCBITaxon.SCIENTIFIC;
1996                        else if (this.organismNameClass.equals(Terms.SYNONYM_NAME_KEY)) ournameclass = NCBITaxon.SYNONYM;
1997                        if (!this.currNames.containsKey(ournameclass)) this.currNames.put(ournameclass,new TreeSet());
1998                        ((Set)this.currNames.get(ournameclass)).add(val);
1999                    }
2000                    
2001                    else if (this.currNameIsFor.equals("ORGANELLE")) {
2002                        this.organelleDesc.append(", Plasmid ");
2003                        this.organelleDesc.append(val);
2004                    }
2005                    
2006                    else if (this.currNameIsFor.equals("ISOFORM")) {
2007                        this.currUCParserIsoform.getNames().add(val);
2008                    }
2009                }
2010                
2011                else if (qName.equals(PROTEIN_TAG)) {
2012                    if (!this.firstDomainInProteinGroup || !this.firstComponentInProteinGroup) this.proteinDesc.append("]");
2013                    this.proteinDesc.append(".");
2014                    rlistener.setDescription(this.proteinDesc.toString());
2015                }
2016                
2017                else if (qName.equals(ORGANISM_TAG)) {
2018                    this.currNameIsFor="";
2019                }
2020                
2021                else if (qName.equals(GENELOCATION_TAG)) {
2022                    String total = this.organelleDesc.toString().substring(3); // chomp leading ", "
2023                    int lastComma = total.lastIndexOf(',');
2024                    if (lastComma>-1) {
2025                        this.organelleDesc.insert(lastComma+1," and");
2026                        total = this.organelleDesc.toString();
2027                    }
2028                    rlistener.addSequenceProperty(Terms.getOrganelleTerm(), total);
2029                }
2030                
2031                else if (qName.equals(RC_SPECIES_TAG)) {
2032                    rlistener.addSequenceProperty(Terms.getSpeciesTerm(), this.currRCID+":"+val);
2033                } else if (qName.equals(RC_TISSUE_TAG)) {
2034                    rlistener.addSequenceProperty(Terms.getTissueTerm(), this.currRCID+":"+val);
2035                } else if (qName.equals(RC_TRANSP_TAG)) {
2036                    rlistener.addSequenceProperty(Terms.getTransposonTerm(), this.currRCID+":"+val);
2037                } else if (qName.equals(RC_PLASMID_TAG)) {
2038                    rlistener.addSequenceProperty(Terms.getPlasmidTerm(), this.currRCID+":"+val);
2039                }
2040                
2041                else if (qName.equals(TITLE_TAG)) {
2042                    this.currRefTitle = val;
2043                } else if (qName.equals(LOCATOR_TAG)) {
2044                    this.currRefLocation = val;
2045                } else if (qName.equals(RP_LINE_TAG)) {
2046                    this.currComments.add(val);
2047                    // Try to use it to find the location of the reference, if we have one.
2048                    Matcher m = rppat.matcher(val);
2049                    if (m.matches()) {
2050                        this.currRefStart = Integer.parseInt(m.group(1));
2051                        this.currRefEnd = Integer.parseInt(m.group(2));
2052                    }
2053                } else if (qName.equals(REFERENCE_TAG) && !this.parent.getElideReferences()) {
2054                    // do the crossrefs
2055                    CrossRef useForDocRef = null;
2056                    for (Iterator j = this.currDBXrefs.iterator(); j.hasNext();) {
2057                        CrossRef dbx = (CrossRef)j.next();
2058                        RankedCrossRef rdbx = new SimpleRankedCrossRef(dbx,0);
2059                        rlistener.setRankedCrossRef(rdbx);
2060                        if (useForDocRef==null) useForDocRef = dbx;
2061                        else {
2062                            // medline gets priority, then pubmed - if multiple, use last
2063                            if (dbx.getDbname().equalsIgnoreCase(Terms.MEDLINE_KEY) || 
2064                                    (dbx.getDbname().equalsIgnoreCase(Terms.PUBMED_KEY) && 
2065                                    !useForDocRef.getDbname().equalsIgnoreCase(Terms.MEDLINE_KEY))) {
2066                                useForDocRef = dbx;
2067                            }
2068                        }
2069                    }
2070                    // do the comment - can only be one in this object model
2071                    String currRefRemark = null;
2072                    if (currComments.size()>0) currRefRemark = (String)currComments.iterator().next();
2073                    // create the docref object
2074                    try {
2075                        DocRef dr = (DocRef)RichObjectFactory.getObject(SimpleDocRef.class,new Object[]{currRefAuthors,currRefLocation,currRefTitle});
2076                        // assign the pubmed or medline to the docref - medline gets priority
2077                        if (useForDocRef!=null) dr.setCrossref(useForDocRef);
2078                        // assign the remarks
2079                        dr.setRemark(currRefRemark);
2080                        // assign the docref to the bioentry
2081                        RankedDocRef rdr = new SimpleRankedDocRef(dr,
2082                                (currRefStart != -999 ? new Integer(currRefStart) : null),
2083                                (currRefEnd != -999 ? new Integer(currRefEnd) : null),
2084                                currRefRank);
2085                        rlistener.setRankedDocRef(rdr);
2086                    } catch (ChangeVetoException e) {
2087                        throw new ParseException(e);
2088                    }
2089                    currDBXrefs.clear();
2090                    currComments.clear();
2091                }
2092                
2093                // keywords
2094                else if (qName.equals(KEYWORD_TAG)) {
2095                    // create and persist term
2096                    ComparableTerm t = Terms.getUniprotKWOnto().getOrCreateTerm(val);
2097                    try {
2098                        t.setIdentifier(currKWID);
2099                    } catch (ChangeVetoException e) {
2100                        throw new ParseException(e);
2101                    }
2102                    rlistener.addSequenceProperty(Terms.getKeywordTerm(), val);
2103                }
2104                
2105                else if (qName.equals(LOCATION_TAG)) {
2106                    if (currLocIsFor.equals("FEATURE")) {
2107                        templ.location = UniProtLocationParser.parseLocation(currLocStr.toString());
2108                    } else if (currLocIsFor.equals("COMMENT")) {
2109                        Location l = UniProtLocationParser.parseLocation(currLocStr.toString());
2110                        this.currUCParser.setMolWeightRangeStart(l.getMin());
2111                        this.currUCParser.setMolWeightRangeEnd(l.getMax());
2112                    }
2113                }
2114                
2115                else if (qName.equals(FEATURE_TAG)) {
2116                    // start the feature from the template we built
2117                    rlistener.startFeature(templ);
2118                    // end the feature
2119                    rlistener.endFeature();
2120                } else if (qName.equals(FEATURE_ORIGINAL_TAG)) {
2121                    try {
2122                        Note note = new SimpleNote(Terms.getFeatureOriginalTerm(), val, featNoteRank++);
2123                        ((RichAnnotation)templ.annotation).addNote(note);
2124                    } catch (ChangeVetoException e) {
2125                        SAXException pe = new SAXException("Could not create location terms");
2126                        pe.initCause(e);
2127                        throw pe;
2128                    }
2129                } else if (qName.equals(FEATURE_VARIATION_TAG)) {
2130                    try {
2131                        Note note = new SimpleNote(Terms.getFeatureVariationTerm(), val, featNoteRank++);
2132                        ((RichAnnotation)templ.annotation).addNote(note);
2133                    } catch (ChangeVetoException e) {
2134                        SAXException pe = new SAXException("Could not create location terms");
2135                        pe.initCause(e);
2136                        throw pe;
2137                    }
2138                }
2139                
2140                else if (qName.equals(COMMENT_TAG)) {
2141                    rlistener.setComment(currUCParser.generate());
2142                } else if (qName.equals(TEXT_TAG)) {
2143                    if (this.currTextIsFor.equals("COMMENT")) currUCParser.setText(val);
2144                    else if (this.currTextIsFor.equals("ABSORPTION")) currUCParser.setAbsorptionNote(val);
2145                    else if (this.currTextIsFor.equals("KINETICS")) currUCParser.setKineticsNote(val);
2146                } else if (qName.equals(COMMENT_ABS_MAX_TAG)) {
2147                    currUCParser.setAbsorptionMax(val);
2148                } else if (qName.equals(COMMENT_KIN_KM_TAG)) {
2149                    currUCParser.getKMs().add(val);
2150                } else if (qName.equals(COMMENT_KIN_VMAX_TAG)) {
2151                    currUCParser.getVMaxes().add(val);
2152                } else if (qName.equals(COMMENT_PH_TAG)) {
2153                    currUCParser.setPHDependence(val);
2154                } else if (qName.equals(COMMENT_REDOX_TAG)) {
2155                    currUCParser.setRedoxPotential(val);
2156                } else if (qName.equals(COMMENT_TEMPERATURE_TAG)) {
2157                    currUCParser.setTemperatureDependence(val);
2158                } else if (qName.equals(COMMENT_ORGANISMS_TAG)) {
2159                    if (val.equalsIgnoreCase("true")) currUCParserInteract.setOrganismsDiffer(true);
2160                    else currUCParserInteract.setOrganismsDiffer(false);
2161                } else if (qName.equals(COMMENT_EXPERIMENTS_TAG)) {
2162                    currUCParserInteract.setNumberExperiments(Integer.parseInt(val));
2163                } else if (qName.equals(NOTE_TAG)) {
2164                    if (currNoteIsFor.equals("COMMENT")) currUCParser.setNote(val);
2165                    else if (currNoteIsFor.equals("ISOFORM")) currUCParser.setNote(val);
2166                } else if (qName.equals(COMMENT_EVENT_TAG)) {
2167                    currUCParserEvent.setComment(val);
2168                } else if (qName.equals(COMMENT_ISOFORM_TAG)) {
2169                    this.currSeqIsFor = "ENTRY";
2170                    this.currNoteIsFor = "COMMENT";
2171                } else if (qName.equals(ID_TAG)) {
2172                    if (currIDIsFor.equals("ISOFORM")) currUCParserIsoform.getIsoIDs().add(val);
2173                    else if (currIDIsFor.equals("INTERACTION")) currUCParserInteract.setID(val);
2174                } else if (qName.equals(COMMENT_INTERACT_LABEL_TAG)) {
2175                    currUCParserInteract.setLabel(val);
2176                }
2177                
2178                else if (qName.equals(SEQUENCE_TAG)) {
2179                    if (this.currSeqIsFor.equals("ENTRY") && !this.parent.getElideSymbols()) {
2180                        try {
2181                            SymbolList sl = new SimpleSymbolList(symParser,
2182                                    val.replaceAll("\\s+","").replaceAll("[\\.|~]","-"));
2183                            rlistener.addSymbols(symParser.getAlphabet(),
2184                                    (Symbol[])(sl.toList().toArray(new Symbol[0])),
2185                                    0, sl.length());
2186                        } catch (Exception e) {
2187                            throw new ParseException(e);
2188                        }
2189                    }
2190                }
2191                
2192                else if (qName.equals(ENTRY_TAG)) {
2193                    // do the comments
2194                    for (Iterator j = currComments.iterator(); j.hasNext();) {
2195                        rlistener.setComment((String)j.next());
2196                    }
2197                    // do the crossrefs
2198                    for (Iterator j = currDBXrefs.iterator(); j.hasNext();) {
2199                        CrossRef dbx = (CrossRef)j.next();
2200                        RankedCrossRef rdbx = new SimpleRankedCrossRef(dbx, 0);
2201                        rlistener.setRankedCrossRef(rdbx);
2202                    }
2203                    // end the sequence
2204                    currComments.clear();
2205                    currDBXrefs.clear();
2206                }
2207                
2208            } catch (ParseException e) {
2209                throw new SAXException(e);
2210            }
2211            
2212            // drop old string
2213            this.m_currentString.setLength(0);
2214        }
2215        
2216        // process text inside tags
2217        @Override
2218        public void characters(char[] ch, int start, int length) {
2219            this.m_currentString.append(ch, start, length);
2220        }
2221    }
2222}
2223