Source code

001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022package org.biojavax.bio.seq.io;
023
024import java.io.BufferedInputStream;
025import java.io.BufferedReader;
026import java.io.File;
027import java.io.FileReader;
028import java.io.IOException;
029import java.io.InputStreamReader;
030import java.io.PrintStream;
031import java.util.ArrayList;
032import java.util.Iterator;
033import java.util.List;
034import java.util.Map;
035import java.util.Set;
036import java.util.TreeMap;
037import java.util.TreeSet;
038import java.util.regex.Matcher;
039import java.util.regex.Pattern;
040
041import org.biojava.bio.proteomics.MassCalc;
042import org.biojava.bio.seq.Sequence;
043import org.biojava.bio.seq.io.ParseException;
044import org.biojava.bio.seq.io.SeqIOListener;
045import org.biojava.bio.seq.io.SymbolTokenization;
046import org.biojava.bio.symbol.IllegalAlphabetException;
047import org.biojava.bio.symbol.IllegalSymbolException;
048import org.biojava.bio.symbol.SimpleSymbolList;
049import org.biojava.bio.symbol.Symbol;
050import org.biojava.bio.symbol.SymbolList;
051import org.biojava.ontology.Term;
052import org.biojava.utils.ChangeVetoException;
053import org.biojavax.Comment;
054import org.biojavax.CrossRef;
055import org.biojavax.DocRef;
056import org.biojavax.DocRefAuthor;
057import org.biojavax.Namespace;
058import org.biojavax.Note;
059import org.biojavax.RankedCrossRef;
060import org.biojavax.RankedDocRef;
061import org.biojavax.RichObjectFactory;
062import org.biojavax.SimpleComment;
063import org.biojavax.SimpleCrossRef;
064import org.biojavax.SimpleDocRef;
065import org.biojavax.SimpleDocRefAuthor;
066import org.biojavax.SimpleNote;
067import org.biojavax.SimpleRankedCrossRef;
068import org.biojavax.SimpleRankedDocRef;
069import org.biojavax.SimpleRichAnnotation;
070import org.biojavax.bio.seq.RichFeature;
071import org.biojavax.bio.seq.RichLocation;
072import org.biojavax.bio.seq.RichSequence;
073import org.biojavax.bio.taxa.NCBITaxon;
074import org.biojavax.bio.taxa.SimpleNCBITaxon;
075import org.biojavax.ontology.ComparableTerm;
076import org.biojavax.utils.CRC64Checksum;
077import org.biojavax.utils.StringTools;
078
079/**
080 * Format reader for UniProt files. This version of UniProt format will generate
081 * and write RichSequence objects. Loosely Based on code from the old, deprecated,
082 * org.biojava.bio.seq.io.EMBLLikeFormat object. Since 1.7, the parser reads the
083 * International Protein Index (IPI) pseudo-Uniprot format.
084 *
085 * @author Richard Holland
086 * @author Mark Schreiber
087 * @author George Waldon
088 * @since 1.5
089 */
090public class UniProtFormat extends RichSequenceFormat.HeaderlessFormat {
091    
092    // Register this format with the format auto-guesser.
093    static {
094        RichSequence.IOTools.registerFormat(UniProtFormat.class);
095    }
096    
097    /**
098     * The name of this format
099     */
100    public static final String UNIPROT_FORMAT = "UniProt";
101
102    private static final String SUBFORMAT_UNIPROT = "UniProt";
103    private static final String SUBFORMAT_IPI = "IPI";
104    
105    protected static final String LOCUS_TAG = "ID";
106    protected static final String ACCESSION_TAG = "AC";
107    protected static final String DEFINITION_TAG = "DE";
108    protected static final String DATE_TAG = "DT";
109    protected static final String SOURCE_TAG = "OS";
110    protected static final String ORGANELLE_TAG = "OG";
111    protected static final String ORGANISM_TAG = "OC";
112    protected static final String TAXON_TAG = "OX";
113    protected static final String GENE_TAG = "GN";
114    protected static final String DATABASE_XREF_TAG = "DR";
115    protected static final String PROTEIN_EXIST_TAG = "PE";
116    protected static final String REFERENCE_TAG = "RN";
117    protected static final String RP_LINE_TAG = "RP";
118    protected static final String REFERENCE_XREF_TAG = "RX";
119    protected static final String AUTHORS_TAG = "RA";
120    protected static final String CONSORTIUM_TAG = "RG";
121    protected static final String TITLE_TAG = "RT";
122    protected static final String LOCATION_TAG = "RL";
123    protected static final String RC_LINE_TAG = "RC";
124    protected static final String KEYWORDS_TAG = "KW";
125    protected static final String COMMENT_TAG = "CC";
126    protected static final String FEATURE_TAG = "FT";
127    protected static final String START_SEQUENCE_TAG = "SQ";
128    protected static final String END_SEQUENCE_TAG = "//";
129    
130    // locus line for uniprot format
131    protected static final Pattern lp_uniprot = Pattern.compile("^((\\S+)_(\\S+))\\s+(\\S+);\\s+(PRT)?;?\\s*\\d+\\s+AA\\.$");
132    // locus line for IPI format
133    protected static final Pattern lp_ipi = Pattern.compile("^((\\S+)\\.(\\d+))\\s+(IPI);\\s+(PRT)?;?\\s*\\d+\\s+AA\\.$");
134    // RP line parser
135    protected static final Pattern rppat = Pattern.compile("SEQUENCE OF (\\d+)-(\\d+)");
136    // date lineDT for uniprot
137    // date, integrated into UniProtKB/database_name.
138    // date, sequence version x.
139    // date, entry version x.
140    protected static final Pattern dp_uniprot = Pattern.compile("([^,]+),([^\\d\\.]+)(\\d+)?\\.$");
141    // date lineDT for IPI
142    // date (xxx, Created)
143    // date (xxx, Last sequence update)
144    protected static final Pattern dp_ipi = Pattern.compile("([^\\(]+)\\(([^,]+),([^\\)]+)\\)$");
145    // feature line
146    protected static final Pattern fp = Pattern.compile("^\\s*([\\d?<]+\\s+[\\d?>]+)(\\s+(.*))?$");
147    
148    protected static final Pattern headerLine = Pattern.compile("^ID.*");
149    
150    /**
151     * Implements some UniProt-specific terms.
152     */
153    public static class Terms extends RichSequence.Terms {        
154        private static String GENENAME_KEY = "Name";
155        private static String GENESYNONYM_KEY = "Synonyms";
156        private static String ORDLOCNAME_KEY = "OrderedLocusNames";
157        private static String ORFNAME_KEY = "ORFNames";
158        
159        /**
160         * Getter for the UniProt term
161         * @return The UniProt Term
162         */
163        public static ComparableTerm getUniProtTerm() {
164            return RichObjectFactory.getDefaultOntology().getOrCreateTerm("UniProt");
165        }
166        
167        /**
168         * Getter for the UniProt combined database term
169         * @return The combined database for UniProt Term
170         */
171        public static ComparableTerm getUniProtDBNameTerm() {
172            return RichObjectFactory.getDefaultOntology().getOrCreateTerm("UniProt database name");
173        }
174        
175        /**
176         * Getter for the protein exists term
177         * @return The protein exists Term
178         */
179        public static ComparableTerm getProteinExistsTerm() {
180            return RichObjectFactory.getDefaultOntology().getOrCreateTerm("UniProt protein exists");
181        }
182    }
183    
184    /**
185     * {@inheritDoc}
186     * A file is in UniProt format if the first line matches the UniProt format for the ID line.
187     */
188    public boolean canRead(File file) throws IOException {
189        BufferedReader br = new BufferedReader(new FileReader(file));
190        String firstLine = br.readLine();
191        boolean readable = firstLine!=null && headerLine.matcher(firstLine).matches() && 
192                (lp_uniprot.matcher(firstLine.substring(3).trim()).matches() ||
193                lp_ipi.matcher(firstLine.substring(3).trim()).matches());
194        br.close();
195        return readable;
196    }
197    
198    /**
199     * {@inheritDoc}
200     * Always returns a protein tokenizer.
201     */
202    public SymbolTokenization guessSymbolTokenization(File file) throws IOException {
203        return RichSequence.IOTools.getProteinParser();
204    }
205    
206    /**
207     * {@inheritDoc}
208     * A stream is in UniProt format if the first line matches the UniProt format for the ID line.
209     */
210    public boolean canRead(BufferedInputStream stream) throws IOException {
211        stream.mark(2000); // some streams may not support this
212        BufferedReader br = new BufferedReader(new InputStreamReader(stream));
213        String firstLine = br.readLine();
214        boolean readable = firstLine!=null && headerLine.matcher(firstLine).matches() && 
215                (lp_uniprot.matcher(firstLine.substring(3).trim()).matches()
216                || lp_ipi.matcher(firstLine.substring(3).trim()).matches());
217        // don't close the reader as it'll close the stream too.
218        // br.close();
219        stream.reset();
220        return readable;
221    }
222    
223    /**
224     * {@inheritDoc}
225     * Always returns a protein tokenizer.
226     */
227    public SymbolTokenization guessSymbolTokenization(BufferedInputStream stream) throws IOException {
228        return RichSequence.IOTools.getProteinParser();
229    }
230    
231    /**
232     * {@inheritDoc}
233     */
234    public boolean readSequence(BufferedReader reader,
235            SymbolTokenization symParser,
236            SeqIOListener listener)
237            throws IllegalSymbolException, IOException, ParseException {
238        if (!(listener instanceof RichSeqIOListener)) throw new IllegalArgumentException("Only accepting RichSeqIOListeners today");
239        return this.readRichSequence(reader,symParser,(RichSeqIOListener)listener,null);
240    }
241    
242    private String accession = null;
243    
244    /**
245     * {@inheritDoc}
246     */
247    public boolean readRichSequence(BufferedReader reader,
248            SymbolTokenization symParser,
249            RichSeqIOListener rlistener,
250            Namespace ns)
251            throws IllegalSymbolException, IOException, ParseException {
252        
253        boolean hasAnotherSequence = true;
254        //boolean hasInternalWhitespace = false;
255
256        String subformat = SUBFORMAT_UNIPROT;
257        
258        rlistener.startSequence();
259        
260        if (ns==null) ns=RichObjectFactory.getDefaultNamespace();
261        rlistener.setNamespace(ns);
262        
263        // Get an ordered list of key->value pairs in array-tuples
264        String sectionKey = null;
265        NCBITaxon tax = null;
266        accession = null;
267        List section = null;
268        try{
269            do {
270                
271                section = this.readSection(reader);
272                sectionKey = ((String[])section.get(0))[0];
273                if(sectionKey == null){
274                    String message = ParseException.newMessage(this.getClass(),accession, "", "Section key was null", sectionToString(section));
275                    throw new ParseException(message);
276                }
277                // process section-by-section
278                if (sectionKey.equals(LOCUS_TAG)) {
279                    // entryname  dataclass; moltype; sequencelength AA.
280                    String loc = ((String[])section.get(0))[1];
281                    Matcher m = lp_uniprot.matcher(loc);
282                    if (m.matches()) {
283                        rlistener.setName(m.group(2));
284                        rlistener.setDivision(m.group(3));
285                        if (m.groupCount() > 4){
286                            rlistener.addSequenceProperty(Terms.getDataClassTerm(),m.group(4));
287                            rlistener.addSequenceProperty(Terms.getMolTypeTerm(),m.group(5));
288                        }else{
289                            rlistener.addSequenceProperty(Terms.getDataClassTerm(), m.group(4));
290                            rlistener.addSequenceProperty(Terms.getMolTypeTerm(), "");
291                        }
292                    } else {
293                        m = lp_ipi.matcher(loc);
294                        if (m.matches()) {
295                            subformat = SUBFORMAT_IPI;
296                            rlistener.setName(m.group(2));
297                            rlistener.setVersion(Integer.parseInt(m.group(3)));
298                            rlistener.addSequenceProperty(Terms.getDataClassTerm(), m.group(4));
299                            rlistener.addSequenceProperty(Terms.getMolTypeTerm(),m.group(5));
300                        } else {
301                            String message = ParseException.newMessage(this.getClass(),accession, "", "Bad ID line", sectionToString(section));
302                            throw new ParseException(message);
303                        }
304                    }
305                } else if (sectionKey.equals(DEFINITION_TAG)) {
306                    String val = ((String[])section.get(0))[1];
307                    if (val.endsWith(".")) val = val.substring(0, val.length()-1); // chomp dot
308                    rlistener.setDescription(val);
309                } else if (sectionKey.equals(SOURCE_TAG)) {
310                    // use SOURCE_TAG and TAXON_TAG values
311                    String sciname = null;
312                    String comname = null;
313                    List synonym = new ArrayList();
314                    int taxid = 0;
315                    for (int i = 0; i < section.size(); i++) {
316                        String tag = ((String[])section.get(i))[0];
317                        String value = ((String[])section.get(i))[1].trim();
318                        value = value.replace("\n", " ");
319                        value = value.replace("\r\n", " ");
320                        
321                        if (tag.equals(SOURCE_TAG)) {
322                            if (value.endsWith(".")) value = value.substring(0,value.length()-1); // chomp trailing dot
323                            String[] parts = value.split("\\(");
324                            sciname = parts[0].trim();
325                            if (parts.length>1) {
326                                comname = parts[1].trim();
327                                if (comname.endsWith(")")) comname = comname.substring(0,comname.length()-1); // chomp trailing bracket
328                                if (parts.length>2) {
329                                    // synonyms
330                                    for (int j = 2 ; j < parts.length; j++) {
331                                        String syn = parts[j].trim();
332                                        if (syn.endsWith(")")) syn = syn.substring(0,syn.length()-1); // chomp trailing bracket
333                                        synonym.add(syn);
334                                    }
335                                }
336                            }
337                        } else if (tag.equals(TAXON_TAG)) {
338                            String[] parts = value.split(";");
339                            for (int j = 0; j < parts.length; j++) {
340                                String[] bits = parts[j].split("=");
341                                if (bits[0].equals("NCBI_TaxID")) {
342                                    String[] morebits = bits[1].split(",");
343                                    taxid = Integer.parseInt(morebits[0].trim());
344                                }
345                            }
346                        } else if (tag.equals(ORGANELLE_TAG)) {
347                            if (value.endsWith(".")) value = value.substring(0,value.length()-1); // chomp trailing dot
348                            String[] parts = value.split(";");
349                            for (int j = 0; j < parts.length; j++) {
350                                parts[j]=parts[j].trim();
351                                rlistener.addSequenceProperty(Terms.getOrganelleTerm(),parts[j]);
352                            }
353                        }
354                    }
355                    // Set the Taxon
356                    tax = (NCBITaxon)RichObjectFactory.getObject(SimpleNCBITaxon.class, new Object[]{new Integer(taxid)});
357                    rlistener.setTaxon(tax);
358                    try {
359                        if (sciname!=null) tax.addName(NCBITaxon.SCIENTIFIC,sciname);
360                        if (comname!=null) tax.addName(NCBITaxon.COMMON,comname);
361                        for (Iterator j = synonym.iterator(); j.hasNext(); ) tax.addName(NCBITaxon.SYNONYM, (String)j.next());
362                    } catch (ChangeVetoException e) {
363                        throw new ParseException(e);
364                    }
365                } else if (sectionKey.equals(DATE_TAG)) {
366                    String chunk = ((String[])section.get(0))[1];
367                    if(subformat.equals(SUBFORMAT_UNIPROT)) {
368                        Matcher dm = dp_uniprot.matcher(chunk);
369                        if (dm.matches()) {
370                            String date = dm.group(1).trim();
371                            String type = dm.group(2).trim();
372                            String rel = dm.group(3);
373                            if (rel!=null) rel = rel.trim();
374                            if (type.startsWith("integrated into UniProtKB")) {
375                                String dbname = type.split("/")[1];
376                                rlistener.addSequenceProperty(Terms.getDateCreatedTerm(), date);
377                                rlistener.addSequenceProperty(Terms.getUniProtDBNameTerm(), dbname);
378                            } else if (type.equalsIgnoreCase("sequence version")) {
379                                if (rel==null){
380                                    String message = ParseException.newMessage(this.getClass(),accession, "", "Version missing for "+type, sectionToString(section));
381                                    throw new ParseException(message);
382                                }
383                                rlistener.addSequenceProperty(Terms.getDateUpdatedTerm(), date);
384                                rlistener.setVersion(Integer.parseInt(rel));
385                            } else if (type.equalsIgnoreCase("entry version")) {
386                                if (rel==null) {
387                                    String message = ParseException.newMessage(this.getClass(),accession, "", "Version missing for "+type, sectionToString(section));
388                                    throw new ParseException(message);
389                                }
390                                rlistener.addSequenceProperty(Terms.getDateAnnotatedTerm(), date);
391                                rlistener.addSequenceProperty(Terms.getRelAnnotatedTerm(), rel);
392                            } else {
393                                String message = ParseException.newMessage(this.getClass(),accession, "", "Bad date type "+type, sectionToString(section));
394                                throw new ParseException(message);
395                            }
396                        } else {
397                            String message = ParseException.newMessage(this.getClass(),accession, "", "Bad date line", sectionToString(section));
398                            throw new ParseException(message);
399                        }
400                    } else if(subformat.equals(SUBFORMAT_IPI)) {
401                        Matcher dm = dp_ipi.matcher(chunk);
402                        if (dm.matches()) {
403                            String date = dm.group(1).trim();
404                            String type = dm.group(3).trim();
405                            if(type.equals("Created")) {
406                                rlistener.addSequenceProperty(Terms.getDateCreatedTerm(), date);
407                            } else if(type.equals("Last sequence update")) {
408                                rlistener.addSequenceProperty(Terms.getDateUpdatedTerm(), date);
409                            } else {
410                                String message = ParseException.newMessage(this.getClass(),accession, "", "Bad date type "+type, sectionToString(section));
411                                throw new ParseException(message);
412                            }
413                        } else {
414                            String message = ParseException.newMessage(this.getClass(),accession, "", "Bad date line", sectionToString(section));
415                            throw new ParseException(message);
416                        }
417                    } else {
418                            String message = ParseException.newMessage(this.getClass(),accession, "", "Unknown date line format", sectionToString(section));
419                            throw new ParseException(message);
420                    }
421                } else if (sectionKey.equals(ACCESSION_TAG)) {
422                    // if multiple accessions, store only first as accession,
423                    // and store rest in annotation
424                    String[] accs = ((String[])section.get(0))[1].split(";");
425                    if(accs.length>0) accession = accs[0].trim(); else accession = "";
426                    rlistener.setAccession(accession);
427                    for (int i = 1; i < accs.length; i++) {
428                        rlistener.addSequenceProperty(Terms.getAdditionalAccessionTerm(),accs[i].trim());
429                    }
430                } else if (sectionKey.equals(PROTEIN_EXIST_TAG)) {
431                    String val = ((String[])section.get(0))[1];
432                    if (val.endsWith(";")) val = val.substring(0, val.length()-1); // chomp semicolon
433                    rlistener.addSequenceProperty(Terms.getProteinExistsTerm(),val.trim());
434                } else if (sectionKey.equals(KEYWORDS_TAG)) {
435                    String val = ((String[])section.get(0))[1];
436                    if (val.endsWith(".")) val = val.substring(0, val.length()-1); // chomp dot
437                    val = val.replace('\n',' '); //remove newline
438                    String[] kws = val.split(";");
439                    for (int i = 0; i < kws.length; i++) {
440                        String kw = kws[i].trim();
441                        if (kw.length()==0) continue;
442                        rlistener.addSequenceProperty(Terms.getKeywordTerm(), kw);
443                    }
444                } else if (sectionKey.equals(GENE_TAG)) {
445                    String[] genes = ((String[])section.get(0))[1].split("\\s+(or|and)\\s+");
446                    for (int geneID = 0; geneID < genes.length; geneID++) {
447                        String[] parts = genes[geneID].split(";");
448                        for (int j = 0; j < parts.length; j++) {
449                            String[] moreparts = parts[j].split("=");
450                            String[] values = moreparts[1].split(",");
451                            // nasty hack - we really should have notes on the gene object itself... if such a thing existed...
452                            if (moreparts[0].trim().equals(Terms.GENENAME_KEY)) rlistener.addSequenceProperty(Terms.getGeneNameTerm(),geneID+":"+values[0].trim());
453                            else if (moreparts[0].trim().equals(Terms.GENESYNONYM_KEY)) {
454                                for (int k = 0; k < values.length; k++) rlistener.addSequenceProperty(Terms.getGeneSynonymTerm(),geneID+":"+values[k].trim());
455                            } else if (moreparts[0].trim().equals(Terms.ORDLOCNAME_KEY)) {
456                                for (int k = 0; k < values.length; k++) rlistener.addSequenceProperty(Terms.getOrderedLocusNameTerm(),geneID+":"+values[k].trim());
457                            } else if (moreparts[0].trim().equals(Terms.ORFNAME_KEY)) {
458                                for (int k = 0; k < values.length; k++) rlistener.addSequenceProperty(Terms.getORFNameTerm(),geneID+":"+values[k].trim());
459                            }
460                        }
461                    }
462                } else if (sectionKey.equals(DATABASE_XREF_TAG)) {
463                    // database_identifier; primary_identifier; secondary_identifier....
464                    String val = ((String[])section.get(0))[1];
465                    if (val.endsWith(".")) val = val.substring(0, val.length()-1); // chomp dot
466                    String[] parts = val.split(";");
467                    // construct a DBXREF out of the dbname part[0] and accession part[1]
468                    String dbname = parts[0].trim();
469                    String acc = parts[1].trim();
470                    CrossRef crossRef = (CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{dbname,acc,new Integer(0)});
471                    // assign remaining bits of info as additional accession annotations
472                    for (int j = 2; j < parts.length; j++) {
473                        ComparableTerm t = (ComparableTerm)Terms.getAdditionalAccessionTerm();
474                        Note note = new SimpleNote(t,parts[j].trim(),j-1);
475                        try {
476                            crossRef.getRichAnnotation().addNote(note);
477                        } catch (ChangeVetoException ce) {
478                            ParseException pe = new ParseException("Could not annotate additional accession terms");
479                            pe.initCause(ce);
480                            throw pe;
481                        }
482                    }
483                    RankedCrossRef rcrossRef = new SimpleRankedCrossRef(crossRef, 0);
484                    rlistener.setRankedCrossRef(rcrossRef);
485                } else if (sectionKey.equals(REFERENCE_TAG) && !this.getElideReferences()) {
486                    // first line of section has rank and location
487                    String refrank = ((String[])section.get(0))[1];
488                    int ref_rank = Integer.parseInt(refrank.substring(1,refrank.length()-1));
489                    // rest can be in any order
490                    String authors = null;
491                    String consortium = null;
492                    String title = null;
493                    String locator = null;
494                    String pubmed = null;
495                    String medline = null;
496                    String doi = null;
497                    String remark = null;
498                    Integer rstart = null;
499                    Integer rend = null;
500                    for (int i = 1; i < section.size(); i++) {
501                        String key = ((String[])section.get(i))[0];
502                        String val = ((String[])section.get(i))[1];
503                        //System.err.println(key+": "+val);
504                        if (key.equals(AUTHORS_TAG)) {
505                            if (val.endsWith(";")) val = val.substring(0, val.length()-1); // chomp semicolon
506                            authors = val.replace('\n',' '); //see #2276
507                        }
508                        if (key.equals(CONSORTIUM_TAG)) {
509                            if (val.endsWith(";")) val = val.substring(0, val.length()-1); // chomp semicolon
510                            consortium = val.replace('\n',' '); //see #2276
511                        }
512                        if (key.equals(TITLE_TAG)) {
513                            if (val.endsWith(";")) val = val.substring(0, val.length()-1); // chomp semicolon
514                            if (val.endsWith("\"")) val = val.substring(1, val.length()-1); // chomp quotes
515                            title = val.replace('\n',' '); //see #2276
516                        }
517                        if (key.equals(LOCATION_TAG)) {
518                            if (val.endsWith(".")) val = val.substring(0, val.length()-1); // chomp dot
519                            locator = val.replace('\n',' '); //see #2276
520                        }
521                        if (key.equals(REFERENCE_XREF_TAG)) {
522                            // database_identifier=primary_identifier;
523                            String[] refs = val.split(";");
524                            for (int j = 0 ; j < refs.length; j++) {
525                                if (refs[j].trim().length()==0) continue;
526                                String[] parts = refs[j].split("=");
527                                if ( parts.length <2) {
528                                        // some DOI lines look like this and are causing problems:
529                                        //DOI=10.1002/(SICI)1097-0215(19990702)82:1<137::AID-IJC23>3.0.CO;2-F;ignoring
530                                        System.err.println("warning: problems while parsing: " + val);
531                                        continue;
532                                }
533                                String db = parts[0].trim();
534                                String ref = parts[1].trim();
535                                if (db.equalsIgnoreCase(Terms.PUBMED_KEY)) pubmed = ref;
536                                else if (db.equalsIgnoreCase(Terms.MEDLINE_KEY)) medline = ref;
537                                else if (db.equalsIgnoreCase(Terms.DOI_KEY)) doi = ref;
538                            }
539                        }
540                        if (key.equals(RP_LINE_TAG)) {
541                            if (val.endsWith(".")) val = val.substring(0, val.length()-1); // chomp dot
542                            remark = val.replace('\n',' '); //see #2276
543                            // Try to use it to find the location of the reference, if we have one.
544                            Matcher m = rppat.matcher(val);
545                            if (m.matches()) {
546                                rstart = Integer.valueOf(m.group(1));
547                                rend = Integer.valueOf(m.group(2));
548                            }
549                        }
550                        if (key.equals(RC_LINE_TAG)) {
551                            // Split into key=value pairs separated by semicolons and terminated with semicolon.
552                            String[] parts = val.split(";");
553                            for (int j = 0; j < parts.length; j++) {
554                                String[] subparts = parts[j].split("=");
555                                // get term for first section
556                                String termName = subparts[0].trim();
557                                Term t;
558                                if (termName.equalsIgnoreCase(Terms.SPECIES_KEY)) t = Terms.getSpeciesTerm();
559                                else if (termName.equalsIgnoreCase(Terms.STRAIN_KEY)) t = Terms.getStrainTerm();
560                                else if (termName.equalsIgnoreCase(Terms.TISSUE_KEY)) t = Terms.getTissueTerm();
561                                else if (termName.equalsIgnoreCase(Terms.TRANSPOSON_KEY)) t = Terms.getTransposonTerm();
562                                else if (termName.equalsIgnoreCase(Terms.PLASMID_KEY)) t = Terms.getPlasmidTerm();
563                                else {
564                                    String message = ParseException.newMessage(this.getClass(),accession, "", "Invalid RC term found: "+termName, sectionToString(section));
565                                    throw new ParseException(message);
566                                }
567                                // assign notes using term and rank:second section as value
568                                // nasty hack - we really should have notes on the reference itself.
569                                rlistener.addSequenceProperty(t, ref_rank+":"+subparts[1].trim());
570                            }
571                        }
572                    }
573                    
574                    // create the docref object
575                    try {
576                        List auths = null;
577                        if(authors != null) auths = DocRefAuthor.Tools.parseAuthorString(authors);
578                        if (consortium!=null){
579                            if(auths == null) auths = new ArrayList();
580                            auths.add(new SimpleDocRefAuthor(consortium,true,false));
581                        }
582                        DocRef dr = (DocRef)RichObjectFactory.getObject(SimpleDocRef.class,new Object[]{auths,locator,title});
583                        // assign either the pubmed or medline to the docref - medline gets priority, then pubmed, then doi
584                        if (medline!=null) dr.setCrossref((CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{Terms.MEDLINE_KEY, medline, new Integer(0)}));
585                        else if (pubmed!=null) dr.setCrossref((CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{Terms.PUBMED_KEY, pubmed, new Integer(0)}));
586                        else if (doi!=null) dr.setCrossref((CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{Terms.DOI_KEY, doi, new Integer(0)}));
587                        // assign the remarks
588                        if (!this.getElideComments()) dr.setRemark(remark);
589                        // assign the docref to the bioentry
590                        RankedDocRef rdr = new SimpleRankedDocRef(dr,rstart,rend,ref_rank);
591                        rlistener.setRankedDocRef(rdr);
592                    } catch (ChangeVetoException e) {
593                        throw new ParseException(e);
594                    }
595                } else if (sectionKey.equals(COMMENT_TAG) && !this.getElideComments()) {
596                    // Set up some comments
597                    String val = ((String[])section.get(0))[1];
598                    if (UniProtCommentParser.isParseable(val)) rlistener.setComment(val);
599                    else {
600                        // copyright message
601                        rlistener.addSequenceProperty(Terms.getCopyrightTerm(), val);
602                    }
603                } else if (sectionKey.equals(FEATURE_TAG) && !this.getElideFeatures()) {
604                    // starting from second line of input, start a new feature whenever we come across
605                    // a key that does not start with /
606                    boolean seenAFeature = false;
607                    for (int i = 1 ; i < section.size(); i++) {
608                        String key = ((String[])section.get(i))[0];
609                        String val = ((String[])section.get(i))[1];
610                        val = val.replaceAll("\\s*[\\n\\r]+\\s*", " ").trim();
611                        if (val.endsWith(".")) val = val.substring(0,val.length()-1); // chomp dot
612                        if (key.startsWith("/")) {
613                            key = key.substring(1); // strip leading slash
614                            if (key.equals("FTId")) rlistener.addFeatureProperty(Terms.getFTIdTerm(),val);
615                            else {
616                                // should never happen - but here just in case
617                                rlistener.addFeatureProperty(RichObjectFactory.getDefaultOntology().getOrCreateTerm(key),val);
618                            }
619                        } else {
620                            // new feature!
621                            // end previous feature
622                            if (seenAFeature) rlistener.endFeature();
623                            // start next one, with lots of lovely info in it
624                            RichFeature.Template templ = new RichFeature.Template();
625                            templ.annotation = new SimpleRichAnnotation();
626                            templ.sourceTerm = Terms.getUniProtTerm();
627                            templ.typeTerm = RichObjectFactory.getDefaultOntology().getOrCreateTerm(key);
628                            templ.featureRelationshipSet = new TreeSet();
629                            templ.rankedCrossRefs = new TreeSet();
630                            String desc = null;
631                            Matcher m = fp.matcher(val);
632                            if (m.matches()) {
633                                String loc = m.group(1);
634                                desc = m.group(3);
635                                templ.location = UniProtLocationParser.parseLocation(loc);
636                            } else {
637                                String message = ParseException.newMessage(this.getClass(),accession, "", "Bad feature value: "+val, sectionToString(section));
638                                throw new ParseException(message);
639                            }
640                            rlistener.startFeature(templ);
641                            if (desc!=null && desc.length()>0) rlistener.addFeatureProperty(Terms.getFeatureDescTerm(),desc);
642                            seenAFeature = true;
643                        }
644                    }
645                    if (seenAFeature) rlistener.endFeature();
646                } else if (sectionKey.equals(START_SEQUENCE_TAG) && !this.getElideSymbols()) {
647                    StringBuffer seq = new StringBuffer();
648                    for (int i = 0 ; i < section.size(); i++) seq.append(((String[])section.get(i))[1]);
649                    try {
650                        SymbolList sl = new SimpleSymbolList(symParser,
651                                seq.toString().replaceAll("\\s+","").replaceAll("[\\.|~]","-"));
652                        rlistener.addSymbols(symParser.getAlphabet(),
653                                (Symbol[])(sl.toList().toArray(new Symbol[0])),
654                                0, sl.length());
655                    } catch (IllegalAlphabetException e) {
656                        String message = ParseException.newMessage(this.getClass(),accession, "", "", sectionToString(section));
657                        throw new ParseException(e, message);
658                    }
659                }
660            } while (!sectionKey.equals(END_SEQUENCE_TAG));
661        }catch (RuntimeException e){
662            String message = ParseException.newMessage(this.getClass(),accession, "", "", sectionToString(section));
663            throw new ParseException(e, message);
664        }
665        
666        // Allows us to tolerate trailing whitespace without
667        // thinking that there is another Sequence to follow
668        while (true) {
669            reader.mark(1);
670            int c = reader.read();
671            if (c == -1) {
672                hasAnotherSequence = false;
673                break;
674            }
675            if (Character.isWhitespace((char) c)) {
676                //hasInternalWhitespace = true;
677                continue;
678            }
679            //if (hasInternalWhitespace)
680            //System.err.println("Warning: whitespace found between sequence entries");
681            reader.reset();
682            break;
683        }
684        
685        // Finish up.
686        rlistener.endSequence();
687        return hasAnotherSequence;
688    }
689    
690// reads an indented section, combining split lines and creating a list of key->value tuples
691    private List readSection(BufferedReader br) throws ParseException {
692        List section = new ArrayList();
693        String line;
694        boolean done = false;
695        
696        // while not done
697        try {
698            while (!done) {
699                // mark buffer
700                br.mark(160);
701                // read token
702                line = br.readLine();
703                if (line.length()<2) {
704                    String message = ParseException.newMessage(this.getClass(),accession, "", "Bad line found: "+line, sectionToString(section));
705                    throw new ParseException(message);
706                }
707                String token = line.substring(0,2);
708                // READ SEQUENCE SECTION
709                if (token.equals(START_SEQUENCE_TAG)) {
710                    //      from next line, read sequence until // - leave // on stack
711                    StringBuffer sb = new StringBuffer();
712                    while (!done) {
713                        br.mark(160);
714                        line = br.readLine();
715                        if (line.startsWith(END_SEQUENCE_TAG)) {
716                            br.reset();
717                            done = true;
718                        } else {
719                            //      create sequence tag->value pair to return, sans numbers
720                            sb.append(line);
721                        }
722                    }
723                    section.add(new String[]{START_SEQUENCE_TAG,sb.toString()});
724                }
725                // READ COMMENT SECTION
726                else if (token.equals(COMMENT_TAG)) {
727                    // read from first line till next that begins with "CC   -!-"
728                    StringBuffer currentVal = new StringBuffer();
729                    boolean wasMisc = false;
730                    if (!line.startsWith(COMMENT_TAG+"   -!-")) wasMisc = true;
731                    currentVal.append(line.substring(5));
732                    while (!done) {
733                        br.mark(160);
734                        line = br.readLine();
735                        if (((!wasMisc) && line.charAt(5)!=' ') || !line.startsWith("C") || line.startsWith(COMMENT_TAG+"   -!-")) {
736                            br.reset();
737                            done = true;
738                            // dump current tag if exists
739                            section.add(new String[]{COMMENT_TAG,currentVal.toString()});
740                        } else {
741                            currentVal.append("\n");
742                            currentVal.append(line.substring(5));
743                        }
744                    }
745                }
746                // READ FEATURE TABLE SECTION
747                else if (token.equals(FEATURE_TAG)) {
748                    br.reset();
749                    //      read all FT lines until first non-FT starting line
750                    String currentTag = null;
751                    StringBuffer currentVal = new StringBuffer();
752                    section.add(new String[]{FEATURE_TAG,null});
753                    while (!done) {
754                        br.mark(160);
755                        line = br.readLine();
756                        if (!line.startsWith(FEATURE_TAG)) {
757                            br.reset();
758                            done = true;
759                            // dump current tag if exists
760                            if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()});
761                        } else {
762                            //         FT lines:   FT   KEY_NAME     x      x        description
763                            //         or:         FT                                ....
764                            //         or          FT                                /FTId=899.
765                            line = line.substring(5); // chomp off "FT   "
766                            if (!line.startsWith(" ")) {
767                                // dump current tag if exists
768                                if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()});
769                                // case 1 : word value - splits into key-value based on first 8 chars
770                                currentTag = line.substring(0,8).trim();
771                                currentVal = new StringBuffer();
772                                currentVal.append(line.substring(8).trim());
773                            } else {
774                                line = line.trim();
775                                if (line.startsWith("/")) {
776                                    // dump current tag if exists
777                                    if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()});
778                                    // case 3 : /word=.....
779                                    currentVal = new StringBuffer();
780                                    int equalIndex = line.indexOf('=');
781                                    if (equalIndex>=0) {
782                                        currentTag = line.substring(0, equalIndex);
783                                        currentVal.append(line.substring(equalIndex+1));
784                                    } else {
785                                        currentTag = line;
786                                    }
787                                } else {
788                                    // case 2 : ...."
789                                    currentVal.append("\n");
790                                    currentVal.append(line);
791                                }
792                            }
793                        }
794                    }
795                }
796                // READ DOCREF
797                else if (token.equals(DATABASE_XREF_TAG)) {
798                    section.add(new String[]{DATABASE_XREF_TAG,line.substring(5).trim()});
799                    done = true;
800                }
801                // READ DATE
802                else if (token.equals(DATE_TAG)) {
803                    section.add(new String[]{DATE_TAG,line.substring(5).trim()});
804                    done = true;
805                }
806                // READ END OF SEQUENCE
807                else if (token.equals(END_SEQUENCE_TAG)) {
808                    section.add(new String[]{END_SEQUENCE_TAG,null});
809                    done = true;
810                }
811                // READ NORMAL TAG/VALUE SECTION
812                else {
813                    //      rewind buffer to mark
814                    br.reset();
815                    //      read token/values until first with non-same first character
816                    //      exceptions: DE/DT, and RN...RN
817                    String currentTag = null;
818                    char currentTagStart = '\0';
819                    StringBuffer currentVal = null;
820                    while (!done) {
821                        br.mark(160);
822                        line = br.readLine();
823                        if (currentTagStart=='\0') currentTagStart = line.charAt(0);
824                        if (!line.startsWith(""+currentTagStart) ||
825                                (currentTagStart=='D' && currentTag!=null && !line.startsWith(""+currentTag)) ||
826                                (currentTagStart=='R' && currentTag!=null && line.startsWith("RN"))) {
827                            br.reset();
828                            done = true;
829                            // dump current tag if exists
830                            if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()});
831                        } else {
832                            try {
833                                //      merge neighbouring repeated tokens by concatting values
834                                //      return tag->value pairs
835                                String tag = line.substring(0,2);
836                                String value = line.substring(5);
837                                if (currentTag==null || !tag.equals(currentTag)) {
838                                    // dump current tag if exists
839                                    if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()});
840                                    // start new tag
841                                    currentTag = tag;
842                                    currentVal = new StringBuffer();
843                                    currentVal.append(value);
844                                } else {
845                                    currentVal.append("\n");
846                                    currentVal.append(value);
847                                }
848                            } catch (Exception e) {
849                                String message = ParseException.newMessage(this.getClass(),accession, "", "", sectionToString(section));
850                                throw new ParseException(e, message);
851                            }
852                        }
853                    }
854                }
855            }
856        } catch (IOException e) {
857            String message = ParseException.newMessage(this.getClass(),accession, "", "", sectionToString(section));
858            throw new ParseException(e, message);
859        } catch (RuntimeException e){
860            String message = ParseException.newMessage(this.getClass(),accession, "", "", sectionToString(section));
861            throw new ParseException(e, message);
862        }
863        return section;
864    }
865    
866    /**
867     * {@inheritDoc}
868     */
869    public void writeSequence(Sequence seq, PrintStream os) throws IOException {
870        if (this.getPrintStream()==null) this.setPrintStream(os);
871        this.writeSequence(seq, RichObjectFactory.getDefaultNamespace());
872    }
873    
874    /**
875     * {@inheritDoc}
876     */
877    public void writeSequence(Sequence seq, String format, PrintStream os) throws IOException {
878        if (this.getPrintStream()==null) this.setPrintStream(os);
879        if (!format.equals(this.getDefaultFormat())) throw new IllegalArgumentException("Unknown format: "+format);
880        this.writeSequence(seq, RichObjectFactory.getDefaultNamespace());
881    }
882    
883    /**
884     * {@inheritDoc}
885     * Namespace is ignored as UniProt has no concept of it.
886     */
887    public void writeSequence(Sequence seq, Namespace ns) throws IOException {
888        RichSequence rs;
889        try {
890            if (seq instanceof RichSequence) rs = (RichSequence)seq;
891            else rs = RichSequence.Tools.enrich(seq);
892        } catch (ChangeVetoException e) {
893            IOException e2 = new IOException("Unable to enrich sequence");
894            e2.initCause(e);
895            throw e2;
896        }
897        
898        SymbolTokenization tok;
899        try {
900            tok = rs.getAlphabet().getTokenization("token");
901        } catch (Exception e) {
902            throw new RuntimeException("Unable to get alphabet tokenizer",e);
903        }
904        
905        Set<Note> notes = rs.getNoteSet();
906        String accession = rs.getAccession();
907        StringBuffer accessions = new StringBuffer();
908        accessions.append(accession);
909        accessions.append(";");
910        String cdat = null;
911        String udat = null;
912        String adat = null;
913        String dbname = "?";
914        String arel = null;
915        String organelle = null;
916        String protExists = null;
917        String dataclass = "STANDARD";
918        String copyright = null;
919        Map speciesRecs = new TreeMap();
920        Map strainRecs = new TreeMap();
921        Map tissueRecs = new TreeMap();
922        Map transpRecs = new TreeMap();
923        Map plasmidRecs = new TreeMap();
924        Map genenames = new TreeMap();
925        Map genesynonyms = new TreeMap();
926        Map orfnames = new TreeMap();
927        Map ordlocnames = new TreeMap();
928        for (Iterator<Note> i = notes.iterator(); i.hasNext(); ) {
929            Note n = i.next();
930            if (n.getTerm().equals(Terms.getDateCreatedTerm())) cdat=n.getValue();
931            else if (n.getTerm().equals(Terms.getDateUpdatedTerm())) udat=n.getValue();
932            else if (n.getTerm().equals(Terms.getDateAnnotatedTerm())) adat=n.getValue();
933            else if (n.getTerm().equals(Terms.getUniProtDBNameTerm())) dbname=n.getValue();
934            else if (n.getTerm().equals(Terms.getProteinExistsTerm())) protExists=n.getValue();
935            else if (n.getTerm().equals(Terms.getRelAnnotatedTerm())) arel=n.getValue();
936            else if (n.getTerm().equals(Terms.getDataClassTerm())) dataclass = n.getValue();
937            else if (n.getTerm().equals(Terms.getCopyrightTerm())) copyright = n.getValue();
938            else if (n.getTerm().equals(Terms.getAdditionalAccessionTerm())) {
939                accessions.append(" ");
940                accessions.append(n.getValue());
941                accessions.append(";");
942            } else if (n.getTerm().equals(Terms.getOrganelleTerm())) organelle = (organelle==null?"":organelle+"; ")+n.getValue();
943            // use the nasty hack to split the reference rank away from the actual value in this field
944            else if (n.getTerm().equals(Terms.getGeneNameTerm()))  {
945                String ref = n.getValue();
946                int colon = ref.indexOf(':');
947                Integer refID = new Integer(0);
948                if (colon>=1) refID = new Integer(ref.substring(0,colon));
949                genenames.put(refID, ref.substring(colon+1)); // map of id -> string as only one name per gene
950            } else if (n.getTerm().equals(Terms.getGeneSynonymTerm())) {
951                String ref = n.getValue();
952                int colon = ref.indexOf(':');
953                Integer refID = new Integer(0);
954                if (colon>=1) refID = new Integer(ref.substring(0,colon));
955                if (genesynonyms.get(refID)==null) genesynonyms.put(refID, new ArrayList());
956                ((List)genesynonyms.get(refID)).add(ref.substring(colon+1));
957            } else if (n.getTerm().equals(Terms.getOrderedLocusNameTerm())) {
958                String ref = n.getValue();
959                int colon = ref.indexOf(':');
960                Integer refID = new Integer(0);
961                if (colon>=1) refID = new Integer(ref.substring(0,colon));
962                if (ordlocnames.get(refID)==null) ordlocnames.put(refID, new ArrayList());
963                ((List)ordlocnames.get(refID)).add(ref.substring(colon+1));
964            } else if (n.getTerm().equals(Terms.getORFNameTerm())) {
965                String ref = n.getValue();
966                int colon = ref.indexOf(':');
967                Integer refID = new Integer(0);
968                if (colon>=1) refID = new Integer(ref.substring(0,colon));
969                if (orfnames.get(refID)==null) orfnames.put(refID, new ArrayList());
970                ((List)orfnames.get(refID)).add(ref.substring(colon+1));
971            }
972            // use the nasty hack to split the reference rank away from the actual value in this field
973            // we'll end up with a bunch in key 0 for those which did not come from us. We ignore these for now.
974            else if (n.getTerm().equals(Terms.getSpeciesTerm())) {
975                String ref = n.getValue();
976                int colon = ref.indexOf(':');
977                Integer refID = new Integer(0);
978                if (colon>=1) refID = new Integer(ref.substring(0,colon));
979                if (speciesRecs.get(refID)==null) speciesRecs.put(refID, new ArrayList());
980                ((List)speciesRecs.get(refID)).add(ref.substring(colon+1));
981            } else if (n.getTerm().equals(Terms.getStrainTerm()))  {
982                String ref = n.getValue();
983                int colon = ref.indexOf(':');
984                Integer refID = new Integer(0);
985                if (colon>=1) refID = new Integer(ref.substring(0,colon));
986                if (strainRecs.get(refID)==null) strainRecs.put(refID, new ArrayList());
987                ((List)strainRecs.get(refID)).add(ref.substring(colon+1));
988            } else if (n.getTerm().equals(Terms.getTissueTerm()))  {
989                String ref = n.getValue();
990                int colon = ref.indexOf(':');
991                Integer refID = new Integer(0);
992                if (colon>=1) refID = new Integer(ref.substring(0,colon));
993                if (tissueRecs.get(refID)==null) tissueRecs.put(refID, new ArrayList());
994                ((List)tissueRecs.get(refID)).add(ref.substring(colon+1));
995            } else if (n.getTerm().equals(Terms.getTransposonTerm()))  {
996                String ref = n.getValue();
997                int colon = ref.indexOf(':');
998                Integer refID = new Integer(0);
999                if (colon>=1) refID = new Integer(ref.substring(0,colon));
1000                if (transpRecs.get(refID)==null) transpRecs.put(refID, new ArrayList());
1001                ((List)transpRecs.get(refID)).add(ref.substring(colon+1));
1002            } else if (n.getTerm().equals(Terms.getPlasmidTerm()))  {
1003                String ref = n.getValue();
1004                int colon = ref.indexOf(':');
1005                Integer refID = new Integer(0);
1006                if (colon>=1) refID = new Integer(ref.substring(0,colon));
1007                if (plasmidRecs.get(refID)==null) plasmidRecs.put(refID, new ArrayList());
1008                ((List)plasmidRecs.get(refID)).add(ref.substring(colon+1));
1009            }
1010        }
1011        
1012        // entryname  dataclass; [circular] molecule; division; sequencelength BP.
1013        StringBuffer locusLine = new StringBuffer();
1014        locusLine.append(StringTools.rightPad(rs.getName()+"_"+rs.getDivision(),12));
1015        locusLine.append(" ");
1016        locusLine.append(StringTools.leftPad(dataclass,19));
1017        //locusLine.append(";      PRT; "); //Uniprot no longer uses the PRT;
1018        locusLine.append("; ");
1019        locusLine.append(StringTools.leftPad(""+rs.length(),11));
1020        locusLine.append(" AA.");
1021        StringTools.writeKeyValueLine(LOCUS_TAG, locusLine.toString(), 5, this.getLineWidth(), null, LOCUS_TAG, this.getPrintStream());
1022        
1023        // accession line
1024        StringTools.writeKeyValueLine(ACCESSION_TAG, accessions.toString(), 5, this.getLineWidth(), null, ACCESSION_TAG, this.getPrintStream());
1025        
1026        // date line
1027        StringTools.writeKeyValueLine(DATE_TAG, (cdat==null?udat:cdat)+", integrated into UniProtKB/"+dbname+".", 5, this.getLineWidth(), null, DATE_TAG, this.getPrintStream());
1028        StringTools.writeKeyValueLine(DATE_TAG, udat+", sequence version "+rs.getVersion()+".", 5, this.getLineWidth(), null, DATE_TAG, this.getPrintStream());
1029        StringTools.writeKeyValueLine(DATE_TAG, (adat==null?udat:adat)+", entry version "+(arel==null?"0":arel)+".", 5, this.getLineWidth(), null, DATE_TAG, this.getPrintStream());
1030        
1031        // definition line
1032        StringTools.writeKeyValueLine(DEFINITION_TAG, rs.getDescription()+".", 5, this.getLineWidth(), null, DEFINITION_TAG, this.getPrintStream());
1033        
1034        // gene line
1035        for (Iterator i = genenames.keySet().iterator(); i.hasNext(); ) {
1036            Integer geneid = (Integer)i.next();
1037            String genename = (String)genenames.get(geneid);
1038            List synonyms = (List)genesynonyms.get(geneid);
1039            List orfs = (List)orfnames.get(geneid);
1040            List ordlocs = (List)ordlocnames.get(geneid);
1041            
1042            StringBuffer gnline = new StringBuffer();
1043            gnline.append(Terms.GENENAME_KEY);
1044            gnline.append("=");
1045            gnline.append(genename);
1046            gnline.append("; ");
1047            
1048            if (synonyms!=null) {
1049                gnline.append(Terms.GENESYNONYM_KEY);
1050                gnline.append("=");
1051                for (Iterator j = synonyms.iterator(); j.hasNext(); ) {
1052                    gnline.append((String)j.next());
1053                    if (j.hasNext()) gnline.append(", ");
1054                }
1055                gnline.append("; ");
1056            }
1057            if (ordlocs!=null) {
1058                gnline.append(Terms.ORDLOCNAME_KEY);
1059                gnline.append("=");
1060                for (Iterator j = ordlocs.iterator(); j.hasNext(); ) {
1061                    gnline.append((String)j.next());
1062                    if (j.hasNext()) gnline.append(", ");
1063                }
1064                gnline.append("; ");
1065            }
1066            if (orfs!=null) {
1067                gnline.append(Terms.ORFNAME_KEY);
1068                gnline.append("=");
1069                for (Iterator j = orfs.iterator(); j.hasNext(); ) {
1070                    gnline.append((String)j.next());
1071                    if (j.hasNext()) gnline.append(", ");
1072                }
1073                gnline.append("; ");
1074            }
1075            
1076            StringTools.writeKeyValueLine(GENE_TAG, gnline.toString(), 5, this.getLineWidth(), null, GENE_TAG, this.getPrintStream());
1077            
1078            if (i.hasNext()) StringTools.writeKeyValueLine(GENE_TAG, "and", 5, this.getLineWidth(), null, GENE_TAG, this.getPrintStream());
1079        }
1080        
1081        // source line (from taxon)
1082        //   organism line
1083        NCBITaxon tax = rs.getTaxon();
1084        if (tax!=null) {
1085            StringBuffer source = new StringBuffer();
1086            source.append(tax.getDisplayName());
1087            for (Iterator j = tax.getNames(NCBITaxon.SYNONYM).iterator(); j.hasNext(); ) {
1088                source.append(" (");
1089                source.append((String)j.next());
1090                source.append(")");
1091            }
1092            source.append(".");
1093            StringTools.writeKeyValueLine(SOURCE_TAG, source.toString(), 5, this.getLineWidth(), null, SOURCE_TAG, this.getPrintStream());
1094            if (organelle!=null) StringTools.writeKeyValueLine(ORGANELLE_TAG, organelle+".", 5, this.getLineWidth(), null, ORGANELLE_TAG, this.getPrintStream());
1095            StringTools.writeKeyValueLine(ORGANISM_TAG, tax.getNameHierarchy(), 5, this.getLineWidth(), null, ORGANISM_TAG, this.getPrintStream());
1096            StringTools.writeKeyValueLine(TAXON_TAG, "NCBI_TaxID="+tax.getNCBITaxID()+";", 5, this.getLineWidth(), this.getPrintStream());
1097        }
1098        
1099        // references - rank (bases x to y)
1100        for (Iterator<RankedDocRef> r = rs.getRankedDocRefs().iterator(); r.hasNext(); ) {
1101            RankedDocRef rdr = r.next();
1102            DocRef d = rdr.getDocumentReference();
1103            // RN, RP, RC, RX, RG, RA, RT, RL
1104            StringTools.writeKeyValueLine(REFERENCE_TAG, "["+rdr.getRank()+"]", 5, this.getLineWidth(), null, REFERENCE_TAG, this.getPrintStream());
1105            if (d.getRemark()!=null)
1106                StringTools.writeKeyValueLine(RP_LINE_TAG, d.getRemark()+".", 5, this.getLineWidth(), null, RP_LINE_TAG, this.getPrintStream());
1107            // Print out ref position if present
1108            if (rdr.getStart()!=null && rdr.getEnd()!=null && d.getRemark()!=null && !rppat.matcher(d.getRemark()).matches()) StringTools.writeKeyValueLine(RP_LINE_TAG, "SEQUENCE OF "+rdr.getStart()+"-"+rdr.getEnd()+".", 5, this.getLineWidth(), null, RP_LINE_TAG, this.getPrintStream());
1109            // RC lines
1110            StringBuffer rcline = new StringBuffer();
1111            Integer rank = new Integer(rdr.getRank());
1112            if (speciesRecs.get(rank)!=null) {
1113                rcline.append(Terms.SPECIES_KEY);
1114                rcline.append("=");
1115                for (Iterator i = ((List)speciesRecs.get(rank)).iterator(); i.hasNext(); ) {
1116                    rcline.append((String)i.next());
1117                    if (i.hasNext()) rcline.append(", ");
1118                }
1119                rcline.append("; ");
1120            }
1121            if (strainRecs.get(rank)!=null) {
1122                rcline.append(Terms.STRAIN_KEY);
1123                rcline.append("=");
1124                for (Iterator i = ((List)strainRecs.get(rank)).iterator(); i.hasNext(); ) {
1125                    rcline.append((String)i.next());
1126                    if (i.hasNext()) rcline.append(", ");
1127                }
1128                rcline.append("; ");
1129            }
1130            if (tissueRecs.get(rank)!=null) {
1131                rcline.append(Terms.TISSUE_KEY);
1132                rcline.append("=");
1133                for (Iterator i = ((List)tissueRecs.get(rank)).iterator(); i.hasNext(); ) {
1134                    rcline.append((String)i.next());
1135                    if (i.hasNext()) rcline.append(", ");
1136                }
1137                rcline.append("; ");
1138            }
1139            if (transpRecs.get(rank)!=null) {
1140                rcline.append(Terms.TRANSPOSON_KEY);
1141                rcline.append("=");
1142                for (Iterator i = ((List)transpRecs.get(rank)).iterator(); i.hasNext(); ) {
1143                    rcline.append((String)i.next());
1144                    if (i.hasNext()) rcline.append(", ");
1145                }
1146                rcline.append("; ");
1147            }
1148            if (plasmidRecs.get(rank)!=null) {
1149                rcline.append(Terms.PLASMID_KEY);
1150                rcline.append("=");
1151                for (Iterator i = ((List)plasmidRecs.get(rank)).iterator(); i.hasNext(); ) {
1152                    rcline.append((String)i.next());
1153                    if (i.hasNext()) rcline.append(", ");
1154                }
1155                rcline.append("; ");
1156            }
1157            // print the rcline
1158            if (rcline.length()>0) StringTools.writeKeyValueLine(RC_LINE_TAG, rcline.toString(), 5, this.getLineWidth(), null, RC_LINE_TAG, this.getPrintStream());
1159            // Deal with RX and rest
1160            CrossRef c = d.getCrossref();
1161            if (c!=null) StringTools.writeKeyValueLine(REFERENCE_XREF_TAG, c.getDbname()+"="+c.getAccession()+";", 5, this.getLineWidth(), null, REFERENCE_XREF_TAG, this.getPrintStream());
1162            List<DocRefAuthor> auths = d.getAuthorList();
1163            for (Iterator<DocRefAuthor> j = auths.iterator(); j.hasNext(); ) {
1164                DocRefAuthor a = j.next();
1165                if (a.isConsortium()) {
1166                    StringTools.writeKeyValueLine(CONSORTIUM_TAG, a.getName()+";", 5, this.getLineWidth(), null, CONSORTIUM_TAG, this.getPrintStream());
1167                    j.remove();
1168                }
1169            }
1170            if (!auths.isEmpty()) StringTools.writeKeyValueLine(AUTHORS_TAG, DocRefAuthor.Tools.generateAuthorString(auths, false)+";", 5, this.getLineWidth(), null, AUTHORS_TAG, this.getPrintStream());
1171            if (d.getTitle()!=null && d.getTitle().length()!=0) StringTools.writeKeyValueLine(TITLE_TAG, "\""+d.getTitle()+"\";", 5, this.getLineWidth(), null, TITLE_TAG, this.getPrintStream());
1172            StringTools.writeKeyValueLine(LOCATION_TAG, d.getLocation()+".", 5, this.getLineWidth(), null, LOCATION_TAG, this.getPrintStream());
1173        }
1174        
1175        // comments - if any
1176        if (!rs.getComments().isEmpty()) {
1177            for (Iterator<Comment> i = rs.getComments().iterator(); i.hasNext(); ) {
1178                Comment c = i.next();
1179                String text = c.getComment().trim();
1180                if (text.length()>3 && text.substring(0,3).equals("-!-")) StringTools.writeKeyValueLine(COMMENT_TAG, text, 5, this.getLineWidth(), null, COMMENT_TAG, this.getPrintStream());
1181                else StringTools.writeKeyValueLine(COMMENT_TAG, text, 5, this.getLineWidth(), null, COMMENT_TAG, this.getPrintStream());
1182            }
1183        }
1184        
1185        // copyright - if any
1186        if (copyright!=null)
1187            StringTools.writeKeyValueLine(COMMENT_TAG, copyright, 5, this.getLineWidth(), null, COMMENT_TAG, this.getPrintStream());
1188        
1189        // db references - ranked
1190        for (Iterator<RankedCrossRef> r = rs.getRankedCrossRefs().iterator(); r.hasNext(); ) {
1191            RankedCrossRef rcr = r.next();
1192            CrossRef c = rcr.getCrossRef();
1193            Set<Note> noteset = c.getNoteSet();
1194            StringBuffer sb = new StringBuffer();
1195            sb.append(c.getDbname());
1196            sb.append("; ");
1197            sb.append(c.getAccession());
1198            boolean hasSecondary = false;
1199            for (Iterator<Note> i = noteset.iterator(); i.hasNext(); ) {
1200                Note n = i.next();
1201                if (n.getTerm().equals(Terms.getAdditionalAccessionTerm())) {
1202                    sb.append("; ");
1203                    sb.append(n.getValue());
1204                    hasSecondary = true;
1205                }
1206            }
1207            if (!hasSecondary) sb.append("; -");
1208            sb.append(".");
1209            StringTools.writeKeyValueLine(DATABASE_XREF_TAG, sb.toString(), 5, this.getLineWidth(), null, DATABASE_XREF_TAG, this.getPrintStream());
1210        }
1211        
1212        // protein exists line
1213        if (protExists!=null) {
1214            StringTools.writeKeyValueLine(PROTEIN_EXIST_TAG, protExists+";", 5, this.getLineWidth(), null, PROTEIN_EXIST_TAG, this.getPrintStream());
1215        }
1216        
1217        // keywords line
1218        String keywords = null;
1219        for (Iterator<Note> n = notes.iterator(); n.hasNext(); ) {
1220            Note nt = n.next();
1221            if (nt.getTerm().equals(Terms.getKeywordTerm())) {
1222                if (keywords==null) keywords = nt.getValue();
1223                else keywords = keywords+"; "+nt.getValue();
1224            }
1225        }
1226        if (keywords!=null) {
1227            StringTools.writeKeyValueLine(KEYWORDS_TAG, keywords+".", 5, this.getLineWidth(), null, KEYWORDS_TAG, this.getPrintStream());
1228        }
1229        
1230        // feature_type     location
1231        for (Iterator i = rs.getFeatureSet().iterator(); i.hasNext(); ) {
1232            RichFeature f = (RichFeature)i.next();
1233            String desc = "";
1234            String ftid = null;
1235            for (Iterator<Note> j = f.getNoteSet().iterator(); j.hasNext(); ) {
1236                Note n = j.next();
1237                if (n.getTerm().equals(Terms.getFTIdTerm())) ftid = n.getValue();
1238                else if (n.getTerm().equals(Terms.getFeatureDescTerm())) desc = n.getValue();
1239            }
1240            String kw = f.getTypeTerm().getName();
1241            String leader = StringTools.rightPad(kw,8)+" "+UniProtLocationParser.writeLocation((RichLocation)f.getLocation());
1242            if(desc.length()==0)
1243                this.getPrintStream().println(FEATURE_TAG+"   "+leader); //see #2277
1244            else
1245                StringTools.writeKeyValueLine(FEATURE_TAG+"   "+leader, desc+".", 34, this.getLineWidth(), null, FEATURE_TAG, this.getPrintStream());
1246            if (ftid!=null) StringTools.writeKeyValueLine(FEATURE_TAG, "/FTId="+ftid+".", 34, this.getLineWidth(), null, FEATURE_TAG, this.getPrintStream());
1247        }
1248        
1249        // sequence header
1250        int mw = 0;
1251        try {
1252            mw = (int)MassCalc.getMolecularWeight(rs);
1253        } catch (IllegalSymbolException e) {
1254            throw new RuntimeException("Found illegal symbol", e);
1255        }
1256        CRC64Checksum crc = new CRC64Checksum();
1257        String seqstr = rs.seqString();
1258        crc.update(seqstr.getBytes(),0,seqstr.length());
1259        this.getPrintStream().print(START_SEQUENCE_TAG+"   SEQUENCE  "+StringTools.leftPad(""+rs.length(),4)+" AA;  ");
1260        this.getPrintStream().print(StringTools.leftPad(""+mw,5)+" MW;  ");
1261        this.getPrintStream().println(crc+" CRC64;");
1262        
1263        // sequence stuff
1264        Symbol[] syms = (Symbol[])rs.toList().toArray(new Symbol[0]);
1265        int symCount = 0;
1266        this.getPrintStream().print("    ");
1267        for (int i = 0; i < syms.length; i++) {
1268            if (symCount % 60 == 0 && symCount>0) {
1269                this.getPrintStream().print("\n    ");
1270            }
1271            if (symCount % 10 == 0) {
1272                this.getPrintStream().print(" ");
1273            }
1274            try {
1275                this.getPrintStream().print(tok.tokenizeSymbol(syms[i]));
1276            } catch (IllegalSymbolException e) {
1277                throw new RuntimeException("Found illegal symbol: "+syms[i]);
1278            }
1279            symCount++;
1280        }
1281        this.getPrintStream().print("\n");
1282        this.getPrintStream().println(END_SEQUENCE_TAG);
1283    }
1284    
1285    /**
1286     * {@inheritDoc}
1287     */
1288    public String getDefaultFormat() {
1289        return UNIPROT_FORMAT;
1290    }
1291    
1292    /**
1293     * Converts the current parse section to a String. Useful for debugging.
1294     */
1295    String sectionToString(List section){
1296        StringBuffer parseBlock = new StringBuffer();
1297        for(Iterator i = section.listIterator(); i.hasNext();){
1298            String[] part = (String[])i.next();
1299            for(int x = 0; x < part.length; x++){
1300                parseBlock.append(part[x]);
1301                if(x == 0){
1302                    parseBlock.append("   "); //the gap will have been trimmed
1303                }
1304            }
1305        }
1306        return parseBlock.toString();
1307    }
1308}