001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 * Created on 01-21-2010
021 *
022 * @auther Scooter Willis
023 *
024 */
025package org.biojava.nbio.core.sequence.loader;
026
027import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
028import org.biojava.nbio.core.sequence.AccessionID;
029import org.biojava.nbio.core.sequence.DataSource;
030import org.biojava.nbio.core.sequence.ProteinSequence;
031import org.biojava.nbio.core.sequence.Strand;
032import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
033import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
034import org.biojava.nbio.core.sequence.features.DBReferenceInfo;
035import org.biojava.nbio.core.sequence.features.DatabaseReferenceInterface;
036import org.biojava.nbio.core.sequence.features.FeaturesKeyWordInterface;
037import org.biojava.nbio.core.sequence.storage.SequenceAsStringHelper;
038import org.biojava.nbio.core.sequence.template.*;
039import org.biojava.nbio.core.util.Equals;
040import org.biojava.nbio.core.util.XMLHelper;
041import org.slf4j.Logger;
042import org.slf4j.LoggerFactory;
043import org.w3c.dom.Document;
044import org.w3c.dom.Element;
045import org.xml.sax.SAXException;
046
047import javax.xml.parsers.ParserConfigurationException;
048import javax.xml.xpath.XPathExpressionException;
049import java.io.*;
050import java.net.HttpURLConnection;
051import java.net.URL;
052import java.rmi.RemoteException;
053import java.util.ArrayList;
054import java.util.Iterator;
055import java.util.LinkedHashMap;
056import java.util.List;
057import java.util.regex.Pattern;
058
059/**
060 *
061 * Pass in a Uniprot ID and this ProxySequenceReader when passed to a ProteinSequence will get the sequence data and other data elements
062 * associated with the ProteinSequence by Uniprot. This is an example of how to map external databases of proteins and features to the BioJava3
063 * ProteinSequence.
064 * Important to call @see setUniprotDirectoryCache to allow caching of XML files so they don't need to be reloaded each time. Does
065 * not manage cache.
066 * @param <C>
067 */
068public class UniprotProxySequenceReader<C extends Compound> implements ProxySequenceReader<C>, FeaturesKeyWordInterface, DatabaseReferenceInterface {
069
070        private final static Logger logger = LoggerFactory.getLogger(UniprotProxySequenceReader.class);
071
072        /*
073         * Taken from http://www.uniprot.org/help/accession_numbers
074         */
075        private static final String SPID_PATTERN = "[OPQ][0-9][A-Z0-9]{3}[0-9]";
076        private static final String TREMBLID_PATTERN = "[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}";
077        public static final Pattern UP_AC_PATTERN = Pattern.compile("(" + SPID_PATTERN + "|" + TREMBLID_PATTERN + ")");
078
079        public static final String DEFAULT_UNIPROT_BASE_URL = "https://www.uniprot.org";
080
081        private static String uniprotbaseURL = DEFAULT_UNIPROT_BASE_URL;
082        private static String uniprotDirectoryCache = null;
083        private String sequence;
084        private CompoundSet<C> compoundSet;
085        private List<C> parsedCompounds = new ArrayList<C>();
086        Document uniprotDoc;
087
088        /**
089         * The UniProt id is used to retrieve the UniProt XML which is then parsed as a DOM object
090         * so we know everything about the protein. If an error occurs throw an exception. We could
091         * have a bad uniprot id or network error
092         * @param accession
093         * @param compoundSet
094         * @throws CompoundNotFoundException
095         * @throws IOException if problems while reading the UniProt XML
096         */
097        public UniprotProxySequenceReader(String accession, CompoundSet<C> compoundSet) throws CompoundNotFoundException, IOException {
098                if (!UP_AC_PATTERN.matcher(accession.toUpperCase()).matches()) {
099                        throw new IllegalArgumentException("Accession provided " + accession + " doesn't comply with the uniprot acession pattern.");
100                }
101                setCompoundSet(compoundSet);
102                uniprotDoc = this.getUniprotXML(accession);
103                String seq = this.getSequence(uniprotDoc);
104                setContents(seq);
105        }
106
107        /**
108         * The xml is passed in as a DOM object so we know everything about the protein.
109         *  If an error occurs throw an exception. We could have a bad uniprot id
110         * @param document
111         * @param compoundSet
112         * @throws CompoundNotFoundException
113         */
114        public UniprotProxySequenceReader(Document document, CompoundSet<C> compoundSet) throws CompoundNotFoundException {
115                setCompoundSet(compoundSet);
116                uniprotDoc = document;
117                String seq = this.getSequence(uniprotDoc);
118                setContents(seq);
119        }
120        /**
121         * The passed in xml is parsed as a DOM object so we know everything about the protein.
122         *  If an error occurs throw an exception. We could have a bad uniprot id
123         * @param xml
124         * @param compoundSet
125         * @return UniprotProxySequenceReader
126         * @throws Exception
127         */
128        public static <C extends Compound> UniprotProxySequenceReader<C> parseUniprotXMLString(String xml, CompoundSet<C> compoundSet) {
129                try {
130                        Document document = XMLHelper.inputStreamToDocument(new ByteArrayInputStream(xml.getBytes()));
131                        return new UniprotProxySequenceReader<C>(document, compoundSet);
132                } catch (Exception e) {
133                        logger.error("Exception on xml parse of: {}", xml);
134                }
135                return null;
136        }
137
138        @Override
139        public void setCompoundSet(CompoundSet<C> compoundSet) {
140                this.compoundSet = compoundSet;
141        }
142
143        /**
144         * Once the sequence is retrieved set the contents and make sure everything this is valid
145         * @param sequence
146         * @throws CompoundNotFoundException
147         */
148        @Override
149        public void setContents(String sequence) throws CompoundNotFoundException {
150                // Horrendously inefficient - pretty much the way the old BJ did things.
151                // TODO Should be optimised.
152                this.sequence = sequence;
153                this.parsedCompounds.clear();
154                for (int i = 0; i < sequence.length();) {
155                        String compoundStr = null;
156                        C compound = null;
157                        for (int compoundStrLength = 1; compound == null && compoundStrLength <= compoundSet.getMaxSingleCompoundStringLength(); compoundStrLength++) {
158                                compoundStr = sequence.substring(i, i + compoundStrLength);
159                                compound = compoundSet.getCompoundForString(compoundStr);
160                        }
161                        if (compound == null) {
162                                throw new CompoundNotFoundException("Compound "+compoundStr+" not found");
163                        } else {
164                                i += compoundStr.length();
165                        }
166                        this.parsedCompounds.add(compound);
167                }
168        }
169
170        /**
171         * The sequence length
172         * @return
173         */
174        @Override
175        public int getLength() {
176                return this.parsedCompounds.size();
177        }
178
179        /**
180         *
181         * @param position
182         * @return
183         */
184        @Override
185        public C getCompoundAt(int position) {
186                return this.parsedCompounds.get(position - 1);
187        }
188
189        /**
190         *
191         * @param compound
192         * @return
193         */
194        @Override
195        public int getIndexOf(C compound) {
196                return this.parsedCompounds.indexOf(compound) + 1;
197        }
198
199        /**
200         *
201         * @param compound
202         * @return
203         */
204        @Override
205        public int getLastIndexOf(C compound) {
206                return this.parsedCompounds.lastIndexOf(compound) + 1;
207        }
208
209        /**
210         *
211         * @return
212         */
213        @Override
214        public String toString() {
215                return getSequenceAsString();
216        }
217
218        /**
219         *
220         * @return
221         */
222        @Override
223        public String getSequenceAsString() {
224                return sequence;
225        }
226
227        /**
228         *
229         * @return
230         */
231        @Override
232        public List<C> getAsList() {
233                return this.parsedCompounds;
234        }
235
236        @Override
237        public boolean equals(Object o){
238
239                if(! Equals.classEqual(this, o)) {
240                        return false;
241                }
242
243                Sequence<C> other = (Sequence<C>)o;
244                if ( other.getCompoundSet() != getCompoundSet())
245                        return false;
246
247                List<C> rawCompounds = getAsList();
248                List<C> otherCompounds = other.getAsList();
249
250                if ( rawCompounds.size() != otherCompounds.size())
251                        return false;
252
253                for (int i = 0 ; i < rawCompounds.size() ; i++){
254                        Compound myCompound = rawCompounds.get(i);
255                        Compound otherCompound = otherCompounds.get(i);
256                        if ( ! myCompound.equalsIgnoreCase(otherCompound))
257                                return false;
258                }
259                return true;
260        }
261
262        @Override
263        public int hashCode(){
264                String s = getSequenceAsString();
265                return s.hashCode();
266        }
267
268        /**
269         *
270         * @return
271         */
272        @Override
273        public SequenceView<C> getInverse() {
274                return SequenceMixin.inverse(this);
275        }
276
277        /**
278         *
279         * @param bioBegin
280         * @param bioEnd
281         * @param strand
282         * @return
283         */
284        public String getSequenceAsString(Integer bioBegin, Integer bioEnd, Strand strand) {
285                SequenceAsStringHelper<C> sequenceAsStringHelper = new SequenceAsStringHelper<C>();
286                return sequenceAsStringHelper.getSequenceAsString(this.parsedCompounds, compoundSet, bioBegin, bioEnd, strand);
287        }
288
289        /**
290         *
291         * @param bioBegin
292         * @param bioEnd
293         * @return
294         */
295        @Override
296        public SequenceView<C> getSubSequence(final Integer bioBegin, final Integer bioEnd) {
297                return new SequenceProxyView<C>(UniprotProxySequenceReader.this, bioBegin, bioEnd);
298        }
299
300        /**
301         *
302         * @return
303         */
304        @Override
305        public Iterator<C> iterator() {
306                return this.parsedCompounds.iterator();
307        }
308
309        /**
310         *
311         * @return
312         */
313        @Override
314        public CompoundSet<C> getCompoundSet() {
315                return compoundSet;
316        }
317
318        /**
319         *
320         * @return
321         */
322        @Override
323        public AccessionID getAccession() {
324                AccessionID accessionID = new AccessionID();
325                if (uniprotDoc == null) {
326                        return accessionID;
327                }
328                try {
329                        Element uniprotElement = uniprotDoc.getDocumentElement();
330                        Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
331                        Element nameElement = XMLHelper.selectSingleElement(entryElement, "name");
332                        accessionID = new AccessionID(nameElement.getTextContent(), DataSource.UNIPROT);
333                } catch (XPathExpressionException e) {
334                        logger.error("Exception: ", e);
335                }
336                return accessionID;
337        }
338
339        /**
340         * Pull uniprot accessions associated with this sequence
341         * @return
342         * @throws XPathExpressionException
343         */
344        public ArrayList<AccessionID> getAccessions() throws XPathExpressionException {
345                ArrayList<AccessionID> accessionList = new ArrayList<AccessionID>();
346                if (uniprotDoc == null) {
347                        return accessionList;
348                }
349                Element uniprotElement = uniprotDoc.getDocumentElement();
350                Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
351                ArrayList<Element> keyWordElementList = XMLHelper.selectElements(entryElement, "accession");
352                for (Element element : keyWordElementList) {
353                        AccessionID accessionID = new AccessionID(element.getTextContent(), DataSource.UNIPROT);
354                        accessionList.add(accessionID);
355                }
356
357                return accessionList;
358        }
359
360        /**
361         * Pull uniprot protein aliases associated with this sequence
362         * Provided for backwards compatibility now that we support both
363         * gene and protein aliases via separate methods.
364         * @return
365         * @throws XPathExpressionException
366         */
367        public ArrayList<String> getAliases() throws XPathExpressionException {
368
369                return getProteinAliases();
370        }
371        /**
372         * Pull uniprot protein aliases associated with this sequence
373         * @return
374         * @throws XPathExpressionException
375         */
376        public ArrayList<String> getProteinAliases() throws XPathExpressionException {
377                ArrayList<String> aliasList = new ArrayList<String>();
378                if (uniprotDoc == null) {
379                        return aliasList;
380                }
381                Element uniprotElement = uniprotDoc.getDocumentElement();
382                Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
383                Element proteinElement = XMLHelper.selectSingleElement(entryElement, "protein");
384                ArrayList<Element> keyWordElementList = XMLHelper.selectElements(proteinElement, "alternativeName");
385                for (Element element : keyWordElementList) {
386                        Element fullNameElement = XMLHelper.selectSingleElement(element, "fullName");
387                        aliasList.add(fullNameElement.getTextContent());
388                        Element shortNameElement = XMLHelper.selectSingleElement(element, "shortName");
389                        if(null != shortNameElement) {
390                                String shortName = shortNameElement.getTextContent();
391                                if(null != shortName && !shortName.trim().isEmpty()) {
392                                        aliasList.add(shortName);
393                                }
394                        }
395                }
396                keyWordElementList = XMLHelper.selectElements(proteinElement, "recommendedName");
397                for (Element element : keyWordElementList) {
398                        Element fullNameElement = XMLHelper.selectSingleElement(element, "fullName");
399                        aliasList.add(fullNameElement.getTextContent());
400                        Element shortNameElement = XMLHelper.selectSingleElement(element, "shortName");
401                        if(null != shortNameElement) {
402                                String shortName = shortNameElement.getTextContent();
403                                if(null != shortName && !shortName.trim().isEmpty()) {
404                                        aliasList.add(shortName);
405                                }
406                        }
407                }
408                Element cdAntigen = XMLHelper.selectSingleElement(proteinElement, "cdAntigenName");
409                if(null != cdAntigen) {
410                        String cdAntigenName = cdAntigen.getTextContent();
411                        if(null != cdAntigenName && !cdAntigenName.trim().isEmpty()) {
412                                aliasList.add(cdAntigenName);
413                        }
414                }
415
416                return aliasList;
417        }
418
419        /**
420         * Pull uniprot gene aliases associated with this sequence
421         * @return
422         * @throws XPathExpressionException
423         */
424        public ArrayList<String> getGeneAliases() throws XPathExpressionException {
425                ArrayList<String> aliasList = new ArrayList<String>();
426                if (uniprotDoc == null) {
427                        return aliasList;
428                }
429                Element uniprotElement = uniprotDoc.getDocumentElement();
430                Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
431                ArrayList<Element> proteinElements = XMLHelper.selectElements(entryElement, "gene");
432                for(Element proteinElement : proteinElements) {
433                        ArrayList<Element> keyWordElementList = XMLHelper.selectElements(proteinElement, "name");
434                        for (Element element : keyWordElementList) {
435                                aliasList.add(element.getTextContent());
436                        }
437                }
438                return aliasList;
439        }
440
441        /**
442         *
443         * @param compounds
444         * @return
445         */
446        @Override
447        public int countCompounds(C... compounds) {
448                throw new UnsupportedOperationException("Not supported yet.");
449        }
450
451        /**
452         *
453         * @param accession
454         * @return
455         * @throws IOException
456         */
457        private Document getUniprotXML(String accession) throws IOException, CompoundNotFoundException {
458                StringBuilder sb = new StringBuilder();
459                // try in cache
460                if (uniprotDirectoryCache != null && uniprotDirectoryCache.length() > 0) {
461                        sb = fetchFromCache(accession);
462                }
463
464                // http://www.uniprot.org/uniprot/?query=SORBIDRAFT_03g027040&format=xml
465                if (sb.length() == 0) {
466                        String uniprotURL = getUniprotbaseURL() + "/uniprot/" + accession.toUpperCase() + ".xml";
467                        logger.info("Loading: {}", uniprotURL);
468                        sb = fetchUniprotXML(uniprotURL);
469
470                        int index = sb.indexOf("xmlns="); //strip out name space stuff to make it easier on xpath
471                        if (index != -1) {
472                                int lastIndex = sb.indexOf(">", index);
473                                sb.replace(index, lastIndex, "");
474                        }
475                        if (uniprotDirectoryCache != null && uniprotDirectoryCache.length() > 0)
476                                writeCache(sb,accession);
477                }
478
479                logger.info("Load complete");
480                try {
481                        //       logger.debug(sb.toString());
482                        Document document = XMLHelper.inputStreamToDocument(new ByteArrayInputStream(sb.toString().getBytes()));
483                        return document;
484                } catch (SAXException e) {
485                        logger.error("Exception on xml parse of: {}", sb.toString());
486                } catch (ParserConfigurationException e) {
487                        logger.error("Exception on xml parse of: {}", sb.toString());
488                }
489                return null;
490        }
491
492        private void writeCache(StringBuilder sb, String accession) throws IOException {
493                File f = new File(uniprotDirectoryCache + File.separatorChar + accession + ".xml");
494                FileWriter fw = new FileWriter(f);
495                fw.write(sb.toString());
496                fw.close();
497        }
498
499        /**
500         * Open a URL connection.
501         *
502         * Follows redirects.
503         * @param url
504         * @throws IOException
505         */
506        private static HttpURLConnection openURLConnection(URL url) throws IOException {
507                // This method should be moved to a utility class in BioJava 5.0
508
509                final int timeout = 5000;
510                final String useragent = "BioJava";
511
512                HttpURLConnection conn = (HttpURLConnection) url.openConnection();
513                conn.setRequestProperty("User-Agent", useragent);
514                conn.setInstanceFollowRedirects(true);
515                conn.setConnectTimeout(timeout);
516                conn.setReadTimeout(timeout);
517
518                int status = conn.getResponseCode();
519                while (status == HttpURLConnection.HTTP_MOVED_TEMP
520                                || status == HttpURLConnection.HTTP_MOVED_PERM
521                                || status == HttpURLConnection.HTTP_SEE_OTHER) {
522                        // Redirect!
523                        String newUrl = conn.getHeaderField("Location");
524
525                        if(newUrl.equals(url.toString())) {
526                                throw new IOException("Cyclic redirect detected at "+newUrl);
527                        }
528
529                        // Preserve cookies
530                        String cookies = conn.getHeaderField("Set-Cookie");
531
532                        // open the new connection again
533                        url = new URL(newUrl);
534                        conn.disconnect();
535                        conn = (HttpURLConnection) url.openConnection();
536                        if(cookies != null) {
537                                conn.setRequestProperty("Cookie", cookies);
538                        }
539                        conn.addRequestProperty("User-Agent", useragent);
540                        conn.setInstanceFollowRedirects(true);
541                        conn.setConnectTimeout(timeout);
542                        conn.setReadTimeout(timeout);
543                        conn.connect();
544
545                        status = conn.getResponseCode();
546
547                        logger.info("Redirecting from {} to {}", url, newUrl);
548                }
549                conn.connect();
550
551                return conn;
552        }
553
554        private StringBuilder fetchUniprotXML(String uniprotURL)
555                        throws IOException, CompoundNotFoundException {
556
557                StringBuilder sb = new StringBuilder();
558                URL uniprot = new URL(uniprotURL);
559                int attempt = 5;
560                List<String> errorCodes = new ArrayList<String>();
561                while(attempt > 0) {
562                        HttpURLConnection uniprotConnection = openURLConnection(uniprot);
563                        int statusCode = uniprotConnection.getResponseCode();
564                        if (statusCode == HttpURLConnection.HTTP_OK) {
565                                BufferedReader in = new BufferedReader(
566                                                new InputStreamReader(
567                                                uniprotConnection.getInputStream()));
568                                String inputLine;
569
570                                while ((inputLine = in.readLine()) != null) {
571                                        sb.append(inputLine);
572                                }
573                                in.close();
574                                return sb;
575                        }
576                        attempt--;
577                        errorCodes.add(String.valueOf(statusCode));
578                }
579                throw new RemoteException("Couldn't fetch accession from the url " + uniprotURL + " error codes on 5 attempts are " + errorCodes.toString());
580        }
581
582        /**
583         * @param key
584         * @return A string containing the contents of entry specified by key and if not found returns an empty string
585         * @throws FileNotFoundException
586         * @throws IOException
587         */
588        private StringBuilder fetchFromCache(String key)
589                        throws FileNotFoundException, IOException {
590                int index;
591                File f = new File(uniprotDirectoryCache + File.separatorChar + key + ".xml");
592                StringBuilder sb = new StringBuilder();
593                if (f.exists()) {
594                        FileReader fr = new FileReader(f);
595                        int size = (int) f.length();
596                        char[] data = new char[size];
597                        fr.read(data);
598                        fr.close();
599                        sb.append(data);
600                        index = sb.indexOf("xmlns="); //strip out name space stuff to make it easier on xpath
601                        if (index != -1) {
602                                int lastIndex = sb.indexOf(">", index);
603                                sb.replace(index, lastIndex, "");
604                        }
605                }
606                return sb;
607        }
608
609        /**
610         *
611         * @param uniprotDoc
612         * @return
613         */
614        private String getSequence(Document uniprotDoc)  {
615
616                try {
617                        Element uniprotElement = uniprotDoc.getDocumentElement();
618                        Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
619                        Element sequenceElement = XMLHelper.selectSingleElement(entryElement, "sequence");
620
621                        String seqdata = sequenceElement.getTextContent();
622
623                        return seqdata;
624                } catch (XPathExpressionException e) {
625                        logger.error("Problems while parsing sequence in UniProt XML: {}. Sequence will be blank.", e.getMessage());
626                        return "";
627                }
628        }
629
630        /**
631         * The current UniProt URL to deal with caching issues. www.uniprot.org is load balanced
632         * but you can access pir.uniprot.org directly.
633         * @return the uniprotbaseURL
634         */
635        public static String getUniprotbaseURL() {
636                return uniprotbaseURL;
637        }
638
639        /**
640         * @param aUniprotbaseURL the uniprotbaseURL to set
641         */
642        public static void setUniprotbaseURL(String aUniprotbaseURL) {
643                uniprotbaseURL = aUniprotbaseURL;
644        }
645
646        /**
647         * Local directory cache of XML that can be downloaded
648         * @return the uniprotDirectoryCache
649         */
650        public static String getUniprotDirectoryCache() {
651                return uniprotDirectoryCache;
652        }
653
654        /**
655         * @param aUniprotDirectoryCache the uniprotDirectoryCache to set
656         */
657        public static void setUniprotDirectoryCache(String aUniprotDirectoryCache) {
658                File f = new File(aUniprotDirectoryCache);
659                if (!f.exists()) {
660                        f.mkdirs();
661                }
662                uniprotDirectoryCache = aUniprotDirectoryCache;
663        }
664
665        public static void main(String[] args) {
666
667                try {
668                        UniprotProxySequenceReader<AminoAcidCompound> uniprotSequence = new UniprotProxySequenceReader<AminoAcidCompound>("YA745_GIBZE", AminoAcidCompoundSet.getAminoAcidCompoundSet());
669                        ProteinSequence proteinSequence = new ProteinSequence(uniprotSequence);
670                        logger.info("Accession: {}", proteinSequence.getAccession().getID());
671                        logger.info("Sequence: {}", proteinSequence.getSequenceAsString());
672                } catch (Exception e) {
673                        logger.error("Exception: ", e);
674                }
675
676        }
677
678        /**
679         * Get the gene name associated with this sequence.
680         * @return
681         */
682        public String getGeneName() {
683                if (uniprotDoc == null) {
684                        return "";
685                }
686                try {
687                        Element uniprotElement = uniprotDoc.getDocumentElement();
688                        Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
689                        Element geneElement = XMLHelper.selectSingleElement(entryElement, "gene");
690                        if (geneElement == null) {
691                                return "";
692                        }
693                        Element nameElement = XMLHelper.selectSingleElement(geneElement, "name");
694                        if (nameElement == null) {
695                                return "";
696                        }
697                        return nameElement.getTextContent();
698                } catch (XPathExpressionException e) {
699                        logger.error("Problems while parsing gene name in UniProt XML: {}. Gene name will be blank.",e.getMessage());
700                        return "";
701                }
702        }
703
704        /**
705         * Get the organism name assigned to this sequence
706         * @return
707         */
708        public String getOrganismName() {
709                if (uniprotDoc == null) {
710                        return "";
711                }
712                try {
713                        Element uniprotElement = uniprotDoc.getDocumentElement();
714                        Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
715                        Element organismElement = XMLHelper.selectSingleElement(entryElement, "organism");
716                        if (organismElement == null) {
717                                return "";
718                        }
719                        Element nameElement = XMLHelper.selectSingleElement(organismElement, "name");
720                        if (nameElement == null) {
721                                return "";
722                        }
723                        return nameElement.getTextContent();
724                } catch (XPathExpressionException e) {
725                        logger.error("Problems while parsing organism name in UniProt XML: {}. Organism name will be blank.",e.getMessage());
726                        return "";
727                }
728
729        }
730
731        /**
732         * Pull UniProt key words which is a mixed bag of words associated with this sequence
733         * @return
734         */
735        @Override
736        public ArrayList<String> getKeyWords() {
737                ArrayList<String> keyWordsList = new ArrayList<String>();
738                if (uniprotDoc == null) {
739                        return keyWordsList;
740                }
741                try {
742                        Element uniprotElement = uniprotDoc.getDocumentElement();
743
744                        Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
745                        ArrayList<Element> keyWordElementList = XMLHelper.selectElements(entryElement, "keyword");
746                        for (Element element : keyWordElementList) {
747                                keyWordsList.add(element.getTextContent());
748                        }
749                } catch (XPathExpressionException e) {
750                        logger.error("Problems while parsing keywords in UniProt XML: {}. No keywords will be available.",e.getMessage());
751                        return new ArrayList<String>();
752                }
753
754                return keyWordsList;
755        }
756
757        /**
758         * The Uniprot mappings to other database identifiers for this sequence
759         * @return
760         */
761        @Override
762        public LinkedHashMap<String, ArrayList<DBReferenceInfo>> getDatabaseReferences()  {
763                LinkedHashMap<String, ArrayList<DBReferenceInfo>> databaseReferencesHashMap = new LinkedHashMap<String, ArrayList<DBReferenceInfo>>();
764                if (uniprotDoc == null) {
765                        return databaseReferencesHashMap;
766                }
767
768                try {
769                        Element uniprotElement = uniprotDoc.getDocumentElement();
770                        Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
771                        ArrayList<Element> dbreferenceElementList = XMLHelper.selectElements(entryElement, "dbReference");
772                        for (Element element : dbreferenceElementList) {
773                                String type = element.getAttribute("type");
774                                String id = element.getAttribute("id");
775                                ArrayList<DBReferenceInfo> idlist = databaseReferencesHashMap.get(type);
776                                if (idlist == null) {
777                                        idlist = new ArrayList<DBReferenceInfo>();
778                                        databaseReferencesHashMap.put(type, idlist);
779                                }
780                                DBReferenceInfo dbreferenceInfo = new DBReferenceInfo(type, id);
781                                ArrayList<Element> propertyElementList = XMLHelper.selectElements(element, "property");
782                                for (Element propertyElement : propertyElementList) {
783                                        String propertyType = propertyElement.getAttribute("type");
784                                        String propertyValue = propertyElement.getAttribute("value");
785                                        dbreferenceInfo.addProperty(propertyType, propertyValue);
786                                }
787
788                                idlist.add(dbreferenceInfo);
789                        }
790                } catch (XPathExpressionException e) {
791                        logger.error("Problems while parsing db references in UniProt XML: {}. No db references will be available.",e.getMessage());
792                        return new LinkedHashMap<String, ArrayList<DBReferenceInfo>>();
793                }
794
795                return databaseReferencesHashMap;
796        }
797}