001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 * Created on 01-21-2010
021 *
022 * @auther Scooter Willis
023 *
024 */
025package org.biojava.nbio.core.sequence.loader;
026
027import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
028import org.biojava.nbio.core.sequence.AccessionID;
029import org.biojava.nbio.core.sequence.DataSource;
030import org.biojava.nbio.core.sequence.ProteinSequence;
031import org.biojava.nbio.core.sequence.Strand;
032import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
033import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
034import org.biojava.nbio.core.sequence.features.DBReferenceInfo;
035import org.biojava.nbio.core.sequence.features.DatabaseReferenceInterface;
036import org.biojava.nbio.core.sequence.features.FeaturesKeyWordInterface;
037import org.biojava.nbio.core.sequence.storage.SequenceAsStringHelper;
038import org.biojava.nbio.core.sequence.template.*;
039import org.biojava.nbio.core.util.Equals;
040import org.biojava.nbio.core.util.XMLHelper;
041import org.slf4j.Logger;
042import org.slf4j.LoggerFactory;
043import org.w3c.dom.Document;
044import org.w3c.dom.Element;
045import org.xml.sax.SAXException;
046
047import javax.xml.parsers.ParserConfigurationException;
048import javax.xml.xpath.XPathExpressionException;
049import java.io.*;
050import java.net.HttpURLConnection;
051import java.net.URL;
052import java.rmi.RemoteException;
053import java.util.*;
054import java.util.regex.Pattern;
055
056/**
057 *
058 * Pass in a Uniprot ID and this ProxySequenceReader when passed to a ProteinSequence will get the sequence data and other data elements
059 * associated with the ProteinSequence by Uniprot. This is an example of how to map external databases of proteins and features to the BioJava3
060 * ProteinSequence.
061 * Important to call @see setUniprotDirectoryCache to allow caching of XML files so they don't need to be reloaded each time. Does
062 * not manage cache.
063 * @param <C>
064 */
065public class UniprotProxySequenceReader<C extends Compound> implements ProxySequenceReader<C>, FeaturesKeyWordInterface, DatabaseReferenceInterface {
066
067        private final static Logger logger = LoggerFactory.getLogger(UniprotProxySequenceReader.class);
068
069        /*
070         * Taken from http://www.uniprot.org/help/accession_numbers
071         */
072        private static final String SPID_PATTERN = "[OPQ][0-9][A-Z0-9]{3}[0-9]";
073        private static final String TREMBLID_PATTERN = "[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}";
074        public static final Pattern UP_AC_PATTERN = Pattern.compile("(" + SPID_PATTERN + "|" + TREMBLID_PATTERN + ")");
075
076        public static final String DEFAULT_UNIPROT_BASE_URL = "https://www.uniprot.org";
077
078        private static String uniprotbaseURL = DEFAULT_UNIPROT_BASE_URL;
079        private static String uniprotDirectoryCache = null;
080        private String sequence;
081        private CompoundSet<C> compoundSet;
082        private List<C> parsedCompounds = new ArrayList<>();
083        Document uniprotDoc;
084
085        /**
086         * The UniProt id is used to retrieve the UniProt XML which is then parsed as a DOM object
087         * so we know everything about the protein. If an error occurs throw an exception. We could
088         * have a bad uniprot id or network error
089         * @param accession
090         * @param compoundSet
091         * @throws CompoundNotFoundException
092         * @throws IOException if problems while reading the UniProt XML
093         */
094        public UniprotProxySequenceReader(String accession, CompoundSet<C> compoundSet) throws CompoundNotFoundException, IOException {
095                if (!UP_AC_PATTERN.matcher(accession.toUpperCase()).matches()) {
096                        throw new IllegalArgumentException("Accession provided " + accession + " doesn't comply with the uniprot acession pattern.");
097                }
098                setCompoundSet(compoundSet);
099                uniprotDoc = this.getUniprotXML(accession);
100                String seq = this.getSequence(uniprotDoc);
101                setContents(seq);
102        }
103
104        /**
105         * The xml is passed in as a DOM object so we know everything about the protein.
106         *  If an error occurs throw an exception. We could have a bad uniprot id
107         * @param document
108         * @param compoundSet
109         * @throws CompoundNotFoundException
110         */
111        public UniprotProxySequenceReader(Document document, CompoundSet<C> compoundSet) throws CompoundNotFoundException {
112                setCompoundSet(compoundSet);
113                uniprotDoc = document;
114                String seq = this.getSequence(uniprotDoc);
115                setContents(seq);
116        }
117        /**
118         * The passed in xml is parsed as a DOM object so we know everything about the protein.
119         *  If an error occurs throw an exception. We could have a bad uniprot id
120         * @param xml
121         * @param compoundSet
122         * @return UniprotProxySequenceReader
123         */
124        public static <C extends Compound> UniprotProxySequenceReader<C> parseUniprotXMLString(String xml, CompoundSet<C> compoundSet) {
125                try {
126                        Document document = XMLHelper.inputStreamToDocument(new ByteArrayInputStream(xml.getBytes()));
127                        return new UniprotProxySequenceReader<>(document, compoundSet);
128                } catch (Exception e) {
129                        logger.error("Exception on xml parse of: {}", xml);
130                }
131                return null;
132        }
133
134        @Override
135        public void setCompoundSet(CompoundSet<C> compoundSet) {
136                this.compoundSet = compoundSet;
137        }
138
139        /**
140         * Once the sequence is retrieved set the contents and make sure everything this is valid
141         * Some uniprot records contain white space in the sequence. We must strip it out so setContents doesn't fail.
142         * @param sequence
143         * @throws CompoundNotFoundException
144         */
145        @Override
146        public void setContents(String sequence) throws CompoundNotFoundException {
147                // Horrendously inefficient - pretty much the way the old BJ did things.
148                // TODO Should be optimised.
149                // NOTE This chokes on whitespace in the sequence, so whitespace is stripped
150                this.sequence = sequence.replaceAll("\\s", "").trim();
151                this.parsedCompounds.clear();
152                for (int i = 0; i < this.sequence.length();) {
153                        String compoundStr = null;
154                        C compound = null;
155                        for (int compoundStrLength = 1; compound == null && compoundStrLength <= compoundSet.getMaxSingleCompoundStringLength(); compoundStrLength++) {
156                                compoundStr = this.sequence.substring(i, i + compoundStrLength);
157                                compound = compoundSet.getCompoundForString(compoundStr);
158                        }
159                        if (compound == null) {
160                                throw new CompoundNotFoundException("Compound "+compoundStr+" not found");
161                        } else {
162                                i += compoundStr.length();
163                        }
164                        this.parsedCompounds.add(compound);
165                }
166        }
167
168        /**
169         * The sequence length
170         * @return
171         */
172        @Override
173        public int getLength() {
174                return this.parsedCompounds.size();
175        }
176
177        /**
178         *
179         * @param position
180         * @return
181         */
182        @Override
183        public C getCompoundAt(int position) {
184                return this.parsedCompounds.get(position - 1);
185        }
186
187        /**
188         *
189         * @param compound
190         * @return
191         */
192        @Override
193        public int getIndexOf(C compound) {
194                return this.parsedCompounds.indexOf(compound) + 1;
195        }
196
197        /**
198         *
199         * @param compound
200         * @return
201         */
202        @Override
203        public int getLastIndexOf(C compound) {
204                return this.parsedCompounds.lastIndexOf(compound) + 1;
205        }
206
207        /**
208         *
209         * @return
210         */
211        @Override
212        public String toString() {
213                return getSequenceAsString();
214        }
215
216        /**
217         *
218         * @return
219         */
220        @Override
221        public String getSequenceAsString() {
222                return sequence;
223        }
224
225        /**
226         *
227         * @return
228         */
229        @Override
230        public List<C> getAsList() {
231                return this.parsedCompounds;
232        }
233
234        @Override
235        public boolean equals(Object o){
236
237                if(! Equals.classEqual(this, o)) {
238                        return false;
239                }
240                @SuppressWarnings("unchecked")
241                Sequence<C> other = (Sequence<C>)o;
242                if ( other.getCompoundSet() != getCompoundSet())
243                        return false;
244
245                List<C> rawCompounds = getAsList();
246                List<C> otherCompounds = other.getAsList();
247
248                if ( rawCompounds.size() != otherCompounds.size())
249                        return false;
250
251                for (int i = 0 ; i < rawCompounds.size() ; i++){
252                        Compound myCompound = rawCompounds.get(i);
253                        Compound otherCompound = otherCompounds.get(i);
254                        if ( ! myCompound.equalsIgnoreCase(otherCompound))
255                                return false;
256                }
257                return true;
258        }
259
260        @Override
261        public int hashCode(){
262                String s = getSequenceAsString();
263                return s.hashCode();
264        }
265
266        /**
267         *
268         * @return
269         */
270        @Override
271        public SequenceView<C> getInverse() {
272                return SequenceMixin.inverse(this);
273        }
274
275        /**
276         *
277         * @param bioBegin
278         * @param bioEnd
279         * @param strand
280         * @return
281         */
282        public String getSequenceAsString(Integer bioBegin, Integer bioEnd, Strand strand) {
283                SequenceAsStringHelper<C> sequenceAsStringHelper = new SequenceAsStringHelper<>();
284                return sequenceAsStringHelper.getSequenceAsString(this.parsedCompounds, compoundSet, bioBegin, bioEnd, strand);
285        }
286
287        /**
288         *
289         * @param bioBegin
290         * @param bioEnd
291         * @return
292         */
293        @Override
294        public SequenceView<C> getSubSequence(final Integer bioBegin, final Integer bioEnd) {
295                return new SequenceProxyView<>(UniprotProxySequenceReader.this, bioBegin, bioEnd);
296        }
297
298        /**
299         *
300         * @return
301         */
302        @Override
303        public Iterator<C> iterator() {
304                return this.parsedCompounds.iterator();
305        }
306
307        /**
308         *
309         * @return
310         */
311        @Override
312        public CompoundSet<C> getCompoundSet() {
313                return compoundSet;
314        }
315
316        /**
317         *
318         * @return
319         */
320        @Override
321        public AccessionID getAccession() {
322                AccessionID accessionID = new AccessionID();
323                if (uniprotDoc == null) {
324                        return accessionID;
325                }
326                try {
327                        Element uniprotElement = uniprotDoc.getDocumentElement();
328                        Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
329                        Element nameElement = XMLHelper.selectSingleElement(entryElement, "name");
330                        accessionID = new AccessionID(nameElement.getTextContent(), DataSource.UNIPROT);
331                } catch (XPathExpressionException e) {
332                        logger.error("Exception: ", e);
333                }
334                return accessionID;
335        }
336
337        /**
338         * Pull uniprot accessions associated with this sequence
339         * @return
340         * @throws XPathExpressionException
341         */
342        public List<AccessionID> getAccessions() throws XPathExpressionException {
343                List<AccessionID> accessionList = new ArrayList<>();
344                if (uniprotDoc == null) {
345                        return accessionList;
346                }
347                Element uniprotElement = uniprotDoc.getDocumentElement();
348                Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
349                List<Element> keyWordElementList = XMLHelper.selectElements(entryElement, "accession");
350                for (Element element : keyWordElementList) {
351                        AccessionID accessionID = new AccessionID(element.getTextContent(), DataSource.UNIPROT);
352                        accessionList.add(accessionID);
353                }
354
355                return accessionList;
356        }
357
358        /**
359         * Pull uniprot protein aliases associated with this sequence
360         * Provided for backwards compatibility now that we support both
361         * gene and protein aliases via separate methods.
362         * @return
363         * @throws XPathExpressionException
364         */
365        public List<String> getAliases() throws XPathExpressionException {
366
367                return getProteinAliases();
368        }
369        /**
370         * Pull uniprot protein aliases associated with this sequence
371         * @return
372         * @throws XPathExpressionException
373         */
374        public List<String> getProteinAliases() throws XPathExpressionException {
375                List<String> aliasList = new ArrayList<>();
376                if (uniprotDoc == null) {
377                        return aliasList;
378                }
379                Element uniprotElement = uniprotDoc.getDocumentElement();
380                Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
381                Element proteinElement = XMLHelper.selectSingleElement(entryElement, "protein");
382                
383                List<Element> keyWordElementList;
384                getProteinAliasesFromNameGroup(aliasList, proteinElement);
385                
386                keyWordElementList = XMLHelper.selectElements(proteinElement, "component");
387                for (Element element : keyWordElementList) {
388                        getProteinAliasesFromNameGroup(aliasList, element);
389                }
390
391                keyWordElementList = XMLHelper.selectElements(proteinElement, "domain");
392                for (Element element : keyWordElementList) {
393                        getProteinAliasesFromNameGroup(aliasList, element);
394                }
395
396                keyWordElementList = XMLHelper.selectElements(proteinElement, "submittedName");
397                for (Element element : keyWordElementList) {
398                        getProteinAliasesFromNameGroup(aliasList, element);
399                }
400
401                keyWordElementList = XMLHelper.selectElements(proteinElement, "cdAntigenName");
402                for (Element element : keyWordElementList) {
403                        String cdAntigenName = element.getTextContent();
404                        if(null != cdAntigenName && !cdAntigenName.trim().isEmpty()) {
405                                aliasList.add(cdAntigenName);
406                        }
407                }
408                        
409                keyWordElementList = XMLHelper.selectElements(proteinElement, "innName");
410                for (Element element : keyWordElementList) {
411                        String cdAntigenName = element.getTextContent();
412                        if(null != cdAntigenName && !cdAntigenName.trim().isEmpty()) {
413                                aliasList.add(cdAntigenName);
414                        }
415                }
416
417                keyWordElementList = XMLHelper.selectElements(proteinElement, "biotechName");
418                for (Element element : keyWordElementList) {
419                        String cdAntigenName = element.getTextContent();
420                        if(null != cdAntigenName && !cdAntigenName.trim().isEmpty()) {
421                                aliasList.add(cdAntigenName);
422                        }
423                }
424
425                keyWordElementList = XMLHelper.selectElements(proteinElement, "allergenName");
426                for (Element element : keyWordElementList) {
427                        String cdAntigenName = element.getTextContent();
428                        if(null != cdAntigenName && !cdAntigenName.trim().isEmpty()) {
429                                aliasList.add(cdAntigenName);
430                        }
431                }
432
433                return aliasList;
434        }
435
436        /**
437         * @param aliasList
438         * @param proteinElement
439         * @throws XPathExpressionException
440         */
441        private void getProteinAliasesFromNameGroup(List<String> aliasList, Element proteinElement)
442                        throws XPathExpressionException {
443                List<Element> keyWordElementList = XMLHelper.selectElements(proteinElement, "alternativeName");
444                for (Element element : keyWordElementList) {
445                        getProteinAliasesFromElement(aliasList, element);
446                }
447                
448                keyWordElementList = XMLHelper.selectElements(proteinElement, "recommendedName");
449                for (Element element : keyWordElementList) {
450                        getProteinAliasesFromElement(aliasList, element);
451                }
452        }
453
454        /**
455         * @param aliasList
456         * @param element
457         * @throws XPathExpressionException
458         */
459        private void getProteinAliasesFromElement(List<String> aliasList, Element element)
460                        throws XPathExpressionException {
461                Element fullNameElement = XMLHelper.selectSingleElement(element, "fullName");
462                aliasList.add(fullNameElement.getTextContent());
463                Element shortNameElement = XMLHelper.selectSingleElement(element, "shortName");
464                if(null != shortNameElement) {
465                        String shortName = shortNameElement.getTextContent();
466                        if(null != shortName && !shortName.trim().isEmpty()) {
467                                aliasList.add(shortName);
468                        }
469                }
470        }
471
472        /**
473         * Pull uniprot gene aliases associated with this sequence
474         * @return
475         * @throws XPathExpressionException
476         */
477        public List<String> getGeneAliases() throws XPathExpressionException {
478                List<String> aliasList = new ArrayList<>();
479                if (uniprotDoc == null) {
480                        return aliasList;
481                }
482                Element uniprotElement = uniprotDoc.getDocumentElement();
483                Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
484                List<Element> proteinElements = XMLHelper.selectElements(entryElement, "gene");
485                for(Element proteinElement : proteinElements) {
486                        List<Element> keyWordElementList = XMLHelper.selectElements(proteinElement, "name");
487                        for (Element element : keyWordElementList) {
488                                aliasList.add(element.getTextContent());
489                        }
490                }
491                return aliasList;
492        }
493
494        /**
495         *
496         * @param compounds
497         * @return
498         */
499        @Override
500        public int countCompounds(C... compounds) {
501                throw new UnsupportedOperationException("Not supported yet.");
502        }
503
504        /**
505         *
506         * @param accession
507         * @return
508         * @throws IOException
509         */
510        private Document getUniprotXML(String accession) throws IOException, CompoundNotFoundException {
511                StringBuilder sb = new StringBuilder();
512                // try in cache
513                if (uniprotDirectoryCache != null && uniprotDirectoryCache.length() > 0) {
514                        sb = fetchFromCache(accession);
515                }
516
517                // http://www.uniprot.org/uniprot/?query=SORBIDRAFT_03g027040&format=xml
518                if (sb.length() == 0) {
519                        String uniprotURL = getUniprotbaseURL() + "/uniprot/" + accession.toUpperCase() + ".xml";
520                        logger.info("Loading: {}", uniprotURL);
521                        sb = fetchUniprotXML(uniprotURL);
522
523                        int index = sb.indexOf("xmlns="); //strip out name space stuff to make it easier on xpath
524                        if (index != -1) {
525                                int lastIndex = sb.indexOf(">", index);
526                                sb.replace(index, lastIndex, "");
527                        }
528                        if (uniprotDirectoryCache != null && uniprotDirectoryCache.length() > 0)
529                                writeCache(sb,accession);
530                }
531
532                logger.info("Load complete");
533                try {
534                        //       logger.debug(sb.toString());
535                        Document document = XMLHelper.inputStreamToDocument(new ByteArrayInputStream(sb.toString().getBytes()));
536                        return document;
537                } catch (SAXException | ParserConfigurationException e) {
538                        logger.error("Exception on xml parse of: {}", sb.toString());
539                }
540                return null;
541        }
542
543        private void writeCache(StringBuilder sb, String accession) throws IOException {
544                File f = new File(uniprotDirectoryCache + File.separatorChar + accession + ".xml");
545                try (FileWriter fw = new FileWriter(f)) {
546                        fw.write(sb.toString());
547                }
548        }
549
550        /**
551         * Open a URL connection.
552         *
553         * Follows redirects.
554         * @param url
555         * @throws IOException
556         */
557        private static HttpURLConnection openURLConnection(URL url) throws IOException {
558                // This method should be moved to a utility class in BioJava 5.0
559
560                final int timeout = 5000;
561                final String useragent = "BioJava";
562
563                HttpURLConnection conn = (HttpURLConnection) url.openConnection();
564                conn.setRequestProperty("User-Agent", useragent);
565                conn.setInstanceFollowRedirects(true);
566                conn.setConnectTimeout(timeout);
567                conn.setReadTimeout(timeout);
568
569                int status = conn.getResponseCode();
570                while (status == HttpURLConnection.HTTP_MOVED_TEMP
571                                || status == HttpURLConnection.HTTP_MOVED_PERM
572                                || status == HttpURLConnection.HTTP_SEE_OTHER) {
573                        // Redirect!
574                        String newUrl = conn.getHeaderField("Location");
575
576                        if(newUrl.equals(url.toString())) {
577                                throw new IOException("Cyclic redirect detected at "+newUrl);
578                        }
579
580                        // Preserve cookies
581                        String cookies = conn.getHeaderField("Set-Cookie");
582
583                        // open the new connection again
584                        url = new URL(newUrl);
585                        conn.disconnect();
586                        conn = (HttpURLConnection) url.openConnection();
587                        if(cookies != null) {
588                                conn.setRequestProperty("Cookie", cookies);
589                        }
590                        conn.addRequestProperty("User-Agent", useragent);
591                        conn.setInstanceFollowRedirects(true);
592                        conn.setConnectTimeout(timeout);
593                        conn.setReadTimeout(timeout);
594                        conn.connect();
595
596                        status = conn.getResponseCode();
597
598                        logger.info("Redirecting from {} to {}", url, newUrl);
599                }
600                conn.connect();
601
602                return conn;
603        }
604
605        private StringBuilder fetchUniprotXML(String uniprotURL)
606                        throws IOException, CompoundNotFoundException {
607
608                StringBuilder sb = new StringBuilder();
609                URL uniprot = new URL(uniprotURL);
610                int attempt = 5;
611                List<String> errorCodes = new ArrayList<>();
612                while(attempt > 0) {
613                        HttpURLConnection uniprotConnection = openURLConnection(uniprot);
614                        int statusCode = uniprotConnection.getResponseCode();
615                        if (statusCode == HttpURLConnection.HTTP_OK) {
616                                BufferedReader in = new BufferedReader(
617                                                new InputStreamReader(
618                                                uniprotConnection.getInputStream()));
619                                String inputLine;
620
621                                while ((inputLine = in.readLine()) != null) {
622                                        sb.append(inputLine);
623                                }
624                                in.close();
625                                return sb;
626                        }
627                        attempt--;
628                        errorCodes.add(String.valueOf(statusCode));
629                }
630                throw new RemoteException("Couldn't fetch accession from the url " + uniprotURL + " error codes on 5 attempts are " + errorCodes.toString());
631        }
632
633        /**
634         * @param key
635         * @return A string containing the contents of entry specified by key and if not found returns an empty string
636         * @throws FileNotFoundException
637         * @throws IOException
638         */
639        private StringBuilder fetchFromCache(String key)
640                        throws IOException {
641                int index;
642                File f = new File(uniprotDirectoryCache + File.separatorChar + key + ".xml");
643                StringBuilder sb = new StringBuilder();
644                if (f.exists()) {
645                        char[] data;
646                        try (FileReader fr = new FileReader(f)) {
647                                int size = (int) f.length();
648                                data = new char[size];
649                                fr.read(data);
650                        }
651                        sb.append(data);
652                        index = sb.indexOf("xmlns="); //strip out name space stuff to make it easier on xpath
653                        if (index != -1) {
654                                int lastIndex = sb.indexOf(">", index);
655                                sb.replace(index, lastIndex, "");
656                        }
657                }
658                return sb;
659        }
660
661        /**
662         *
663         * @param uniprotDoc
664         * @return
665         */
666        private String getSequence(Document uniprotDoc)  {
667
668                try {
669                        Element uniprotElement = uniprotDoc.getDocumentElement();
670                        Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
671                        Element sequenceElement = XMLHelper.selectSingleElement(entryElement, "sequence");
672
673                        String seqdata = sequenceElement.getTextContent();
674
675                        return seqdata;
676                } catch (XPathExpressionException e) {
677                        logger.error("Problems while parsing sequence in UniProt XML: {}. Sequence will be blank.", e.getMessage());
678                        return "";
679                }
680        }
681
682        /**
683         * The current UniProt URL to deal with caching issues. www.uniprot.org is load balanced
684         * but you can access pir.uniprot.org directly.
685         * @return the uniprotbaseURL
686         */
687        public static String getUniprotbaseURL() {
688                return uniprotbaseURL;
689        }
690
691        /**
692         * @param aUniprotbaseURL the uniprotbaseURL to set
693         */
694        public static void setUniprotbaseURL(String aUniprotbaseURL) {
695                uniprotbaseURL = aUniprotbaseURL;
696        }
697
698        /**
699         * Local directory cache of XML that can be downloaded
700         * @return the uniprotDirectoryCache
701         */
702        public static String getUniprotDirectoryCache() {
703                return uniprotDirectoryCache;
704        }
705
706        /**
707         * @param aUniprotDirectoryCache the uniprotDirectoryCache to set
708         */
709        public static void setUniprotDirectoryCache(String aUniprotDirectoryCache) {
710                File f = new File(aUniprotDirectoryCache);
711                if (!f.exists()) {
712                        f.mkdirs();
713                }
714                uniprotDirectoryCache = aUniprotDirectoryCache;
715        }
716
717
718        /**
719         * Get the gene name associated with this sequence.
720         * @return
721         */
722        public String getGeneName() {
723                if (uniprotDoc == null) {
724                        return "";
725                }
726                try {
727                        Element uniprotElement = uniprotDoc.getDocumentElement();
728                        Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
729                        Element geneElement = XMLHelper.selectSingleElement(entryElement, "gene");
730                        if (geneElement == null) {
731                                return "";
732                        }
733                        Element nameElement = XMLHelper.selectSingleElement(geneElement, "name");
734                        if (nameElement == null) {
735                                return "";
736                        }
737                        return nameElement.getTextContent();
738                } catch (XPathExpressionException e) {
739                        logger.error("Problems while parsing gene name in UniProt XML: {}. Gene name will be blank.",e.getMessage());
740                        return "";
741                }
742        }
743
744        /**
745         * Get the organism name assigned to this sequence
746         * @return
747         */
748        public String getOrganismName() {
749                if (uniprotDoc == null) {
750                        return "";
751                }
752                try {
753                        Element uniprotElement = uniprotDoc.getDocumentElement();
754                        Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
755                        Element organismElement = XMLHelper.selectSingleElement(entryElement, "organism");
756                        if (organismElement == null) {
757                                return "";
758                        }
759                        Element nameElement = XMLHelper.selectSingleElement(organismElement, "name");
760                        if (nameElement == null) {
761                                return "";
762                        }
763                        return nameElement.getTextContent();
764                } catch (XPathExpressionException e) {
765                        logger.error("Problems while parsing organism name in UniProt XML: {}. Organism name will be blank.",e.getMessage());
766                        return "";
767                }
768
769        }
770
771        /**
772         * Pull UniProt key words which is a mixed bag of words associated with this sequence
773         * @return
774         */
775        @Override
776        public List<String> getKeyWords() {
777                List<String> keyWordsList = new ArrayList<>();
778                if (uniprotDoc == null) {
779                        return keyWordsList;
780                }
781                try {
782                        Element uniprotElement = uniprotDoc.getDocumentElement();
783
784                        Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
785                        List<Element> keyWordElementList = XMLHelper.selectElements(entryElement, "keyword");
786                        for (Element element : keyWordElementList) {
787                                keyWordsList.add(element.getTextContent());
788                        }
789                } catch (XPathExpressionException e) {
790                        logger.error("Problems while parsing keywords in UniProt XML: {}. No keywords will be available.",e.getMessage());
791                        return new ArrayList<>();
792                }
793
794                return keyWordsList;
795        }
796
797        /**
798         * The Uniprot mappings to other database identifiers for this sequence
799         * @return
800         */
801        @Override
802        public Map<String, List<DBReferenceInfo>> getDatabaseReferences()  {
803                Map<String, List<DBReferenceInfo>> databaseReferencesHashMap = new LinkedHashMap<>();
804                if (uniprotDoc == null) {
805                        return databaseReferencesHashMap;
806                }
807
808                try {
809                        Element uniprotElement = uniprotDoc.getDocumentElement();
810                        Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
811                        List<Element> dbreferenceElementList = XMLHelper.selectElements(entryElement, "dbReference");
812                        for (Element element : dbreferenceElementList) {
813                                String type = element.getAttribute("type");
814                                String id = element.getAttribute("id");
815                                List<DBReferenceInfo> idlist = databaseReferencesHashMap.get(type);
816                                if (idlist == null) {
817                                        idlist = new ArrayList<>();
818                                        databaseReferencesHashMap.put(type, idlist);
819                                }
820                                DBReferenceInfo dbreferenceInfo = new DBReferenceInfo(type, id);
821                                List<Element> propertyElementList = XMLHelper.selectElements(element, "property");
822                                for (Element propertyElement : propertyElementList) {
823                                        String propertyType = propertyElement.getAttribute("type");
824                                        String propertyValue = propertyElement.getAttribute("value");
825                                        dbreferenceInfo.addProperty(propertyType, propertyValue);
826                                }
827
828                                idlist.add(dbreferenceInfo);
829                        }
830                } catch (XPathExpressionException e) {
831                        logger.error("Problems while parsing db references in UniProt XML: {}. No db references will be available.",e.getMessage());
832                        return new LinkedHashMap<>();
833                }
834
835                return databaseReferencesHashMap;
836        }
837}