001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 * Created on 01-21-2010
021 *
022 * @auther Scooter Willis
023 *
024 */
025package org.biojava.nbio.core.sequence.loader;
026
027import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
028import org.biojava.nbio.core.sequence.AccessionID;
029import org.biojava.nbio.core.sequence.DataSource;
030import org.biojava.nbio.core.sequence.ProteinSequence;
031import org.biojava.nbio.core.sequence.Strand;
032import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
033import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
034import org.biojava.nbio.core.sequence.features.DBReferenceInfo;
035import org.biojava.nbio.core.sequence.features.DatabaseReferenceInterface;
036import org.biojava.nbio.core.sequence.features.FeaturesKeyWordInterface;
037import org.biojava.nbio.core.sequence.storage.SequenceAsStringHelper;
038import org.biojava.nbio.core.sequence.template.*;
039import org.biojava.nbio.core.util.Equals;
040import org.biojava.nbio.core.util.XMLHelper;
041import org.slf4j.Logger;
042import org.slf4j.LoggerFactory;
043import org.w3c.dom.Document;
044import org.w3c.dom.Element;
045import org.xml.sax.SAXException;
046
047import javax.xml.parsers.ParserConfigurationException;
048import javax.xml.xpath.XPathExpressionException;
049import java.io.*;
050import java.net.HttpURLConnection;
051import java.net.URL;
052import java.rmi.RemoteException;
053import java.util.*;
054import java.util.regex.Pattern;
055
056/**
057 *
058 * Pass in a Uniprot ID and this ProxySequenceReader when passed to a ProteinSequence will get the sequence data and other data elements
059 * associated with the ProteinSequence by Uniprot. This is an example of how to map external databases of proteins and features to the BioJava3
060 * ProteinSequence.
061 * Important to call @see setUniprotDirectoryCache to allow caching of XML files so they don't need to be reloaded each time. Does
062 * not manage cache.
063 * @param <C>
064 */
065public class UniprotProxySequenceReader<C extends Compound> implements ProxySequenceReader<C>, FeaturesKeyWordInterface, DatabaseReferenceInterface {
066
067        private final static Logger logger = LoggerFactory.getLogger(UniprotProxySequenceReader.class);
068
069        /*
070         * Taken from http://www.uniprot.org/help/accession_numbers
071         */
072        private static final String SPID_PATTERN = "[OPQ][0-9][A-Z0-9]{3}[0-9]";
073        private static final String TREMBLID_PATTERN = "[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}";
074        public static final Pattern UP_AC_PATTERN = Pattern.compile("(" + SPID_PATTERN + "|" + TREMBLID_PATTERN + ")");
075
076        public static final String DEFAULT_UNIPROT_BASE_URL = "https://www.uniprot.org";
077
078        private static String uniprotbaseURL = DEFAULT_UNIPROT_BASE_URL;
079        private static String uniprotDirectoryCache = null;
080        private String sequence;
081        private CompoundSet<C> compoundSet;
082        private List<C> parsedCompounds = new ArrayList<C>();
083        Document uniprotDoc;
084
085        /**
086         * The UniProt id is used to retrieve the UniProt XML which is then parsed as a DOM object
087         * so we know everything about the protein. If an error occurs throw an exception. We could
088         * have a bad uniprot id or network error
089         * @param accession
090         * @param compoundSet
091         * @throws CompoundNotFoundException
092         * @throws IOException if problems while reading the UniProt XML
093         */
094        public UniprotProxySequenceReader(String accession, CompoundSet<C> compoundSet) throws CompoundNotFoundException, IOException {
095                if (!UP_AC_PATTERN.matcher(accession.toUpperCase()).matches()) {
096                        throw new IllegalArgumentException("Accession provided " + accession + " doesn't comply with the uniprot acession pattern.");
097                }
098                setCompoundSet(compoundSet);
099                uniprotDoc = this.getUniprotXML(accession);
100                String seq = this.getSequence(uniprotDoc);
101                setContents(seq);
102        }
103
104        /**
105         * The xml is passed in as a DOM object so we know everything about the protein.
106         *  If an error occurs throw an exception. We could have a bad uniprot id
107         * @param document
108         * @param compoundSet
109         * @throws CompoundNotFoundException
110         */
111        public UniprotProxySequenceReader(Document document, CompoundSet<C> compoundSet) throws CompoundNotFoundException {
112                setCompoundSet(compoundSet);
113                uniprotDoc = document;
114                String seq = this.getSequence(uniprotDoc);
115                setContents(seq);
116        }
117        /**
118         * The passed in xml is parsed as a DOM object so we know everything about the protein.
119         *  If an error occurs throw an exception. We could have a bad uniprot id
120         * @param xml
121         * @param compoundSet
122         * @return UniprotProxySequenceReader
123         * @throws Exception
124         */
125        public static <C extends Compound> UniprotProxySequenceReader<C> parseUniprotXMLString(String xml, CompoundSet<C> compoundSet) {
126                try {
127                        Document document = XMLHelper.inputStreamToDocument(new ByteArrayInputStream(xml.getBytes()));
128                        return new UniprotProxySequenceReader<C>(document, compoundSet);
129                } catch (Exception e) {
130                        logger.error("Exception on xml parse of: {}", xml);
131                }
132                return null;
133        }
134
135        @Override
136        public void setCompoundSet(CompoundSet<C> compoundSet) {
137                this.compoundSet = compoundSet;
138        }
139
140        /**
141         * Once the sequence is retrieved set the contents and make sure everything this is valid
142         * Some uniprot records contain white space in the sequence. We must strip it out so setContents doesn't fail.
143         * @param sequence
144         * @throws CompoundNotFoundException
145         */
146        @Override
147        public void setContents(String sequence) throws CompoundNotFoundException {
148                // Horrendously inefficient - pretty much the way the old BJ did things.
149                // TODO Should be optimised.
150                // NOTE This chokes on whitespace in the sequence, so whitespace is stripped
151                this.sequence = sequence.replaceAll("\\s", "").trim();
152                this.parsedCompounds.clear();
153                for (int i = 0; i < this.sequence.length();) {
154                        String compoundStr = null;
155                        C compound = null;
156                        for (int compoundStrLength = 1; compound == null && compoundStrLength <= compoundSet.getMaxSingleCompoundStringLength(); compoundStrLength++) {
157                                compoundStr = this.sequence.substring(i, i + compoundStrLength);
158                                compound = compoundSet.getCompoundForString(compoundStr);
159                        }
160                        if (compound == null) {
161                                throw new CompoundNotFoundException("Compound "+compoundStr+" not found");
162                        } else {
163                                i += compoundStr.length();
164                        }
165                        this.parsedCompounds.add(compound);
166                }
167        }
168
169        /**
170         * The sequence length
171         * @return
172         */
173        @Override
174        public int getLength() {
175                return this.parsedCompounds.size();
176        }
177
178        /**
179         *
180         * @param position
181         * @return
182         */
183        @Override
184        public C getCompoundAt(int position) {
185                return this.parsedCompounds.get(position - 1);
186        }
187
188        /**
189         *
190         * @param compound
191         * @return
192         */
193        @Override
194        public int getIndexOf(C compound) {
195                return this.parsedCompounds.indexOf(compound) + 1;
196        }
197
198        /**
199         *
200         * @param compound
201         * @return
202         */
203        @Override
204        public int getLastIndexOf(C compound) {
205                return this.parsedCompounds.lastIndexOf(compound) + 1;
206        }
207
208        /**
209         *
210         * @return
211         */
212        @Override
213        public String toString() {
214                return getSequenceAsString();
215        }
216
217        /**
218         *
219         * @return
220         */
221        @Override
222        public String getSequenceAsString() {
223                return sequence;
224        }
225
226        /**
227         *
228         * @return
229         */
230        @Override
231        public List<C> getAsList() {
232                return this.parsedCompounds;
233        }
234
235        @Override
236        public boolean equals(Object o){
237
238                if(! Equals.classEqual(this, o)) {
239                        return false;
240                }
241                @SuppressWarnings("unchecked")
242                Sequence<C> other = (Sequence<C>)o;
243                if ( other.getCompoundSet() != getCompoundSet())
244                        return false;
245
246                List<C> rawCompounds = getAsList();
247                List<C> otherCompounds = other.getAsList();
248
249                if ( rawCompounds.size() != otherCompounds.size())
250                        return false;
251
252                for (int i = 0 ; i < rawCompounds.size() ; i++){
253                        Compound myCompound = rawCompounds.get(i);
254                        Compound otherCompound = otherCompounds.get(i);
255                        if ( ! myCompound.equalsIgnoreCase(otherCompound))
256                                return false;
257                }
258                return true;
259        }
260
261        @Override
262        public int hashCode(){
263                String s = getSequenceAsString();
264                return s.hashCode();
265        }
266
267        /**
268         *
269         * @return
270         */
271        @Override
272        public SequenceView<C> getInverse() {
273                return SequenceMixin.inverse(this);
274        }
275
276        /**
277         *
278         * @param bioBegin
279         * @param bioEnd
280         * @param strand
281         * @return
282         */
283        public String getSequenceAsString(Integer bioBegin, Integer bioEnd, Strand strand) {
284                SequenceAsStringHelper<C> sequenceAsStringHelper = new SequenceAsStringHelper<C>();
285                return sequenceAsStringHelper.getSequenceAsString(this.parsedCompounds, compoundSet, bioBegin, bioEnd, strand);
286        }
287
288        /**
289         *
290         * @param bioBegin
291         * @param bioEnd
292         * @return
293         */
294        @Override
295        public SequenceView<C> getSubSequence(final Integer bioBegin, final Integer bioEnd) {
296                return new SequenceProxyView<C>(UniprotProxySequenceReader.this, bioBegin, bioEnd);
297        }
298
299        /**
300         *
301         * @return
302         */
303        @Override
304        public Iterator<C> iterator() {
305                return this.parsedCompounds.iterator();
306        }
307
308        /**
309         *
310         * @return
311         */
312        @Override
313        public CompoundSet<C> getCompoundSet() {
314                return compoundSet;
315        }
316
317        /**
318         *
319         * @return
320         */
321        @Override
322        public AccessionID getAccession() {
323                AccessionID accessionID = new AccessionID();
324                if (uniprotDoc == null) {
325                        return accessionID;
326                }
327                try {
328                        Element uniprotElement = uniprotDoc.getDocumentElement();
329                        Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
330                        Element nameElement = XMLHelper.selectSingleElement(entryElement, "name");
331                        accessionID = new AccessionID(nameElement.getTextContent(), DataSource.UNIPROT);
332                } catch (XPathExpressionException e) {
333                        logger.error("Exception: ", e);
334                }
335                return accessionID;
336        }
337
338        /**
339         * Pull uniprot accessions associated with this sequence
340         * @return
341         * @throws XPathExpressionException
342         */
343        public ArrayList<AccessionID> getAccessions() throws XPathExpressionException {
344                ArrayList<AccessionID> accessionList = new ArrayList<AccessionID>();
345                if (uniprotDoc == null) {
346                        return accessionList;
347                }
348                Element uniprotElement = uniprotDoc.getDocumentElement();
349                Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
350                ArrayList<Element> keyWordElementList = XMLHelper.selectElements(entryElement, "accession");
351                for (Element element : keyWordElementList) {
352                        AccessionID accessionID = new AccessionID(element.getTextContent(), DataSource.UNIPROT);
353                        accessionList.add(accessionID);
354                }
355
356                return accessionList;
357        }
358
359        /**
360         * Pull uniprot protein aliases associated with this sequence
361         * Provided for backwards compatibility now that we support both
362         * gene and protein aliases via separate methods.
363         * @return
364         * @throws XPathExpressionException
365         */
366        public ArrayList<String> getAliases() throws XPathExpressionException {
367
368                return getProteinAliases();
369        }
370        /**
371         * Pull uniprot protein aliases associated with this sequence
372         * @return
373         * @throws XPathExpressionException
374         */
375        public ArrayList<String> getProteinAliases() throws XPathExpressionException {
376                ArrayList<String> aliasList = new ArrayList<String>();
377                if (uniprotDoc == null) {
378                        return aliasList;
379                }
380                Element uniprotElement = uniprotDoc.getDocumentElement();
381                Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
382                Element proteinElement = XMLHelper.selectSingleElement(entryElement, "protein");
383                
384                ArrayList<Element> keyWordElementList;
385                getProteinAliasesFromNameGroup(aliasList, proteinElement);
386                
387                keyWordElementList = XMLHelper.selectElements(proteinElement, "component");
388                for (Element element : keyWordElementList) {
389                        getProteinAliasesFromNameGroup(aliasList, element);
390                }
391
392                keyWordElementList = XMLHelper.selectElements(proteinElement, "domain");
393                for (Element element : keyWordElementList) {
394                        getProteinAliasesFromNameGroup(aliasList, element);
395                }
396
397                keyWordElementList = XMLHelper.selectElements(proteinElement, "submittedName");
398                for (Element element : keyWordElementList) {
399                        getProteinAliasesFromNameGroup(aliasList, element);
400                }
401
402                keyWordElementList = XMLHelper.selectElements(proteinElement, "cdAntigenName");
403                for (Element element : keyWordElementList) {
404                        String cdAntigenName = element.getTextContent();
405                        if(null != cdAntigenName && !cdAntigenName.trim().isEmpty()) {
406                                aliasList.add(cdAntigenName);
407                        }
408                }
409                        
410                keyWordElementList = XMLHelper.selectElements(proteinElement, "innName");
411                for (Element element : keyWordElementList) {
412                        String cdAntigenName = element.getTextContent();
413                        if(null != cdAntigenName && !cdAntigenName.trim().isEmpty()) {
414                                aliasList.add(cdAntigenName);
415                        }
416                }
417
418                keyWordElementList = XMLHelper.selectElements(proteinElement, "biotechName");
419                for (Element element : keyWordElementList) {
420                        String cdAntigenName = element.getTextContent();
421                        if(null != cdAntigenName && !cdAntigenName.trim().isEmpty()) {
422                                aliasList.add(cdAntigenName);
423                        }
424                }
425
426                keyWordElementList = XMLHelper.selectElements(proteinElement, "allergenName");
427                for (Element element : keyWordElementList) {
428                        String cdAntigenName = element.getTextContent();
429                        if(null != cdAntigenName && !cdAntigenName.trim().isEmpty()) {
430                                aliasList.add(cdAntigenName);
431                        }
432                }
433
434                return aliasList;
435        }
436
437        /**
438         * @param aliasList
439         * @param proteinElement
440         * @throws XPathExpressionException
441         */
442        private void getProteinAliasesFromNameGroup(ArrayList<String> aliasList, Element proteinElement)
443                        throws XPathExpressionException {
444                ArrayList<Element> keyWordElementList = XMLHelper.selectElements(proteinElement, "alternativeName");
445                for (Element element : keyWordElementList) {
446                        getProteinAliasesFromElement(aliasList, element);
447                }
448                
449                keyWordElementList = XMLHelper.selectElements(proteinElement, "recommendedName");
450                for (Element element : keyWordElementList) {
451                        getProteinAliasesFromElement(aliasList, element);
452                }
453        }
454
455        /**
456         * @param aliasList
457         * @param element
458         * @throws XPathExpressionException
459         */
460        private void getProteinAliasesFromElement(ArrayList<String> aliasList, Element element)
461                        throws XPathExpressionException {
462                Element fullNameElement = XMLHelper.selectSingleElement(element, "fullName");
463                aliasList.add(fullNameElement.getTextContent());
464                Element shortNameElement = XMLHelper.selectSingleElement(element, "shortName");
465                if(null != shortNameElement) {
466                        String shortName = shortNameElement.getTextContent();
467                        if(null != shortName && !shortName.trim().isEmpty()) {
468                                aliasList.add(shortName);
469                        }
470                }
471        }
472
473        /**
474         * Pull uniprot gene aliases associated with this sequence
475         * @return
476         * @throws XPathExpressionException
477         */
478        public ArrayList<String> getGeneAliases() throws XPathExpressionException {
479                ArrayList<String> aliasList = new ArrayList<String>();
480                if (uniprotDoc == null) {
481                        return aliasList;
482                }
483                Element uniprotElement = uniprotDoc.getDocumentElement();
484                Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
485                ArrayList<Element> proteinElements = XMLHelper.selectElements(entryElement, "gene");
486                for(Element proteinElement : proteinElements) {
487                        ArrayList<Element> keyWordElementList = XMLHelper.selectElements(proteinElement, "name");
488                        for (Element element : keyWordElementList) {
489                                aliasList.add(element.getTextContent());
490                        }
491                }
492                return aliasList;
493        }
494
495        /**
496         *
497         * @param compounds
498         * @return
499         */
500        @Override
501        public int countCompounds(C... compounds) {
502                throw new UnsupportedOperationException("Not supported yet.");
503        }
504
505        /**
506         *
507         * @param accession
508         * @return
509         * @throws IOException
510         */
511        private Document getUniprotXML(String accession) throws IOException, CompoundNotFoundException {
512                StringBuilder sb = new StringBuilder();
513                // try in cache
514                if (uniprotDirectoryCache != null && uniprotDirectoryCache.length() > 0) {
515                        sb = fetchFromCache(accession);
516                }
517
518                // http://www.uniprot.org/uniprot/?query=SORBIDRAFT_03g027040&format=xml
519                if (sb.length() == 0) {
520                        String uniprotURL = getUniprotbaseURL() + "/uniprot/" + accession.toUpperCase() + ".xml";
521                        logger.info("Loading: {}", uniprotURL);
522                        sb = fetchUniprotXML(uniprotURL);
523
524                        int index = sb.indexOf("xmlns="); //strip out name space stuff to make it easier on xpath
525                        if (index != -1) {
526                                int lastIndex = sb.indexOf(">", index);
527                                sb.replace(index, lastIndex, "");
528                        }
529                        if (uniprotDirectoryCache != null && uniprotDirectoryCache.length() > 0)
530                                writeCache(sb,accession);
531                }
532
533                logger.info("Load complete");
534                try {
535                        //       logger.debug(sb.toString());
536                        Document document = XMLHelper.inputStreamToDocument(new ByteArrayInputStream(sb.toString().getBytes()));
537                        return document;
538                } catch (SAXException | ParserConfigurationException e) {
539                        logger.error("Exception on xml parse of: {}", sb.toString());
540                }
541                return null;
542        }
543
544        private void writeCache(StringBuilder sb, String accession) throws IOException {
545                File f = new File(uniprotDirectoryCache + File.separatorChar + accession + ".xml");
546                try (FileWriter fw = new FileWriter(f)) {
547                        fw.write(sb.toString());
548                }
549        }
550
551        /**
552         * Open a URL connection.
553         *
554         * Follows redirects.
555         * @param url
556         * @throws IOException
557         */
558        private static HttpURLConnection openURLConnection(URL url) throws IOException {
559                // This method should be moved to a utility class in BioJava 5.0
560
561                final int timeout = 5000;
562                final String useragent = "BioJava";
563
564                HttpURLConnection conn = (HttpURLConnection) url.openConnection();
565                conn.setRequestProperty("User-Agent", useragent);
566                conn.setInstanceFollowRedirects(true);
567                conn.setConnectTimeout(timeout);
568                conn.setReadTimeout(timeout);
569
570                int status = conn.getResponseCode();
571                while (status == HttpURLConnection.HTTP_MOVED_TEMP
572                                || status == HttpURLConnection.HTTP_MOVED_PERM
573                                || status == HttpURLConnection.HTTP_SEE_OTHER) {
574                        // Redirect!
575                        String newUrl = conn.getHeaderField("Location");
576
577                        if(newUrl.equals(url.toString())) {
578                                throw new IOException("Cyclic redirect detected at "+newUrl);
579                        }
580
581                        // Preserve cookies
582                        String cookies = conn.getHeaderField("Set-Cookie");
583
584                        // open the new connection again
585                        url = new URL(newUrl);
586                        conn.disconnect();
587                        conn = (HttpURLConnection) url.openConnection();
588                        if(cookies != null) {
589                                conn.setRequestProperty("Cookie", cookies);
590                        }
591                        conn.addRequestProperty("User-Agent", useragent);
592                        conn.setInstanceFollowRedirects(true);
593                        conn.setConnectTimeout(timeout);
594                        conn.setReadTimeout(timeout);
595                        conn.connect();
596
597                        status = conn.getResponseCode();
598
599                        logger.info("Redirecting from {} to {}", url, newUrl);
600                }
601                conn.connect();
602
603                return conn;
604        }
605
606        private StringBuilder fetchUniprotXML(String uniprotURL)
607                        throws IOException, CompoundNotFoundException {
608
609                StringBuilder sb = new StringBuilder();
610                URL uniprot = new URL(uniprotURL);
611                int attempt = 5;
612                List<String> errorCodes = new ArrayList<String>();
613                while(attempt > 0) {
614                        HttpURLConnection uniprotConnection = openURLConnection(uniprot);
615                        int statusCode = uniprotConnection.getResponseCode();
616                        if (statusCode == HttpURLConnection.HTTP_OK) {
617                                BufferedReader in = new BufferedReader(
618                                                new InputStreamReader(
619                                                uniprotConnection.getInputStream()));
620                                String inputLine;
621
622                                while ((inputLine = in.readLine()) != null) {
623                                        sb.append(inputLine);
624                                }
625                                in.close();
626                                return sb;
627                        }
628                        attempt--;
629                        errorCodes.add(String.valueOf(statusCode));
630                }
631                throw new RemoteException("Couldn't fetch accession from the url " + uniprotURL + " error codes on 5 attempts are " + errorCodes.toString());
632        }
633
634        /**
635         * @param key
636         * @return A string containing the contents of entry specified by key and if not found returns an empty string
637         * @throws FileNotFoundException
638         * @throws IOException
639         */
640        private StringBuilder fetchFromCache(String key)
641                        throws FileNotFoundException, IOException {
642                int index;
643                File f = new File(uniprotDirectoryCache + File.separatorChar + key + ".xml");
644                StringBuilder sb = new StringBuilder();
645                if (f.exists()) {
646                        char[] data;
647                        try (FileReader fr = new FileReader(f)) {
648                                int size = (int) f.length();
649                                data = new char[size];
650                                fr.read(data);
651                        }
652                        sb.append(data);
653                        index = sb.indexOf("xmlns="); //strip out name space stuff to make it easier on xpath
654                        if (index != -1) {
655                                int lastIndex = sb.indexOf(">", index);
656                                sb.replace(index, lastIndex, "");
657                        }
658                }
659                return sb;
660        }
661
662        /**
663         *
664         * @param uniprotDoc
665         * @return
666         */
667        private String getSequence(Document uniprotDoc)  {
668
669                try {
670                        Element uniprotElement = uniprotDoc.getDocumentElement();
671                        Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
672                        Element sequenceElement = XMLHelper.selectSingleElement(entryElement, "sequence");
673
674                        String seqdata = sequenceElement.getTextContent();
675
676                        return seqdata;
677                } catch (XPathExpressionException e) {
678                        logger.error("Problems while parsing sequence in UniProt XML: {}. Sequence will be blank.", e.getMessage());
679                        return "";
680                }
681        }
682
683        /**
684         * The current UniProt URL to deal with caching issues. www.uniprot.org is load balanced
685         * but you can access pir.uniprot.org directly.
686         * @return the uniprotbaseURL
687         */
688        public static String getUniprotbaseURL() {
689                return uniprotbaseURL;
690        }
691
692        /**
693         * @param aUniprotbaseURL the uniprotbaseURL to set
694         */
695        public static void setUniprotbaseURL(String aUniprotbaseURL) {
696                uniprotbaseURL = aUniprotbaseURL;
697        }
698
699        /**
700         * Local directory cache of XML that can be downloaded
701         * @return the uniprotDirectoryCache
702         */
703        public static String getUniprotDirectoryCache() {
704                return uniprotDirectoryCache;
705        }
706
707        /**
708         * @param aUniprotDirectoryCache the uniprotDirectoryCache to set
709         */
710        public static void setUniprotDirectoryCache(String aUniprotDirectoryCache) {
711                File f = new File(aUniprotDirectoryCache);
712                if (!f.exists()) {
713                        f.mkdirs();
714                }
715                uniprotDirectoryCache = aUniprotDirectoryCache;
716        }
717
718
719        /**
720         * Get the gene name associated with this sequence.
721         * @return
722         */
723        public String getGeneName() {
724                if (uniprotDoc == null) {
725                        return "";
726                }
727                try {
728                        Element uniprotElement = uniprotDoc.getDocumentElement();
729                        Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
730                        Element geneElement = XMLHelper.selectSingleElement(entryElement, "gene");
731                        if (geneElement == null) {
732                                return "";
733                        }
734                        Element nameElement = XMLHelper.selectSingleElement(geneElement, "name");
735                        if (nameElement == null) {
736                                return "";
737                        }
738                        return nameElement.getTextContent();
739                } catch (XPathExpressionException e) {
740                        logger.error("Problems while parsing gene name in UniProt XML: {}. Gene name will be blank.",e.getMessage());
741                        return "";
742                }
743        }
744
745        /**
746         * Get the organism name assigned to this sequence
747         * @return
748         */
749        public String getOrganismName() {
750                if (uniprotDoc == null) {
751                        return "";
752                }
753                try {
754                        Element uniprotElement = uniprotDoc.getDocumentElement();
755                        Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
756                        Element organismElement = XMLHelper.selectSingleElement(entryElement, "organism");
757                        if (organismElement == null) {
758                                return "";
759                        }
760                        Element nameElement = XMLHelper.selectSingleElement(organismElement, "name");
761                        if (nameElement == null) {
762                                return "";
763                        }
764                        return nameElement.getTextContent();
765                } catch (XPathExpressionException e) {
766                        logger.error("Problems while parsing organism name in UniProt XML: {}. Organism name will be blank.",e.getMessage());
767                        return "";
768                }
769
770        }
771
772        /**
773         * Pull UniProt key words which is a mixed bag of words associated with this sequence
774         * @return
775         */
776        @Override
777        public ArrayList<String> getKeyWords() {
778                ArrayList<String> keyWordsList = new ArrayList<String>();
779                if (uniprotDoc == null) {
780                        return keyWordsList;
781                }
782                try {
783                        Element uniprotElement = uniprotDoc.getDocumentElement();
784
785                        Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
786                        ArrayList<Element> keyWordElementList = XMLHelper.selectElements(entryElement, "keyword");
787                        for (Element element : keyWordElementList) {
788                                keyWordsList.add(element.getTextContent());
789                        }
790                } catch (XPathExpressionException e) {
791                        logger.error("Problems while parsing keywords in UniProt XML: {}. No keywords will be available.",e.getMessage());
792                        return new ArrayList<String>();
793                }
794
795                return keyWordsList;
796        }
797
798        /**
799         * The Uniprot mappings to other database identifiers for this sequence
800         * @return
801         */
802        @Override
803        public Map<String, List<DBReferenceInfo>> getDatabaseReferences()  {
804                Map<String, List<DBReferenceInfo>> databaseReferencesHashMap = new LinkedHashMap<>();
805                if (uniprotDoc == null) {
806                        return databaseReferencesHashMap;
807                }
808
809                try {
810                        Element uniprotElement = uniprotDoc.getDocumentElement();
811                        Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
812                        ArrayList<Element> dbreferenceElementList = XMLHelper.selectElements(entryElement, "dbReference");
813                        for (Element element : dbreferenceElementList) {
814                                String type = element.getAttribute("type");
815                                String id = element.getAttribute("id");
816                                List<DBReferenceInfo> idlist = databaseReferencesHashMap.get(type);
817                                if (idlist == null) {
818                                        idlist = new ArrayList<DBReferenceInfo>();
819                                        databaseReferencesHashMap.put(type, idlist);
820                                }
821                                DBReferenceInfo dbreferenceInfo = new DBReferenceInfo(type, id);
822                                ArrayList<Element> propertyElementList = XMLHelper.selectElements(element, "property");
823                                for (Element propertyElement : propertyElementList) {
824                                        String propertyType = propertyElement.getAttribute("type");
825                                        String propertyValue = propertyElement.getAttribute("value");
826                                        dbreferenceInfo.addProperty(propertyType, propertyValue);
827                                }
828
829                                idlist.add(dbreferenceInfo);
830                        }
831                } catch (XPathExpressionException e) {
832                        logger.error("Problems while parsing db references in UniProt XML: {}. No db references will be available.",e.getMessage());
833                        return new LinkedHashMap<>();
834                }
835
836                return databaseReferencesHashMap;
837        }
838}