001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 * Created on 01-21-2010
021 *
022 * @auther Scooter Willis
023 *
024 */
025package org.biojava.nbio.core.sequence.loader;
026
027import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
028import org.biojava.nbio.core.sequence.AccessionID;
029import org.biojava.nbio.core.sequence.DataSource;
030import org.biojava.nbio.core.sequence.ProteinSequence;
031import org.biojava.nbio.core.sequence.Strand;
032import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
033import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
034import org.biojava.nbio.core.sequence.features.DBReferenceInfo;
035import org.biojava.nbio.core.sequence.features.DatabaseReferenceInterface;
036import org.biojava.nbio.core.sequence.features.FeaturesKeyWordInterface;
037import org.biojava.nbio.core.sequence.storage.SequenceAsStringHelper;
038import org.biojava.nbio.core.sequence.template.*;
039import org.biojava.nbio.core.util.Equals;
040import org.biojava.nbio.core.util.XMLHelper;
041import org.slf4j.Logger;
042import org.slf4j.LoggerFactory;
043import org.w3c.dom.Document;
044import org.w3c.dom.Element;
045import org.xml.sax.SAXException;
046
047import javax.xml.parsers.ParserConfigurationException;
048import javax.xml.xpath.XPathExpressionException;
049import java.io.*;
050import java.net.HttpURLConnection;
051import java.net.URL;
052import java.rmi.RemoteException;
053import java.util.*;
054import java.util.regex.Pattern;
055
056/**
057 *
058 * Pass in a Uniprot ID and this ProxySequenceReader when passed to a ProteinSequence will get the sequence data and other data elements
059 * associated with the ProteinSequence by Uniprot. This is an example of how to map external databases of proteins and features to the BioJava3
060 * ProteinSequence.
061 * Important to call @see setUniprotDirectoryCache to allow caching of XML files so they don't need to be reloaded each time. Does
062 * not manage cache.
063 * @param <C>
064 */
065public class UniprotProxySequenceReader<C extends Compound> implements ProxySequenceReader<C>, FeaturesKeyWordInterface, DatabaseReferenceInterface {
066
067        private final static Logger logger = LoggerFactory.getLogger(UniprotProxySequenceReader.class);
068
069        /*
070         * Taken from http://www.uniprot.org/help/accession_numbers
071         */
072        private static final String SPID_PATTERN = "[OPQ][0-9][A-Z0-9]{3}[0-9]";
073        private static final String TREMBLID_PATTERN = "[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}";
074        public static final Pattern UP_AC_PATTERN = Pattern.compile("(" + SPID_PATTERN + "|" + TREMBLID_PATTERN + ")");
075
076        public static final String DEFAULT_UNIPROT_BASE_URL = "https://www.uniprot.org";
077
078        private static String uniprotbaseURL = DEFAULT_UNIPROT_BASE_URL;
079        private static String uniprotDirectoryCache = null;
080        private String sequence;
081        private CompoundSet<C> compoundSet;
082        private List<C> parsedCompounds = new ArrayList<C>();
083        Document uniprotDoc;
084
085        /**
086         * The UniProt id is used to retrieve the UniProt XML which is then parsed as a DOM object
087         * so we know everything about the protein. If an error occurs throw an exception. We could
088         * have a bad uniprot id or network error
089         * @param accession
090         * @param compoundSet
091         * @throws CompoundNotFoundException
092         * @throws IOException if problems while reading the UniProt XML
093         */
094        public UniprotProxySequenceReader(String accession, CompoundSet<C> compoundSet) throws CompoundNotFoundException, IOException {
095                if (!UP_AC_PATTERN.matcher(accession.toUpperCase()).matches()) {
096                        throw new IllegalArgumentException("Accession provided " + accession + " doesn't comply with the uniprot acession pattern.");
097                }
098                setCompoundSet(compoundSet);
099                uniprotDoc = this.getUniprotXML(accession);
100                String seq = this.getSequence(uniprotDoc);
101                setContents(seq);
102        }
103
104        /**
105         * The xml is passed in as a DOM object so we know everything about the protein.
106         *  If an error occurs throw an exception. We could have a bad uniprot id
107         * @param document
108         * @param compoundSet
109         * @throws CompoundNotFoundException
110         */
111        public UniprotProxySequenceReader(Document document, CompoundSet<C> compoundSet) throws CompoundNotFoundException {
112                setCompoundSet(compoundSet);
113                uniprotDoc = document;
114                String seq = this.getSequence(uniprotDoc);
115                setContents(seq);
116        }
117        /**
118         * The passed in xml is parsed as a DOM object so we know everything about the protein.
119         *  If an error occurs throw an exception. We could have a bad uniprot id
120         * @param xml
121         * @param compoundSet
122         * @return UniprotProxySequenceReader
123         * @throws Exception
124         */
125        public static <C extends Compound> UniprotProxySequenceReader<C> parseUniprotXMLString(String xml, CompoundSet<C> compoundSet) {
126                try {
127                        Document document = XMLHelper.inputStreamToDocument(new ByteArrayInputStream(xml.getBytes()));
128                        return new UniprotProxySequenceReader<C>(document, compoundSet);
129                } catch (Exception e) {
130                        logger.error("Exception on xml parse of: {}", xml);
131                }
132                return null;
133        }
134
135        @Override
136        public void setCompoundSet(CompoundSet<C> compoundSet) {
137                this.compoundSet = compoundSet;
138        }
139
140        /**
141         * Once the sequence is retrieved set the contents and make sure everything this is valid
142         * Some uniprot records contain white space in the sequence. We must strip it out so setContents doesn't fail.
143         * @param sequence
144         * @throws CompoundNotFoundException
145         */
146        @Override
147        public void setContents(String sequence) throws CompoundNotFoundException {
148                // Horrendously inefficient - pretty much the way the old BJ did things.
149                // TODO Should be optimised.
150                // NOTE This chokes on whitespace in the sequence, so whitespace is stripped
151                this.sequence = sequence.replaceAll("\\s", "").trim();
152                this.parsedCompounds.clear();
153                for (int i = 0; i < this.sequence.length();) {
154                        String compoundStr = null;
155                        C compound = null;
156                        for (int compoundStrLength = 1; compound == null && compoundStrLength <= compoundSet.getMaxSingleCompoundStringLength(); compoundStrLength++) {
157                                compoundStr = this.sequence.substring(i, i + compoundStrLength);
158                                compound = compoundSet.getCompoundForString(compoundStr);
159                        }
160                        if (compound == null) {
161                                throw new CompoundNotFoundException("Compound "+compoundStr+" not found");
162                        } else {
163                                i += compoundStr.length();
164                        }
165                        this.parsedCompounds.add(compound);
166                }
167        }
168
169        /**
170         * The sequence length
171         * @return
172         */
173        @Override
174        public int getLength() {
175                return this.parsedCompounds.size();
176        }
177
178        /**
179         *
180         * @param position
181         * @return
182         */
183        @Override
184        public C getCompoundAt(int position) {
185                return this.parsedCompounds.get(position - 1);
186        }
187
188        /**
189         *
190         * @param compound
191         * @return
192         */
193        @Override
194        public int getIndexOf(C compound) {
195                return this.parsedCompounds.indexOf(compound) + 1;
196        }
197
198        /**
199         *
200         * @param compound
201         * @return
202         */
203        @Override
204        public int getLastIndexOf(C compound) {
205                return this.parsedCompounds.lastIndexOf(compound) + 1;
206        }
207
208        /**
209         *
210         * @return
211         */
212        @Override
213        public String toString() {
214                return getSequenceAsString();
215        }
216
217        /**
218         *
219         * @return
220         */
221        @Override
222        public String getSequenceAsString() {
223                return sequence;
224        }
225
226        /**
227         *
228         * @return
229         */
230        @Override
231        public List<C> getAsList() {
232                return this.parsedCompounds;
233        }
234
235        @Override
236        public boolean equals(Object o){
237
238                if(! Equals.classEqual(this, o)) {
239                        return false;
240                }
241                @SuppressWarnings("unchecked")
242                Sequence<C> other = (Sequence<C>)o;
243                if ( other.getCompoundSet() != getCompoundSet())
244                        return false;
245
246                List<C> rawCompounds = getAsList();
247                List<C> otherCompounds = other.getAsList();
248
249                if ( rawCompounds.size() != otherCompounds.size())
250                        return false;
251
252                for (int i = 0 ; i < rawCompounds.size() ; i++){
253                        Compound myCompound = rawCompounds.get(i);
254                        Compound otherCompound = otherCompounds.get(i);
255                        if ( ! myCompound.equalsIgnoreCase(otherCompound))
256                                return false;
257                }
258                return true;
259        }
260
261        @Override
262        public int hashCode(){
263                String s = getSequenceAsString();
264                return s.hashCode();
265        }
266
267        /**
268         *
269         * @return
270         */
271        @Override
272        public SequenceView<C> getInverse() {
273                return SequenceMixin.inverse(this);
274        }
275
276        /**
277         *
278         * @param bioBegin
279         * @param bioEnd
280         * @param strand
281         * @return
282         */
283        public String getSequenceAsString(Integer bioBegin, Integer bioEnd, Strand strand) {
284                SequenceAsStringHelper<C> sequenceAsStringHelper = new SequenceAsStringHelper<C>();
285                return sequenceAsStringHelper.getSequenceAsString(this.parsedCompounds, compoundSet, bioBegin, bioEnd, strand);
286        }
287
288        /**
289         *
290         * @param bioBegin
291         * @param bioEnd
292         * @return
293         */
294        @Override
295        public SequenceView<C> getSubSequence(final Integer bioBegin, final Integer bioEnd) {
296                return new SequenceProxyView<C>(UniprotProxySequenceReader.this, bioBegin, bioEnd);
297        }
298
299        /**
300         *
301         * @return
302         */
303        @Override
304        public Iterator<C> iterator() {
305                return this.parsedCompounds.iterator();
306        }
307
308        /**
309         *
310         * @return
311         */
312        @Override
313        public CompoundSet<C> getCompoundSet() {
314                return compoundSet;
315        }
316
317        /**
318         *
319         * @return
320         */
321        @Override
322        public AccessionID getAccession() {
323                AccessionID accessionID = new AccessionID();
324                if (uniprotDoc == null) {
325                        return accessionID;
326                }
327                try {
328                        Element uniprotElement = uniprotDoc.getDocumentElement();
329                        Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
330                        Element nameElement = XMLHelper.selectSingleElement(entryElement, "name");
331                        accessionID = new AccessionID(nameElement.getTextContent(), DataSource.UNIPROT);
332                } catch (XPathExpressionException e) {
333                        logger.error("Exception: ", e);
334                }
335                return accessionID;
336        }
337
338        /**
339         * Pull uniprot accessions associated with this sequence
340         * @return
341         * @throws XPathExpressionException
342         */
343        public ArrayList<AccessionID> getAccessions() throws XPathExpressionException {
344                ArrayList<AccessionID> accessionList = new ArrayList<AccessionID>();
345                if (uniprotDoc == null) {
346                        return accessionList;
347                }
348                Element uniprotElement = uniprotDoc.getDocumentElement();
349                Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
350                ArrayList<Element> keyWordElementList = XMLHelper.selectElements(entryElement, "accession");
351                for (Element element : keyWordElementList) {
352                        AccessionID accessionID = new AccessionID(element.getTextContent(), DataSource.UNIPROT);
353                        accessionList.add(accessionID);
354                }
355
356                return accessionList;
357        }
358
359        /**
360         * Pull uniprot protein aliases associated with this sequence
361         * Provided for backwards compatibility now that we support both
362         * gene and protein aliases via separate methods.
363         * @return
364         * @throws XPathExpressionException
365         */
366        public ArrayList<String> getAliases() throws XPathExpressionException {
367
368                return getProteinAliases();
369        }
370        /**
371         * Pull uniprot protein aliases associated with this sequence
372         * @return
373         * @throws XPathExpressionException
374         */
375        public ArrayList<String> getProteinAliases() throws XPathExpressionException {
376                ArrayList<String> aliasList = new ArrayList<String>();
377                if (uniprotDoc == null) {
378                        return aliasList;
379                }
380                Element uniprotElement = uniprotDoc.getDocumentElement();
381                Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
382                Element proteinElement = XMLHelper.selectSingleElement(entryElement, "protein");
383                
384                ArrayList<Element> keyWordElementList;
385                getProteinAliasesFromNameGroup(aliasList, proteinElement);
386                
387                keyWordElementList = XMLHelper.selectElements(proteinElement, "component");
388                for (Element element : keyWordElementList) {
389                        getProteinAliasesFromNameGroup(aliasList, element);
390                }
391
392                keyWordElementList = XMLHelper.selectElements(proteinElement, "domain");
393                for (Element element : keyWordElementList) {
394                        getProteinAliasesFromNameGroup(aliasList, element);
395                }
396
397                keyWordElementList = XMLHelper.selectElements(proteinElement, "submittedName");
398                for (Element element : keyWordElementList) {
399                        getProteinAliasesFromNameGroup(aliasList, element);
400                }
401
402                keyWordElementList = XMLHelper.selectElements(proteinElement, "cdAntigenName");
403                for (Element element : keyWordElementList) {
404                        String cdAntigenName = element.getTextContent();
405                        if(null != cdAntigenName && !cdAntigenName.trim().isEmpty()) {
406                                aliasList.add(cdAntigenName);
407                        }
408                }
409                        
410                keyWordElementList = XMLHelper.selectElements(proteinElement, "innName");
411                for (Element element : keyWordElementList) {
412                        String cdAntigenName = element.getTextContent();
413                        if(null != cdAntigenName && !cdAntigenName.trim().isEmpty()) {
414                                aliasList.add(cdAntigenName);
415                        }
416                }
417
418                keyWordElementList = XMLHelper.selectElements(proteinElement, "biotechName");
419                for (Element element : keyWordElementList) {
420                        String cdAntigenName = element.getTextContent();
421                        if(null != cdAntigenName && !cdAntigenName.trim().isEmpty()) {
422                                aliasList.add(cdAntigenName);
423                        }
424                }
425
426                keyWordElementList = XMLHelper.selectElements(proteinElement, "allergenName");
427                for (Element element : keyWordElementList) {
428                        String cdAntigenName = element.getTextContent();
429                        if(null != cdAntigenName && !cdAntigenName.trim().isEmpty()) {
430                                aliasList.add(cdAntigenName);
431                        }
432                }
433
434                return aliasList;
435        }
436
437        /**
438         * @param aliasList
439         * @param proteinElement
440         * @throws XPathExpressionException
441         */
442        private void getProteinAliasesFromNameGroup(ArrayList<String> aliasList, Element proteinElement)
443                        throws XPathExpressionException {
444                ArrayList<Element> keyWordElementList = XMLHelper.selectElements(proteinElement, "alternativeName");
445                for (Element element : keyWordElementList) {
446                        getProteinAliasesFromElement(aliasList, element);
447                }
448                
449                keyWordElementList = XMLHelper.selectElements(proteinElement, "recommendedName");
450                for (Element element : keyWordElementList) {
451                        getProteinAliasesFromElement(aliasList, element);
452                }
453        }
454
455        /**
456         * @param aliasList
457         * @param element
458         * @throws XPathExpressionException
459         */
460        private void getProteinAliasesFromElement(ArrayList<String> aliasList, Element element)
461                        throws XPathExpressionException {
462                Element fullNameElement = XMLHelper.selectSingleElement(element, "fullName");
463                aliasList.add(fullNameElement.getTextContent());
464                Element shortNameElement = XMLHelper.selectSingleElement(element, "shortName");
465                if(null != shortNameElement) {
466                        String shortName = shortNameElement.getTextContent();
467                        if(null != shortName && !shortName.trim().isEmpty()) {
468                                aliasList.add(shortName);
469                        }
470                }
471        }
472
473        /**
474         * Pull uniprot gene aliases associated with this sequence
475         * @return
476         * @throws XPathExpressionException
477         */
478        public ArrayList<String> getGeneAliases() throws XPathExpressionException {
479                ArrayList<String> aliasList = new ArrayList<String>();
480                if (uniprotDoc == null) {
481                        return aliasList;
482                }
483                Element uniprotElement = uniprotDoc.getDocumentElement();
484                Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
485                ArrayList<Element> proteinElements = XMLHelper.selectElements(entryElement, "gene");
486                for(Element proteinElement : proteinElements) {
487                        ArrayList<Element> keyWordElementList = XMLHelper.selectElements(proteinElement, "name");
488                        for (Element element : keyWordElementList) {
489                                aliasList.add(element.getTextContent());
490                        }
491                }
492                return aliasList;
493        }
494
495        /**
496         *
497         * @param compounds
498         * @return
499         */
500        @Override
501        public int countCompounds(C... compounds) {
502                throw new UnsupportedOperationException("Not supported yet.");
503        }
504
505        /**
506         *
507         * @param accession
508         * @return
509         * @throws IOException
510         */
511        private Document getUniprotXML(String accession) throws IOException, CompoundNotFoundException {
512                StringBuilder sb = new StringBuilder();
513                // try in cache
514                if (uniprotDirectoryCache != null && uniprotDirectoryCache.length() > 0) {
515                        sb = fetchFromCache(accession);
516                }
517
518                // http://www.uniprot.org/uniprot/?query=SORBIDRAFT_03g027040&format=xml
519                if (sb.length() == 0) {
520                        String uniprotURL = getUniprotbaseURL() + "/uniprot/" + accession.toUpperCase() + ".xml";
521                        logger.info("Loading: {}", uniprotURL);
522                        sb = fetchUniprotXML(uniprotURL);
523
524                        int index = sb.indexOf("xmlns="); //strip out name space stuff to make it easier on xpath
525                        if (index != -1) {
526                                int lastIndex = sb.indexOf(">", index);
527                                sb.replace(index, lastIndex, "");
528                        }
529                        if (uniprotDirectoryCache != null && uniprotDirectoryCache.length() > 0)
530                                writeCache(sb,accession);
531                }
532
533                logger.info("Load complete");
534                try {
535                        //       logger.debug(sb.toString());
536                        Document document = XMLHelper.inputStreamToDocument(new ByteArrayInputStream(sb.toString().getBytes()));
537                        return document;
538                } catch (SAXException e) {
539                        logger.error("Exception on xml parse of: {}", sb.toString());
540                } catch (ParserConfigurationException e) {
541                        logger.error("Exception on xml parse of: {}", sb.toString());
542                }
543                return null;
544        }
545
546        private void writeCache(StringBuilder sb, String accession) throws IOException {
547                File f = new File(uniprotDirectoryCache + File.separatorChar + accession + ".xml");
548                FileWriter fw = new FileWriter(f);
549                fw.write(sb.toString());
550                fw.close();
551        }
552
553        /**
554         * Open a URL connection.
555         *
556         * Follows redirects.
557         * @param url
558         * @throws IOException
559         */
560        private static HttpURLConnection openURLConnection(URL url) throws IOException {
561                // This method should be moved to a utility class in BioJava 5.0
562
563                final int timeout = 5000;
564                final String useragent = "BioJava";
565
566                HttpURLConnection conn = (HttpURLConnection) url.openConnection();
567                conn.setRequestProperty("User-Agent", useragent);
568                conn.setInstanceFollowRedirects(true);
569                conn.setConnectTimeout(timeout);
570                conn.setReadTimeout(timeout);
571
572                int status = conn.getResponseCode();
573                while (status == HttpURLConnection.HTTP_MOVED_TEMP
574                                || status == HttpURLConnection.HTTP_MOVED_PERM
575                                || status == HttpURLConnection.HTTP_SEE_OTHER) {
576                        // Redirect!
577                        String newUrl = conn.getHeaderField("Location");
578
579                        if(newUrl.equals(url.toString())) {
580                                throw new IOException("Cyclic redirect detected at "+newUrl);
581                        }
582
583                        // Preserve cookies
584                        String cookies = conn.getHeaderField("Set-Cookie");
585
586                        // open the new connection again
587                        url = new URL(newUrl);
588                        conn.disconnect();
589                        conn = (HttpURLConnection) url.openConnection();
590                        if(cookies != null) {
591                                conn.setRequestProperty("Cookie", cookies);
592                        }
593                        conn.addRequestProperty("User-Agent", useragent);
594                        conn.setInstanceFollowRedirects(true);
595                        conn.setConnectTimeout(timeout);
596                        conn.setReadTimeout(timeout);
597                        conn.connect();
598
599                        status = conn.getResponseCode();
600
601                        logger.info("Redirecting from {} to {}", url, newUrl);
602                }
603                conn.connect();
604
605                return conn;
606        }
607
608        private StringBuilder fetchUniprotXML(String uniprotURL)
609                        throws IOException, CompoundNotFoundException {
610
611                StringBuilder sb = new StringBuilder();
612                URL uniprot = new URL(uniprotURL);
613                int attempt = 5;
614                List<String> errorCodes = new ArrayList<String>();
615                while(attempt > 0) {
616                        HttpURLConnection uniprotConnection = openURLConnection(uniprot);
617                        int statusCode = uniprotConnection.getResponseCode();
618                        if (statusCode == HttpURLConnection.HTTP_OK) {
619                                BufferedReader in = new BufferedReader(
620                                                new InputStreamReader(
621                                                uniprotConnection.getInputStream()));
622                                String inputLine;
623
624                                while ((inputLine = in.readLine()) != null) {
625                                        sb.append(inputLine);
626                                }
627                                in.close();
628                                return sb;
629                        }
630                        attempt--;
631                        errorCodes.add(String.valueOf(statusCode));
632                }
633                throw new RemoteException("Couldn't fetch accession from the url " + uniprotURL + " error codes on 5 attempts are " + errorCodes.toString());
634        }
635
636        /**
637         * @param key
638         * @return A string containing the contents of entry specified by key and if not found returns an empty string
639         * @throws FileNotFoundException
640         * @throws IOException
641         */
642        private StringBuilder fetchFromCache(String key)
643                        throws FileNotFoundException, IOException {
644                int index;
645                File f = new File(uniprotDirectoryCache + File.separatorChar + key + ".xml");
646                StringBuilder sb = new StringBuilder();
647                if (f.exists()) {
648                        FileReader fr = new FileReader(f);
649                        int size = (int) f.length();
650                        char[] data = new char[size];
651                        fr.read(data);
652                        fr.close();
653                        sb.append(data);
654                        index = sb.indexOf("xmlns="); //strip out name space stuff to make it easier on xpath
655                        if (index != -1) {
656                                int lastIndex = sb.indexOf(">", index);
657                                sb.replace(index, lastIndex, "");
658                        }
659                }
660                return sb;
661        }
662
663        /**
664         *
665         * @param uniprotDoc
666         * @return
667         */
668        private String getSequence(Document uniprotDoc)  {
669
670                try {
671                        Element uniprotElement = uniprotDoc.getDocumentElement();
672                        Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
673                        Element sequenceElement = XMLHelper.selectSingleElement(entryElement, "sequence");
674
675                        String seqdata = sequenceElement.getTextContent();
676
677                        return seqdata;
678                } catch (XPathExpressionException e) {
679                        logger.error("Problems while parsing sequence in UniProt XML: {}. Sequence will be blank.", e.getMessage());
680                        return "";
681                }
682        }
683
684        /**
685         * The current UniProt URL to deal with caching issues. www.uniprot.org is load balanced
686         * but you can access pir.uniprot.org directly.
687         * @return the uniprotbaseURL
688         */
689        public static String getUniprotbaseURL() {
690                return uniprotbaseURL;
691        }
692
693        /**
694         * @param aUniprotbaseURL the uniprotbaseURL to set
695         */
696        public static void setUniprotbaseURL(String aUniprotbaseURL) {
697                uniprotbaseURL = aUniprotbaseURL;
698        }
699
700        /**
701         * Local directory cache of XML that can be downloaded
702         * @return the uniprotDirectoryCache
703         */
704        public static String getUniprotDirectoryCache() {
705                return uniprotDirectoryCache;
706        }
707
708        /**
709         * @param aUniprotDirectoryCache the uniprotDirectoryCache to set
710         */
711        public static void setUniprotDirectoryCache(String aUniprotDirectoryCache) {
712                File f = new File(aUniprotDirectoryCache);
713                if (!f.exists()) {
714                        f.mkdirs();
715                }
716                uniprotDirectoryCache = aUniprotDirectoryCache;
717        }
718
719        public static void main(String[] args) {
720
721                try {
722                        UniprotProxySequenceReader<AminoAcidCompound> uniprotSequence = new UniprotProxySequenceReader<AminoAcidCompound>("YA745_GIBZE", AminoAcidCompoundSet.getAminoAcidCompoundSet());
723                        ProteinSequence proteinSequence = new ProteinSequence(uniprotSequence);
724                        logger.info("Accession: {}", proteinSequence.getAccession().getID());
725                        logger.info("Sequence: {}", proteinSequence.getSequenceAsString());
726                } catch (Exception e) {
727                        logger.error("Exception: ", e);
728                }
729
730        }
731
732        /**
733         * Get the gene name associated with this sequence.
734         * @return
735         */
736        public String getGeneName() {
737                if (uniprotDoc == null) {
738                        return "";
739                }
740                try {
741                        Element uniprotElement = uniprotDoc.getDocumentElement();
742                        Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
743                        Element geneElement = XMLHelper.selectSingleElement(entryElement, "gene");
744                        if (geneElement == null) {
745                                return "";
746                        }
747                        Element nameElement = XMLHelper.selectSingleElement(geneElement, "name");
748                        if (nameElement == null) {
749                                return "";
750                        }
751                        return nameElement.getTextContent();
752                } catch (XPathExpressionException e) {
753                        logger.error("Problems while parsing gene name in UniProt XML: {}. Gene name will be blank.",e.getMessage());
754                        return "";
755                }
756        }
757
758        /**
759         * Get the organism name assigned to this sequence
760         * @return
761         */
762        public String getOrganismName() {
763                if (uniprotDoc == null) {
764                        return "";
765                }
766                try {
767                        Element uniprotElement = uniprotDoc.getDocumentElement();
768                        Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
769                        Element organismElement = XMLHelper.selectSingleElement(entryElement, "organism");
770                        if (organismElement == null) {
771                                return "";
772                        }
773                        Element nameElement = XMLHelper.selectSingleElement(organismElement, "name");
774                        if (nameElement == null) {
775                                return "";
776                        }
777                        return nameElement.getTextContent();
778                } catch (XPathExpressionException e) {
779                        logger.error("Problems while parsing organism name in UniProt XML: {}. Organism name will be blank.",e.getMessage());
780                        return "";
781                }
782
783        }
784
785        /**
786         * Pull UniProt key words which is a mixed bag of words associated with this sequence
787         * @return
788         */
789        @Override
790        public ArrayList<String> getKeyWords() {
791                ArrayList<String> keyWordsList = new ArrayList<String>();
792                if (uniprotDoc == null) {
793                        return keyWordsList;
794                }
795                try {
796                        Element uniprotElement = uniprotDoc.getDocumentElement();
797
798                        Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
799                        ArrayList<Element> keyWordElementList = XMLHelper.selectElements(entryElement, "keyword");
800                        for (Element element : keyWordElementList) {
801                                keyWordsList.add(element.getTextContent());
802                        }
803                } catch (XPathExpressionException e) {
804                        logger.error("Problems while parsing keywords in UniProt XML: {}. No keywords will be available.",e.getMessage());
805                        return new ArrayList<String>();
806                }
807
808                return keyWordsList;
809        }
810
811        /**
812         * The Uniprot mappings to other database identifiers for this sequence
813         * @return
814         */
815        @Override
816        public Map<String, List<DBReferenceInfo>> getDatabaseReferences()  {
817                Map<String, List<DBReferenceInfo>> databaseReferencesHashMap = new LinkedHashMap<>();
818                if (uniprotDoc == null) {
819                        return databaseReferencesHashMap;
820                }
821
822                try {
823                        Element uniprotElement = uniprotDoc.getDocumentElement();
824                        Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
825                        ArrayList<Element> dbreferenceElementList = XMLHelper.selectElements(entryElement, "dbReference");
826                        for (Element element : dbreferenceElementList) {
827                                String type = element.getAttribute("type");
828                                String id = element.getAttribute("id");
829                                List<DBReferenceInfo> idlist = databaseReferencesHashMap.get(type);
830                                if (idlist == null) {
831                                        idlist = new ArrayList<DBReferenceInfo>();
832                                        databaseReferencesHashMap.put(type, idlist);
833                                }
834                                DBReferenceInfo dbreferenceInfo = new DBReferenceInfo(type, id);
835                                ArrayList<Element> propertyElementList = XMLHelper.selectElements(element, "property");
836                                for (Element propertyElement : propertyElementList) {
837                                        String propertyType = propertyElement.getAttribute("type");
838                                        String propertyValue = propertyElement.getAttribute("value");
839                                        dbreferenceInfo.addProperty(propertyType, propertyValue);
840                                }
841
842                                idlist.add(dbreferenceInfo);
843                        }
844                } catch (XPathExpressionException e) {
845                        logger.error("Problems while parsing db references in UniProt XML: {}. No db references will be available.",e.getMessage());
846                        return new LinkedHashMap<>();
847                }
848
849                return databaseReferencesHashMap;
850        }
851}