001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 * Created on 01-21-2010
021 *
022 * @auther Scooter Willis
023 *
024 */
025package org.biojava.nbio.core.sequence.loader;
026
027import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
028import org.biojava.nbio.core.sequence.AccessionID;
029import org.biojava.nbio.core.sequence.DataSource;
030import org.biojava.nbio.core.sequence.ProteinSequence;
031import org.biojava.nbio.core.sequence.Strand;
032import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
033import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
034import org.biojava.nbio.core.sequence.features.DBReferenceInfo;
035import org.biojava.nbio.core.sequence.features.DatabaseReferenceInterface;
036import org.biojava.nbio.core.sequence.features.FeaturesKeyWordInterface;
037import org.biojava.nbio.core.sequence.storage.SequenceAsStringHelper;
038import org.biojava.nbio.core.sequence.template.*;
039import org.biojava.nbio.core.util.Equals;
040import org.biojava.nbio.core.util.XMLHelper;
041import org.slf4j.Logger;
042import org.slf4j.LoggerFactory;
043import org.w3c.dom.Document;
044import org.w3c.dom.Element;
045import org.xml.sax.SAXException;
046
047import javax.xml.parsers.ParserConfigurationException;
048import javax.xml.xpath.XPathExpressionException;
049import java.io.*;
050import java.net.HttpURLConnection;
051import java.net.URL;
052import java.rmi.RemoteException;
053import java.util.ArrayList;
054import java.util.Iterator;
055import java.util.LinkedHashMap;
056import java.util.List;
057import java.util.regex.Pattern;
058
059/**
060 *
061 * Pass in a Uniprot ID and this ProxySequenceReader when passed to a ProteinSequence will get the sequence data and other data elements
062 * associated with the ProteinSequence by Uniprot. This is an example of how to map external databases of proteins and features to the BioJava3
063 * ProteinSequence.
064 * Important to call @see setUniprotDirectoryCache to allow caching of XML files so they don't need to be reloaded each time. Does
065 * not manage cache.
066 * @param <C>
067 */
068public class UniprotProxySequenceReader<C extends Compound> implements ProxySequenceReader<C>, FeaturesKeyWordInterface, DatabaseReferenceInterface {
069
070        private final static Logger logger = LoggerFactory.getLogger(UniprotProxySequenceReader.class);
071
072        /*
073         * Taken from http://www.uniprot.org/help/accession_numbers
074         */
075        private static final String SPID_PATTERN = "[OPQ][0-9][A-Z0-9]{3}[0-9]";
076        private static final String TREMBLID_PATTERN = "[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}";
077        public static final Pattern UP_AC_PATTERN = Pattern.compile("(" + SPID_PATTERN + "|" + TREMBLID_PATTERN + ")");
078
079        public static final String DEFAULT_UNIPROT_BASE_URL = "https://www.uniprot.org";
080
081        private static String uniprotbaseURL = DEFAULT_UNIPROT_BASE_URL;
082        private static String uniprotDirectoryCache = null;
083        private String sequence;
084        private CompoundSet<C> compoundSet;
085        private List<C> parsedCompounds = new ArrayList<C>();
086        Document uniprotDoc;
087
088        /**
089         * The UniProt id is used to retrieve the UniProt XML which is then parsed as a DOM object
090         * so we know everything about the protein. If an error occurs throw an exception. We could
091         * have a bad uniprot id or network error
092         * @param accession
093         * @param compoundSet
094         * @throws CompoundNotFoundException
095         * @throws IOException if problems while reading the UniProt XML
096         */
097        public UniprotProxySequenceReader(String accession, CompoundSet<C> compoundSet) throws CompoundNotFoundException, IOException {
098                if (!UP_AC_PATTERN.matcher(accession.toUpperCase()).matches()) {
099                        throw new IllegalArgumentException("Accession provided " + accession + " doesn't comply with the uniprot acession pattern.");
100                }
101                setCompoundSet(compoundSet);
102                uniprotDoc = this.getUniprotXML(accession);
103                String seq = this.getSequence(uniprotDoc);
104                setContents(seq);
105        }
106
107        /**
108         * The xml is passed in as a DOM object so we know everything about the protein.
109         *  If an error occurs throw an exception. We could have a bad uniprot id
110         * @param document
111         * @param compoundSet
112         * @throws CompoundNotFoundException
113         */
114        public UniprotProxySequenceReader(Document document, CompoundSet<C> compoundSet) throws CompoundNotFoundException {
115                setCompoundSet(compoundSet);
116                uniprotDoc = document;
117                String seq = this.getSequence(uniprotDoc);
118                setContents(seq);
119        }
120        /**
121         * The passed in xml is parsed as a DOM object so we know everything about the protein.
122         *  If an error occurs throw an exception. We could have a bad uniprot id
123         * @param xml
124         * @param compoundSet
125         * @return UniprotProxySequenceReader
126         * @throws Exception
127         */
128        public static <C extends Compound> UniprotProxySequenceReader<C> parseUniprotXMLString(String xml, CompoundSet<C> compoundSet) {
129                try {
130                        Document document = XMLHelper.inputStreamToDocument(new ByteArrayInputStream(xml.getBytes()));
131                        return new UniprotProxySequenceReader<C>(document, compoundSet);
132                } catch (Exception e) {
133                        logger.error("Exception on xml parse of: {}", xml);
134                }
135                return null;
136        }
137
138        @Override
139        public void setCompoundSet(CompoundSet<C> compoundSet) {
140                this.compoundSet = compoundSet;
141        }
142
143        /**
144         * Once the sequence is retrieved set the contents and make sure everything this is valid
145         * Some uniprot records contain white space in the sequence. We must strip it out so setContents doesn't fail.
146         * @param sequence
147         * @throws CompoundNotFoundException
148         */
149        @Override
150        public void setContents(String sequence) throws CompoundNotFoundException {
151                // Horrendously inefficient - pretty much the way the old BJ did things.
152                // TODO Should be optimised.
153                // NOTE This chokes on whitespace in the sequence, so whitespace is stripped
154                this.sequence = sequence.replaceAll("\\s", "").trim();
155                this.parsedCompounds.clear();
156                for (int i = 0; i < this.sequence.length();) {
157                        String compoundStr = null;
158                        C compound = null;
159                        for (int compoundStrLength = 1; compound == null && compoundStrLength <= compoundSet.getMaxSingleCompoundStringLength(); compoundStrLength++) {
160                                compoundStr = this.sequence.substring(i, i + compoundStrLength);
161                                compound = compoundSet.getCompoundForString(compoundStr);
162                        }
163                        if (compound == null) {
164                                throw new CompoundNotFoundException("Compound "+compoundStr+" not found");
165                        } else {
166                                i += compoundStr.length();
167                        }
168                        this.parsedCompounds.add(compound);
169                }
170        }
171
172        /**
173         * The sequence length
174         * @return
175         */
176        @Override
177        public int getLength() {
178                return this.parsedCompounds.size();
179        }
180
181        /**
182         *
183         * @param position
184         * @return
185         */
186        @Override
187        public C getCompoundAt(int position) {
188                return this.parsedCompounds.get(position - 1);
189        }
190
191        /**
192         *
193         * @param compound
194         * @return
195         */
196        @Override
197        public int getIndexOf(C compound) {
198                return this.parsedCompounds.indexOf(compound) + 1;
199        }
200
201        /**
202         *
203         * @param compound
204         * @return
205         */
206        @Override
207        public int getLastIndexOf(C compound) {
208                return this.parsedCompounds.lastIndexOf(compound) + 1;
209        }
210
211        /**
212         *
213         * @return
214         */
215        @Override
216        public String toString() {
217                return getSequenceAsString();
218        }
219
220        /**
221         *
222         * @return
223         */
224        @Override
225        public String getSequenceAsString() {
226                return sequence;
227        }
228
229        /**
230         *
231         * @return
232         */
233        @Override
234        public List<C> getAsList() {
235                return this.parsedCompounds;
236        }
237
238        @Override
239        public boolean equals(Object o){
240
241                if(! Equals.classEqual(this, o)) {
242                        return false;
243                }
244
245                Sequence<C> other = (Sequence<C>)o;
246                if ( other.getCompoundSet() != getCompoundSet())
247                        return false;
248
249                List<C> rawCompounds = getAsList();
250                List<C> otherCompounds = other.getAsList();
251
252                if ( rawCompounds.size() != otherCompounds.size())
253                        return false;
254
255                for (int i = 0 ; i < rawCompounds.size() ; i++){
256                        Compound myCompound = rawCompounds.get(i);
257                        Compound otherCompound = otherCompounds.get(i);
258                        if ( ! myCompound.equalsIgnoreCase(otherCompound))
259                                return false;
260                }
261                return true;
262        }
263
264        @Override
265        public int hashCode(){
266                String s = getSequenceAsString();
267                return s.hashCode();
268        }
269
270        /**
271         *
272         * @return
273         */
274        @Override
275        public SequenceView<C> getInverse() {
276                return SequenceMixin.inverse(this);
277        }
278
279        /**
280         *
281         * @param bioBegin
282         * @param bioEnd
283         * @param strand
284         * @return
285         */
286        public String getSequenceAsString(Integer bioBegin, Integer bioEnd, Strand strand) {
287                SequenceAsStringHelper<C> sequenceAsStringHelper = new SequenceAsStringHelper<C>();
288                return sequenceAsStringHelper.getSequenceAsString(this.parsedCompounds, compoundSet, bioBegin, bioEnd, strand);
289        }
290
291        /**
292         *
293         * @param bioBegin
294         * @param bioEnd
295         * @return
296         */
297        @Override
298        public SequenceView<C> getSubSequence(final Integer bioBegin, final Integer bioEnd) {
299                return new SequenceProxyView<C>(UniprotProxySequenceReader.this, bioBegin, bioEnd);
300        }
301
302        /**
303         *
304         * @return
305         */
306        @Override
307        public Iterator<C> iterator() {
308                return this.parsedCompounds.iterator();
309        }
310
311        /**
312         *
313         * @return
314         */
315        @Override
316        public CompoundSet<C> getCompoundSet() {
317                return compoundSet;
318        }
319
320        /**
321         *
322         * @return
323         */
324        @Override
325        public AccessionID getAccession() {
326                AccessionID accessionID = new AccessionID();
327                if (uniprotDoc == null) {
328                        return accessionID;
329                }
330                try {
331                        Element uniprotElement = uniprotDoc.getDocumentElement();
332                        Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
333                        Element nameElement = XMLHelper.selectSingleElement(entryElement, "name");
334                        accessionID = new AccessionID(nameElement.getTextContent(), DataSource.UNIPROT);
335                } catch (XPathExpressionException e) {
336                        logger.error("Exception: ", e);
337                }
338                return accessionID;
339        }
340
341        /**
342         * Pull uniprot accessions associated with this sequence
343         * @return
344         * @throws XPathExpressionException
345         */
346        public ArrayList<AccessionID> getAccessions() throws XPathExpressionException {
347                ArrayList<AccessionID> accessionList = new ArrayList<AccessionID>();
348                if (uniprotDoc == null) {
349                        return accessionList;
350                }
351                Element uniprotElement = uniprotDoc.getDocumentElement();
352                Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
353                ArrayList<Element> keyWordElementList = XMLHelper.selectElements(entryElement, "accession");
354                for (Element element : keyWordElementList) {
355                        AccessionID accessionID = new AccessionID(element.getTextContent(), DataSource.UNIPROT);
356                        accessionList.add(accessionID);
357                }
358
359                return accessionList;
360        }
361
362        /**
363         * Pull uniprot protein aliases associated with this sequence
364         * Provided for backwards compatibility now that we support both
365         * gene and protein aliases via separate methods.
366         * @return
367         * @throws XPathExpressionException
368         */
369        public ArrayList<String> getAliases() throws XPathExpressionException {
370
371                return getProteinAliases();
372        }
373        /**
374         * Pull uniprot protein aliases associated with this sequence
375         * @return
376         * @throws XPathExpressionException
377         */
378        public ArrayList<String> getProteinAliases() throws XPathExpressionException {
379                ArrayList<String> aliasList = new ArrayList<String>();
380                if (uniprotDoc == null) {
381                        return aliasList;
382                }
383                Element uniprotElement = uniprotDoc.getDocumentElement();
384                Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
385                Element proteinElement = XMLHelper.selectSingleElement(entryElement, "protein");
386                
387                ArrayList<Element> keyWordElementList;
388                getProteinAliasesFromNameGroup(aliasList, proteinElement);
389                
390                keyWordElementList = XMLHelper.selectElements(proteinElement, "component");
391                for (Element element : keyWordElementList) {
392                        getProteinAliasesFromNameGroup(aliasList, element);
393                }
394
395                keyWordElementList = XMLHelper.selectElements(proteinElement, "domain");
396                for (Element element : keyWordElementList) {
397                        getProteinAliasesFromNameGroup(aliasList, element);
398                }
399
400                keyWordElementList = XMLHelper.selectElements(proteinElement, "submittedName");
401                for (Element element : keyWordElementList) {
402                        getProteinAliasesFromNameGroup(aliasList, element);
403                }
404
405                keyWordElementList = XMLHelper.selectElements(proteinElement, "cdAntigenName");
406                for (Element element : keyWordElementList) {
407                        String cdAntigenName = element.getTextContent();
408                        if(null != cdAntigenName && !cdAntigenName.trim().isEmpty()) {
409                                aliasList.add(cdAntigenName);
410                        }
411                }
412                        
413                keyWordElementList = XMLHelper.selectElements(proteinElement, "innName");
414                for (Element element : keyWordElementList) {
415                        String cdAntigenName = element.getTextContent();
416                        if(null != cdAntigenName && !cdAntigenName.trim().isEmpty()) {
417                                aliasList.add(cdAntigenName);
418                        }
419                }
420
421                keyWordElementList = XMLHelper.selectElements(proteinElement, "biotechName");
422                for (Element element : keyWordElementList) {
423                        String cdAntigenName = element.getTextContent();
424                        if(null != cdAntigenName && !cdAntigenName.trim().isEmpty()) {
425                                aliasList.add(cdAntigenName);
426                        }
427                }
428
429                keyWordElementList = XMLHelper.selectElements(proteinElement, "allergenName");
430                for (Element element : keyWordElementList) {
431                        String cdAntigenName = element.getTextContent();
432                        if(null != cdAntigenName && !cdAntigenName.trim().isEmpty()) {
433                                aliasList.add(cdAntigenName);
434                        }
435                }
436
437                return aliasList;
438        }
439
440        /**
441         * @param aliasList
442         * @param proteinElement
443         * @throws XPathExpressionException
444         */
445        private void getProteinAliasesFromNameGroup(ArrayList<String> aliasList, Element proteinElement)
446                        throws XPathExpressionException {
447                ArrayList<Element> keyWordElementList = XMLHelper.selectElements(proteinElement, "alternativeName");
448                for (Element element : keyWordElementList) {
449                        getProteinAliasesFromElement(aliasList, element);
450                }
451                
452                keyWordElementList = XMLHelper.selectElements(proteinElement, "recommendedName");
453                for (Element element : keyWordElementList) {
454                        getProteinAliasesFromElement(aliasList, element);
455                }
456        }
457
458        /**
459         * @param aliasList
460         * @param element
461         * @throws XPathExpressionException
462         */
463        private void getProteinAliasesFromElement(ArrayList<String> aliasList, Element element)
464                        throws XPathExpressionException {
465                Element fullNameElement = XMLHelper.selectSingleElement(element, "fullName");
466                aliasList.add(fullNameElement.getTextContent());
467                Element shortNameElement = XMLHelper.selectSingleElement(element, "shortName");
468                if(null != shortNameElement) {
469                        String shortName = shortNameElement.getTextContent();
470                        if(null != shortName && !shortName.trim().isEmpty()) {
471                                aliasList.add(shortName);
472                        }
473                }
474        }
475
476        /**
477         * Pull uniprot gene aliases associated with this sequence
478         * @return
479         * @throws XPathExpressionException
480         */
481        public ArrayList<String> getGeneAliases() throws XPathExpressionException {
482                ArrayList<String> aliasList = new ArrayList<String>();
483                if (uniprotDoc == null) {
484                        return aliasList;
485                }
486                Element uniprotElement = uniprotDoc.getDocumentElement();
487                Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
488                ArrayList<Element> proteinElements = XMLHelper.selectElements(entryElement, "gene");
489                for(Element proteinElement : proteinElements) {
490                        ArrayList<Element> keyWordElementList = XMLHelper.selectElements(proteinElement, "name");
491                        for (Element element : keyWordElementList) {
492                                aliasList.add(element.getTextContent());
493                        }
494                }
495                return aliasList;
496        }
497
498        /**
499         *
500         * @param compounds
501         * @return
502         */
503        @Override
504        public int countCompounds(C... compounds) {
505                throw new UnsupportedOperationException("Not supported yet.");
506        }
507
508        /**
509         *
510         * @param accession
511         * @return
512         * @throws IOException
513         */
514        private Document getUniprotXML(String accession) throws IOException, CompoundNotFoundException {
515                StringBuilder sb = new StringBuilder();
516                // try in cache
517                if (uniprotDirectoryCache != null && uniprotDirectoryCache.length() > 0) {
518                        sb = fetchFromCache(accession);
519                }
520
521                // http://www.uniprot.org/uniprot/?query=SORBIDRAFT_03g027040&format=xml
522                if (sb.length() == 0) {
523                        String uniprotURL = getUniprotbaseURL() + "/uniprot/" + accession.toUpperCase() + ".xml";
524                        logger.info("Loading: {}", uniprotURL);
525                        sb = fetchUniprotXML(uniprotURL);
526
527                        int index = sb.indexOf("xmlns="); //strip out name space stuff to make it easier on xpath
528                        if (index != -1) {
529                                int lastIndex = sb.indexOf(">", index);
530                                sb.replace(index, lastIndex, "");
531                        }
532                        if (uniprotDirectoryCache != null && uniprotDirectoryCache.length() > 0)
533                                writeCache(sb,accession);
534                }
535
536                logger.info("Load complete");
537                try {
538                        //       logger.debug(sb.toString());
539                        Document document = XMLHelper.inputStreamToDocument(new ByteArrayInputStream(sb.toString().getBytes()));
540                        return document;
541                } catch (SAXException e) {
542                        logger.error("Exception on xml parse of: {}", sb.toString());
543                } catch (ParserConfigurationException e) {
544                        logger.error("Exception on xml parse of: {}", sb.toString());
545                }
546                return null;
547        }
548
549        private void writeCache(StringBuilder sb, String accession) throws IOException {
550                File f = new File(uniprotDirectoryCache + File.separatorChar + accession + ".xml");
551                FileWriter fw = new FileWriter(f);
552                fw.write(sb.toString());
553                fw.close();
554        }
555
556        /**
557         * Open a URL connection.
558         *
559         * Follows redirects.
560         * @param url
561         * @throws IOException
562         */
563        private static HttpURLConnection openURLConnection(URL url) throws IOException {
564                // This method should be moved to a utility class in BioJava 5.0
565
566                final int timeout = 5000;
567                final String useragent = "BioJava";
568
569                HttpURLConnection conn = (HttpURLConnection) url.openConnection();
570                conn.setRequestProperty("User-Agent", useragent);
571                conn.setInstanceFollowRedirects(true);
572                conn.setConnectTimeout(timeout);
573                conn.setReadTimeout(timeout);
574
575                int status = conn.getResponseCode();
576                while (status == HttpURLConnection.HTTP_MOVED_TEMP
577                                || status == HttpURLConnection.HTTP_MOVED_PERM
578                                || status == HttpURLConnection.HTTP_SEE_OTHER) {
579                        // Redirect!
580                        String newUrl = conn.getHeaderField("Location");
581
582                        if(newUrl.equals(url.toString())) {
583                                throw new IOException("Cyclic redirect detected at "+newUrl);
584                        }
585
586                        // Preserve cookies
587                        String cookies = conn.getHeaderField("Set-Cookie");
588
589                        // open the new connection again
590                        url = new URL(newUrl);
591                        conn.disconnect();
592                        conn = (HttpURLConnection) url.openConnection();
593                        if(cookies != null) {
594                                conn.setRequestProperty("Cookie", cookies);
595                        }
596                        conn.addRequestProperty("User-Agent", useragent);
597                        conn.setInstanceFollowRedirects(true);
598                        conn.setConnectTimeout(timeout);
599                        conn.setReadTimeout(timeout);
600                        conn.connect();
601
602                        status = conn.getResponseCode();
603
604                        logger.info("Redirecting from {} to {}", url, newUrl);
605                }
606                conn.connect();
607
608                return conn;
609        }
610
611        private StringBuilder fetchUniprotXML(String uniprotURL)
612                        throws IOException, CompoundNotFoundException {
613
614                StringBuilder sb = new StringBuilder();
615                URL uniprot = new URL(uniprotURL);
616                int attempt = 5;
617                List<String> errorCodes = new ArrayList<String>();
618                while(attempt > 0) {
619                        HttpURLConnection uniprotConnection = openURLConnection(uniprot);
620                        int statusCode = uniprotConnection.getResponseCode();
621                        if (statusCode == HttpURLConnection.HTTP_OK) {
622                                BufferedReader in = new BufferedReader(
623                                                new InputStreamReader(
624                                                uniprotConnection.getInputStream()));
625                                String inputLine;
626
627                                while ((inputLine = in.readLine()) != null) {
628                                        sb.append(inputLine);
629                                }
630                                in.close();
631                                return sb;
632                        }
633                        attempt--;
634                        errorCodes.add(String.valueOf(statusCode));
635                }
636                throw new RemoteException("Couldn't fetch accession from the url " + uniprotURL + " error codes on 5 attempts are " + errorCodes.toString());
637        }
638
639        /**
640         * @param key
641         * @return A string containing the contents of entry specified by key and if not found returns an empty string
642         * @throws FileNotFoundException
643         * @throws IOException
644         */
645        private StringBuilder fetchFromCache(String key)
646                        throws FileNotFoundException, IOException {
647                int index;
648                File f = new File(uniprotDirectoryCache + File.separatorChar + key + ".xml");
649                StringBuilder sb = new StringBuilder();
650                if (f.exists()) {
651                        FileReader fr = new FileReader(f);
652                        int size = (int) f.length();
653                        char[] data = new char[size];
654                        fr.read(data);
655                        fr.close();
656                        sb.append(data);
657                        index = sb.indexOf("xmlns="); //strip out name space stuff to make it easier on xpath
658                        if (index != -1) {
659                                int lastIndex = sb.indexOf(">", index);
660                                sb.replace(index, lastIndex, "");
661                        }
662                }
663                return sb;
664        }
665
666        /**
667         *
668         * @param uniprotDoc
669         * @return
670         */
671        private String getSequence(Document uniprotDoc)  {
672
673                try {
674                        Element uniprotElement = uniprotDoc.getDocumentElement();
675                        Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
676                        Element sequenceElement = XMLHelper.selectSingleElement(entryElement, "sequence");
677
678                        String seqdata = sequenceElement.getTextContent();
679
680                        return seqdata;
681                } catch (XPathExpressionException e) {
682                        logger.error("Problems while parsing sequence in UniProt XML: {}. Sequence will be blank.", e.getMessage());
683                        return "";
684                }
685        }
686
687        /**
688         * The current UniProt URL to deal with caching issues. www.uniprot.org is load balanced
689         * but you can access pir.uniprot.org directly.
690         * @return the uniprotbaseURL
691         */
692        public static String getUniprotbaseURL() {
693                return uniprotbaseURL;
694        }
695
696        /**
697         * @param aUniprotbaseURL the uniprotbaseURL to set
698         */
699        public static void setUniprotbaseURL(String aUniprotbaseURL) {
700                uniprotbaseURL = aUniprotbaseURL;
701        }
702
703        /**
704         * Local directory cache of XML that can be downloaded
705         * @return the uniprotDirectoryCache
706         */
707        public static String getUniprotDirectoryCache() {
708                return uniprotDirectoryCache;
709        }
710
711        /**
712         * @param aUniprotDirectoryCache the uniprotDirectoryCache to set
713         */
714        public static void setUniprotDirectoryCache(String aUniprotDirectoryCache) {
715                File f = new File(aUniprotDirectoryCache);
716                if (!f.exists()) {
717                        f.mkdirs();
718                }
719                uniprotDirectoryCache = aUniprotDirectoryCache;
720        }
721
722        public static void main(String[] args) {
723
724                try {
725                        UniprotProxySequenceReader<AminoAcidCompound> uniprotSequence = new UniprotProxySequenceReader<AminoAcidCompound>("YA745_GIBZE", AminoAcidCompoundSet.getAminoAcidCompoundSet());
726                        ProteinSequence proteinSequence = new ProteinSequence(uniprotSequence);
727                        logger.info("Accession: {}", proteinSequence.getAccession().getID());
728                        logger.info("Sequence: {}", proteinSequence.getSequenceAsString());
729                } catch (Exception e) {
730                        logger.error("Exception: ", e);
731                }
732
733        }
734
735        /**
736         * Get the gene name associated with this sequence.
737         * @return
738         */
739        public String getGeneName() {
740                if (uniprotDoc == null) {
741                        return "";
742                }
743                try {
744                        Element uniprotElement = uniprotDoc.getDocumentElement();
745                        Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
746                        Element geneElement = XMLHelper.selectSingleElement(entryElement, "gene");
747                        if (geneElement == null) {
748                                return "";
749                        }
750                        Element nameElement = XMLHelper.selectSingleElement(geneElement, "name");
751                        if (nameElement == null) {
752                                return "";
753                        }
754                        return nameElement.getTextContent();
755                } catch (XPathExpressionException e) {
756                        logger.error("Problems while parsing gene name in UniProt XML: {}. Gene name will be blank.",e.getMessage());
757                        return "";
758                }
759        }
760
761        /**
762         * Get the organism name assigned to this sequence
763         * @return
764         */
765        public String getOrganismName() {
766                if (uniprotDoc == null) {
767                        return "";
768                }
769                try {
770                        Element uniprotElement = uniprotDoc.getDocumentElement();
771                        Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
772                        Element organismElement = XMLHelper.selectSingleElement(entryElement, "organism");
773                        if (organismElement == null) {
774                                return "";
775                        }
776                        Element nameElement = XMLHelper.selectSingleElement(organismElement, "name");
777                        if (nameElement == null) {
778                                return "";
779                        }
780                        return nameElement.getTextContent();
781                } catch (XPathExpressionException e) {
782                        logger.error("Problems while parsing organism name in UniProt XML: {}. Organism name will be blank.",e.getMessage());
783                        return "";
784                }
785
786        }
787
788        /**
789         * Pull UniProt key words which is a mixed bag of words associated with this sequence
790         * @return
791         */
792        @Override
793        public ArrayList<String> getKeyWords() {
794                ArrayList<String> keyWordsList = new ArrayList<String>();
795                if (uniprotDoc == null) {
796                        return keyWordsList;
797                }
798                try {
799                        Element uniprotElement = uniprotDoc.getDocumentElement();
800
801                        Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
802                        ArrayList<Element> keyWordElementList = XMLHelper.selectElements(entryElement, "keyword");
803                        for (Element element : keyWordElementList) {
804                                keyWordsList.add(element.getTextContent());
805                        }
806                } catch (XPathExpressionException e) {
807                        logger.error("Problems while parsing keywords in UniProt XML: {}. No keywords will be available.",e.getMessage());
808                        return new ArrayList<String>();
809                }
810
811                return keyWordsList;
812        }
813
814        /**
815         * The Uniprot mappings to other database identifiers for this sequence
816         * @return
817         */
818        @Override
819        public LinkedHashMap<String, ArrayList<DBReferenceInfo>> getDatabaseReferences()  {
820                LinkedHashMap<String, ArrayList<DBReferenceInfo>> databaseReferencesHashMap = new LinkedHashMap<String, ArrayList<DBReferenceInfo>>();
821                if (uniprotDoc == null) {
822                        return databaseReferencesHashMap;
823                }
824
825                try {
826                        Element uniprotElement = uniprotDoc.getDocumentElement();
827                        Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
828                        ArrayList<Element> dbreferenceElementList = XMLHelper.selectElements(entryElement, "dbReference");
829                        for (Element element : dbreferenceElementList) {
830                                String type = element.getAttribute("type");
831                                String id = element.getAttribute("id");
832                                ArrayList<DBReferenceInfo> idlist = databaseReferencesHashMap.get(type);
833                                if (idlist == null) {
834                                        idlist = new ArrayList<DBReferenceInfo>();
835                                        databaseReferencesHashMap.put(type, idlist);
836                                }
837                                DBReferenceInfo dbreferenceInfo = new DBReferenceInfo(type, id);
838                                ArrayList<Element> propertyElementList = XMLHelper.selectElements(element, "property");
839                                for (Element propertyElement : propertyElementList) {
840                                        String propertyType = propertyElement.getAttribute("type");
841                                        String propertyValue = propertyElement.getAttribute("value");
842                                        dbreferenceInfo.addProperty(propertyType, propertyValue);
843                                }
844
845                                idlist.add(dbreferenceInfo);
846                        }
847                } catch (XPathExpressionException e) {
848                        logger.error("Problems while parsing db references in UniProt XML: {}. No db references will be available.",e.getMessage());
849                        return new LinkedHashMap<String, ArrayList<DBReferenceInfo>>();
850                }
851
852                return databaseReferencesHashMap;
853        }
854}