Source code

001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 * Created on 01-21-2010
021 *
022 * @auther Scooter Willis
023 *
024 */
025package org.biojava.nbio.core.sequence.loader;
026
027import java.io.BufferedReader;
028import java.io.ByteArrayInputStream;
029import java.io.File;
030import java.io.FileNotFoundException;
031import java.io.FileReader;
032import java.io.FileWriter;
033import java.io.IOException;
034import java.io.InputStreamReader;
035import java.net.HttpURLConnection;
036import java.net.URL;
037import java.rmi.RemoteException;
038import java.util.ArrayList;
039import java.util.Iterator;
040import java.util.LinkedHashMap;
041import java.util.List;
042import java.util.regex.Pattern;
043
044import javax.xml.parsers.ParserConfigurationException;
045import javax.xml.xpath.XPathExpressionException;
046
047import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
048import org.biojava.nbio.core.sequence.AccessionID;
049import org.biojava.nbio.core.sequence.DataSource;
050import org.biojava.nbio.core.sequence.ProteinSequence;
051import org.biojava.nbio.core.sequence.Strand;
052import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
053import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
054import org.biojava.nbio.core.sequence.features.DBReferenceInfo;
055import org.biojava.nbio.core.sequence.features.DatabaseReferenceInterface;
056import org.biojava.nbio.core.sequence.features.FeaturesKeyWordInterface;
057import org.biojava.nbio.core.sequence.storage.SequenceAsStringHelper;
058import org.biojava.nbio.core.sequence.template.Compound;
059import org.biojava.nbio.core.sequence.template.CompoundSet;
060import org.biojava.nbio.core.sequence.template.ProxySequenceReader;
061import org.biojava.nbio.core.sequence.template.SequenceMixin;
062import org.biojava.nbio.core.sequence.template.SequenceProxyView;
063import org.biojava.nbio.core.sequence.template.SequenceView;
064import org.biojava.nbio.core.util.XMLHelper;
065import org.slf4j.Logger;
066import org.slf4j.LoggerFactory;
067import org.w3c.dom.Document;
068import org.w3c.dom.Element;
069import org.xml.sax.SAXException;
070
071/**
072 *
073 * Pass in a Uniprot ID and this ProxySequenceReader when passed to a ProteinSequence will get the sequence data and other data elements
074 * associated with the ProteinSequence by Uniprot. This is an example of how to map external databases of proteins and features to the BioJava3
075 * ProteinSequence.
076 * Important to call @see setUniprotDirectoryCache to allow caching of XML files so they don't need to be reloaded each time. Does
077 * not manage cache.
078 * @param <C>
079 */
080public class UniprotProxySequenceReader<C extends Compound> implements ProxySequenceReader<C>, FeaturesKeyWordInterface, DatabaseReferenceInterface {
081
082        private final static Logger logger = LoggerFactory.getLogger(UniprotProxySequenceReader.class);
083
084        /*
085         * Taken from http://www.uniprot.org/help/accession_numbers
086         */
087        private static final String SPID_PATTERN = "[OPQ][0-9][A-Z0-9]{3}[0-9]";
088        private static final String TREMBLID_PATTERN = "[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}";
089        public static final Pattern UP_AC_PATTERN = Pattern.compile("(" + SPID_PATTERN + "|" + TREMBLID_PATTERN + ")");
090
091        private static String uniprotbaseURL = "http://www.uniprot.org"; //"http://pir.uniprot.org";
092        private static String uniprotDirectoryCache = null;
093        private String sequence;
094        private CompoundSet<C> compoundSet;
095        private List<C> parsedCompounds = new ArrayList<C>();
096        Document uniprotDoc;
097
098        /**
099         * The UniProt id is used to retrieve the UniProt XML which is then parsed as a DOM object
100         * so we know everything about the protein. If an error occurs throw an exception. We could
101         * have a bad uniprot id or network error
102         * @param accession
103         * @param compoundSet
104         * @throws CompoundNotFoundException
105         * @throws IOException if problems while reading the UniProt XML
106         */
107        public UniprotProxySequenceReader(String accession, CompoundSet<C> compoundSet) throws CompoundNotFoundException, IOException {
108                if (!UP_AC_PATTERN.matcher(accession.toUpperCase()).matches()) {
109                        throw new IllegalArgumentException("Accession provided " + accession + " doesn't comply with the uniprot acession pattern.");
110                }
111                setCompoundSet(compoundSet);
112                uniprotDoc = this.getUniprotXML(accession);
113                String seq = this.getSequence(uniprotDoc);
114                setContents(seq);
115        }
116
117        /**
118         * The xml is passed in as a DOM object so we know everything about the protein.
119         *  If an error occurs throw an exception. We could have a bad uniprot id
120         * @param document
121         * @param compoundSet
122         * @throws CompoundNotFoundException
123         */
124        public UniprotProxySequenceReader(Document document, CompoundSet<C> compoundSet) throws CompoundNotFoundException {
125                setCompoundSet(compoundSet);
126                uniprotDoc = document;
127                String seq = this.getSequence(uniprotDoc);
128                setContents(seq);
129        }
130        /**
131         * The passed in xml is parsed as a DOM object so we know everything about the protein.
132         *  If an error occurs throw an exception. We could have a bad uniprot id
133         * @param xml
134         * @param compoundSet
135         * @return UniprotProxySequenceReader
136         * @throws Exception
137         */
138        public static <C extends Compound> UniprotProxySequenceReader<C> parseUniprotXMLString(String xml, CompoundSet<C> compoundSet) {
139                try {
140                        Document document = XMLHelper.inputStreamToDocument(new ByteArrayInputStream(xml.getBytes()));
141                        return new UniprotProxySequenceReader<C>(document, compoundSet);
142                } catch (Exception e) {
143                        logger.error("Exception on xml parse of: {}", xml);
144                }
145                return null;
146        }
147
148        @Override
149        public void setCompoundSet(CompoundSet<C> compoundSet) {
150                this.compoundSet = compoundSet;
151        }
152
153        /**
154         * Once the sequence is retrieved set the contents and make sure everything this is valid
155         * @param sequence
156         * @throws CompoundNotFoundException
157         */
158        @Override
159        public void setContents(String sequence) throws CompoundNotFoundException {
160                // Horrendously inefficient - pretty much the way the old BJ did things.
161                // TODO Should be optimised.
162                this.sequence = sequence;
163                this.parsedCompounds.clear();
164                for (int i = 0; i < sequence.length();) {
165                        String compoundStr = null;
166                        C compound = null;
167                        for (int compoundStrLength = 1; compound == null && compoundStrLength <= compoundSet.getMaxSingleCompoundStringLength(); compoundStrLength++) {
168                                compoundStr = sequence.substring(i, i + compoundStrLength);
169                                compound = compoundSet.getCompoundForString(compoundStr);
170                        }
171                        if (compound == null) {
172                                throw new CompoundNotFoundException("Compound "+compoundStr+" not found");
173                        } else {
174                                i += compoundStr.length();
175                        }
176                        this.parsedCompounds.add(compound);
177                }
178        }
179
180        /**
181         * The sequence length
182         * @return
183         */
184        @Override
185        public int getLength() {
186                return this.parsedCompounds.size();
187        }
188
189        /**
190         *
191         * @param position
192         * @return
193         */
194        @Override
195        public C getCompoundAt(int position) {
196                return this.parsedCompounds.get(position - 1);
197        }
198
199        /**
200         *
201         * @param compound
202         * @return
203         */
204        @Override
205        public int getIndexOf(C compound) {
206                return this.parsedCompounds.indexOf(compound) + 1;
207        }
208
209        /**
210         *
211         * @param compound
212         * @return
213         */
214        @Override
215        public int getLastIndexOf(C compound) {
216                return this.parsedCompounds.lastIndexOf(compound) + 1;
217        }
218
219        /**
220         *
221         * @return
222         */
223        @Override
224        public String toString() {
225                return getSequenceAsString();
226        }
227
228        /**
229         *
230         * @return
231         */
232        @Override
233        public String getSequenceAsString() {
234                return sequence;
235        }
236
237        /**
238         *
239         * @return
240         */
241        @Override
242        public List<C> getAsList() {
243                return this.parsedCompounds;
244        }
245
246        /**
247         *
248         * @return
249         */
250        @Override
251        public SequenceView<C> getInverse() {
252                return SequenceMixin.inverse(this);
253        }
254
255        /**
256         *
257         * @param bioBegin
258         * @param bioEnd
259         * @param strand
260         * @return
261         */
262        public String getSequenceAsString(Integer bioBegin, Integer bioEnd, Strand strand) {
263                SequenceAsStringHelper<C> sequenceAsStringHelper = new SequenceAsStringHelper<C>();
264                return sequenceAsStringHelper.getSequenceAsString(this.parsedCompounds, compoundSet, bioBegin, bioEnd, strand);
265        }
266
267        /**
268         *
269         * @param bioBegin
270         * @param bioEnd
271         * @return
272         */
273        @Override
274        public SequenceView<C> getSubSequence(final Integer bioBegin, final Integer bioEnd) {
275                return new SequenceProxyView<C>(UniprotProxySequenceReader.this, bioBegin, bioEnd);
276        }
277
278        /**
279         *
280         * @return
281         */
282        @Override
283        public Iterator<C> iterator() {
284                return this.parsedCompounds.iterator();
285        }
286
287        /**
288         *
289         * @return
290         */
291        @Override
292        public CompoundSet<C> getCompoundSet() {
293                return compoundSet;
294        }
295
296        /**
297         *
298         * @return
299         */
300        @Override
301        public AccessionID getAccession() {
302                AccessionID accessionID = new AccessionID();
303                if (uniprotDoc == null) {
304                        return accessionID;
305                }
306                try {
307                        Element uniprotElement = uniprotDoc.getDocumentElement();
308                        Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
309                        Element nameElement = XMLHelper.selectSingleElement(entryElement, "name");
310                        accessionID = new AccessionID(nameElement.getTextContent(), DataSource.UNIPROT);
311                } catch (XPathExpressionException e) {
312                        logger.error("Exception: ", e);
313                }
314                return accessionID;
315        }
316
317        /**
318         * Pull uniprot accessions associated with this sequence
319         * @return
320         * @throws XPathExpressionException
321         */
322        public ArrayList<AccessionID> getAccessions() throws XPathExpressionException {
323                ArrayList<AccessionID> accessionList = new ArrayList<AccessionID>();
324                if (uniprotDoc == null) {
325                        return accessionList;
326                }
327                Element uniprotElement = uniprotDoc.getDocumentElement();
328                Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
329                ArrayList<Element> keyWordElementList = XMLHelper.selectElements(entryElement, "accession");
330                for (Element element : keyWordElementList) {
331                        AccessionID accessionID = new AccessionID(element.getTextContent(), DataSource.UNIPROT);
332                        accessionList.add(accessionID);
333                }
334
335                return accessionList;
336        }
337
338        /**
339         * Pull uniprot protein aliases associated with this sequence
340         * @return
341         * @throws XPathExpressionException
342         */
343        public ArrayList<String> getAliases() throws XPathExpressionException {
344                ArrayList<String> aliasList = new ArrayList<String>();
345                if (uniprotDoc == null) {
346                        return aliasList;
347                }
348                Element uniprotElement = uniprotDoc.getDocumentElement();
349                Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
350                Element proteinElement = XMLHelper.selectSingleElement(entryElement, "protein");
351                ArrayList<Element> keyWordElementList = XMLHelper.selectElements(proteinElement, "alternativeName");
352                for (Element element : keyWordElementList) {
353                        Element fullNameElement = XMLHelper.selectSingleElement(element, "fullName");
354                        aliasList.add(fullNameElement.getTextContent());
355                }
356
357                return aliasList;
358        }
359
360        /**
361         *
362         * @param compounds
363         * @return
364         */
365        @Override
366        public int countCompounds(C... compounds) {
367                throw new UnsupportedOperationException("Not supported yet.");
368        }
369
370        /**
371         *
372         * @param accession
373         * @return
374         * @throws IOException
375         */
376        private Document getUniprotXML(String accession) throws IOException, CompoundNotFoundException {
377                StringBuilder sb = new StringBuilder();
378                // try in cache
379                if (uniprotDirectoryCache != null && uniprotDirectoryCache.length() > 0) {
380                        sb = fetchFromCache(accession);
381                }
382
383                // http://www.uniprot.org/uniprot/?query=SORBIDRAFT_03g027040&format=xml
384                if (sb.length() == 0) {
385                        String uniprotURL = getUniprotbaseURL() + "/uniprot/" + accession.toUpperCase() + ".xml";
386                        logger.info("Loading: {}", uniprotURL);
387                        sb = fetchUniprotXML(uniprotURL);
388
389                        int index = sb.indexOf("xmlns="); //strip out name space stuff to make it easier on xpath
390                        if (index != -1) {
391                                int lastIndex = sb.indexOf(">", index);
392                                sb.replace(index, lastIndex, "");
393                        }
394                        if (uniprotDirectoryCache != null && uniprotDirectoryCache.length() > 0)
395                                writeCache(sb,accession);
396                }
397
398                logger.info("Load complete");
399                try {
400                        //       logger.debug(sb.toString());
401                        Document document = XMLHelper.inputStreamToDocument(new ByteArrayInputStream(sb.toString().getBytes()));
402                        return document;
403                } catch (SAXException e) {
404                        logger.error("Exception on xml parse of: {}", sb.toString());
405                } catch (ParserConfigurationException e) {
406                        logger.error("Exception on xml parse of: {}", sb.toString());
407                }
408                return null;
409        }
410
411        private void writeCache(StringBuilder sb, String accession) throws IOException {
412                File f = new File(uniprotDirectoryCache + File.separatorChar + accession + ".xml");
413                FileWriter fw = new FileWriter(f);
414                fw.write(sb.toString());
415                fw.close();
416        }
417
418        private StringBuilder fetchUniprotXML(String uniprotURL)
419                        throws IOException, CompoundNotFoundException {
420
421                StringBuilder sb = new StringBuilder();
422                URL uniprot = new URL(uniprotURL);
423                int attempt = 5;
424                List<String> errorCodes = new ArrayList<String>();
425                while(attempt > 0) {
426                        HttpURLConnection uniprotConnection = (HttpURLConnection) uniprot.openConnection();
427                        uniprotConnection.setRequestProperty("User-Agent", "BioJava");
428                        uniprotConnection.connect();
429                        int statusCode = uniprotConnection.getResponseCode();
430                        if (statusCode == 200) {
431                                BufferedReader in = new BufferedReader(
432                                                new InputStreamReader(
433                                                uniprotConnection.getInputStream()));
434                                String inputLine;
435
436                                while ((inputLine = in.readLine()) != null) {
437                                        sb.append(inputLine);
438                                }
439                                in.close();
440                                return sb;
441                        }
442                        attempt--;
443                        errorCodes.add(String.valueOf(statusCode));
444                }
445                throw new RemoteException("Couldn't fetch accession from the url " + uniprotURL + " error codes on 5 attempts are " + errorCodes.toString());
446        }
447
448        /**
449         * @param key
450         * @return A string containing the contents of entry specified by key and if not found returns an empty string
451         * @throws FileNotFoundException
452         * @throws IOException
453         */
454        private StringBuilder fetchFromCache(String key)
455                        throws FileNotFoundException, IOException {
456                int index;
457                File f = new File(uniprotDirectoryCache + File.separatorChar + key + ".xml");
458                StringBuilder sb = new StringBuilder();
459                if (f.exists()) {
460                        FileReader fr = new FileReader(f);
461                        int size = (int) f.length();
462                        char[] data = new char[size];
463                        fr.read(data);
464                        fr.close();
465                        sb.append(data);
466                        index = sb.indexOf("xmlns="); //strip out name space stuff to make it easier on xpath
467                        if (index != -1) {
468                                int lastIndex = sb.indexOf(">", index);
469                                sb.replace(index, lastIndex, "");
470                        }
471                }
472                return sb;
473        }
474
475        /**
476         *
477         * @param uniprotDoc
478         * @return
479         */
480        private String getSequence(Document uniprotDoc)  {
481
482                try {
483                        Element uniprotElement = uniprotDoc.getDocumentElement();
484                        Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
485                        Element sequenceElement = XMLHelper.selectSingleElement(entryElement, "sequence");
486
487                        String seqdata = sequenceElement.getTextContent();
488
489                        return seqdata;
490                } catch (XPathExpressionException e) {
491                        logger.error("Problems while parsing sequence in UniProt XML: {}. Sequence will be blank.", e.getMessage());
492                        return "";
493                }
494        }
495
496        /**
497         * The current UniProt URL to deal with caching issues. www.uniprot.org is load balanced
498         * but you can access pir.uniprot.org directly.
499         * @return the uniprotbaseURL
500         */
501        public static String getUniprotbaseURL() {
502                return uniprotbaseURL;
503        }
504
505        /**
506         * @param aUniprotbaseURL the uniprotbaseURL to set
507         */
508        public static void setUniprotbaseURL(String aUniprotbaseURL) {
509                uniprotbaseURL = aUniprotbaseURL;
510        }
511
512        /**
513         * Local directory cache of XML that can be downloaded
514         * @return the uniprotDirectoryCache
515         */
516        public static String getUniprotDirectoryCache() {
517                return uniprotDirectoryCache;
518        }
519
520        /**
521         * @param aUniprotDirectoryCache the uniprotDirectoryCache to set
522         */
523        public static void setUniprotDirectoryCache(String aUniprotDirectoryCache) {
524                File f = new File(aUniprotDirectoryCache);
525                if (!f.exists()) {
526                        f.mkdirs();
527                }
528                uniprotDirectoryCache = aUniprotDirectoryCache;
529        }
530
531        public static void main(String[] args) {
532
533                try {
534                        UniprotProxySequenceReader<AminoAcidCompound> uniprotSequence = new UniprotProxySequenceReader<AminoAcidCompound>("YA745_GIBZE", AminoAcidCompoundSet.getAminoAcidCompoundSet());
535                        ProteinSequence proteinSequence = new ProteinSequence(uniprotSequence);
536                        logger.info("Accession: {}", proteinSequence.getAccession().getID());
537                        logger.info("Sequence: {}", proteinSequence.getSequenceAsString());
538                } catch (Exception e) {
539                        logger.error("Exception: ", e);
540                }
541
542        }
543
544        /**
545         * Get the gene name associated with this sequence.
546         * @return
547         */
548        public String getGeneName() {
549                if (uniprotDoc == null) {
550                        return "";
551                }
552                try {
553                        Element uniprotElement = uniprotDoc.getDocumentElement();
554                        Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
555                        Element geneElement = XMLHelper.selectSingleElement(entryElement, "gene");
556                        if (geneElement == null) {
557                                return "";
558                        }
559                        Element nameElement = XMLHelper.selectSingleElement(geneElement, "name");
560                        if (nameElement == null) {
561                                return "";
562                        }
563                        return nameElement.getTextContent();
564                } catch (XPathExpressionException e) {
565                        logger.error("Problems while parsing gene name in UniProt XML: {}. Gene name will be blank.",e.getMessage());
566                        return "";
567                }
568        }
569
570        /**
571         * Get the organism name assigned to this sequence
572         * @return
573         */
574        public String getOrganismName() {
575                if (uniprotDoc == null) {
576                        return "";
577                }
578                try {
579                        Element uniprotElement = uniprotDoc.getDocumentElement();
580                        Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
581                        Element organismElement = XMLHelper.selectSingleElement(entryElement, "organism");
582                        if (organismElement == null) {
583                                return "";
584                        }
585                        Element nameElement = XMLHelper.selectSingleElement(organismElement, "name");
586                        if (nameElement == null) {
587                                return "";
588                        }
589                        return nameElement.getTextContent();
590                } catch (XPathExpressionException e) {
591                        logger.error("Problems while parsing organism name in UniProt XML: {}. Organism name will be blank.",e.getMessage());
592                        return "";
593                }
594
595        }
596
597        /**
598         * Pull UniProt key words which is a mixed bag of words associated with this sequence
599         * @return
600         */
601        @Override
602        public ArrayList<String> getKeyWords() {
603                ArrayList<String> keyWordsList = new ArrayList<String>();
604                if (uniprotDoc == null) {
605                        return keyWordsList;
606                }
607                try {
608                        Element uniprotElement = uniprotDoc.getDocumentElement();
609
610                        Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
611                        ArrayList<Element> keyWordElementList = XMLHelper.selectElements(entryElement, "keyword");
612                        for (Element element : keyWordElementList) {
613                                keyWordsList.add(element.getTextContent());
614                        }
615                } catch (XPathExpressionException e) {
616                        logger.error("Problems while parsing keywords in UniProt XML: {}. No keywords will be available.",e.getMessage());
617                        return new ArrayList<String>();
618                }
619
620                return keyWordsList;
621        }
622
623        /**
624         * The Uniprot mappings to other database identifiers for this sequence
625         * @return
626         */
627        @Override
628        public LinkedHashMap<String, ArrayList<DBReferenceInfo>> getDatabaseReferences()  {
629                LinkedHashMap<String, ArrayList<DBReferenceInfo>> databaseReferencesHashMap = new LinkedHashMap<String, ArrayList<DBReferenceInfo>>();
630                if (uniprotDoc == null) {
631                        return databaseReferencesHashMap;
632                }
633
634                try {
635                        Element uniprotElement = uniprotDoc.getDocumentElement();
636                        Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
637                        ArrayList<Element> dbreferenceElementList = XMLHelper.selectElements(entryElement, "dbReference");
638                        for (Element element : dbreferenceElementList) {
639                                String type = element.getAttribute("type");
640                                String id = element.getAttribute("id");
641                                ArrayList<DBReferenceInfo> idlist = databaseReferencesHashMap.get(type);
642                                if (idlist == null) {
643                                        idlist = new ArrayList<DBReferenceInfo>();
644                                        databaseReferencesHashMap.put(type, idlist);
645                                }
646                                DBReferenceInfo dbreferenceInfo = new DBReferenceInfo(type, id);
647                                ArrayList<Element> propertyElementList = XMLHelper.selectElements(element, "property");
648                                for (Element propertyElement : propertyElementList) {
649                                        String propertyType = propertyElement.getAttribute("type");
650                                        String propertyValue = propertyElement.getAttribute("value");
651                                        dbreferenceInfo.addProperty(propertyType, propertyValue);
652                                }
653
654                                idlist.add(dbreferenceInfo);
655                        }
656                } catch (XPathExpressionException e) {
657                        logger.error("Problems while parsing db references in UniProt XML: {}. No db references will be available.",e.getMessage());
658                        return new LinkedHashMap<String, ArrayList<DBReferenceInfo>>();
659                }
660
661                return databaseReferencesHashMap;
662        }
663}