001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created on 01-21-2010 021 * 022 * @auther Scooter Willis 023 * 024 */ 025package org.biojava.nbio.core.sequence.loader; 026 027import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 028import org.biojava.nbio.core.sequence.AccessionID; 029import org.biojava.nbio.core.sequence.DataSource; 030import org.biojava.nbio.core.sequence.ProteinSequence; 031import org.biojava.nbio.core.sequence.Strand; 032import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; 033import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet; 034import org.biojava.nbio.core.sequence.features.DBReferenceInfo; 035import org.biojava.nbio.core.sequence.features.DatabaseReferenceInterface; 036import org.biojava.nbio.core.sequence.features.FeaturesKeyWordInterface; 037import org.biojava.nbio.core.sequence.storage.SequenceAsStringHelper; 038import org.biojava.nbio.core.sequence.template.*; 039import org.biojava.nbio.core.util.Equals; 040import org.biojava.nbio.core.util.XMLHelper; 041import org.slf4j.Logger; 042import org.slf4j.LoggerFactory; 043import org.w3c.dom.Document; 044import org.w3c.dom.Element; 045import org.xml.sax.SAXException; 046 047import javax.xml.parsers.ParserConfigurationException; 048import javax.xml.xpath.XPathExpressionException; 049import java.io.*; 050import java.net.HttpURLConnection; 051import java.net.URL; 052import java.rmi.RemoteException; 053import java.util.ArrayList; 054import java.util.Iterator; 055import java.util.LinkedHashMap; 056import java.util.List; 057import java.util.regex.Pattern; 058 059/** 060 * 061 * Pass in a Uniprot ID and this ProxySequenceReader when passed to a ProteinSequence will get the sequence data and other data elements 062 * associated with the ProteinSequence by Uniprot. This is an example of how to map external databases of proteins and features to the BioJava3 063 * ProteinSequence. 064 * Important to call @see setUniprotDirectoryCache to allow caching of XML files so they don't need to be reloaded each time. Does 065 * not manage cache. 066 * @param <C> 067 */ 068public class UniprotProxySequenceReader<C extends Compound> implements ProxySequenceReader<C>, FeaturesKeyWordInterface, DatabaseReferenceInterface { 069 070 private final static Logger logger = LoggerFactory.getLogger(UniprotProxySequenceReader.class); 071 072 /* 073 * Taken from http://www.uniprot.org/help/accession_numbers 074 */ 075 private static final String SPID_PATTERN = "[OPQ][0-9][A-Z0-9]{3}[0-9]"; 076 private static final String TREMBLID_PATTERN = "[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}"; 077 public static final Pattern UP_AC_PATTERN = Pattern.compile("(" + SPID_PATTERN + "|" + TREMBLID_PATTERN + ")"); 078 079 private static String uniprotbaseURL = "http://www.uniprot.org"; //"http://pir.uniprot.org"; 080 private static String uniprotDirectoryCache = null; 081 private String sequence; 082 private CompoundSet<C> compoundSet; 083 private List<C> parsedCompounds = new ArrayList<C>(); 084 Document uniprotDoc; 085 086 /** 087 * The UniProt id is used to retrieve the UniProt XML which is then parsed as a DOM object 088 * so we know everything about the protein. If an error occurs throw an exception. We could 089 * have a bad uniprot id or network error 090 * @param accession 091 * @param compoundSet 092 * @throws CompoundNotFoundException 093 * @throws IOException if problems while reading the UniProt XML 094 */ 095 public UniprotProxySequenceReader(String accession, CompoundSet<C> compoundSet) throws CompoundNotFoundException, IOException { 096 if (!UP_AC_PATTERN.matcher(accession.toUpperCase()).matches()) { 097 throw new IllegalArgumentException("Accession provided " + accession + " doesn't comply with the uniprot acession pattern."); 098 } 099 setCompoundSet(compoundSet); 100 uniprotDoc = this.getUniprotXML(accession); 101 String seq = this.getSequence(uniprotDoc); 102 setContents(seq); 103 } 104 105 /** 106 * The xml is passed in as a DOM object so we know everything about the protein. 107 * If an error occurs throw an exception. We could have a bad uniprot id 108 * @param document 109 * @param compoundSet 110 * @throws CompoundNotFoundException 111 */ 112 public UniprotProxySequenceReader(Document document, CompoundSet<C> compoundSet) throws CompoundNotFoundException { 113 setCompoundSet(compoundSet); 114 uniprotDoc = document; 115 String seq = this.getSequence(uniprotDoc); 116 setContents(seq); 117 } 118 /** 119 * The passed in xml is parsed as a DOM object so we know everything about the protein. 120 * If an error occurs throw an exception. We could have a bad uniprot id 121 * @param xml 122 * @param compoundSet 123 * @return UniprotProxySequenceReader 124 * @throws Exception 125 */ 126 public static <C extends Compound> UniprotProxySequenceReader<C> parseUniprotXMLString(String xml, CompoundSet<C> compoundSet) { 127 try { 128 Document document = XMLHelper.inputStreamToDocument(new ByteArrayInputStream(xml.getBytes())); 129 return new UniprotProxySequenceReader<C>(document, compoundSet); 130 } catch (Exception e) { 131 logger.error("Exception on xml parse of: {}", xml); 132 } 133 return null; 134 } 135 136 @Override 137 public void setCompoundSet(CompoundSet<C> compoundSet) { 138 this.compoundSet = compoundSet; 139 } 140 141 /** 142 * Once the sequence is retrieved set the contents and make sure everything this is valid 143 * @param sequence 144 * @throws CompoundNotFoundException 145 */ 146 @Override 147 public void setContents(String sequence) throws CompoundNotFoundException { 148 // Horrendously inefficient - pretty much the way the old BJ did things. 149 // TODO Should be optimised. 150 this.sequence = sequence; 151 this.parsedCompounds.clear(); 152 for (int i = 0; i < sequence.length();) { 153 String compoundStr = null; 154 C compound = null; 155 for (int compoundStrLength = 1; compound == null && compoundStrLength <= compoundSet.getMaxSingleCompoundStringLength(); compoundStrLength++) { 156 compoundStr = sequence.substring(i, i + compoundStrLength); 157 compound = compoundSet.getCompoundForString(compoundStr); 158 } 159 if (compound == null) { 160 throw new CompoundNotFoundException("Compound "+compoundStr+" not found"); 161 } else { 162 i += compoundStr.length(); 163 } 164 this.parsedCompounds.add(compound); 165 } 166 } 167 168 /** 169 * The sequence length 170 * @return 171 */ 172 @Override 173 public int getLength() { 174 return this.parsedCompounds.size(); 175 } 176 177 /** 178 * 179 * @param position 180 * @return 181 */ 182 @Override 183 public C getCompoundAt(int position) { 184 return this.parsedCompounds.get(position - 1); 185 } 186 187 /** 188 * 189 * @param compound 190 * @return 191 */ 192 @Override 193 public int getIndexOf(C compound) { 194 return this.parsedCompounds.indexOf(compound) + 1; 195 } 196 197 /** 198 * 199 * @param compound 200 * @return 201 */ 202 @Override 203 public int getLastIndexOf(C compound) { 204 return this.parsedCompounds.lastIndexOf(compound) + 1; 205 } 206 207 /** 208 * 209 * @return 210 */ 211 @Override 212 public String toString() { 213 return getSequenceAsString(); 214 } 215 216 /** 217 * 218 * @return 219 */ 220 @Override 221 public String getSequenceAsString() { 222 return sequence; 223 } 224 225 /** 226 * 227 * @return 228 */ 229 @Override 230 public List<C> getAsList() { 231 return this.parsedCompounds; 232 } 233 234 @Override 235 public boolean equals(Object o){ 236 237 if(! Equals.classEqual(this, o)) { 238 return false; 239 } 240 241 Sequence<C> other = (Sequence<C>)o; 242 if ( other.getCompoundSet() != getCompoundSet()) 243 return false; 244 245 List<C> rawCompounds = getAsList(); 246 List<C> otherCompounds = other.getAsList(); 247 248 if ( rawCompounds.size() != otherCompounds.size()) 249 return false; 250 251 for (int i = 0 ; i < rawCompounds.size() ; i++){ 252 Compound myCompound = rawCompounds.get(i); 253 Compound otherCompound = otherCompounds.get(i); 254 if ( ! myCompound.equalsIgnoreCase(otherCompound)) 255 return false; 256 } 257 return true; 258 } 259 260 @Override 261 public int hashCode(){ 262 String s = getSequenceAsString(); 263 return s.hashCode(); 264 } 265 266 /** 267 * 268 * @return 269 */ 270 @Override 271 public SequenceView<C> getInverse() { 272 return SequenceMixin.inverse(this); 273 } 274 275 /** 276 * 277 * @param bioBegin 278 * @param bioEnd 279 * @param strand 280 * @return 281 */ 282 public String getSequenceAsString(Integer bioBegin, Integer bioEnd, Strand strand) { 283 SequenceAsStringHelper<C> sequenceAsStringHelper = new SequenceAsStringHelper<C>(); 284 return sequenceAsStringHelper.getSequenceAsString(this.parsedCompounds, compoundSet, bioBegin, bioEnd, strand); 285 } 286 287 /** 288 * 289 * @param bioBegin 290 * @param bioEnd 291 * @return 292 */ 293 @Override 294 public SequenceView<C> getSubSequence(final Integer bioBegin, final Integer bioEnd) { 295 return new SequenceProxyView<C>(UniprotProxySequenceReader.this, bioBegin, bioEnd); 296 } 297 298 /** 299 * 300 * @return 301 */ 302 @Override 303 public Iterator<C> iterator() { 304 return this.parsedCompounds.iterator(); 305 } 306 307 /** 308 * 309 * @return 310 */ 311 @Override 312 public CompoundSet<C> getCompoundSet() { 313 return compoundSet; 314 } 315 316 /** 317 * 318 * @return 319 */ 320 @Override 321 public AccessionID getAccession() { 322 AccessionID accessionID = new AccessionID(); 323 if (uniprotDoc == null) { 324 return accessionID; 325 } 326 try { 327 Element uniprotElement = uniprotDoc.getDocumentElement(); 328 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry"); 329 Element nameElement = XMLHelper.selectSingleElement(entryElement, "name"); 330 accessionID = new AccessionID(nameElement.getTextContent(), DataSource.UNIPROT); 331 } catch (XPathExpressionException e) { 332 logger.error("Exception: ", e); 333 } 334 return accessionID; 335 } 336 337 /** 338 * Pull uniprot accessions associated with this sequence 339 * @return 340 * @throws XPathExpressionException 341 */ 342 public ArrayList<AccessionID> getAccessions() throws XPathExpressionException { 343 ArrayList<AccessionID> accessionList = new ArrayList<AccessionID>(); 344 if (uniprotDoc == null) { 345 return accessionList; 346 } 347 Element uniprotElement = uniprotDoc.getDocumentElement(); 348 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry"); 349 ArrayList<Element> keyWordElementList = XMLHelper.selectElements(entryElement, "accession"); 350 for (Element element : keyWordElementList) { 351 AccessionID accessionID = new AccessionID(element.getTextContent(), DataSource.UNIPROT); 352 accessionList.add(accessionID); 353 } 354 355 return accessionList; 356 } 357 358 /** 359 * Pull uniprot protein aliases associated with this sequence 360 * Provided for backwards compatibility now that we support both 361 * gene and protein aliases via separate methods. 362 * @return 363 * @throws XPathExpressionException 364 */ 365 public ArrayList<String> getAliases() throws XPathExpressionException { 366 367 return getProteinAliases(); 368 } 369 /** 370 * Pull uniprot protein aliases associated with this sequence 371 * @return 372 * @throws XPathExpressionException 373 */ 374 public ArrayList<String> getProteinAliases() throws XPathExpressionException { 375 ArrayList<String> aliasList = new ArrayList<String>(); 376 if (uniprotDoc == null) { 377 return aliasList; 378 } 379 Element uniprotElement = uniprotDoc.getDocumentElement(); 380 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry"); 381 Element proteinElement = XMLHelper.selectSingleElement(entryElement, "protein"); 382 ArrayList<Element> keyWordElementList = XMLHelper.selectElements(proteinElement, "alternativeName"); 383 for (Element element : keyWordElementList) { 384 Element fullNameElement = XMLHelper.selectSingleElement(element, "fullName"); 385 aliasList.add(fullNameElement.getTextContent()); 386 Element shortNameElement = XMLHelper.selectSingleElement(element, "shortName"); 387 if(null != shortNameElement) { 388 String shortName = shortNameElement.getTextContent(); 389 if(null != shortName && !shortName.trim().isEmpty()) { 390 aliasList.add(shortName); 391 } 392 } 393 } 394 keyWordElementList = XMLHelper.selectElements(proteinElement, "recommendedName"); 395 for (Element element : keyWordElementList) { 396 Element fullNameElement = XMLHelper.selectSingleElement(element, "fullName"); 397 aliasList.add(fullNameElement.getTextContent()); 398 Element shortNameElement = XMLHelper.selectSingleElement(element, "shortName"); 399 if(null != shortNameElement) { 400 String shortName = shortNameElement.getTextContent(); 401 if(null != shortName && !shortName.trim().isEmpty()) { 402 aliasList.add(shortName); 403 } 404 } 405 } 406 Element cdAntigen = XMLHelper.selectSingleElement(proteinElement, "cdAntigenName"); 407 if(null != cdAntigen) { 408 String cdAntigenName = cdAntigen.getTextContent(); 409 if(null != cdAntigenName && !cdAntigenName.trim().isEmpty()) { 410 aliasList.add(cdAntigenName); 411 } 412 } 413 414 return aliasList; 415 } 416 417 /** 418 * Pull uniprot gene aliases associated with this sequence 419 * @return 420 * @throws XPathExpressionException 421 */ 422 public ArrayList<String> getGeneAliases() throws XPathExpressionException { 423 ArrayList<String> aliasList = new ArrayList<String>(); 424 if (uniprotDoc == null) { 425 return aliasList; 426 } 427 Element uniprotElement = uniprotDoc.getDocumentElement(); 428 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry"); 429 ArrayList<Element> proteinElements = XMLHelper.selectElements(entryElement, "gene"); 430 for(Element proteinElement : proteinElements) { 431 ArrayList<Element> keyWordElementList = XMLHelper.selectElements(proteinElement, "name"); 432 for (Element element : keyWordElementList) { 433 aliasList.add(element.getTextContent()); 434 } 435 } 436 return aliasList; 437 } 438 439 /** 440 * 441 * @param compounds 442 * @return 443 */ 444 @Override 445 public int countCompounds(C... compounds) { 446 throw new UnsupportedOperationException("Not supported yet."); 447 } 448 449 /** 450 * 451 * @param accession 452 * @return 453 * @throws IOException 454 */ 455 private Document getUniprotXML(String accession) throws IOException, CompoundNotFoundException { 456 StringBuilder sb = new StringBuilder(); 457 // try in cache 458 if (uniprotDirectoryCache != null && uniprotDirectoryCache.length() > 0) { 459 sb = fetchFromCache(accession); 460 } 461 462 // http://www.uniprot.org/uniprot/?query=SORBIDRAFT_03g027040&format=xml 463 if (sb.length() == 0) { 464 String uniprotURL = getUniprotbaseURL() + "/uniprot/" + accession.toUpperCase() + ".xml"; 465 logger.info("Loading: {}", uniprotURL); 466 sb = fetchUniprotXML(uniprotURL); 467 468 int index = sb.indexOf("xmlns="); //strip out name space stuff to make it easier on xpath 469 if (index != -1) { 470 int lastIndex = sb.indexOf(">", index); 471 sb.replace(index, lastIndex, ""); 472 } 473 if (uniprotDirectoryCache != null && uniprotDirectoryCache.length() > 0) 474 writeCache(sb,accession); 475 } 476 477 logger.info("Load complete"); 478 try { 479 // logger.debug(sb.toString()); 480 Document document = XMLHelper.inputStreamToDocument(new ByteArrayInputStream(sb.toString().getBytes())); 481 return document; 482 } catch (SAXException e) { 483 logger.error("Exception on xml parse of: {}", sb.toString()); 484 } catch (ParserConfigurationException e) { 485 logger.error("Exception on xml parse of: {}", sb.toString()); 486 } 487 return null; 488 } 489 490 private void writeCache(StringBuilder sb, String accession) throws IOException { 491 File f = new File(uniprotDirectoryCache + File.separatorChar + accession + ".xml"); 492 FileWriter fw = new FileWriter(f); 493 fw.write(sb.toString()); 494 fw.close(); 495 } 496 497 private StringBuilder fetchUniprotXML(String uniprotURL) 498 throws IOException, CompoundNotFoundException { 499 500 StringBuilder sb = new StringBuilder(); 501 URL uniprot = new URL(uniprotURL); 502 int attempt = 5; 503 List<String> errorCodes = new ArrayList<String>(); 504 while(attempt > 0) { 505 HttpURLConnection uniprotConnection = (HttpURLConnection) uniprot.openConnection(); 506 uniprotConnection.setRequestProperty("User-Agent", "BioJava"); 507 uniprotConnection.connect(); 508 int statusCode = uniprotConnection.getResponseCode(); 509 if (statusCode == 200) { 510 BufferedReader in = new BufferedReader( 511 new InputStreamReader( 512 uniprotConnection.getInputStream())); 513 String inputLine; 514 515 while ((inputLine = in.readLine()) != null) { 516 sb.append(inputLine); 517 } 518 in.close(); 519 return sb; 520 } 521 attempt--; 522 errorCodes.add(String.valueOf(statusCode)); 523 } 524 throw new RemoteException("Couldn't fetch accession from the url " + uniprotURL + " error codes on 5 attempts are " + errorCodes.toString()); 525 } 526 527 /** 528 * @param key 529 * @return A string containing the contents of entry specified by key and if not found returns an empty string 530 * @throws FileNotFoundException 531 * @throws IOException 532 */ 533 private StringBuilder fetchFromCache(String key) 534 throws FileNotFoundException, IOException { 535 int index; 536 File f = new File(uniprotDirectoryCache + File.separatorChar + key + ".xml"); 537 StringBuilder sb = new StringBuilder(); 538 if (f.exists()) { 539 FileReader fr = new FileReader(f); 540 int size = (int) f.length(); 541 char[] data = new char[size]; 542 fr.read(data); 543 fr.close(); 544 sb.append(data); 545 index = sb.indexOf("xmlns="); //strip out name space stuff to make it easier on xpath 546 if (index != -1) { 547 int lastIndex = sb.indexOf(">", index); 548 sb.replace(index, lastIndex, ""); 549 } 550 } 551 return sb; 552 } 553 554 /** 555 * 556 * @param uniprotDoc 557 * @return 558 */ 559 private String getSequence(Document uniprotDoc) { 560 561 try { 562 Element uniprotElement = uniprotDoc.getDocumentElement(); 563 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry"); 564 Element sequenceElement = XMLHelper.selectSingleElement(entryElement, "sequence"); 565 566 String seqdata = sequenceElement.getTextContent(); 567 568 return seqdata; 569 } catch (XPathExpressionException e) { 570 logger.error("Problems while parsing sequence in UniProt XML: {}. Sequence will be blank.", e.getMessage()); 571 return ""; 572 } 573 } 574 575 /** 576 * The current UniProt URL to deal with caching issues. www.uniprot.org is load balanced 577 * but you can access pir.uniprot.org directly. 578 * @return the uniprotbaseURL 579 */ 580 public static String getUniprotbaseURL() { 581 return uniprotbaseURL; 582 } 583 584 /** 585 * @param aUniprotbaseURL the uniprotbaseURL to set 586 */ 587 public static void setUniprotbaseURL(String aUniprotbaseURL) { 588 uniprotbaseURL = aUniprotbaseURL; 589 } 590 591 /** 592 * Local directory cache of XML that can be downloaded 593 * @return the uniprotDirectoryCache 594 */ 595 public static String getUniprotDirectoryCache() { 596 return uniprotDirectoryCache; 597 } 598 599 /** 600 * @param aUniprotDirectoryCache the uniprotDirectoryCache to set 601 */ 602 public static void setUniprotDirectoryCache(String aUniprotDirectoryCache) { 603 File f = new File(aUniprotDirectoryCache); 604 if (!f.exists()) { 605 f.mkdirs(); 606 } 607 uniprotDirectoryCache = aUniprotDirectoryCache; 608 } 609 610 public static void main(String[] args) { 611 612 try { 613 UniprotProxySequenceReader<AminoAcidCompound> uniprotSequence = new UniprotProxySequenceReader<AminoAcidCompound>("YA745_GIBZE", AminoAcidCompoundSet.getAminoAcidCompoundSet()); 614 ProteinSequence proteinSequence = new ProteinSequence(uniprotSequence); 615 logger.info("Accession: {}", proteinSequence.getAccession().getID()); 616 logger.info("Sequence: {}", proteinSequence.getSequenceAsString()); 617 } catch (Exception e) { 618 logger.error("Exception: ", e); 619 } 620 621 } 622 623 /** 624 * Get the gene name associated with this sequence. 625 * @return 626 */ 627 public String getGeneName() { 628 if (uniprotDoc == null) { 629 return ""; 630 } 631 try { 632 Element uniprotElement = uniprotDoc.getDocumentElement(); 633 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry"); 634 Element geneElement = XMLHelper.selectSingleElement(entryElement, "gene"); 635 if (geneElement == null) { 636 return ""; 637 } 638 Element nameElement = XMLHelper.selectSingleElement(geneElement, "name"); 639 if (nameElement == null) { 640 return ""; 641 } 642 return nameElement.getTextContent(); 643 } catch (XPathExpressionException e) { 644 logger.error("Problems while parsing gene name in UniProt XML: {}. Gene name will be blank.",e.getMessage()); 645 return ""; 646 } 647 } 648 649 /** 650 * Get the organism name assigned to this sequence 651 * @return 652 */ 653 public String getOrganismName() { 654 if (uniprotDoc == null) { 655 return ""; 656 } 657 try { 658 Element uniprotElement = uniprotDoc.getDocumentElement(); 659 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry"); 660 Element organismElement = XMLHelper.selectSingleElement(entryElement, "organism"); 661 if (organismElement == null) { 662 return ""; 663 } 664 Element nameElement = XMLHelper.selectSingleElement(organismElement, "name"); 665 if (nameElement == null) { 666 return ""; 667 } 668 return nameElement.getTextContent(); 669 } catch (XPathExpressionException e) { 670 logger.error("Problems while parsing organism name in UniProt XML: {}. Organism name will be blank.",e.getMessage()); 671 return ""; 672 } 673 674 } 675 676 /** 677 * Pull UniProt key words which is a mixed bag of words associated with this sequence 678 * @return 679 */ 680 @Override 681 public ArrayList<String> getKeyWords() { 682 ArrayList<String> keyWordsList = new ArrayList<String>(); 683 if (uniprotDoc == null) { 684 return keyWordsList; 685 } 686 try { 687 Element uniprotElement = uniprotDoc.getDocumentElement(); 688 689 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry"); 690 ArrayList<Element> keyWordElementList = XMLHelper.selectElements(entryElement, "keyword"); 691 for (Element element : keyWordElementList) { 692 keyWordsList.add(element.getTextContent()); 693 } 694 } catch (XPathExpressionException e) { 695 logger.error("Problems while parsing keywords in UniProt XML: {}. No keywords will be available.",e.getMessage()); 696 return new ArrayList<String>(); 697 } 698 699 return keyWordsList; 700 } 701 702 /** 703 * The Uniprot mappings to other database identifiers for this sequence 704 * @return 705 */ 706 @Override 707 public LinkedHashMap<String, ArrayList<DBReferenceInfo>> getDatabaseReferences() { 708 LinkedHashMap<String, ArrayList<DBReferenceInfo>> databaseReferencesHashMap = new LinkedHashMap<String, ArrayList<DBReferenceInfo>>(); 709 if (uniprotDoc == null) { 710 return databaseReferencesHashMap; 711 } 712 713 try { 714 Element uniprotElement = uniprotDoc.getDocumentElement(); 715 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry"); 716 ArrayList<Element> dbreferenceElementList = XMLHelper.selectElements(entryElement, "dbReference"); 717 for (Element element : dbreferenceElementList) { 718 String type = element.getAttribute("type"); 719 String id = element.getAttribute("id"); 720 ArrayList<DBReferenceInfo> idlist = databaseReferencesHashMap.get(type); 721 if (idlist == null) { 722 idlist = new ArrayList<DBReferenceInfo>(); 723 databaseReferencesHashMap.put(type, idlist); 724 } 725 DBReferenceInfo dbreferenceInfo = new DBReferenceInfo(type, id); 726 ArrayList<Element> propertyElementList = XMLHelper.selectElements(element, "property"); 727 for (Element propertyElement : propertyElementList) { 728 String propertyType = propertyElement.getAttribute("type"); 729 String propertyValue = propertyElement.getAttribute("value"); 730 dbreferenceInfo.addProperty(propertyType, propertyValue); 731 } 732 733 idlist.add(dbreferenceInfo); 734 } 735 } catch (XPathExpressionException e) { 736 logger.error("Problems while parsing db references in UniProt XML: {}. No db references will be available.",e.getMessage()); 737 return new LinkedHashMap<String, ArrayList<DBReferenceInfo>>(); 738 } 739 740 return databaseReferencesHashMap; 741 } 742}