001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created on 01-21-2010 021 * 022 * @auther Scooter Willis 023 * 024 */ 025package org.biojava.nbio.core.sequence.loader; 026 027import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 028import org.biojava.nbio.core.sequence.AccessionID; 029import org.biojava.nbio.core.sequence.DataSource; 030import org.biojava.nbio.core.sequence.ProteinSequence; 031import org.biojava.nbio.core.sequence.Strand; 032import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; 033import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet; 034import org.biojava.nbio.core.sequence.features.DBReferenceInfo; 035import org.biojava.nbio.core.sequence.features.DatabaseReferenceInterface; 036import org.biojava.nbio.core.sequence.features.FeaturesKeyWordInterface; 037import org.biojava.nbio.core.sequence.storage.SequenceAsStringHelper; 038import org.biojava.nbio.core.sequence.template.*; 039import org.biojava.nbio.core.util.Equals; 040import org.biojava.nbio.core.util.XMLHelper; 041import org.slf4j.Logger; 042import org.slf4j.LoggerFactory; 043import org.w3c.dom.Document; 044import org.w3c.dom.Element; 045import org.xml.sax.SAXException; 046 047import javax.xml.parsers.ParserConfigurationException; 048import javax.xml.xpath.XPathExpressionException; 049import java.io.*; 050import java.net.HttpURLConnection; 051import java.net.URL; 052import java.rmi.RemoteException; 053import java.util.ArrayList; 054import java.util.Iterator; 055import java.util.LinkedHashMap; 056import java.util.List; 057import java.util.regex.Pattern; 058 059/** 060 * 061 * Pass in a Uniprot ID and this ProxySequenceReader when passed to a ProteinSequence will get the sequence data and other data elements 062 * associated with the ProteinSequence by Uniprot. This is an example of how to map external databases of proteins and features to the BioJava3 063 * ProteinSequence. 064 * Important to call @see setUniprotDirectoryCache to allow caching of XML files so they don't need to be reloaded each time. Does 065 * not manage cache. 066 * @param <C> 067 */ 068public class UniprotProxySequenceReader<C extends Compound> implements ProxySequenceReader<C>, FeaturesKeyWordInterface, DatabaseReferenceInterface { 069 070 private final static Logger logger = LoggerFactory.getLogger(UniprotProxySequenceReader.class); 071 072 /* 073 * Taken from http://www.uniprot.org/help/accession_numbers 074 */ 075 private static final String SPID_PATTERN = "[OPQ][0-9][A-Z0-9]{3}[0-9]"; 076 private static final String TREMBLID_PATTERN = "[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}"; 077 public static final Pattern UP_AC_PATTERN = Pattern.compile("(" + SPID_PATTERN + "|" + TREMBLID_PATTERN + ")"); 078 079 public static final String DEFAULT_UNIPROT_BASE_URL = "https://www.uniprot.org"; 080 081 private static String uniprotbaseURL = DEFAULT_UNIPROT_BASE_URL; 082 private static String uniprotDirectoryCache = null; 083 private String sequence; 084 private CompoundSet<C> compoundSet; 085 private List<C> parsedCompounds = new ArrayList<C>(); 086 Document uniprotDoc; 087 088 /** 089 * The UniProt id is used to retrieve the UniProt XML which is then parsed as a DOM object 090 * so we know everything about the protein. If an error occurs throw an exception. We could 091 * have a bad uniprot id or network error 092 * @param accession 093 * @param compoundSet 094 * @throws CompoundNotFoundException 095 * @throws IOException if problems while reading the UniProt XML 096 */ 097 public UniprotProxySequenceReader(String accession, CompoundSet<C> compoundSet) throws CompoundNotFoundException, IOException { 098 if (!UP_AC_PATTERN.matcher(accession.toUpperCase()).matches()) { 099 throw new IllegalArgumentException("Accession provided " + accession + " doesn't comply with the uniprot acession pattern."); 100 } 101 setCompoundSet(compoundSet); 102 uniprotDoc = this.getUniprotXML(accession); 103 String seq = this.getSequence(uniprotDoc); 104 setContents(seq); 105 } 106 107 /** 108 * The xml is passed in as a DOM object so we know everything about the protein. 109 * If an error occurs throw an exception. We could have a bad uniprot id 110 * @param document 111 * @param compoundSet 112 * @throws CompoundNotFoundException 113 */ 114 public UniprotProxySequenceReader(Document document, CompoundSet<C> compoundSet) throws CompoundNotFoundException { 115 setCompoundSet(compoundSet); 116 uniprotDoc = document; 117 String seq = this.getSequence(uniprotDoc); 118 setContents(seq); 119 } 120 /** 121 * The passed in xml is parsed as a DOM object so we know everything about the protein. 122 * If an error occurs throw an exception. We could have a bad uniprot id 123 * @param xml 124 * @param compoundSet 125 * @return UniprotProxySequenceReader 126 * @throws Exception 127 */ 128 public static <C extends Compound> UniprotProxySequenceReader<C> parseUniprotXMLString(String xml, CompoundSet<C> compoundSet) { 129 try { 130 Document document = XMLHelper.inputStreamToDocument(new ByteArrayInputStream(xml.getBytes())); 131 return new UniprotProxySequenceReader<C>(document, compoundSet); 132 } catch (Exception e) { 133 logger.error("Exception on xml parse of: {}", xml); 134 } 135 return null; 136 } 137 138 @Override 139 public void setCompoundSet(CompoundSet<C> compoundSet) { 140 this.compoundSet = compoundSet; 141 } 142 143 /** 144 * Once the sequence is retrieved set the contents and make sure everything this is valid 145 * @param sequence 146 * @throws CompoundNotFoundException 147 */ 148 @Override 149 public void setContents(String sequence) throws CompoundNotFoundException { 150 // Horrendously inefficient - pretty much the way the old BJ did things. 151 // TODO Should be optimised. 152 this.sequence = sequence; 153 this.parsedCompounds.clear(); 154 for (int i = 0; i < sequence.length();) { 155 String compoundStr = null; 156 C compound = null; 157 for (int compoundStrLength = 1; compound == null && compoundStrLength <= compoundSet.getMaxSingleCompoundStringLength(); compoundStrLength++) { 158 compoundStr = sequence.substring(i, i + compoundStrLength); 159 compound = compoundSet.getCompoundForString(compoundStr); 160 } 161 if (compound == null) { 162 throw new CompoundNotFoundException("Compound "+compoundStr+" not found"); 163 } else { 164 i += compoundStr.length(); 165 } 166 this.parsedCompounds.add(compound); 167 } 168 } 169 170 /** 171 * The sequence length 172 * @return 173 */ 174 @Override 175 public int getLength() { 176 return this.parsedCompounds.size(); 177 } 178 179 /** 180 * 181 * @param position 182 * @return 183 */ 184 @Override 185 public C getCompoundAt(int position) { 186 return this.parsedCompounds.get(position - 1); 187 } 188 189 /** 190 * 191 * @param compound 192 * @return 193 */ 194 @Override 195 public int getIndexOf(C compound) { 196 return this.parsedCompounds.indexOf(compound) + 1; 197 } 198 199 /** 200 * 201 * @param compound 202 * @return 203 */ 204 @Override 205 public int getLastIndexOf(C compound) { 206 return this.parsedCompounds.lastIndexOf(compound) + 1; 207 } 208 209 /** 210 * 211 * @return 212 */ 213 @Override 214 public String toString() { 215 return getSequenceAsString(); 216 } 217 218 /** 219 * 220 * @return 221 */ 222 @Override 223 public String getSequenceAsString() { 224 return sequence; 225 } 226 227 /** 228 * 229 * @return 230 */ 231 @Override 232 public List<C> getAsList() { 233 return this.parsedCompounds; 234 } 235 236 @Override 237 public boolean equals(Object o){ 238 239 if(! Equals.classEqual(this, o)) { 240 return false; 241 } 242 243 Sequence<C> other = (Sequence<C>)o; 244 if ( other.getCompoundSet() != getCompoundSet()) 245 return false; 246 247 List<C> rawCompounds = getAsList(); 248 List<C> otherCompounds = other.getAsList(); 249 250 if ( rawCompounds.size() != otherCompounds.size()) 251 return false; 252 253 for (int i = 0 ; i < rawCompounds.size() ; i++){ 254 Compound myCompound = rawCompounds.get(i); 255 Compound otherCompound = otherCompounds.get(i); 256 if ( ! myCompound.equalsIgnoreCase(otherCompound)) 257 return false; 258 } 259 return true; 260 } 261 262 @Override 263 public int hashCode(){ 264 String s = getSequenceAsString(); 265 return s.hashCode(); 266 } 267 268 /** 269 * 270 * @return 271 */ 272 @Override 273 public SequenceView<C> getInverse() { 274 return SequenceMixin.inverse(this); 275 } 276 277 /** 278 * 279 * @param bioBegin 280 * @param bioEnd 281 * @param strand 282 * @return 283 */ 284 public String getSequenceAsString(Integer bioBegin, Integer bioEnd, Strand strand) { 285 SequenceAsStringHelper<C> sequenceAsStringHelper = new SequenceAsStringHelper<C>(); 286 return sequenceAsStringHelper.getSequenceAsString(this.parsedCompounds, compoundSet, bioBegin, bioEnd, strand); 287 } 288 289 /** 290 * 291 * @param bioBegin 292 * @param bioEnd 293 * @return 294 */ 295 @Override 296 public SequenceView<C> getSubSequence(final Integer bioBegin, final Integer bioEnd) { 297 return new SequenceProxyView<C>(UniprotProxySequenceReader.this, bioBegin, bioEnd); 298 } 299 300 /** 301 * 302 * @return 303 */ 304 @Override 305 public Iterator<C> iterator() { 306 return this.parsedCompounds.iterator(); 307 } 308 309 /** 310 * 311 * @return 312 */ 313 @Override 314 public CompoundSet<C> getCompoundSet() { 315 return compoundSet; 316 } 317 318 /** 319 * 320 * @return 321 */ 322 @Override 323 public AccessionID getAccession() { 324 AccessionID accessionID = new AccessionID(); 325 if (uniprotDoc == null) { 326 return accessionID; 327 } 328 try { 329 Element uniprotElement = uniprotDoc.getDocumentElement(); 330 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry"); 331 Element nameElement = XMLHelper.selectSingleElement(entryElement, "name"); 332 accessionID = new AccessionID(nameElement.getTextContent(), DataSource.UNIPROT); 333 } catch (XPathExpressionException e) { 334 logger.error("Exception: ", e); 335 } 336 return accessionID; 337 } 338 339 /** 340 * Pull uniprot accessions associated with this sequence 341 * @return 342 * @throws XPathExpressionException 343 */ 344 public ArrayList<AccessionID> getAccessions() throws XPathExpressionException { 345 ArrayList<AccessionID> accessionList = new ArrayList<AccessionID>(); 346 if (uniprotDoc == null) { 347 return accessionList; 348 } 349 Element uniprotElement = uniprotDoc.getDocumentElement(); 350 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry"); 351 ArrayList<Element> keyWordElementList = XMLHelper.selectElements(entryElement, "accession"); 352 for (Element element : keyWordElementList) { 353 AccessionID accessionID = new AccessionID(element.getTextContent(), DataSource.UNIPROT); 354 accessionList.add(accessionID); 355 } 356 357 return accessionList; 358 } 359 360 /** 361 * Pull uniprot protein aliases associated with this sequence 362 * Provided for backwards compatibility now that we support both 363 * gene and protein aliases via separate methods. 364 * @return 365 * @throws XPathExpressionException 366 */ 367 public ArrayList<String> getAliases() throws XPathExpressionException { 368 369 return getProteinAliases(); 370 } 371 /** 372 * Pull uniprot protein aliases associated with this sequence 373 * @return 374 * @throws XPathExpressionException 375 */ 376 public ArrayList<String> getProteinAliases() throws XPathExpressionException { 377 ArrayList<String> aliasList = new ArrayList<String>(); 378 if (uniprotDoc == null) { 379 return aliasList; 380 } 381 Element uniprotElement = uniprotDoc.getDocumentElement(); 382 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry"); 383 Element proteinElement = XMLHelper.selectSingleElement(entryElement, "protein"); 384 ArrayList<Element> keyWordElementList = XMLHelper.selectElements(proteinElement, "alternativeName"); 385 for (Element element : keyWordElementList) { 386 Element fullNameElement = XMLHelper.selectSingleElement(element, "fullName"); 387 aliasList.add(fullNameElement.getTextContent()); 388 Element shortNameElement = XMLHelper.selectSingleElement(element, "shortName"); 389 if(null != shortNameElement) { 390 String shortName = shortNameElement.getTextContent(); 391 if(null != shortName && !shortName.trim().isEmpty()) { 392 aliasList.add(shortName); 393 } 394 } 395 } 396 keyWordElementList = XMLHelper.selectElements(proteinElement, "recommendedName"); 397 for (Element element : keyWordElementList) { 398 Element fullNameElement = XMLHelper.selectSingleElement(element, "fullName"); 399 aliasList.add(fullNameElement.getTextContent()); 400 Element shortNameElement = XMLHelper.selectSingleElement(element, "shortName"); 401 if(null != shortNameElement) { 402 String shortName = shortNameElement.getTextContent(); 403 if(null != shortName && !shortName.trim().isEmpty()) { 404 aliasList.add(shortName); 405 } 406 } 407 } 408 Element cdAntigen = XMLHelper.selectSingleElement(proteinElement, "cdAntigenName"); 409 if(null != cdAntigen) { 410 String cdAntigenName = cdAntigen.getTextContent(); 411 if(null != cdAntigenName && !cdAntigenName.trim().isEmpty()) { 412 aliasList.add(cdAntigenName); 413 } 414 } 415 416 return aliasList; 417 } 418 419 /** 420 * Pull uniprot gene aliases associated with this sequence 421 * @return 422 * @throws XPathExpressionException 423 */ 424 public ArrayList<String> getGeneAliases() throws XPathExpressionException { 425 ArrayList<String> aliasList = new ArrayList<String>(); 426 if (uniprotDoc == null) { 427 return aliasList; 428 } 429 Element uniprotElement = uniprotDoc.getDocumentElement(); 430 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry"); 431 ArrayList<Element> proteinElements = XMLHelper.selectElements(entryElement, "gene"); 432 for(Element proteinElement : proteinElements) { 433 ArrayList<Element> keyWordElementList = XMLHelper.selectElements(proteinElement, "name"); 434 for (Element element : keyWordElementList) { 435 aliasList.add(element.getTextContent()); 436 } 437 } 438 return aliasList; 439 } 440 441 /** 442 * 443 * @param compounds 444 * @return 445 */ 446 @Override 447 public int countCompounds(C... compounds) { 448 throw new UnsupportedOperationException("Not supported yet."); 449 } 450 451 /** 452 * 453 * @param accession 454 * @return 455 * @throws IOException 456 */ 457 private Document getUniprotXML(String accession) throws IOException, CompoundNotFoundException { 458 StringBuilder sb = new StringBuilder(); 459 // try in cache 460 if (uniprotDirectoryCache != null && uniprotDirectoryCache.length() > 0) { 461 sb = fetchFromCache(accession); 462 } 463 464 // http://www.uniprot.org/uniprot/?query=SORBIDRAFT_03g027040&format=xml 465 if (sb.length() == 0) { 466 String uniprotURL = getUniprotbaseURL() + "/uniprot/" + accession.toUpperCase() + ".xml"; 467 logger.info("Loading: {}", uniprotURL); 468 sb = fetchUniprotXML(uniprotURL); 469 470 int index = sb.indexOf("xmlns="); //strip out name space stuff to make it easier on xpath 471 if (index != -1) { 472 int lastIndex = sb.indexOf(">", index); 473 sb.replace(index, lastIndex, ""); 474 } 475 if (uniprotDirectoryCache != null && uniprotDirectoryCache.length() > 0) 476 writeCache(sb,accession); 477 } 478 479 logger.info("Load complete"); 480 try { 481 // logger.debug(sb.toString()); 482 Document document = XMLHelper.inputStreamToDocument(new ByteArrayInputStream(sb.toString().getBytes())); 483 return document; 484 } catch (SAXException e) { 485 logger.error("Exception on xml parse of: {}", sb.toString()); 486 } catch (ParserConfigurationException e) { 487 logger.error("Exception on xml parse of: {}", sb.toString()); 488 } 489 return null; 490 } 491 492 private void writeCache(StringBuilder sb, String accession) throws IOException { 493 File f = new File(uniprotDirectoryCache + File.separatorChar + accession + ".xml"); 494 FileWriter fw = new FileWriter(f); 495 fw.write(sb.toString()); 496 fw.close(); 497 } 498 499 /** 500 * Open a URL connection. 501 * 502 * Follows redirects. 503 * @param url 504 * @throws IOException 505 */ 506 private static HttpURLConnection openURLConnection(URL url) throws IOException { 507 // This method should be moved to a utility class in BioJava 5.0 508 509 final int timeout = 5000; 510 final String useragent = "BioJava"; 511 512 HttpURLConnection conn = (HttpURLConnection) url.openConnection(); 513 conn.setRequestProperty("User-Agent", useragent); 514 conn.setInstanceFollowRedirects(true); 515 conn.setConnectTimeout(timeout); 516 conn.setReadTimeout(timeout); 517 518 int status = conn.getResponseCode(); 519 while (status == HttpURLConnection.HTTP_MOVED_TEMP 520 || status == HttpURLConnection.HTTP_MOVED_PERM 521 || status == HttpURLConnection.HTTP_SEE_OTHER) { 522 // Redirect! 523 String newUrl = conn.getHeaderField("Location"); 524 525 if(newUrl.equals(url.toString())) { 526 throw new IOException("Cyclic redirect detected at "+newUrl); 527 } 528 529 // Preserve cookies 530 String cookies = conn.getHeaderField("Set-Cookie"); 531 532 // open the new connection again 533 url = new URL(newUrl); 534 conn.disconnect(); 535 conn = (HttpURLConnection) url.openConnection(); 536 if(cookies != null) { 537 conn.setRequestProperty("Cookie", cookies); 538 } 539 conn.addRequestProperty("User-Agent", useragent); 540 conn.setInstanceFollowRedirects(true); 541 conn.setConnectTimeout(timeout); 542 conn.setReadTimeout(timeout); 543 conn.connect(); 544 545 status = conn.getResponseCode(); 546 547 logger.info("Redirecting from {} to {}", url, newUrl); 548 } 549 conn.connect(); 550 551 return conn; 552 } 553 554 private StringBuilder fetchUniprotXML(String uniprotURL) 555 throws IOException, CompoundNotFoundException { 556 557 StringBuilder sb = new StringBuilder(); 558 URL uniprot = new URL(uniprotURL); 559 int attempt = 5; 560 List<String> errorCodes = new ArrayList<String>(); 561 while(attempt > 0) { 562 HttpURLConnection uniprotConnection = openURLConnection(uniprot); 563 int statusCode = uniprotConnection.getResponseCode(); 564 if (statusCode == HttpURLConnection.HTTP_OK) { 565 BufferedReader in = new BufferedReader( 566 new InputStreamReader( 567 uniprotConnection.getInputStream())); 568 String inputLine; 569 570 while ((inputLine = in.readLine()) != null) { 571 sb.append(inputLine); 572 } 573 in.close(); 574 return sb; 575 } 576 attempt--; 577 errorCodes.add(String.valueOf(statusCode)); 578 } 579 throw new RemoteException("Couldn't fetch accession from the url " + uniprotURL + " error codes on 5 attempts are " + errorCodes.toString()); 580 } 581 582 /** 583 * @param key 584 * @return A string containing the contents of entry specified by key and if not found returns an empty string 585 * @throws FileNotFoundException 586 * @throws IOException 587 */ 588 private StringBuilder fetchFromCache(String key) 589 throws FileNotFoundException, IOException { 590 int index; 591 File f = new File(uniprotDirectoryCache + File.separatorChar + key + ".xml"); 592 StringBuilder sb = new StringBuilder(); 593 if (f.exists()) { 594 FileReader fr = new FileReader(f); 595 int size = (int) f.length(); 596 char[] data = new char[size]; 597 fr.read(data); 598 fr.close(); 599 sb.append(data); 600 index = sb.indexOf("xmlns="); //strip out name space stuff to make it easier on xpath 601 if (index != -1) { 602 int lastIndex = sb.indexOf(">", index); 603 sb.replace(index, lastIndex, ""); 604 } 605 } 606 return sb; 607 } 608 609 /** 610 * 611 * @param uniprotDoc 612 * @return 613 */ 614 private String getSequence(Document uniprotDoc) { 615 616 try { 617 Element uniprotElement = uniprotDoc.getDocumentElement(); 618 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry"); 619 Element sequenceElement = XMLHelper.selectSingleElement(entryElement, "sequence"); 620 621 String seqdata = sequenceElement.getTextContent(); 622 623 return seqdata; 624 } catch (XPathExpressionException e) { 625 logger.error("Problems while parsing sequence in UniProt XML: {}. Sequence will be blank.", e.getMessage()); 626 return ""; 627 } 628 } 629 630 /** 631 * The current UniProt URL to deal with caching issues. www.uniprot.org is load balanced 632 * but you can access pir.uniprot.org directly. 633 * @return the uniprotbaseURL 634 */ 635 public static String getUniprotbaseURL() { 636 return uniprotbaseURL; 637 } 638 639 /** 640 * @param aUniprotbaseURL the uniprotbaseURL to set 641 */ 642 public static void setUniprotbaseURL(String aUniprotbaseURL) { 643 uniprotbaseURL = aUniprotbaseURL; 644 } 645 646 /** 647 * Local directory cache of XML that can be downloaded 648 * @return the uniprotDirectoryCache 649 */ 650 public static String getUniprotDirectoryCache() { 651 return uniprotDirectoryCache; 652 } 653 654 /** 655 * @param aUniprotDirectoryCache the uniprotDirectoryCache to set 656 */ 657 public static void setUniprotDirectoryCache(String aUniprotDirectoryCache) { 658 File f = new File(aUniprotDirectoryCache); 659 if (!f.exists()) { 660 f.mkdirs(); 661 } 662 uniprotDirectoryCache = aUniprotDirectoryCache; 663 } 664 665 public static void main(String[] args) { 666 667 try { 668 UniprotProxySequenceReader<AminoAcidCompound> uniprotSequence = new UniprotProxySequenceReader<AminoAcidCompound>("YA745_GIBZE", AminoAcidCompoundSet.getAminoAcidCompoundSet()); 669 ProteinSequence proteinSequence = new ProteinSequence(uniprotSequence); 670 logger.info("Accession: {}", proteinSequence.getAccession().getID()); 671 logger.info("Sequence: {}", proteinSequence.getSequenceAsString()); 672 } catch (Exception e) { 673 logger.error("Exception: ", e); 674 } 675 676 } 677 678 /** 679 * Get the gene name associated with this sequence. 680 * @return 681 */ 682 public String getGeneName() { 683 if (uniprotDoc == null) { 684 return ""; 685 } 686 try { 687 Element uniprotElement = uniprotDoc.getDocumentElement(); 688 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry"); 689 Element geneElement = XMLHelper.selectSingleElement(entryElement, "gene"); 690 if (geneElement == null) { 691 return ""; 692 } 693 Element nameElement = XMLHelper.selectSingleElement(geneElement, "name"); 694 if (nameElement == null) { 695 return ""; 696 } 697 return nameElement.getTextContent(); 698 } catch (XPathExpressionException e) { 699 logger.error("Problems while parsing gene name in UniProt XML: {}. Gene name will be blank.",e.getMessage()); 700 return ""; 701 } 702 } 703 704 /** 705 * Get the organism name assigned to this sequence 706 * @return 707 */ 708 public String getOrganismName() { 709 if (uniprotDoc == null) { 710 return ""; 711 } 712 try { 713 Element uniprotElement = uniprotDoc.getDocumentElement(); 714 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry"); 715 Element organismElement = XMLHelper.selectSingleElement(entryElement, "organism"); 716 if (organismElement == null) { 717 return ""; 718 } 719 Element nameElement = XMLHelper.selectSingleElement(organismElement, "name"); 720 if (nameElement == null) { 721 return ""; 722 } 723 return nameElement.getTextContent(); 724 } catch (XPathExpressionException e) { 725 logger.error("Problems while parsing organism name in UniProt XML: {}. Organism name will be blank.",e.getMessage()); 726 return ""; 727 } 728 729 } 730 731 /** 732 * Pull UniProt key words which is a mixed bag of words associated with this sequence 733 * @return 734 */ 735 @Override 736 public ArrayList<String> getKeyWords() { 737 ArrayList<String> keyWordsList = new ArrayList<String>(); 738 if (uniprotDoc == null) { 739 return keyWordsList; 740 } 741 try { 742 Element uniprotElement = uniprotDoc.getDocumentElement(); 743 744 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry"); 745 ArrayList<Element> keyWordElementList = XMLHelper.selectElements(entryElement, "keyword"); 746 for (Element element : keyWordElementList) { 747 keyWordsList.add(element.getTextContent()); 748 } 749 } catch (XPathExpressionException e) { 750 logger.error("Problems while parsing keywords in UniProt XML: {}. No keywords will be available.",e.getMessage()); 751 return new ArrayList<String>(); 752 } 753 754 return keyWordsList; 755 } 756 757 /** 758 * The Uniprot mappings to other database identifiers for this sequence 759 * @return 760 */ 761 @Override 762 public LinkedHashMap<String, ArrayList<DBReferenceInfo>> getDatabaseReferences() { 763 LinkedHashMap<String, ArrayList<DBReferenceInfo>> databaseReferencesHashMap = new LinkedHashMap<String, ArrayList<DBReferenceInfo>>(); 764 if (uniprotDoc == null) { 765 return databaseReferencesHashMap; 766 } 767 768 try { 769 Element uniprotElement = uniprotDoc.getDocumentElement(); 770 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry"); 771 ArrayList<Element> dbreferenceElementList = XMLHelper.selectElements(entryElement, "dbReference"); 772 for (Element element : dbreferenceElementList) { 773 String type = element.getAttribute("type"); 774 String id = element.getAttribute("id"); 775 ArrayList<DBReferenceInfo> idlist = databaseReferencesHashMap.get(type); 776 if (idlist == null) { 777 idlist = new ArrayList<DBReferenceInfo>(); 778 databaseReferencesHashMap.put(type, idlist); 779 } 780 DBReferenceInfo dbreferenceInfo = new DBReferenceInfo(type, id); 781 ArrayList<Element> propertyElementList = XMLHelper.selectElements(element, "property"); 782 for (Element propertyElement : propertyElementList) { 783 String propertyType = propertyElement.getAttribute("type"); 784 String propertyValue = propertyElement.getAttribute("value"); 785 dbreferenceInfo.addProperty(propertyType, propertyValue); 786 } 787 788 idlist.add(dbreferenceInfo); 789 } 790 } catch (XPathExpressionException e) { 791 logger.error("Problems while parsing db references in UniProt XML: {}. No db references will be available.",e.getMessage()); 792 return new LinkedHashMap<String, ArrayList<DBReferenceInfo>>(); 793 } 794 795 return databaseReferencesHashMap; 796 } 797}