001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created on 01-21-2010 021 * 022 * @auther Scooter Willis 023 * 024 */ 025package org.biojava.nbio.core.sequence.loader; 026 027import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 028import org.biojava.nbio.core.sequence.AccessionID; 029import org.biojava.nbio.core.sequence.DataSource; 030import org.biojava.nbio.core.sequence.ProteinSequence; 031import org.biojava.nbio.core.sequence.Strand; 032import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; 033import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet; 034import org.biojava.nbio.core.sequence.features.DBReferenceInfo; 035import org.biojava.nbio.core.sequence.features.DatabaseReferenceInterface; 036import org.biojava.nbio.core.sequence.features.FeaturesKeyWordInterface; 037import org.biojava.nbio.core.sequence.storage.SequenceAsStringHelper; 038import org.biojava.nbio.core.sequence.template.*; 039import org.biojava.nbio.core.util.Equals; 040import org.biojava.nbio.core.util.XMLHelper; 041import org.slf4j.Logger; 042import org.slf4j.LoggerFactory; 043import org.w3c.dom.Document; 044import org.w3c.dom.Element; 045import org.xml.sax.SAXException; 046 047import javax.xml.parsers.ParserConfigurationException; 048import javax.xml.xpath.XPathExpressionException; 049import java.io.*; 050import java.net.HttpURLConnection; 051import java.net.URL; 052import java.rmi.RemoteException; 053import java.util.*; 054import java.util.regex.Pattern; 055 056/** 057 * 058 * Pass in a Uniprot ID and this ProxySequenceReader when passed to a ProteinSequence will get the sequence data and other data elements 059 * associated with the ProteinSequence by Uniprot. This is an example of how to map external databases of proteins and features to the BioJava3 060 * ProteinSequence. 061 * Important to call @see setUniprotDirectoryCache to allow caching of XML files so they don't need to be reloaded each time. Does 062 * not manage cache. 063 * @param <C> 064 */ 065public class UniprotProxySequenceReader<C extends Compound> implements ProxySequenceReader<C>, FeaturesKeyWordInterface, DatabaseReferenceInterface { 066 067 private final static Logger logger = LoggerFactory.getLogger(UniprotProxySequenceReader.class); 068 069 /* 070 * Taken from http://www.uniprot.org/help/accession_numbers 071 */ 072 private static final String SPID_PATTERN = "[OPQ][0-9][A-Z0-9]{3}[0-9]"; 073 private static final String TREMBLID_PATTERN = "[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}"; 074 public static final Pattern UP_AC_PATTERN = Pattern.compile("(" + SPID_PATTERN + "|" + TREMBLID_PATTERN + ")"); 075 076 public static final String DEFAULT_UNIPROT_BASE_URL = "https://www.uniprot.org"; 077 078 private static String uniprotbaseURL = DEFAULT_UNIPROT_BASE_URL; 079 private static String uniprotDirectoryCache = null; 080 private String sequence; 081 private CompoundSet<C> compoundSet; 082 private List<C> parsedCompounds = new ArrayList<>(); 083 Document uniprotDoc; 084 085 /** 086 * The UniProt id is used to retrieve the UniProt XML which is then parsed as a DOM object 087 * so we know everything about the protein. If an error occurs throw an exception. We could 088 * have a bad uniprot id or network error 089 * @param accession 090 * @param compoundSet 091 * @throws CompoundNotFoundException 092 * @throws IOException if problems while reading the UniProt XML 093 */ 094 public UniprotProxySequenceReader(String accession, CompoundSet<C> compoundSet) throws CompoundNotFoundException, IOException { 095 if (!UP_AC_PATTERN.matcher(accession.toUpperCase()).matches()) { 096 throw new IllegalArgumentException("Accession provided " + accession + " doesn't comply with the uniprot acession pattern."); 097 } 098 setCompoundSet(compoundSet); 099 uniprotDoc = this.getUniprotXML(accession); 100 String seq = this.getSequence(uniprotDoc); 101 setContents(seq); 102 } 103 104 /** 105 * The xml is passed in as a DOM object so we know everything about the protein. 106 * If an error occurs throw an exception. We could have a bad uniprot id 107 * @param document 108 * @param compoundSet 109 * @throws CompoundNotFoundException 110 */ 111 public UniprotProxySequenceReader(Document document, CompoundSet<C> compoundSet) throws CompoundNotFoundException { 112 setCompoundSet(compoundSet); 113 uniprotDoc = document; 114 String seq = this.getSequence(uniprotDoc); 115 setContents(seq); 116 } 117 /** 118 * The passed in xml is parsed as a DOM object so we know everything about the protein. 119 * If an error occurs throw an exception. We could have a bad uniprot id 120 * @param xml 121 * @param compoundSet 122 * @return UniprotProxySequenceReader 123 */ 124 public static <C extends Compound> UniprotProxySequenceReader<C> parseUniprotXMLString(String xml, CompoundSet<C> compoundSet) { 125 try { 126 Document document = XMLHelper.inputStreamToDocument(new ByteArrayInputStream(xml.getBytes())); 127 return new UniprotProxySequenceReader<>(document, compoundSet); 128 } catch (Exception e) { 129 logger.error("Exception on xml parse of: {}", xml); 130 } 131 return null; 132 } 133 134 @Override 135 public void setCompoundSet(CompoundSet<C> compoundSet) { 136 this.compoundSet = compoundSet; 137 } 138 139 /** 140 * Once the sequence is retrieved set the contents and make sure everything this is valid 141 * Some uniprot records contain white space in the sequence. We must strip it out so setContents doesn't fail. 142 * @param sequence 143 * @throws CompoundNotFoundException 144 */ 145 @Override 146 public void setContents(String sequence) throws CompoundNotFoundException { 147 // Horrendously inefficient - pretty much the way the old BJ did things. 148 // TODO Should be optimised. 149 // NOTE This chokes on whitespace in the sequence, so whitespace is stripped 150 this.sequence = sequence.replaceAll("\\s", "").trim(); 151 this.parsedCompounds.clear(); 152 for (int i = 0; i < this.sequence.length();) { 153 String compoundStr = null; 154 C compound = null; 155 for (int compoundStrLength = 1; compound == null && compoundStrLength <= compoundSet.getMaxSingleCompoundStringLength(); compoundStrLength++) { 156 compoundStr = this.sequence.substring(i, i + compoundStrLength); 157 compound = compoundSet.getCompoundForString(compoundStr); 158 } 159 if (compound == null) { 160 throw new CompoundNotFoundException("Compound "+compoundStr+" not found"); 161 } else { 162 i += compoundStr.length(); 163 } 164 this.parsedCompounds.add(compound); 165 } 166 } 167 168 /** 169 * The sequence length 170 * @return 171 */ 172 @Override 173 public int getLength() { 174 return this.parsedCompounds.size(); 175 } 176 177 /** 178 * 179 * @param position 180 * @return 181 */ 182 @Override 183 public C getCompoundAt(int position) { 184 return this.parsedCompounds.get(position - 1); 185 } 186 187 /** 188 * 189 * @param compound 190 * @return 191 */ 192 @Override 193 public int getIndexOf(C compound) { 194 return this.parsedCompounds.indexOf(compound) + 1; 195 } 196 197 /** 198 * 199 * @param compound 200 * @return 201 */ 202 @Override 203 public int getLastIndexOf(C compound) { 204 return this.parsedCompounds.lastIndexOf(compound) + 1; 205 } 206 207 /** 208 * 209 * @return 210 */ 211 @Override 212 public String toString() { 213 return getSequenceAsString(); 214 } 215 216 /** 217 * 218 * @return 219 */ 220 @Override 221 public String getSequenceAsString() { 222 return sequence; 223 } 224 225 /** 226 * 227 * @return 228 */ 229 @Override 230 public List<C> getAsList() { 231 return this.parsedCompounds; 232 } 233 234 @Override 235 public boolean equals(Object o){ 236 237 if(! Equals.classEqual(this, o)) { 238 return false; 239 } 240 @SuppressWarnings("unchecked") 241 Sequence<C> other = (Sequence<C>)o; 242 if ( other.getCompoundSet() != getCompoundSet()) 243 return false; 244 245 List<C> rawCompounds = getAsList(); 246 List<C> otherCompounds = other.getAsList(); 247 248 if ( rawCompounds.size() != otherCompounds.size()) 249 return false; 250 251 for (int i = 0 ; i < rawCompounds.size() ; i++){ 252 Compound myCompound = rawCompounds.get(i); 253 Compound otherCompound = otherCompounds.get(i); 254 if ( ! myCompound.equalsIgnoreCase(otherCompound)) 255 return false; 256 } 257 return true; 258 } 259 260 @Override 261 public int hashCode(){ 262 String s = getSequenceAsString(); 263 return s.hashCode(); 264 } 265 266 /** 267 * 268 * @return 269 */ 270 @Override 271 public SequenceView<C> getInverse() { 272 return SequenceMixin.inverse(this); 273 } 274 275 /** 276 * 277 * @param bioBegin 278 * @param bioEnd 279 * @param strand 280 * @return 281 */ 282 public String getSequenceAsString(Integer bioBegin, Integer bioEnd, Strand strand) { 283 SequenceAsStringHelper<C> sequenceAsStringHelper = new SequenceAsStringHelper<>(); 284 return sequenceAsStringHelper.getSequenceAsString(this.parsedCompounds, compoundSet, bioBegin, bioEnd, strand); 285 } 286 287 /** 288 * 289 * @param bioBegin 290 * @param bioEnd 291 * @return 292 */ 293 @Override 294 public SequenceView<C> getSubSequence(final Integer bioBegin, final Integer bioEnd) { 295 return new SequenceProxyView<>(UniprotProxySequenceReader.this, bioBegin, bioEnd); 296 } 297 298 /** 299 * 300 * @return 301 */ 302 @Override 303 public Iterator<C> iterator() { 304 return this.parsedCompounds.iterator(); 305 } 306 307 /** 308 * 309 * @return 310 */ 311 @Override 312 public CompoundSet<C> getCompoundSet() { 313 return compoundSet; 314 } 315 316 /** 317 * 318 * @return 319 */ 320 @Override 321 public AccessionID getAccession() { 322 AccessionID accessionID = new AccessionID(); 323 if (uniprotDoc == null) { 324 return accessionID; 325 } 326 try { 327 Element uniprotElement = uniprotDoc.getDocumentElement(); 328 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry"); 329 Element nameElement = XMLHelper.selectSingleElement(entryElement, "name"); 330 accessionID = new AccessionID(nameElement.getTextContent(), DataSource.UNIPROT); 331 } catch (XPathExpressionException e) { 332 logger.error("Exception: ", e); 333 } 334 return accessionID; 335 } 336 337 /** 338 * Pull uniprot accessions associated with this sequence 339 * @return 340 * @throws XPathExpressionException 341 */ 342 public List<AccessionID> getAccessions() throws XPathExpressionException { 343 List<AccessionID> accessionList = new ArrayList<>(); 344 if (uniprotDoc == null) { 345 return accessionList; 346 } 347 Element uniprotElement = uniprotDoc.getDocumentElement(); 348 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry"); 349 List<Element> keyWordElementList = XMLHelper.selectElements(entryElement, "accession"); 350 for (Element element : keyWordElementList) { 351 AccessionID accessionID = new AccessionID(element.getTextContent(), DataSource.UNIPROT); 352 accessionList.add(accessionID); 353 } 354 355 return accessionList; 356 } 357 358 /** 359 * Pull uniprot protein aliases associated with this sequence 360 * Provided for backwards compatibility now that we support both 361 * gene and protein aliases via separate methods. 362 * @return 363 * @throws XPathExpressionException 364 */ 365 public List<String> getAliases() throws XPathExpressionException { 366 367 return getProteinAliases(); 368 } 369 /** 370 * Pull uniprot protein aliases associated with this sequence 371 * @return 372 * @throws XPathExpressionException 373 */ 374 public List<String> getProteinAliases() throws XPathExpressionException { 375 List<String> aliasList = new ArrayList<>(); 376 if (uniprotDoc == null) { 377 return aliasList; 378 } 379 Element uniprotElement = uniprotDoc.getDocumentElement(); 380 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry"); 381 Element proteinElement = XMLHelper.selectSingleElement(entryElement, "protein"); 382 383 List<Element> keyWordElementList; 384 getProteinAliasesFromNameGroup(aliasList, proteinElement); 385 386 keyWordElementList = XMLHelper.selectElements(proteinElement, "component"); 387 for (Element element : keyWordElementList) { 388 getProteinAliasesFromNameGroup(aliasList, element); 389 } 390 391 keyWordElementList = XMLHelper.selectElements(proteinElement, "domain"); 392 for (Element element : keyWordElementList) { 393 getProteinAliasesFromNameGroup(aliasList, element); 394 } 395 396 keyWordElementList = XMLHelper.selectElements(proteinElement, "submittedName"); 397 for (Element element : keyWordElementList) { 398 getProteinAliasesFromNameGroup(aliasList, element); 399 } 400 401 keyWordElementList = XMLHelper.selectElements(proteinElement, "cdAntigenName"); 402 for (Element element : keyWordElementList) { 403 String cdAntigenName = element.getTextContent(); 404 if(null != cdAntigenName && !cdAntigenName.trim().isEmpty()) { 405 aliasList.add(cdAntigenName); 406 } 407 } 408 409 keyWordElementList = XMLHelper.selectElements(proteinElement, "innName"); 410 for (Element element : keyWordElementList) { 411 String cdAntigenName = element.getTextContent(); 412 if(null != cdAntigenName && !cdAntigenName.trim().isEmpty()) { 413 aliasList.add(cdAntigenName); 414 } 415 } 416 417 keyWordElementList = XMLHelper.selectElements(proteinElement, "biotechName"); 418 for (Element element : keyWordElementList) { 419 String cdAntigenName = element.getTextContent(); 420 if(null != cdAntigenName && !cdAntigenName.trim().isEmpty()) { 421 aliasList.add(cdAntigenName); 422 } 423 } 424 425 keyWordElementList = XMLHelper.selectElements(proteinElement, "allergenName"); 426 for (Element element : keyWordElementList) { 427 String cdAntigenName = element.getTextContent(); 428 if(null != cdAntigenName && !cdAntigenName.trim().isEmpty()) { 429 aliasList.add(cdAntigenName); 430 } 431 } 432 433 return aliasList; 434 } 435 436 /** 437 * @param aliasList 438 * @param proteinElement 439 * @throws XPathExpressionException 440 */ 441 private void getProteinAliasesFromNameGroup(List<String> aliasList, Element proteinElement) 442 throws XPathExpressionException { 443 List<Element> keyWordElementList = XMLHelper.selectElements(proteinElement, "alternativeName"); 444 for (Element element : keyWordElementList) { 445 getProteinAliasesFromElement(aliasList, element); 446 } 447 448 keyWordElementList = XMLHelper.selectElements(proteinElement, "recommendedName"); 449 for (Element element : keyWordElementList) { 450 getProteinAliasesFromElement(aliasList, element); 451 } 452 } 453 454 /** 455 * @param aliasList 456 * @param element 457 * @throws XPathExpressionException 458 */ 459 private void getProteinAliasesFromElement(List<String> aliasList, Element element) 460 throws XPathExpressionException { 461 Element fullNameElement = XMLHelper.selectSingleElement(element, "fullName"); 462 aliasList.add(fullNameElement.getTextContent()); 463 Element shortNameElement = XMLHelper.selectSingleElement(element, "shortName"); 464 if(null != shortNameElement) { 465 String shortName = shortNameElement.getTextContent(); 466 if(null != shortName && !shortName.trim().isEmpty()) { 467 aliasList.add(shortName); 468 } 469 } 470 } 471 472 /** 473 * Pull uniprot gene aliases associated with this sequence 474 * @return 475 * @throws XPathExpressionException 476 */ 477 public List<String> getGeneAliases() throws XPathExpressionException { 478 List<String> aliasList = new ArrayList<>(); 479 if (uniprotDoc == null) { 480 return aliasList; 481 } 482 Element uniprotElement = uniprotDoc.getDocumentElement(); 483 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry"); 484 List<Element> proteinElements = XMLHelper.selectElements(entryElement, "gene"); 485 for(Element proteinElement : proteinElements) { 486 List<Element> keyWordElementList = XMLHelper.selectElements(proteinElement, "name"); 487 for (Element element : keyWordElementList) { 488 aliasList.add(element.getTextContent()); 489 } 490 } 491 return aliasList; 492 } 493 494 /** 495 * 496 * @param compounds 497 * @return 498 */ 499 @Override 500 public int countCompounds(C... compounds) { 501 throw new UnsupportedOperationException("Not supported yet."); 502 } 503 504 /** 505 * 506 * @param accession 507 * @return 508 * @throws IOException 509 */ 510 private Document getUniprotXML(String accession) throws IOException, CompoundNotFoundException { 511 StringBuilder sb = new StringBuilder(); 512 // try in cache 513 if (uniprotDirectoryCache != null && uniprotDirectoryCache.length() > 0) { 514 sb = fetchFromCache(accession); 515 } 516 517 // http://www.uniprot.org/uniprot/?query=SORBIDRAFT_03g027040&format=xml 518 if (sb.length() == 0) { 519 String uniprotURL = getUniprotbaseURL() + "/uniprot/" + accession.toUpperCase() + ".xml"; 520 logger.info("Loading: {}", uniprotURL); 521 sb = fetchUniprotXML(uniprotURL); 522 523 int index = sb.indexOf("xmlns="); //strip out name space stuff to make it easier on xpath 524 if (index != -1) { 525 int lastIndex = sb.indexOf(">", index); 526 sb.replace(index, lastIndex, ""); 527 } 528 if (uniprotDirectoryCache != null && uniprotDirectoryCache.length() > 0) 529 writeCache(sb,accession); 530 } 531 532 logger.info("Load complete"); 533 try { 534 // logger.debug(sb.toString()); 535 Document document = XMLHelper.inputStreamToDocument(new ByteArrayInputStream(sb.toString().getBytes())); 536 return document; 537 } catch (SAXException | ParserConfigurationException e) { 538 logger.error("Exception on xml parse of: {}", sb.toString()); 539 } 540 return null; 541 } 542 543 private void writeCache(StringBuilder sb, String accession) throws IOException { 544 File f = new File(uniprotDirectoryCache + File.separatorChar + accession + ".xml"); 545 try (FileWriter fw = new FileWriter(f)) { 546 fw.write(sb.toString()); 547 } 548 } 549 550 /** 551 * Open a URL connection. 552 * 553 * Follows redirects. 554 * @param url 555 * @throws IOException 556 */ 557 private static HttpURLConnection openURLConnection(URL url) throws IOException { 558 // This method should be moved to a utility class in BioJava 5.0 559 560 final int timeout = 5000; 561 final String useragent = "BioJava"; 562 563 HttpURLConnection conn = (HttpURLConnection) url.openConnection(); 564 conn.setRequestProperty("User-Agent", useragent); 565 conn.setInstanceFollowRedirects(true); 566 conn.setConnectTimeout(timeout); 567 conn.setReadTimeout(timeout); 568 569 int status = conn.getResponseCode(); 570 while (status == HttpURLConnection.HTTP_MOVED_TEMP 571 || status == HttpURLConnection.HTTP_MOVED_PERM 572 || status == HttpURLConnection.HTTP_SEE_OTHER) { 573 // Redirect! 574 String newUrl = conn.getHeaderField("Location"); 575 576 if(newUrl.equals(url.toString())) { 577 throw new IOException("Cyclic redirect detected at "+newUrl); 578 } 579 580 // Preserve cookies 581 String cookies = conn.getHeaderField("Set-Cookie"); 582 583 // open the new connection again 584 url = new URL(newUrl); 585 conn.disconnect(); 586 conn = (HttpURLConnection) url.openConnection(); 587 if(cookies != null) { 588 conn.setRequestProperty("Cookie", cookies); 589 } 590 conn.addRequestProperty("User-Agent", useragent); 591 conn.setInstanceFollowRedirects(true); 592 conn.setConnectTimeout(timeout); 593 conn.setReadTimeout(timeout); 594 conn.connect(); 595 596 status = conn.getResponseCode(); 597 598 logger.info("Redirecting from {} to {}", url, newUrl); 599 } 600 conn.connect(); 601 602 return conn; 603 } 604 605 private StringBuilder fetchUniprotXML(String uniprotURL) 606 throws IOException, CompoundNotFoundException { 607 608 StringBuilder sb = new StringBuilder(); 609 URL uniprot = new URL(uniprotURL); 610 int attempt = 5; 611 List<String> errorCodes = new ArrayList<>(); 612 while(attempt > 0) { 613 HttpURLConnection uniprotConnection = openURLConnection(uniprot); 614 int statusCode = uniprotConnection.getResponseCode(); 615 if (statusCode == HttpURLConnection.HTTP_OK) { 616 BufferedReader in = new BufferedReader( 617 new InputStreamReader( 618 uniprotConnection.getInputStream())); 619 String inputLine; 620 621 while ((inputLine = in.readLine()) != null) { 622 sb.append(inputLine); 623 } 624 in.close(); 625 return sb; 626 } 627 attempt--; 628 errorCodes.add(String.valueOf(statusCode)); 629 } 630 throw new RemoteException("Couldn't fetch accession from the url " + uniprotURL + " error codes on 5 attempts are " + errorCodes.toString()); 631 } 632 633 /** 634 * @param key 635 * @return A string containing the contents of entry specified by key and if not found returns an empty string 636 * @throws FileNotFoundException 637 * @throws IOException 638 */ 639 private StringBuilder fetchFromCache(String key) 640 throws IOException { 641 int index; 642 File f = new File(uniprotDirectoryCache + File.separatorChar + key + ".xml"); 643 StringBuilder sb = new StringBuilder(); 644 if (f.exists()) { 645 char[] data; 646 try (FileReader fr = new FileReader(f)) { 647 int size = (int) f.length(); 648 data = new char[size]; 649 fr.read(data); 650 } 651 sb.append(data); 652 index = sb.indexOf("xmlns="); //strip out name space stuff to make it easier on xpath 653 if (index != -1) { 654 int lastIndex = sb.indexOf(">", index); 655 sb.replace(index, lastIndex, ""); 656 } 657 } 658 return sb; 659 } 660 661 /** 662 * 663 * @param uniprotDoc 664 * @return 665 */ 666 private String getSequence(Document uniprotDoc) { 667 668 try { 669 Element uniprotElement = uniprotDoc.getDocumentElement(); 670 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry"); 671 Element sequenceElement = XMLHelper.selectSingleElement(entryElement, "sequence"); 672 673 String seqdata = sequenceElement.getTextContent(); 674 675 return seqdata; 676 } catch (XPathExpressionException e) { 677 logger.error("Problems while parsing sequence in UniProt XML: {}. Sequence will be blank.", e.getMessage()); 678 return ""; 679 } 680 } 681 682 /** 683 * The current UniProt URL to deal with caching issues. www.uniprot.org is load balanced 684 * but you can access pir.uniprot.org directly. 685 * @return the uniprotbaseURL 686 */ 687 public static String getUniprotbaseURL() { 688 return uniprotbaseURL; 689 } 690 691 /** 692 * @param aUniprotbaseURL the uniprotbaseURL to set 693 */ 694 public static void setUniprotbaseURL(String aUniprotbaseURL) { 695 uniprotbaseURL = aUniprotbaseURL; 696 } 697 698 /** 699 * Local directory cache of XML that can be downloaded 700 * @return the uniprotDirectoryCache 701 */ 702 public static String getUniprotDirectoryCache() { 703 return uniprotDirectoryCache; 704 } 705 706 /** 707 * @param aUniprotDirectoryCache the uniprotDirectoryCache to set 708 */ 709 public static void setUniprotDirectoryCache(String aUniprotDirectoryCache) { 710 File f = new File(aUniprotDirectoryCache); 711 if (!f.exists()) { 712 f.mkdirs(); 713 } 714 uniprotDirectoryCache = aUniprotDirectoryCache; 715 } 716 717 718 /** 719 * Get the gene name associated with this sequence. 720 * @return 721 */ 722 public String getGeneName() { 723 if (uniprotDoc == null) { 724 return ""; 725 } 726 try { 727 Element uniprotElement = uniprotDoc.getDocumentElement(); 728 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry"); 729 Element geneElement = XMLHelper.selectSingleElement(entryElement, "gene"); 730 if (geneElement == null) { 731 return ""; 732 } 733 Element nameElement = XMLHelper.selectSingleElement(geneElement, "name"); 734 if (nameElement == null) { 735 return ""; 736 } 737 return nameElement.getTextContent(); 738 } catch (XPathExpressionException e) { 739 logger.error("Problems while parsing gene name in UniProt XML: {}. Gene name will be blank.",e.getMessage()); 740 return ""; 741 } 742 } 743 744 /** 745 * Get the organism name assigned to this sequence 746 * @return 747 */ 748 public String getOrganismName() { 749 if (uniprotDoc == null) { 750 return ""; 751 } 752 try { 753 Element uniprotElement = uniprotDoc.getDocumentElement(); 754 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry"); 755 Element organismElement = XMLHelper.selectSingleElement(entryElement, "organism"); 756 if (organismElement == null) { 757 return ""; 758 } 759 Element nameElement = XMLHelper.selectSingleElement(organismElement, "name"); 760 if (nameElement == null) { 761 return ""; 762 } 763 return nameElement.getTextContent(); 764 } catch (XPathExpressionException e) { 765 logger.error("Problems while parsing organism name in UniProt XML: {}. Organism name will be blank.",e.getMessage()); 766 return ""; 767 } 768 769 } 770 771 /** 772 * Pull UniProt key words which is a mixed bag of words associated with this sequence 773 * @return 774 */ 775 @Override 776 public List<String> getKeyWords() { 777 List<String> keyWordsList = new ArrayList<>(); 778 if (uniprotDoc == null) { 779 return keyWordsList; 780 } 781 try { 782 Element uniprotElement = uniprotDoc.getDocumentElement(); 783 784 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry"); 785 List<Element> keyWordElementList = XMLHelper.selectElements(entryElement, "keyword"); 786 for (Element element : keyWordElementList) { 787 keyWordsList.add(element.getTextContent()); 788 } 789 } catch (XPathExpressionException e) { 790 logger.error("Problems while parsing keywords in UniProt XML: {}. No keywords will be available.",e.getMessage()); 791 return new ArrayList<>(); 792 } 793 794 return keyWordsList; 795 } 796 797 /** 798 * The Uniprot mappings to other database identifiers for this sequence 799 * @return 800 */ 801 @Override 802 public Map<String, List<DBReferenceInfo>> getDatabaseReferences() { 803 Map<String, List<DBReferenceInfo>> databaseReferencesHashMap = new LinkedHashMap<>(); 804 if (uniprotDoc == null) { 805 return databaseReferencesHashMap; 806 } 807 808 try { 809 Element uniprotElement = uniprotDoc.getDocumentElement(); 810 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry"); 811 List<Element> dbreferenceElementList = XMLHelper.selectElements(entryElement, "dbReference"); 812 for (Element element : dbreferenceElementList) { 813 String type = element.getAttribute("type"); 814 String id = element.getAttribute("id"); 815 List<DBReferenceInfo> idlist = databaseReferencesHashMap.get(type); 816 if (idlist == null) { 817 idlist = new ArrayList<>(); 818 databaseReferencesHashMap.put(type, idlist); 819 } 820 DBReferenceInfo dbreferenceInfo = new DBReferenceInfo(type, id); 821 List<Element> propertyElementList = XMLHelper.selectElements(element, "property"); 822 for (Element propertyElement : propertyElementList) { 823 String propertyType = propertyElement.getAttribute("type"); 824 String propertyValue = propertyElement.getAttribute("value"); 825 dbreferenceInfo.addProperty(propertyType, propertyValue); 826 } 827 828 idlist.add(dbreferenceInfo); 829 } 830 } catch (XPathExpressionException e) { 831 logger.error("Problems while parsing db references in UniProt XML: {}. No db references will be available.",e.getMessage()); 832 return new LinkedHashMap<>(); 833 } 834 835 return databaseReferencesHashMap; 836 } 837}