001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created on 01-21-2010 021 * 022 * @auther Scooter Willis 023 * 024 */ 025package org.biojava.nbio.core.sequence.loader; 026 027import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 028import org.biojava.nbio.core.sequence.AccessionID; 029import org.biojava.nbio.core.sequence.DataSource; 030import org.biojava.nbio.core.sequence.ProteinSequence; 031import org.biojava.nbio.core.sequence.Strand; 032import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; 033import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet; 034import org.biojava.nbio.core.sequence.features.DBReferenceInfo; 035import org.biojava.nbio.core.sequence.features.DatabaseReferenceInterface; 036import org.biojava.nbio.core.sequence.features.FeaturesKeyWordInterface; 037import org.biojava.nbio.core.sequence.storage.SequenceAsStringHelper; 038import org.biojava.nbio.core.sequence.template.*; 039import org.biojava.nbio.core.util.Equals; 040import org.biojava.nbio.core.util.XMLHelper; 041import org.slf4j.Logger; 042import org.slf4j.LoggerFactory; 043import org.w3c.dom.Document; 044import org.w3c.dom.Element; 045import org.xml.sax.SAXException; 046 047import javax.xml.parsers.ParserConfigurationException; 048import javax.xml.xpath.XPathExpressionException; 049import java.io.*; 050import java.net.HttpURLConnection; 051import java.net.URL; 052import java.rmi.RemoteException; 053import java.util.ArrayList; 054import java.util.Iterator; 055import java.util.LinkedHashMap; 056import java.util.List; 057import java.util.regex.Pattern; 058 059/** 060 * 061 * Pass in a Uniprot ID and this ProxySequenceReader when passed to a ProteinSequence will get the sequence data and other data elements 062 * associated with the ProteinSequence by Uniprot. This is an example of how to map external databases of proteins and features to the BioJava3 063 * ProteinSequence. 064 * Important to call @see setUniprotDirectoryCache to allow caching of XML files so they don't need to be reloaded each time. Does 065 * not manage cache. 066 * @param <C> 067 */ 068public class UniprotProxySequenceReader<C extends Compound> implements ProxySequenceReader<C>, FeaturesKeyWordInterface, DatabaseReferenceInterface { 069 070 private final static Logger logger = LoggerFactory.getLogger(UniprotProxySequenceReader.class); 071 072 /* 073 * Taken from http://www.uniprot.org/help/accession_numbers 074 */ 075 private static final String SPID_PATTERN = "[OPQ][0-9][A-Z0-9]{3}[0-9]"; 076 private static final String TREMBLID_PATTERN = "[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}"; 077 public static final Pattern UP_AC_PATTERN = Pattern.compile("(" + SPID_PATTERN + "|" + TREMBLID_PATTERN + ")"); 078 079 public static final String DEFAULT_UNIPROT_BASE_URL = "https://www.uniprot.org"; 080 081 private static String uniprotbaseURL = DEFAULT_UNIPROT_BASE_URL; 082 private static String uniprotDirectoryCache = null; 083 private String sequence; 084 private CompoundSet<C> compoundSet; 085 private List<C> parsedCompounds = new ArrayList<C>(); 086 Document uniprotDoc; 087 088 /** 089 * The UniProt id is used to retrieve the UniProt XML which is then parsed as a DOM object 090 * so we know everything about the protein. If an error occurs throw an exception. We could 091 * have a bad uniprot id or network error 092 * @param accession 093 * @param compoundSet 094 * @throws CompoundNotFoundException 095 * @throws IOException if problems while reading the UniProt XML 096 */ 097 public UniprotProxySequenceReader(String accession, CompoundSet<C> compoundSet) throws CompoundNotFoundException, IOException { 098 if (!UP_AC_PATTERN.matcher(accession.toUpperCase()).matches()) { 099 throw new IllegalArgumentException("Accession provided " + accession + " doesn't comply with the uniprot acession pattern."); 100 } 101 setCompoundSet(compoundSet); 102 uniprotDoc = this.getUniprotXML(accession); 103 String seq = this.getSequence(uniprotDoc); 104 setContents(seq); 105 } 106 107 /** 108 * The xml is passed in as a DOM object so we know everything about the protein. 109 * If an error occurs throw an exception. We could have a bad uniprot id 110 * @param document 111 * @param compoundSet 112 * @throws CompoundNotFoundException 113 */ 114 public UniprotProxySequenceReader(Document document, CompoundSet<C> compoundSet) throws CompoundNotFoundException { 115 setCompoundSet(compoundSet); 116 uniprotDoc = document; 117 String seq = this.getSequence(uniprotDoc); 118 setContents(seq); 119 } 120 /** 121 * The passed in xml is parsed as a DOM object so we know everything about the protein. 122 * If an error occurs throw an exception. We could have a bad uniprot id 123 * @param xml 124 * @param compoundSet 125 * @return UniprotProxySequenceReader 126 * @throws Exception 127 */ 128 public static <C extends Compound> UniprotProxySequenceReader<C> parseUniprotXMLString(String xml, CompoundSet<C> compoundSet) { 129 try { 130 Document document = XMLHelper.inputStreamToDocument(new ByteArrayInputStream(xml.getBytes())); 131 return new UniprotProxySequenceReader<C>(document, compoundSet); 132 } catch (Exception e) { 133 logger.error("Exception on xml parse of: {}", xml); 134 } 135 return null; 136 } 137 138 @Override 139 public void setCompoundSet(CompoundSet<C> compoundSet) { 140 this.compoundSet = compoundSet; 141 } 142 143 /** 144 * Once the sequence is retrieved set the contents and make sure everything this is valid 145 * Some uniprot records contain white space in the sequence. We must strip it out so setContents doesn't fail. 146 * @param sequence 147 * @throws CompoundNotFoundException 148 */ 149 @Override 150 public void setContents(String sequence) throws CompoundNotFoundException { 151 // Horrendously inefficient - pretty much the way the old BJ did things. 152 // TODO Should be optimised. 153 // NOTE This chokes on whitespace in the sequence, so whitespace is stripped 154 this.sequence = sequence.replaceAll("\\s", "").trim(); 155 this.parsedCompounds.clear(); 156 for (int i = 0; i < this.sequence.length();) { 157 String compoundStr = null; 158 C compound = null; 159 for (int compoundStrLength = 1; compound == null && compoundStrLength <= compoundSet.getMaxSingleCompoundStringLength(); compoundStrLength++) { 160 compoundStr = this.sequence.substring(i, i + compoundStrLength); 161 compound = compoundSet.getCompoundForString(compoundStr); 162 } 163 if (compound == null) { 164 throw new CompoundNotFoundException("Compound "+compoundStr+" not found"); 165 } else { 166 i += compoundStr.length(); 167 } 168 this.parsedCompounds.add(compound); 169 } 170 } 171 172 /** 173 * The sequence length 174 * @return 175 */ 176 @Override 177 public int getLength() { 178 return this.parsedCompounds.size(); 179 } 180 181 /** 182 * 183 * @param position 184 * @return 185 */ 186 @Override 187 public C getCompoundAt(int position) { 188 return this.parsedCompounds.get(position - 1); 189 } 190 191 /** 192 * 193 * @param compound 194 * @return 195 */ 196 @Override 197 public int getIndexOf(C compound) { 198 return this.parsedCompounds.indexOf(compound) + 1; 199 } 200 201 /** 202 * 203 * @param compound 204 * @return 205 */ 206 @Override 207 public int getLastIndexOf(C compound) { 208 return this.parsedCompounds.lastIndexOf(compound) + 1; 209 } 210 211 /** 212 * 213 * @return 214 */ 215 @Override 216 public String toString() { 217 return getSequenceAsString(); 218 } 219 220 /** 221 * 222 * @return 223 */ 224 @Override 225 public String getSequenceAsString() { 226 return sequence; 227 } 228 229 /** 230 * 231 * @return 232 */ 233 @Override 234 public List<C> getAsList() { 235 return this.parsedCompounds; 236 } 237 238 @Override 239 public boolean equals(Object o){ 240 241 if(! Equals.classEqual(this, o)) { 242 return false; 243 } 244 245 Sequence<C> other = (Sequence<C>)o; 246 if ( other.getCompoundSet() != getCompoundSet()) 247 return false; 248 249 List<C> rawCompounds = getAsList(); 250 List<C> otherCompounds = other.getAsList(); 251 252 if ( rawCompounds.size() != otherCompounds.size()) 253 return false; 254 255 for (int i = 0 ; i < rawCompounds.size() ; i++){ 256 Compound myCompound = rawCompounds.get(i); 257 Compound otherCompound = otherCompounds.get(i); 258 if ( ! myCompound.equalsIgnoreCase(otherCompound)) 259 return false; 260 } 261 return true; 262 } 263 264 @Override 265 public int hashCode(){ 266 String s = getSequenceAsString(); 267 return s.hashCode(); 268 } 269 270 /** 271 * 272 * @return 273 */ 274 @Override 275 public SequenceView<C> getInverse() { 276 return SequenceMixin.inverse(this); 277 } 278 279 /** 280 * 281 * @param bioBegin 282 * @param bioEnd 283 * @param strand 284 * @return 285 */ 286 public String getSequenceAsString(Integer bioBegin, Integer bioEnd, Strand strand) { 287 SequenceAsStringHelper<C> sequenceAsStringHelper = new SequenceAsStringHelper<C>(); 288 return sequenceAsStringHelper.getSequenceAsString(this.parsedCompounds, compoundSet, bioBegin, bioEnd, strand); 289 } 290 291 /** 292 * 293 * @param bioBegin 294 * @param bioEnd 295 * @return 296 */ 297 @Override 298 public SequenceView<C> getSubSequence(final Integer bioBegin, final Integer bioEnd) { 299 return new SequenceProxyView<C>(UniprotProxySequenceReader.this, bioBegin, bioEnd); 300 } 301 302 /** 303 * 304 * @return 305 */ 306 @Override 307 public Iterator<C> iterator() { 308 return this.parsedCompounds.iterator(); 309 } 310 311 /** 312 * 313 * @return 314 */ 315 @Override 316 public CompoundSet<C> getCompoundSet() { 317 return compoundSet; 318 } 319 320 /** 321 * 322 * @return 323 */ 324 @Override 325 public AccessionID getAccession() { 326 AccessionID accessionID = new AccessionID(); 327 if (uniprotDoc == null) { 328 return accessionID; 329 } 330 try { 331 Element uniprotElement = uniprotDoc.getDocumentElement(); 332 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry"); 333 Element nameElement = XMLHelper.selectSingleElement(entryElement, "name"); 334 accessionID = new AccessionID(nameElement.getTextContent(), DataSource.UNIPROT); 335 } catch (XPathExpressionException e) { 336 logger.error("Exception: ", e); 337 } 338 return accessionID; 339 } 340 341 /** 342 * Pull uniprot accessions associated with this sequence 343 * @return 344 * @throws XPathExpressionException 345 */ 346 public ArrayList<AccessionID> getAccessions() throws XPathExpressionException { 347 ArrayList<AccessionID> accessionList = new ArrayList<AccessionID>(); 348 if (uniprotDoc == null) { 349 return accessionList; 350 } 351 Element uniprotElement = uniprotDoc.getDocumentElement(); 352 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry"); 353 ArrayList<Element> keyWordElementList = XMLHelper.selectElements(entryElement, "accession"); 354 for (Element element : keyWordElementList) { 355 AccessionID accessionID = new AccessionID(element.getTextContent(), DataSource.UNIPROT); 356 accessionList.add(accessionID); 357 } 358 359 return accessionList; 360 } 361 362 /** 363 * Pull uniprot protein aliases associated with this sequence 364 * Provided for backwards compatibility now that we support both 365 * gene and protein aliases via separate methods. 366 * @return 367 * @throws XPathExpressionException 368 */ 369 public ArrayList<String> getAliases() throws XPathExpressionException { 370 371 return getProteinAliases(); 372 } 373 /** 374 * Pull uniprot protein aliases associated with this sequence 375 * @return 376 * @throws XPathExpressionException 377 */ 378 public ArrayList<String> getProteinAliases() throws XPathExpressionException { 379 ArrayList<String> aliasList = new ArrayList<String>(); 380 if (uniprotDoc == null) { 381 return aliasList; 382 } 383 Element uniprotElement = uniprotDoc.getDocumentElement(); 384 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry"); 385 Element proteinElement = XMLHelper.selectSingleElement(entryElement, "protein"); 386 387 ArrayList<Element> keyWordElementList; 388 getProteinAliasesFromNameGroup(aliasList, proteinElement); 389 390 keyWordElementList = XMLHelper.selectElements(proteinElement, "component"); 391 for (Element element : keyWordElementList) { 392 getProteinAliasesFromNameGroup(aliasList, element); 393 } 394 395 keyWordElementList = XMLHelper.selectElements(proteinElement, "domain"); 396 for (Element element : keyWordElementList) { 397 getProteinAliasesFromNameGroup(aliasList, element); 398 } 399 400 keyWordElementList = XMLHelper.selectElements(proteinElement, "submittedName"); 401 for (Element element : keyWordElementList) { 402 getProteinAliasesFromNameGroup(aliasList, element); 403 } 404 405 keyWordElementList = XMLHelper.selectElements(proteinElement, "cdAntigenName"); 406 for (Element element : keyWordElementList) { 407 String cdAntigenName = element.getTextContent(); 408 if(null != cdAntigenName && !cdAntigenName.trim().isEmpty()) { 409 aliasList.add(cdAntigenName); 410 } 411 } 412 413 keyWordElementList = XMLHelper.selectElements(proteinElement, "innName"); 414 for (Element element : keyWordElementList) { 415 String cdAntigenName = element.getTextContent(); 416 if(null != cdAntigenName && !cdAntigenName.trim().isEmpty()) { 417 aliasList.add(cdAntigenName); 418 } 419 } 420 421 keyWordElementList = XMLHelper.selectElements(proteinElement, "biotechName"); 422 for (Element element : keyWordElementList) { 423 String cdAntigenName = element.getTextContent(); 424 if(null != cdAntigenName && !cdAntigenName.trim().isEmpty()) { 425 aliasList.add(cdAntigenName); 426 } 427 } 428 429 keyWordElementList = XMLHelper.selectElements(proteinElement, "allergenName"); 430 for (Element element : keyWordElementList) { 431 String cdAntigenName = element.getTextContent(); 432 if(null != cdAntigenName && !cdAntigenName.trim().isEmpty()) { 433 aliasList.add(cdAntigenName); 434 } 435 } 436 437 return aliasList; 438 } 439 440 /** 441 * @param aliasList 442 * @param proteinElement 443 * @throws XPathExpressionException 444 */ 445 private void getProteinAliasesFromNameGroup(ArrayList<String> aliasList, Element proteinElement) 446 throws XPathExpressionException { 447 ArrayList<Element> keyWordElementList = XMLHelper.selectElements(proteinElement, "alternativeName"); 448 for (Element element : keyWordElementList) { 449 getProteinAliasesFromElement(aliasList, element); 450 } 451 452 keyWordElementList = XMLHelper.selectElements(proteinElement, "recommendedName"); 453 for (Element element : keyWordElementList) { 454 getProteinAliasesFromElement(aliasList, element); 455 } 456 } 457 458 /** 459 * @param aliasList 460 * @param element 461 * @throws XPathExpressionException 462 */ 463 private void getProteinAliasesFromElement(ArrayList<String> aliasList, Element element) 464 throws XPathExpressionException { 465 Element fullNameElement = XMLHelper.selectSingleElement(element, "fullName"); 466 aliasList.add(fullNameElement.getTextContent()); 467 Element shortNameElement = XMLHelper.selectSingleElement(element, "shortName"); 468 if(null != shortNameElement) { 469 String shortName = shortNameElement.getTextContent(); 470 if(null != shortName && !shortName.trim().isEmpty()) { 471 aliasList.add(shortName); 472 } 473 } 474 } 475 476 /** 477 * Pull uniprot gene aliases associated with this sequence 478 * @return 479 * @throws XPathExpressionException 480 */ 481 public ArrayList<String> getGeneAliases() throws XPathExpressionException { 482 ArrayList<String> aliasList = new ArrayList<String>(); 483 if (uniprotDoc == null) { 484 return aliasList; 485 } 486 Element uniprotElement = uniprotDoc.getDocumentElement(); 487 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry"); 488 ArrayList<Element> proteinElements = XMLHelper.selectElements(entryElement, "gene"); 489 for(Element proteinElement : proteinElements) { 490 ArrayList<Element> keyWordElementList = XMLHelper.selectElements(proteinElement, "name"); 491 for (Element element : keyWordElementList) { 492 aliasList.add(element.getTextContent()); 493 } 494 } 495 return aliasList; 496 } 497 498 /** 499 * 500 * @param compounds 501 * @return 502 */ 503 @Override 504 public int countCompounds(C... compounds) { 505 throw new UnsupportedOperationException("Not supported yet."); 506 } 507 508 /** 509 * 510 * @param accession 511 * @return 512 * @throws IOException 513 */ 514 private Document getUniprotXML(String accession) throws IOException, CompoundNotFoundException { 515 StringBuilder sb = new StringBuilder(); 516 // try in cache 517 if (uniprotDirectoryCache != null && uniprotDirectoryCache.length() > 0) { 518 sb = fetchFromCache(accession); 519 } 520 521 // http://www.uniprot.org/uniprot/?query=SORBIDRAFT_03g027040&format=xml 522 if (sb.length() == 0) { 523 String uniprotURL = getUniprotbaseURL() + "/uniprot/" + accession.toUpperCase() + ".xml"; 524 logger.info("Loading: {}", uniprotURL); 525 sb = fetchUniprotXML(uniprotURL); 526 527 int index = sb.indexOf("xmlns="); //strip out name space stuff to make it easier on xpath 528 if (index != -1) { 529 int lastIndex = sb.indexOf(">", index); 530 sb.replace(index, lastIndex, ""); 531 } 532 if (uniprotDirectoryCache != null && uniprotDirectoryCache.length() > 0) 533 writeCache(sb,accession); 534 } 535 536 logger.info("Load complete"); 537 try { 538 // logger.debug(sb.toString()); 539 Document document = XMLHelper.inputStreamToDocument(new ByteArrayInputStream(sb.toString().getBytes())); 540 return document; 541 } catch (SAXException e) { 542 logger.error("Exception on xml parse of: {}", sb.toString()); 543 } catch (ParserConfigurationException e) { 544 logger.error("Exception on xml parse of: {}", sb.toString()); 545 } 546 return null; 547 } 548 549 private void writeCache(StringBuilder sb, String accession) throws IOException { 550 File f = new File(uniprotDirectoryCache + File.separatorChar + accession + ".xml"); 551 FileWriter fw = new FileWriter(f); 552 fw.write(sb.toString()); 553 fw.close(); 554 } 555 556 /** 557 * Open a URL connection. 558 * 559 * Follows redirects. 560 * @param url 561 * @throws IOException 562 */ 563 private static HttpURLConnection openURLConnection(URL url) throws IOException { 564 // This method should be moved to a utility class in BioJava 5.0 565 566 final int timeout = 5000; 567 final String useragent = "BioJava"; 568 569 HttpURLConnection conn = (HttpURLConnection) url.openConnection(); 570 conn.setRequestProperty("User-Agent", useragent); 571 conn.setInstanceFollowRedirects(true); 572 conn.setConnectTimeout(timeout); 573 conn.setReadTimeout(timeout); 574 575 int status = conn.getResponseCode(); 576 while (status == HttpURLConnection.HTTP_MOVED_TEMP 577 || status == HttpURLConnection.HTTP_MOVED_PERM 578 || status == HttpURLConnection.HTTP_SEE_OTHER) { 579 // Redirect! 580 String newUrl = conn.getHeaderField("Location"); 581 582 if(newUrl.equals(url.toString())) { 583 throw new IOException("Cyclic redirect detected at "+newUrl); 584 } 585 586 // Preserve cookies 587 String cookies = conn.getHeaderField("Set-Cookie"); 588 589 // open the new connection again 590 url = new URL(newUrl); 591 conn.disconnect(); 592 conn = (HttpURLConnection) url.openConnection(); 593 if(cookies != null) { 594 conn.setRequestProperty("Cookie", cookies); 595 } 596 conn.addRequestProperty("User-Agent", useragent); 597 conn.setInstanceFollowRedirects(true); 598 conn.setConnectTimeout(timeout); 599 conn.setReadTimeout(timeout); 600 conn.connect(); 601 602 status = conn.getResponseCode(); 603 604 logger.info("Redirecting from {} to {}", url, newUrl); 605 } 606 conn.connect(); 607 608 return conn; 609 } 610 611 private StringBuilder fetchUniprotXML(String uniprotURL) 612 throws IOException, CompoundNotFoundException { 613 614 StringBuilder sb = new StringBuilder(); 615 URL uniprot = new URL(uniprotURL); 616 int attempt = 5; 617 List<String> errorCodes = new ArrayList<String>(); 618 while(attempt > 0) { 619 HttpURLConnection uniprotConnection = openURLConnection(uniprot); 620 int statusCode = uniprotConnection.getResponseCode(); 621 if (statusCode == HttpURLConnection.HTTP_OK) { 622 BufferedReader in = new BufferedReader( 623 new InputStreamReader( 624 uniprotConnection.getInputStream())); 625 String inputLine; 626 627 while ((inputLine = in.readLine()) != null) { 628 sb.append(inputLine); 629 } 630 in.close(); 631 return sb; 632 } 633 attempt--; 634 errorCodes.add(String.valueOf(statusCode)); 635 } 636 throw new RemoteException("Couldn't fetch accession from the url " + uniprotURL + " error codes on 5 attempts are " + errorCodes.toString()); 637 } 638 639 /** 640 * @param key 641 * @return A string containing the contents of entry specified by key and if not found returns an empty string 642 * @throws FileNotFoundException 643 * @throws IOException 644 */ 645 private StringBuilder fetchFromCache(String key) 646 throws FileNotFoundException, IOException { 647 int index; 648 File f = new File(uniprotDirectoryCache + File.separatorChar + key + ".xml"); 649 StringBuilder sb = new StringBuilder(); 650 if (f.exists()) { 651 FileReader fr = new FileReader(f); 652 int size = (int) f.length(); 653 char[] data = new char[size]; 654 fr.read(data); 655 fr.close(); 656 sb.append(data); 657 index = sb.indexOf("xmlns="); //strip out name space stuff to make it easier on xpath 658 if (index != -1) { 659 int lastIndex = sb.indexOf(">", index); 660 sb.replace(index, lastIndex, ""); 661 } 662 } 663 return sb; 664 } 665 666 /** 667 * 668 * @param uniprotDoc 669 * @return 670 */ 671 private String getSequence(Document uniprotDoc) { 672 673 try { 674 Element uniprotElement = uniprotDoc.getDocumentElement(); 675 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry"); 676 Element sequenceElement = XMLHelper.selectSingleElement(entryElement, "sequence"); 677 678 String seqdata = sequenceElement.getTextContent(); 679 680 return seqdata; 681 } catch (XPathExpressionException e) { 682 logger.error("Problems while parsing sequence in UniProt XML: {}. Sequence will be blank.", e.getMessage()); 683 return ""; 684 } 685 } 686 687 /** 688 * The current UniProt URL to deal with caching issues. www.uniprot.org is load balanced 689 * but you can access pir.uniprot.org directly. 690 * @return the uniprotbaseURL 691 */ 692 public static String getUniprotbaseURL() { 693 return uniprotbaseURL; 694 } 695 696 /** 697 * @param aUniprotbaseURL the uniprotbaseURL to set 698 */ 699 public static void setUniprotbaseURL(String aUniprotbaseURL) { 700 uniprotbaseURL = aUniprotbaseURL; 701 } 702 703 /** 704 * Local directory cache of XML that can be downloaded 705 * @return the uniprotDirectoryCache 706 */ 707 public static String getUniprotDirectoryCache() { 708 return uniprotDirectoryCache; 709 } 710 711 /** 712 * @param aUniprotDirectoryCache the uniprotDirectoryCache to set 713 */ 714 public static void setUniprotDirectoryCache(String aUniprotDirectoryCache) { 715 File f = new File(aUniprotDirectoryCache); 716 if (!f.exists()) { 717 f.mkdirs(); 718 } 719 uniprotDirectoryCache = aUniprotDirectoryCache; 720 } 721 722 public static void main(String[] args) { 723 724 try { 725 UniprotProxySequenceReader<AminoAcidCompound> uniprotSequence = new UniprotProxySequenceReader<AminoAcidCompound>("YA745_GIBZE", AminoAcidCompoundSet.getAminoAcidCompoundSet()); 726 ProteinSequence proteinSequence = new ProteinSequence(uniprotSequence); 727 logger.info("Accession: {}", proteinSequence.getAccession().getID()); 728 logger.info("Sequence: {}", proteinSequence.getSequenceAsString()); 729 } catch (Exception e) { 730 logger.error("Exception: ", e); 731 } 732 733 } 734 735 /** 736 * Get the gene name associated with this sequence. 737 * @return 738 */ 739 public String getGeneName() { 740 if (uniprotDoc == null) { 741 return ""; 742 } 743 try { 744 Element uniprotElement = uniprotDoc.getDocumentElement(); 745 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry"); 746 Element geneElement = XMLHelper.selectSingleElement(entryElement, "gene"); 747 if (geneElement == null) { 748 return ""; 749 } 750 Element nameElement = XMLHelper.selectSingleElement(geneElement, "name"); 751 if (nameElement == null) { 752 return ""; 753 } 754 return nameElement.getTextContent(); 755 } catch (XPathExpressionException e) { 756 logger.error("Problems while parsing gene name in UniProt XML: {}. Gene name will be blank.",e.getMessage()); 757 return ""; 758 } 759 } 760 761 /** 762 * Get the organism name assigned to this sequence 763 * @return 764 */ 765 public String getOrganismName() { 766 if (uniprotDoc == null) { 767 return ""; 768 } 769 try { 770 Element uniprotElement = uniprotDoc.getDocumentElement(); 771 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry"); 772 Element organismElement = XMLHelper.selectSingleElement(entryElement, "organism"); 773 if (organismElement == null) { 774 return ""; 775 } 776 Element nameElement = XMLHelper.selectSingleElement(organismElement, "name"); 777 if (nameElement == null) { 778 return ""; 779 } 780 return nameElement.getTextContent(); 781 } catch (XPathExpressionException e) { 782 logger.error("Problems while parsing organism name in UniProt XML: {}. Organism name will be blank.",e.getMessage()); 783 return ""; 784 } 785 786 } 787 788 /** 789 * Pull UniProt key words which is a mixed bag of words associated with this sequence 790 * @return 791 */ 792 @Override 793 public ArrayList<String> getKeyWords() { 794 ArrayList<String> keyWordsList = new ArrayList<String>(); 795 if (uniprotDoc == null) { 796 return keyWordsList; 797 } 798 try { 799 Element uniprotElement = uniprotDoc.getDocumentElement(); 800 801 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry"); 802 ArrayList<Element> keyWordElementList = XMLHelper.selectElements(entryElement, "keyword"); 803 for (Element element : keyWordElementList) { 804 keyWordsList.add(element.getTextContent()); 805 } 806 } catch (XPathExpressionException e) { 807 logger.error("Problems while parsing keywords in UniProt XML: {}. No keywords will be available.",e.getMessage()); 808 return new ArrayList<String>(); 809 } 810 811 return keyWordsList; 812 } 813 814 /** 815 * The Uniprot mappings to other database identifiers for this sequence 816 * @return 817 */ 818 @Override 819 public LinkedHashMap<String, ArrayList<DBReferenceInfo>> getDatabaseReferences() { 820 LinkedHashMap<String, ArrayList<DBReferenceInfo>> databaseReferencesHashMap = new LinkedHashMap<String, ArrayList<DBReferenceInfo>>(); 821 if (uniprotDoc == null) { 822 return databaseReferencesHashMap; 823 } 824 825 try { 826 Element uniprotElement = uniprotDoc.getDocumentElement(); 827 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry"); 828 ArrayList<Element> dbreferenceElementList = XMLHelper.selectElements(entryElement, "dbReference"); 829 for (Element element : dbreferenceElementList) { 830 String type = element.getAttribute("type"); 831 String id = element.getAttribute("id"); 832 ArrayList<DBReferenceInfo> idlist = databaseReferencesHashMap.get(type); 833 if (idlist == null) { 834 idlist = new ArrayList<DBReferenceInfo>(); 835 databaseReferencesHashMap.put(type, idlist); 836 } 837 DBReferenceInfo dbreferenceInfo = new DBReferenceInfo(type, id); 838 ArrayList<Element> propertyElementList = XMLHelper.selectElements(element, "property"); 839 for (Element propertyElement : propertyElementList) { 840 String propertyType = propertyElement.getAttribute("type"); 841 String propertyValue = propertyElement.getAttribute("value"); 842 dbreferenceInfo.addProperty(propertyType, propertyValue); 843 } 844 845 idlist.add(dbreferenceInfo); 846 } 847 } catch (XPathExpressionException e) { 848 logger.error("Problems while parsing db references in UniProt XML: {}. No db references will be available.",e.getMessage()); 849 return new LinkedHashMap<String, ArrayList<DBReferenceInfo>>(); 850 } 851 852 return databaseReferencesHashMap; 853 } 854}