001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created on 01-21-2010 021 * 022 * @auther Scooter Willis 023 * 024 */ 025package org.biojava.nbio.core.sequence.loader; 026 027import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 028import org.biojava.nbio.core.sequence.AccessionID; 029import org.biojava.nbio.core.sequence.DataSource; 030import org.biojava.nbio.core.sequence.ProteinSequence; 031import org.biojava.nbio.core.sequence.Strand; 032import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; 033import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet; 034import org.biojava.nbio.core.sequence.features.DBReferenceInfo; 035import org.biojava.nbio.core.sequence.features.DatabaseReferenceInterface; 036import org.biojava.nbio.core.sequence.features.FeaturesKeyWordInterface; 037import org.biojava.nbio.core.sequence.storage.SequenceAsStringHelper; 038import org.biojava.nbio.core.sequence.template.*; 039import org.biojava.nbio.core.util.Equals; 040import org.biojava.nbio.core.util.XMLHelper; 041import org.slf4j.Logger; 042import org.slf4j.LoggerFactory; 043import org.w3c.dom.Document; 044import org.w3c.dom.Element; 045import org.xml.sax.SAXException; 046 047import javax.xml.parsers.ParserConfigurationException; 048import javax.xml.xpath.XPathExpressionException; 049import java.io.*; 050import java.net.HttpURLConnection; 051import java.net.URL; 052import java.rmi.RemoteException; 053import java.util.*; 054import java.util.regex.Pattern; 055 056/** 057 * 058 * Pass in a Uniprot ID and this ProxySequenceReader when passed to a ProteinSequence will get the sequence data and other data elements 059 * associated with the ProteinSequence by Uniprot. This is an example of how to map external databases of proteins and features to the BioJava3 060 * ProteinSequence. 061 * Important to call @see setUniprotDirectoryCache to allow caching of XML files so they don't need to be reloaded each time. Does 062 * not manage cache. 063 * @param <C> 064 */ 065public class UniprotProxySequenceReader<C extends Compound> implements ProxySequenceReader<C>, FeaturesKeyWordInterface, DatabaseReferenceInterface { 066 067 private final static Logger logger = LoggerFactory.getLogger(UniprotProxySequenceReader.class); 068 069 /* 070 * Taken from http://www.uniprot.org/help/accession_numbers 071 */ 072 private static final String SPID_PATTERN = "[OPQ][0-9][A-Z0-9]{3}[0-9]"; 073 private static final String TREMBLID_PATTERN = "[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}"; 074 public static final Pattern UP_AC_PATTERN = Pattern.compile("(" + SPID_PATTERN + "|" + TREMBLID_PATTERN + ")"); 075 076 public static final String DEFAULT_UNIPROT_BASE_URL = "https://www.uniprot.org"; 077 078 private static String uniprotbaseURL = DEFAULT_UNIPROT_BASE_URL; 079 private static String uniprotDirectoryCache = null; 080 private String sequence; 081 private CompoundSet<C> compoundSet; 082 private List<C> parsedCompounds = new ArrayList<C>(); 083 Document uniprotDoc; 084 085 /** 086 * The UniProt id is used to retrieve the UniProt XML which is then parsed as a DOM object 087 * so we know everything about the protein. If an error occurs throw an exception. We could 088 * have a bad uniprot id or network error 089 * @param accession 090 * @param compoundSet 091 * @throws CompoundNotFoundException 092 * @throws IOException if problems while reading the UniProt XML 093 */ 094 public UniprotProxySequenceReader(String accession, CompoundSet<C> compoundSet) throws CompoundNotFoundException, IOException { 095 if (!UP_AC_PATTERN.matcher(accession.toUpperCase()).matches()) { 096 throw new IllegalArgumentException("Accession provided " + accession + " doesn't comply with the uniprot acession pattern."); 097 } 098 setCompoundSet(compoundSet); 099 uniprotDoc = this.getUniprotXML(accession); 100 String seq = this.getSequence(uniprotDoc); 101 setContents(seq); 102 } 103 104 /** 105 * The xml is passed in as a DOM object so we know everything about the protein. 106 * If an error occurs throw an exception. We could have a bad uniprot id 107 * @param document 108 * @param compoundSet 109 * @throws CompoundNotFoundException 110 */ 111 public UniprotProxySequenceReader(Document document, CompoundSet<C> compoundSet) throws CompoundNotFoundException { 112 setCompoundSet(compoundSet); 113 uniprotDoc = document; 114 String seq = this.getSequence(uniprotDoc); 115 setContents(seq); 116 } 117 /** 118 * The passed in xml is parsed as a DOM object so we know everything about the protein. 119 * If an error occurs throw an exception. We could have a bad uniprot id 120 * @param xml 121 * @param compoundSet 122 * @return UniprotProxySequenceReader 123 * @throws Exception 124 */ 125 public static <C extends Compound> UniprotProxySequenceReader<C> parseUniprotXMLString(String xml, CompoundSet<C> compoundSet) { 126 try { 127 Document document = XMLHelper.inputStreamToDocument(new ByteArrayInputStream(xml.getBytes())); 128 return new UniprotProxySequenceReader<C>(document, compoundSet); 129 } catch (Exception e) { 130 logger.error("Exception on xml parse of: {}", xml); 131 } 132 return null; 133 } 134 135 @Override 136 public void setCompoundSet(CompoundSet<C> compoundSet) { 137 this.compoundSet = compoundSet; 138 } 139 140 /** 141 * Once the sequence is retrieved set the contents and make sure everything this is valid 142 * Some uniprot records contain white space in the sequence. We must strip it out so setContents doesn't fail. 143 * @param sequence 144 * @throws CompoundNotFoundException 145 */ 146 @Override 147 public void setContents(String sequence) throws CompoundNotFoundException { 148 // Horrendously inefficient - pretty much the way the old BJ did things. 149 // TODO Should be optimised. 150 // NOTE This chokes on whitespace in the sequence, so whitespace is stripped 151 this.sequence = sequence.replaceAll("\\s", "").trim(); 152 this.parsedCompounds.clear(); 153 for (int i = 0; i < this.sequence.length();) { 154 String compoundStr = null; 155 C compound = null; 156 for (int compoundStrLength = 1; compound == null && compoundStrLength <= compoundSet.getMaxSingleCompoundStringLength(); compoundStrLength++) { 157 compoundStr = this.sequence.substring(i, i + compoundStrLength); 158 compound = compoundSet.getCompoundForString(compoundStr); 159 } 160 if (compound == null) { 161 throw new CompoundNotFoundException("Compound "+compoundStr+" not found"); 162 } else { 163 i += compoundStr.length(); 164 } 165 this.parsedCompounds.add(compound); 166 } 167 } 168 169 /** 170 * The sequence length 171 * @return 172 */ 173 @Override 174 public int getLength() { 175 return this.parsedCompounds.size(); 176 } 177 178 /** 179 * 180 * @param position 181 * @return 182 */ 183 @Override 184 public C getCompoundAt(int position) { 185 return this.parsedCompounds.get(position - 1); 186 } 187 188 /** 189 * 190 * @param compound 191 * @return 192 */ 193 @Override 194 public int getIndexOf(C compound) { 195 return this.parsedCompounds.indexOf(compound) + 1; 196 } 197 198 /** 199 * 200 * @param compound 201 * @return 202 */ 203 @Override 204 public int getLastIndexOf(C compound) { 205 return this.parsedCompounds.lastIndexOf(compound) + 1; 206 } 207 208 /** 209 * 210 * @return 211 */ 212 @Override 213 public String toString() { 214 return getSequenceAsString(); 215 } 216 217 /** 218 * 219 * @return 220 */ 221 @Override 222 public String getSequenceAsString() { 223 return sequence; 224 } 225 226 /** 227 * 228 * @return 229 */ 230 @Override 231 public List<C> getAsList() { 232 return this.parsedCompounds; 233 } 234 235 @Override 236 public boolean equals(Object o){ 237 238 if(! Equals.classEqual(this, o)) { 239 return false; 240 } 241 @SuppressWarnings("unchecked") 242 Sequence<C> other = (Sequence<C>)o; 243 if ( other.getCompoundSet() != getCompoundSet()) 244 return false; 245 246 List<C> rawCompounds = getAsList(); 247 List<C> otherCompounds = other.getAsList(); 248 249 if ( rawCompounds.size() != otherCompounds.size()) 250 return false; 251 252 for (int i = 0 ; i < rawCompounds.size() ; i++){ 253 Compound myCompound = rawCompounds.get(i); 254 Compound otherCompound = otherCompounds.get(i); 255 if ( ! myCompound.equalsIgnoreCase(otherCompound)) 256 return false; 257 } 258 return true; 259 } 260 261 @Override 262 public int hashCode(){ 263 String s = getSequenceAsString(); 264 return s.hashCode(); 265 } 266 267 /** 268 * 269 * @return 270 */ 271 @Override 272 public SequenceView<C> getInverse() { 273 return SequenceMixin.inverse(this); 274 } 275 276 /** 277 * 278 * @param bioBegin 279 * @param bioEnd 280 * @param strand 281 * @return 282 */ 283 public String getSequenceAsString(Integer bioBegin, Integer bioEnd, Strand strand) { 284 SequenceAsStringHelper<C> sequenceAsStringHelper = new SequenceAsStringHelper<C>(); 285 return sequenceAsStringHelper.getSequenceAsString(this.parsedCompounds, compoundSet, bioBegin, bioEnd, strand); 286 } 287 288 /** 289 * 290 * @param bioBegin 291 * @param bioEnd 292 * @return 293 */ 294 @Override 295 public SequenceView<C> getSubSequence(final Integer bioBegin, final Integer bioEnd) { 296 return new SequenceProxyView<C>(UniprotProxySequenceReader.this, bioBegin, bioEnd); 297 } 298 299 /** 300 * 301 * @return 302 */ 303 @Override 304 public Iterator<C> iterator() { 305 return this.parsedCompounds.iterator(); 306 } 307 308 /** 309 * 310 * @return 311 */ 312 @Override 313 public CompoundSet<C> getCompoundSet() { 314 return compoundSet; 315 } 316 317 /** 318 * 319 * @return 320 */ 321 @Override 322 public AccessionID getAccession() { 323 AccessionID accessionID = new AccessionID(); 324 if (uniprotDoc == null) { 325 return accessionID; 326 } 327 try { 328 Element uniprotElement = uniprotDoc.getDocumentElement(); 329 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry"); 330 Element nameElement = XMLHelper.selectSingleElement(entryElement, "name"); 331 accessionID = new AccessionID(nameElement.getTextContent(), DataSource.UNIPROT); 332 } catch (XPathExpressionException e) { 333 logger.error("Exception: ", e); 334 } 335 return accessionID; 336 } 337 338 /** 339 * Pull uniprot accessions associated with this sequence 340 * @return 341 * @throws XPathExpressionException 342 */ 343 public ArrayList<AccessionID> getAccessions() throws XPathExpressionException { 344 ArrayList<AccessionID> accessionList = new ArrayList<AccessionID>(); 345 if (uniprotDoc == null) { 346 return accessionList; 347 } 348 Element uniprotElement = uniprotDoc.getDocumentElement(); 349 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry"); 350 ArrayList<Element> keyWordElementList = XMLHelper.selectElements(entryElement, "accession"); 351 for (Element element : keyWordElementList) { 352 AccessionID accessionID = new AccessionID(element.getTextContent(), DataSource.UNIPROT); 353 accessionList.add(accessionID); 354 } 355 356 return accessionList; 357 } 358 359 /** 360 * Pull uniprot protein aliases associated with this sequence 361 * Provided for backwards compatibility now that we support both 362 * gene and protein aliases via separate methods. 363 * @return 364 * @throws XPathExpressionException 365 */ 366 public ArrayList<String> getAliases() throws XPathExpressionException { 367 368 return getProteinAliases(); 369 } 370 /** 371 * Pull uniprot protein aliases associated with this sequence 372 * @return 373 * @throws XPathExpressionException 374 */ 375 public ArrayList<String> getProteinAliases() throws XPathExpressionException { 376 ArrayList<String> aliasList = new ArrayList<String>(); 377 if (uniprotDoc == null) { 378 return aliasList; 379 } 380 Element uniprotElement = uniprotDoc.getDocumentElement(); 381 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry"); 382 Element proteinElement = XMLHelper.selectSingleElement(entryElement, "protein"); 383 384 ArrayList<Element> keyWordElementList; 385 getProteinAliasesFromNameGroup(aliasList, proteinElement); 386 387 keyWordElementList = XMLHelper.selectElements(proteinElement, "component"); 388 for (Element element : keyWordElementList) { 389 getProteinAliasesFromNameGroup(aliasList, element); 390 } 391 392 keyWordElementList = XMLHelper.selectElements(proteinElement, "domain"); 393 for (Element element : keyWordElementList) { 394 getProteinAliasesFromNameGroup(aliasList, element); 395 } 396 397 keyWordElementList = XMLHelper.selectElements(proteinElement, "submittedName"); 398 for (Element element : keyWordElementList) { 399 getProteinAliasesFromNameGroup(aliasList, element); 400 } 401 402 keyWordElementList = XMLHelper.selectElements(proteinElement, "cdAntigenName"); 403 for (Element element : keyWordElementList) { 404 String cdAntigenName = element.getTextContent(); 405 if(null != cdAntigenName && !cdAntigenName.trim().isEmpty()) { 406 aliasList.add(cdAntigenName); 407 } 408 } 409 410 keyWordElementList = XMLHelper.selectElements(proteinElement, "innName"); 411 for (Element element : keyWordElementList) { 412 String cdAntigenName = element.getTextContent(); 413 if(null != cdAntigenName && !cdAntigenName.trim().isEmpty()) { 414 aliasList.add(cdAntigenName); 415 } 416 } 417 418 keyWordElementList = XMLHelper.selectElements(proteinElement, "biotechName"); 419 for (Element element : keyWordElementList) { 420 String cdAntigenName = element.getTextContent(); 421 if(null != cdAntigenName && !cdAntigenName.trim().isEmpty()) { 422 aliasList.add(cdAntigenName); 423 } 424 } 425 426 keyWordElementList = XMLHelper.selectElements(proteinElement, "allergenName"); 427 for (Element element : keyWordElementList) { 428 String cdAntigenName = element.getTextContent(); 429 if(null != cdAntigenName && !cdAntigenName.trim().isEmpty()) { 430 aliasList.add(cdAntigenName); 431 } 432 } 433 434 return aliasList; 435 } 436 437 /** 438 * @param aliasList 439 * @param proteinElement 440 * @throws XPathExpressionException 441 */ 442 private void getProteinAliasesFromNameGroup(ArrayList<String> aliasList, Element proteinElement) 443 throws XPathExpressionException { 444 ArrayList<Element> keyWordElementList = XMLHelper.selectElements(proteinElement, "alternativeName"); 445 for (Element element : keyWordElementList) { 446 getProteinAliasesFromElement(aliasList, element); 447 } 448 449 keyWordElementList = XMLHelper.selectElements(proteinElement, "recommendedName"); 450 for (Element element : keyWordElementList) { 451 getProteinAliasesFromElement(aliasList, element); 452 } 453 } 454 455 /** 456 * @param aliasList 457 * @param element 458 * @throws XPathExpressionException 459 */ 460 private void getProteinAliasesFromElement(ArrayList<String> aliasList, Element element) 461 throws XPathExpressionException { 462 Element fullNameElement = XMLHelper.selectSingleElement(element, "fullName"); 463 aliasList.add(fullNameElement.getTextContent()); 464 Element shortNameElement = XMLHelper.selectSingleElement(element, "shortName"); 465 if(null != shortNameElement) { 466 String shortName = shortNameElement.getTextContent(); 467 if(null != shortName && !shortName.trim().isEmpty()) { 468 aliasList.add(shortName); 469 } 470 } 471 } 472 473 /** 474 * Pull uniprot gene aliases associated with this sequence 475 * @return 476 * @throws XPathExpressionException 477 */ 478 public ArrayList<String> getGeneAliases() throws XPathExpressionException { 479 ArrayList<String> aliasList = new ArrayList<String>(); 480 if (uniprotDoc == null) { 481 return aliasList; 482 } 483 Element uniprotElement = uniprotDoc.getDocumentElement(); 484 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry"); 485 ArrayList<Element> proteinElements = XMLHelper.selectElements(entryElement, "gene"); 486 for(Element proteinElement : proteinElements) { 487 ArrayList<Element> keyWordElementList = XMLHelper.selectElements(proteinElement, "name"); 488 for (Element element : keyWordElementList) { 489 aliasList.add(element.getTextContent()); 490 } 491 } 492 return aliasList; 493 } 494 495 /** 496 * 497 * @param compounds 498 * @return 499 */ 500 @Override 501 public int countCompounds(C... compounds) { 502 throw new UnsupportedOperationException("Not supported yet."); 503 } 504 505 /** 506 * 507 * @param accession 508 * @return 509 * @throws IOException 510 */ 511 private Document getUniprotXML(String accession) throws IOException, CompoundNotFoundException { 512 StringBuilder sb = new StringBuilder(); 513 // try in cache 514 if (uniprotDirectoryCache != null && uniprotDirectoryCache.length() > 0) { 515 sb = fetchFromCache(accession); 516 } 517 518 // http://www.uniprot.org/uniprot/?query=SORBIDRAFT_03g027040&format=xml 519 if (sb.length() == 0) { 520 String uniprotURL = getUniprotbaseURL() + "/uniprot/" + accession.toUpperCase() + ".xml"; 521 logger.info("Loading: {}", uniprotURL); 522 sb = fetchUniprotXML(uniprotURL); 523 524 int index = sb.indexOf("xmlns="); //strip out name space stuff to make it easier on xpath 525 if (index != -1) { 526 int lastIndex = sb.indexOf(">", index); 527 sb.replace(index, lastIndex, ""); 528 } 529 if (uniprotDirectoryCache != null && uniprotDirectoryCache.length() > 0) 530 writeCache(sb,accession); 531 } 532 533 logger.info("Load complete"); 534 try { 535 // logger.debug(sb.toString()); 536 Document document = XMLHelper.inputStreamToDocument(new ByteArrayInputStream(sb.toString().getBytes())); 537 return document; 538 } catch (SAXException e) { 539 logger.error("Exception on xml parse of: {}", sb.toString()); 540 } catch (ParserConfigurationException e) { 541 logger.error("Exception on xml parse of: {}", sb.toString()); 542 } 543 return null; 544 } 545 546 private void writeCache(StringBuilder sb, String accession) throws IOException { 547 File f = new File(uniprotDirectoryCache + File.separatorChar + accession + ".xml"); 548 FileWriter fw = new FileWriter(f); 549 fw.write(sb.toString()); 550 fw.close(); 551 } 552 553 /** 554 * Open a URL connection. 555 * 556 * Follows redirects. 557 * @param url 558 * @throws IOException 559 */ 560 private static HttpURLConnection openURLConnection(URL url) throws IOException { 561 // This method should be moved to a utility class in BioJava 5.0 562 563 final int timeout = 5000; 564 final String useragent = "BioJava"; 565 566 HttpURLConnection conn = (HttpURLConnection) url.openConnection(); 567 conn.setRequestProperty("User-Agent", useragent); 568 conn.setInstanceFollowRedirects(true); 569 conn.setConnectTimeout(timeout); 570 conn.setReadTimeout(timeout); 571 572 int status = conn.getResponseCode(); 573 while (status == HttpURLConnection.HTTP_MOVED_TEMP 574 || status == HttpURLConnection.HTTP_MOVED_PERM 575 || status == HttpURLConnection.HTTP_SEE_OTHER) { 576 // Redirect! 577 String newUrl = conn.getHeaderField("Location"); 578 579 if(newUrl.equals(url.toString())) { 580 throw new IOException("Cyclic redirect detected at "+newUrl); 581 } 582 583 // Preserve cookies 584 String cookies = conn.getHeaderField("Set-Cookie"); 585 586 // open the new connection again 587 url = new URL(newUrl); 588 conn.disconnect(); 589 conn = (HttpURLConnection) url.openConnection(); 590 if(cookies != null) { 591 conn.setRequestProperty("Cookie", cookies); 592 } 593 conn.addRequestProperty("User-Agent", useragent); 594 conn.setInstanceFollowRedirects(true); 595 conn.setConnectTimeout(timeout); 596 conn.setReadTimeout(timeout); 597 conn.connect(); 598 599 status = conn.getResponseCode(); 600 601 logger.info("Redirecting from {} to {}", url, newUrl); 602 } 603 conn.connect(); 604 605 return conn; 606 } 607 608 private StringBuilder fetchUniprotXML(String uniprotURL) 609 throws IOException, CompoundNotFoundException { 610 611 StringBuilder sb = new StringBuilder(); 612 URL uniprot = new URL(uniprotURL); 613 int attempt = 5; 614 List<String> errorCodes = new ArrayList<String>(); 615 while(attempt > 0) { 616 HttpURLConnection uniprotConnection = openURLConnection(uniprot); 617 int statusCode = uniprotConnection.getResponseCode(); 618 if (statusCode == HttpURLConnection.HTTP_OK) { 619 BufferedReader in = new BufferedReader( 620 new InputStreamReader( 621 uniprotConnection.getInputStream())); 622 String inputLine; 623 624 while ((inputLine = in.readLine()) != null) { 625 sb.append(inputLine); 626 } 627 in.close(); 628 return sb; 629 } 630 attempt--; 631 errorCodes.add(String.valueOf(statusCode)); 632 } 633 throw new RemoteException("Couldn't fetch accession from the url " + uniprotURL + " error codes on 5 attempts are " + errorCodes.toString()); 634 } 635 636 /** 637 * @param key 638 * @return A string containing the contents of entry specified by key and if not found returns an empty string 639 * @throws FileNotFoundException 640 * @throws IOException 641 */ 642 private StringBuilder fetchFromCache(String key) 643 throws FileNotFoundException, IOException { 644 int index; 645 File f = new File(uniprotDirectoryCache + File.separatorChar + key + ".xml"); 646 StringBuilder sb = new StringBuilder(); 647 if (f.exists()) { 648 FileReader fr = new FileReader(f); 649 int size = (int) f.length(); 650 char[] data = new char[size]; 651 fr.read(data); 652 fr.close(); 653 sb.append(data); 654 index = sb.indexOf("xmlns="); //strip out name space stuff to make it easier on xpath 655 if (index != -1) { 656 int lastIndex = sb.indexOf(">", index); 657 sb.replace(index, lastIndex, ""); 658 } 659 } 660 return sb; 661 } 662 663 /** 664 * 665 * @param uniprotDoc 666 * @return 667 */ 668 private String getSequence(Document uniprotDoc) { 669 670 try { 671 Element uniprotElement = uniprotDoc.getDocumentElement(); 672 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry"); 673 Element sequenceElement = XMLHelper.selectSingleElement(entryElement, "sequence"); 674 675 String seqdata = sequenceElement.getTextContent(); 676 677 return seqdata; 678 } catch (XPathExpressionException e) { 679 logger.error("Problems while parsing sequence in UniProt XML: {}. Sequence will be blank.", e.getMessage()); 680 return ""; 681 } 682 } 683 684 /** 685 * The current UniProt URL to deal with caching issues. www.uniprot.org is load balanced 686 * but you can access pir.uniprot.org directly. 687 * @return the uniprotbaseURL 688 */ 689 public static String getUniprotbaseURL() { 690 return uniprotbaseURL; 691 } 692 693 /** 694 * @param aUniprotbaseURL the uniprotbaseURL to set 695 */ 696 public static void setUniprotbaseURL(String aUniprotbaseURL) { 697 uniprotbaseURL = aUniprotbaseURL; 698 } 699 700 /** 701 * Local directory cache of XML that can be downloaded 702 * @return the uniprotDirectoryCache 703 */ 704 public static String getUniprotDirectoryCache() { 705 return uniprotDirectoryCache; 706 } 707 708 /** 709 * @param aUniprotDirectoryCache the uniprotDirectoryCache to set 710 */ 711 public static void setUniprotDirectoryCache(String aUniprotDirectoryCache) { 712 File f = new File(aUniprotDirectoryCache); 713 if (!f.exists()) { 714 f.mkdirs(); 715 } 716 uniprotDirectoryCache = aUniprotDirectoryCache; 717 } 718 719 public static void main(String[] args) { 720 721 try { 722 UniprotProxySequenceReader<AminoAcidCompound> uniprotSequence = new UniprotProxySequenceReader<AminoAcidCompound>("YA745_GIBZE", AminoAcidCompoundSet.getAminoAcidCompoundSet()); 723 ProteinSequence proteinSequence = new ProteinSequence(uniprotSequence); 724 logger.info("Accession: {}", proteinSequence.getAccession().getID()); 725 logger.info("Sequence: {}", proteinSequence.getSequenceAsString()); 726 } catch (Exception e) { 727 logger.error("Exception: ", e); 728 } 729 730 } 731 732 /** 733 * Get the gene name associated with this sequence. 734 * @return 735 */ 736 public String getGeneName() { 737 if (uniprotDoc == null) { 738 return ""; 739 } 740 try { 741 Element uniprotElement = uniprotDoc.getDocumentElement(); 742 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry"); 743 Element geneElement = XMLHelper.selectSingleElement(entryElement, "gene"); 744 if (geneElement == null) { 745 return ""; 746 } 747 Element nameElement = XMLHelper.selectSingleElement(geneElement, "name"); 748 if (nameElement == null) { 749 return ""; 750 } 751 return nameElement.getTextContent(); 752 } catch (XPathExpressionException e) { 753 logger.error("Problems while parsing gene name in UniProt XML: {}. Gene name will be blank.",e.getMessage()); 754 return ""; 755 } 756 } 757 758 /** 759 * Get the organism name assigned to this sequence 760 * @return 761 */ 762 public String getOrganismName() { 763 if (uniprotDoc == null) { 764 return ""; 765 } 766 try { 767 Element uniprotElement = uniprotDoc.getDocumentElement(); 768 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry"); 769 Element organismElement = XMLHelper.selectSingleElement(entryElement, "organism"); 770 if (organismElement == null) { 771 return ""; 772 } 773 Element nameElement = XMLHelper.selectSingleElement(organismElement, "name"); 774 if (nameElement == null) { 775 return ""; 776 } 777 return nameElement.getTextContent(); 778 } catch (XPathExpressionException e) { 779 logger.error("Problems while parsing organism name in UniProt XML: {}. Organism name will be blank.",e.getMessage()); 780 return ""; 781 } 782 783 } 784 785 /** 786 * Pull UniProt key words which is a mixed bag of words associated with this sequence 787 * @return 788 */ 789 @Override 790 public ArrayList<String> getKeyWords() { 791 ArrayList<String> keyWordsList = new ArrayList<String>(); 792 if (uniprotDoc == null) { 793 return keyWordsList; 794 } 795 try { 796 Element uniprotElement = uniprotDoc.getDocumentElement(); 797 798 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry"); 799 ArrayList<Element> keyWordElementList = XMLHelper.selectElements(entryElement, "keyword"); 800 for (Element element : keyWordElementList) { 801 keyWordsList.add(element.getTextContent()); 802 } 803 } catch (XPathExpressionException e) { 804 logger.error("Problems while parsing keywords in UniProt XML: {}. No keywords will be available.",e.getMessage()); 805 return new ArrayList<String>(); 806 } 807 808 return keyWordsList; 809 } 810 811 /** 812 * The Uniprot mappings to other database identifiers for this sequence 813 * @return 814 */ 815 @Override 816 public Map<String, List<DBReferenceInfo>> getDatabaseReferences() { 817 Map<String, List<DBReferenceInfo>> databaseReferencesHashMap = new LinkedHashMap<>(); 818 if (uniprotDoc == null) { 819 return databaseReferencesHashMap; 820 } 821 822 try { 823 Element uniprotElement = uniprotDoc.getDocumentElement(); 824 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry"); 825 ArrayList<Element> dbreferenceElementList = XMLHelper.selectElements(entryElement, "dbReference"); 826 for (Element element : dbreferenceElementList) { 827 String type = element.getAttribute("type"); 828 String id = element.getAttribute("id"); 829 List<DBReferenceInfo> idlist = databaseReferencesHashMap.get(type); 830 if (idlist == null) { 831 idlist = new ArrayList<DBReferenceInfo>(); 832 databaseReferencesHashMap.put(type, idlist); 833 } 834 DBReferenceInfo dbreferenceInfo = new DBReferenceInfo(type, id); 835 ArrayList<Element> propertyElementList = XMLHelper.selectElements(element, "property"); 836 for (Element propertyElement : propertyElementList) { 837 String propertyType = propertyElement.getAttribute("type"); 838 String propertyValue = propertyElement.getAttribute("value"); 839 dbreferenceInfo.addProperty(propertyType, propertyValue); 840 } 841 842 idlist.add(dbreferenceInfo); 843 } 844 } catch (XPathExpressionException e) { 845 logger.error("Problems while parsing db references in UniProt XML: {}. No db references will be available.",e.getMessage()); 846 return new LinkedHashMap<>(); 847 } 848 849 return databaseReferencesHashMap; 850 } 851}