001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created on 01-21-2010 021 * 022 * @auther Scooter Willis 023 * 024 */ 025package org.biojava.nbio.core.sequence.loader; 026 027import java.io.BufferedReader; 028import java.io.ByteArrayInputStream; 029import java.io.File; 030import java.io.FileNotFoundException; 031import java.io.FileReader; 032import java.io.FileWriter; 033import java.io.IOException; 034import java.io.InputStreamReader; 035import java.net.HttpURLConnection; 036import java.net.URL; 037import java.rmi.RemoteException; 038import java.util.ArrayList; 039import java.util.Iterator; 040import java.util.LinkedHashMap; 041import java.util.List; 042import java.util.regex.Pattern; 043 044import javax.xml.parsers.ParserConfigurationException; 045import javax.xml.xpath.XPathExpressionException; 046 047import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 048import org.biojava.nbio.core.sequence.AccessionID; 049import org.biojava.nbio.core.sequence.DataSource; 050import org.biojava.nbio.core.sequence.ProteinSequence; 051import org.biojava.nbio.core.sequence.Strand; 052import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; 053import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet; 054import org.biojava.nbio.core.sequence.features.DBReferenceInfo; 055import org.biojava.nbio.core.sequence.features.DatabaseReferenceInterface; 056import org.biojava.nbio.core.sequence.features.FeaturesKeyWordInterface; 057import org.biojava.nbio.core.sequence.storage.SequenceAsStringHelper; 058import org.biojava.nbio.core.sequence.template.Compound; 059import org.biojava.nbio.core.sequence.template.CompoundSet; 060import org.biojava.nbio.core.sequence.template.ProxySequenceReader; 061import org.biojava.nbio.core.sequence.template.SequenceMixin; 062import org.biojava.nbio.core.sequence.template.SequenceProxyView; 063import org.biojava.nbio.core.sequence.template.SequenceView; 064import org.biojava.nbio.core.util.XMLHelper; 065import org.slf4j.Logger; 066import org.slf4j.LoggerFactory; 067import org.w3c.dom.Document; 068import org.w3c.dom.Element; 069import org.xml.sax.SAXException; 070 071/** 072 * 073 * Pass in a Uniprot ID and this ProxySequenceReader when passed to a ProteinSequence will get the sequence data and other data elements 074 * associated with the ProteinSequence by Uniprot. This is an example of how to map external databases of proteins and features to the BioJava3 075 * ProteinSequence. 076 * Important to call @see setUniprotDirectoryCache to allow caching of XML files so they don't need to be reloaded each time. Does 077 * not manage cache. 078 * @param <C> 079 */ 080public class UniprotProxySequenceReader<C extends Compound> implements ProxySequenceReader<C>, FeaturesKeyWordInterface, DatabaseReferenceInterface { 081 082 private final static Logger logger = LoggerFactory.getLogger(UniprotProxySequenceReader.class); 083 084 /* 085 * Taken from http://www.uniprot.org/help/accession_numbers 086 */ 087 private static final String SPID_PATTERN = "[OPQ][0-9][A-Z0-9]{3}[0-9]"; 088 private static final String TREMBLID_PATTERN = "[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}"; 089 public static final Pattern UP_AC_PATTERN = Pattern.compile("(" + SPID_PATTERN + "|" + TREMBLID_PATTERN + ")"); 090 091 private static String uniprotbaseURL = "http://www.uniprot.org"; //"http://pir.uniprot.org"; 092 private static String uniprotDirectoryCache = null; 093 private String sequence; 094 private CompoundSet<C> compoundSet; 095 private List<C> parsedCompounds = new ArrayList<C>(); 096 Document uniprotDoc; 097 098 /** 099 * The UniProt id is used to retrieve the UniProt XML which is then parsed as a DOM object 100 * so we know everything about the protein. If an error occurs throw an exception. We could 101 * have a bad uniprot id or network error 102 * @param accession 103 * @param compoundSet 104 * @throws CompoundNotFoundException 105 * @throws IOException if problems while reading the UniProt XML 106 */ 107 public UniprotProxySequenceReader(String accession, CompoundSet<C> compoundSet) throws CompoundNotFoundException, IOException { 108 if (!UP_AC_PATTERN.matcher(accession.toUpperCase()).matches()) { 109 throw new IllegalArgumentException("Accession provided " + accession + " doesn't comply with the uniprot acession pattern."); 110 } 111 setCompoundSet(compoundSet); 112 uniprotDoc = this.getUniprotXML(accession); 113 String seq = this.getSequence(uniprotDoc); 114 setContents(seq); 115 } 116 117 /** 118 * The xml is passed in as a DOM object so we know everything about the protein. 119 * If an error occurs throw an exception. We could have a bad uniprot id 120 * @param document 121 * @param compoundSet 122 * @throws CompoundNotFoundException 123 */ 124 public UniprotProxySequenceReader(Document document, CompoundSet<C> compoundSet) throws CompoundNotFoundException { 125 setCompoundSet(compoundSet); 126 uniprotDoc = document; 127 String seq = this.getSequence(uniprotDoc); 128 setContents(seq); 129 } 130 /** 131 * The passed in xml is parsed as a DOM object so we know everything about the protein. 132 * If an error occurs throw an exception. We could have a bad uniprot id 133 * @param xml 134 * @param compoundSet 135 * @return UniprotProxySequenceReader 136 * @throws Exception 137 */ 138 public static <C extends Compound> UniprotProxySequenceReader<C> parseUniprotXMLString(String xml, CompoundSet<C> compoundSet) { 139 try { 140 Document document = XMLHelper.inputStreamToDocument(new ByteArrayInputStream(xml.getBytes())); 141 return new UniprotProxySequenceReader<C>(document, compoundSet); 142 } catch (Exception e) { 143 logger.error("Exception on xml parse of: {}", xml); 144 } 145 return null; 146 } 147 148 @Override 149 public void setCompoundSet(CompoundSet<C> compoundSet) { 150 this.compoundSet = compoundSet; 151 } 152 153 /** 154 * Once the sequence is retrieved set the contents and make sure everything this is valid 155 * @param sequence 156 * @throws CompoundNotFoundException 157 */ 158 @Override 159 public void setContents(String sequence) throws CompoundNotFoundException { 160 // Horrendously inefficient - pretty much the way the old BJ did things. 161 // TODO Should be optimised. 162 this.sequence = sequence; 163 this.parsedCompounds.clear(); 164 for (int i = 0; i < sequence.length();) { 165 String compoundStr = null; 166 C compound = null; 167 for (int compoundStrLength = 1; compound == null && compoundStrLength <= compoundSet.getMaxSingleCompoundStringLength(); compoundStrLength++) { 168 compoundStr = sequence.substring(i, i + compoundStrLength); 169 compound = compoundSet.getCompoundForString(compoundStr); 170 } 171 if (compound == null) { 172 throw new CompoundNotFoundException("Compound "+compoundStr+" not found"); 173 } else { 174 i += compoundStr.length(); 175 } 176 this.parsedCompounds.add(compound); 177 } 178 } 179 180 /** 181 * The sequence length 182 * @return 183 */ 184 @Override 185 public int getLength() { 186 return this.parsedCompounds.size(); 187 } 188 189 /** 190 * 191 * @param position 192 * @return 193 */ 194 @Override 195 public C getCompoundAt(int position) { 196 return this.parsedCompounds.get(position - 1); 197 } 198 199 /** 200 * 201 * @param compound 202 * @return 203 */ 204 @Override 205 public int getIndexOf(C compound) { 206 return this.parsedCompounds.indexOf(compound) + 1; 207 } 208 209 /** 210 * 211 * @param compound 212 * @return 213 */ 214 @Override 215 public int getLastIndexOf(C compound) { 216 return this.parsedCompounds.lastIndexOf(compound) + 1; 217 } 218 219 /** 220 * 221 * @return 222 */ 223 @Override 224 public String toString() { 225 return getSequenceAsString(); 226 } 227 228 /** 229 * 230 * @return 231 */ 232 @Override 233 public String getSequenceAsString() { 234 return sequence; 235 } 236 237 /** 238 * 239 * @return 240 */ 241 @Override 242 public List<C> getAsList() { 243 return this.parsedCompounds; 244 } 245 246 /** 247 * 248 * @return 249 */ 250 @Override 251 public SequenceView<C> getInverse() { 252 return SequenceMixin.inverse(this); 253 } 254 255 /** 256 * 257 * @param bioBegin 258 * @param bioEnd 259 * @param strand 260 * @return 261 */ 262 public String getSequenceAsString(Integer bioBegin, Integer bioEnd, Strand strand) { 263 SequenceAsStringHelper<C> sequenceAsStringHelper = new SequenceAsStringHelper<C>(); 264 return sequenceAsStringHelper.getSequenceAsString(this.parsedCompounds, compoundSet, bioBegin, bioEnd, strand); 265 } 266 267 /** 268 * 269 * @param bioBegin 270 * @param bioEnd 271 * @return 272 */ 273 @Override 274 public SequenceView<C> getSubSequence(final Integer bioBegin, final Integer bioEnd) { 275 return new SequenceProxyView<C>(UniprotProxySequenceReader.this, bioBegin, bioEnd); 276 } 277 278 /** 279 * 280 * @return 281 */ 282 @Override 283 public Iterator<C> iterator() { 284 return this.parsedCompounds.iterator(); 285 } 286 287 /** 288 * 289 * @return 290 */ 291 @Override 292 public CompoundSet<C> getCompoundSet() { 293 return compoundSet; 294 } 295 296 /** 297 * 298 * @return 299 */ 300 @Override 301 public AccessionID getAccession() { 302 AccessionID accessionID = new AccessionID(); 303 if (uniprotDoc == null) { 304 return accessionID; 305 } 306 try { 307 Element uniprotElement = uniprotDoc.getDocumentElement(); 308 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry"); 309 Element nameElement = XMLHelper.selectSingleElement(entryElement, "name"); 310 accessionID = new AccessionID(nameElement.getTextContent(), DataSource.UNIPROT); 311 } catch (XPathExpressionException e) { 312 logger.error("Exception: ", e); 313 } 314 return accessionID; 315 } 316 317 /** 318 * Pull uniprot accessions associated with this sequence 319 * @return 320 * @throws XPathExpressionException 321 */ 322 public ArrayList<AccessionID> getAccessions() throws XPathExpressionException { 323 ArrayList<AccessionID> accessionList = new ArrayList<AccessionID>(); 324 if (uniprotDoc == null) { 325 return accessionList; 326 } 327 Element uniprotElement = uniprotDoc.getDocumentElement(); 328 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry"); 329 ArrayList<Element> keyWordElementList = XMLHelper.selectElements(entryElement, "accession"); 330 for (Element element : keyWordElementList) { 331 AccessionID accessionID = new AccessionID(element.getTextContent(), DataSource.UNIPROT); 332 accessionList.add(accessionID); 333 } 334 335 return accessionList; 336 } 337 338 /** 339 * Pull uniprot protein aliases associated with this sequence 340 * @return 341 * @throws XPathExpressionException 342 */ 343 public ArrayList<String> getAliases() throws XPathExpressionException { 344 ArrayList<String> aliasList = new ArrayList<String>(); 345 if (uniprotDoc == null) { 346 return aliasList; 347 } 348 Element uniprotElement = uniprotDoc.getDocumentElement(); 349 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry"); 350 Element proteinElement = XMLHelper.selectSingleElement(entryElement, "protein"); 351 ArrayList<Element> keyWordElementList = XMLHelper.selectElements(proteinElement, "alternativeName"); 352 for (Element element : keyWordElementList) { 353 Element fullNameElement = XMLHelper.selectSingleElement(element, "fullName"); 354 aliasList.add(fullNameElement.getTextContent()); 355 } 356 357 return aliasList; 358 } 359 360 /** 361 * 362 * @param compounds 363 * @return 364 */ 365 @Override 366 public int countCompounds(C... compounds) { 367 throw new UnsupportedOperationException("Not supported yet."); 368 } 369 370 /** 371 * 372 * @param accession 373 * @return 374 * @throws IOException 375 */ 376 private Document getUniprotXML(String accession) throws IOException, CompoundNotFoundException { 377 StringBuilder sb = new StringBuilder(); 378 // try in cache 379 if (uniprotDirectoryCache != null && uniprotDirectoryCache.length() > 0) { 380 sb = fetchFromCache(accession); 381 } 382 383 // http://www.uniprot.org/uniprot/?query=SORBIDRAFT_03g027040&format=xml 384 if (sb.length() == 0) { 385 String uniprotURL = getUniprotbaseURL() + "/uniprot/" + accession.toUpperCase() + ".xml"; 386 logger.info("Loading: {}", uniprotURL); 387 sb = fetchUniprotXML(uniprotURL); 388 389 int index = sb.indexOf("xmlns="); //strip out name space stuff to make it easier on xpath 390 if (index != -1) { 391 int lastIndex = sb.indexOf(">", index); 392 sb.replace(index, lastIndex, ""); 393 } 394 if (uniprotDirectoryCache != null && uniprotDirectoryCache.length() > 0) 395 writeCache(sb,accession); 396 } 397 398 logger.info("Load complete"); 399 try { 400 // logger.debug(sb.toString()); 401 Document document = XMLHelper.inputStreamToDocument(new ByteArrayInputStream(sb.toString().getBytes())); 402 return document; 403 } catch (SAXException e) { 404 logger.error("Exception on xml parse of: {}", sb.toString()); 405 } catch (ParserConfigurationException e) { 406 logger.error("Exception on xml parse of: {}", sb.toString()); 407 } 408 return null; 409 } 410 411 private void writeCache(StringBuilder sb, String accession) throws IOException { 412 File f = new File(uniprotDirectoryCache + File.separatorChar + accession + ".xml"); 413 FileWriter fw = new FileWriter(f); 414 fw.write(sb.toString()); 415 fw.close(); 416 } 417 418 private StringBuilder fetchUniprotXML(String uniprotURL) 419 throws IOException, CompoundNotFoundException { 420 421 StringBuilder sb = new StringBuilder(); 422 URL uniprot = new URL(uniprotURL); 423 int attempt = 5; 424 List<String> errorCodes = new ArrayList<String>(); 425 while(attempt > 0) { 426 HttpURLConnection uniprotConnection = (HttpURLConnection) uniprot.openConnection(); 427 uniprotConnection.setRequestProperty("User-Agent", "BioJava"); 428 uniprotConnection.connect(); 429 int statusCode = uniprotConnection.getResponseCode(); 430 if (statusCode == 200) { 431 BufferedReader in = new BufferedReader( 432 new InputStreamReader( 433 uniprotConnection.getInputStream())); 434 String inputLine; 435 436 while ((inputLine = in.readLine()) != null) { 437 sb.append(inputLine); 438 } 439 in.close(); 440 return sb; 441 } 442 attempt--; 443 errorCodes.add(String.valueOf(statusCode)); 444 } 445 throw new RemoteException("Couldn't fetch accession from the url " + uniprotURL + " error codes on 5 attempts are " + errorCodes.toString()); 446 } 447 448 /** 449 * @param key 450 * @return A string containing the contents of entry specified by key and if not found returns an empty string 451 * @throws FileNotFoundException 452 * @throws IOException 453 */ 454 private StringBuilder fetchFromCache(String key) 455 throws FileNotFoundException, IOException { 456 int index; 457 File f = new File(uniprotDirectoryCache + File.separatorChar + key + ".xml"); 458 StringBuilder sb = new StringBuilder(); 459 if (f.exists()) { 460 FileReader fr = new FileReader(f); 461 int size = (int) f.length(); 462 char[] data = new char[size]; 463 fr.read(data); 464 fr.close(); 465 sb.append(data); 466 index = sb.indexOf("xmlns="); //strip out name space stuff to make it easier on xpath 467 if (index != -1) { 468 int lastIndex = sb.indexOf(">", index); 469 sb.replace(index, lastIndex, ""); 470 } 471 } 472 return sb; 473 } 474 475 /** 476 * 477 * @param uniprotDoc 478 * @return 479 */ 480 private String getSequence(Document uniprotDoc) { 481 482 try { 483 Element uniprotElement = uniprotDoc.getDocumentElement(); 484 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry"); 485 Element sequenceElement = XMLHelper.selectSingleElement(entryElement, "sequence"); 486 487 String seqdata = sequenceElement.getTextContent(); 488 489 return seqdata; 490 } catch (XPathExpressionException e) { 491 logger.error("Problems while parsing sequence in UniProt XML: {}. Sequence will be blank.", e.getMessage()); 492 return ""; 493 } 494 } 495 496 /** 497 * The current UniProt URL to deal with caching issues. www.uniprot.org is load balanced 498 * but you can access pir.uniprot.org directly. 499 * @return the uniprotbaseURL 500 */ 501 public static String getUniprotbaseURL() { 502 return uniprotbaseURL; 503 } 504 505 /** 506 * @param aUniprotbaseURL the uniprotbaseURL to set 507 */ 508 public static void setUniprotbaseURL(String aUniprotbaseURL) { 509 uniprotbaseURL = aUniprotbaseURL; 510 } 511 512 /** 513 * Local directory cache of XML that can be downloaded 514 * @return the uniprotDirectoryCache 515 */ 516 public static String getUniprotDirectoryCache() { 517 return uniprotDirectoryCache; 518 } 519 520 /** 521 * @param aUniprotDirectoryCache the uniprotDirectoryCache to set 522 */ 523 public static void setUniprotDirectoryCache(String aUniprotDirectoryCache) { 524 File f = new File(aUniprotDirectoryCache); 525 if (!f.exists()) { 526 f.mkdirs(); 527 } 528 uniprotDirectoryCache = aUniprotDirectoryCache; 529 } 530 531 public static void main(String[] args) { 532 533 try { 534 UniprotProxySequenceReader<AminoAcidCompound> uniprotSequence = new UniprotProxySequenceReader<AminoAcidCompound>("YA745_GIBZE", AminoAcidCompoundSet.getAminoAcidCompoundSet()); 535 ProteinSequence proteinSequence = new ProteinSequence(uniprotSequence); 536 logger.info("Accession: {}", proteinSequence.getAccession().getID()); 537 logger.info("Sequence: {}", proteinSequence.getSequenceAsString()); 538 } catch (Exception e) { 539 logger.error("Exception: ", e); 540 } 541 542 } 543 544 /** 545 * Get the gene name associated with this sequence. 546 * @return 547 */ 548 public String getGeneName() { 549 if (uniprotDoc == null) { 550 return ""; 551 } 552 try { 553 Element uniprotElement = uniprotDoc.getDocumentElement(); 554 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry"); 555 Element geneElement = XMLHelper.selectSingleElement(entryElement, "gene"); 556 if (geneElement == null) { 557 return ""; 558 } 559 Element nameElement = XMLHelper.selectSingleElement(geneElement, "name"); 560 if (nameElement == null) { 561 return ""; 562 } 563 return nameElement.getTextContent(); 564 } catch (XPathExpressionException e) { 565 logger.error("Problems while parsing gene name in UniProt XML: {}. Gene name will be blank.",e.getMessage()); 566 return ""; 567 } 568 } 569 570 /** 571 * Get the organism name assigned to this sequence 572 * @return 573 */ 574 public String getOrganismName() { 575 if (uniprotDoc == null) { 576 return ""; 577 } 578 try { 579 Element uniprotElement = uniprotDoc.getDocumentElement(); 580 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry"); 581 Element organismElement = XMLHelper.selectSingleElement(entryElement, "organism"); 582 if (organismElement == null) { 583 return ""; 584 } 585 Element nameElement = XMLHelper.selectSingleElement(organismElement, "name"); 586 if (nameElement == null) { 587 return ""; 588 } 589 return nameElement.getTextContent(); 590 } catch (XPathExpressionException e) { 591 logger.error("Problems while parsing organism name in UniProt XML: {}. Organism name will be blank.",e.getMessage()); 592 return ""; 593 } 594 595 } 596 597 /** 598 * Pull UniProt key words which is a mixed bag of words associated with this sequence 599 * @return 600 */ 601 @Override 602 public ArrayList<String> getKeyWords() { 603 ArrayList<String> keyWordsList = new ArrayList<String>(); 604 if (uniprotDoc == null) { 605 return keyWordsList; 606 } 607 try { 608 Element uniprotElement = uniprotDoc.getDocumentElement(); 609 610 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry"); 611 ArrayList<Element> keyWordElementList = XMLHelper.selectElements(entryElement, "keyword"); 612 for (Element element : keyWordElementList) { 613 keyWordsList.add(element.getTextContent()); 614 } 615 } catch (XPathExpressionException e) { 616 logger.error("Problems while parsing keywords in UniProt XML: {}. No keywords will be available.",e.getMessage()); 617 return new ArrayList<String>(); 618 } 619 620 return keyWordsList; 621 } 622 623 /** 624 * The Uniprot mappings to other database identifiers for this sequence 625 * @return 626 */ 627 @Override 628 public LinkedHashMap<String, ArrayList<DBReferenceInfo>> getDatabaseReferences() { 629 LinkedHashMap<String, ArrayList<DBReferenceInfo>> databaseReferencesHashMap = new LinkedHashMap<String, ArrayList<DBReferenceInfo>>(); 630 if (uniprotDoc == null) { 631 return databaseReferencesHashMap; 632 } 633 634 try { 635 Element uniprotElement = uniprotDoc.getDocumentElement(); 636 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry"); 637 ArrayList<Element> dbreferenceElementList = XMLHelper.selectElements(entryElement, "dbReference"); 638 for (Element element : dbreferenceElementList) { 639 String type = element.getAttribute("type"); 640 String id = element.getAttribute("id"); 641 ArrayList<DBReferenceInfo> idlist = databaseReferencesHashMap.get(type); 642 if (idlist == null) { 643 idlist = new ArrayList<DBReferenceInfo>(); 644 databaseReferencesHashMap.put(type, idlist); 645 } 646 DBReferenceInfo dbreferenceInfo = new DBReferenceInfo(type, id); 647 ArrayList<Element> propertyElementList = XMLHelper.selectElements(element, "property"); 648 for (Element propertyElement : propertyElementList) { 649 String propertyType = propertyElement.getAttribute("type"); 650 String propertyValue = propertyElement.getAttribute("value"); 651 dbreferenceInfo.addProperty(propertyType, propertyValue); 652 } 653 654 idlist.add(dbreferenceInfo); 655 } 656 } catch (XPathExpressionException e) { 657 logger.error("Problems while parsing db references in UniProt XML: {}. No db references will be available.",e.getMessage()); 658 return new LinkedHashMap<String, ArrayList<DBReferenceInfo>>(); 659 } 660 661 return databaseReferencesHashMap; 662 } 663}