001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.structure.align.client; 022 023 024import java.io.File; 025import java.io.IOException; 026import java.io.Serializable; 027import java.net.MalformedURLException; 028import java.net.URL; 029import java.util.Iterator; 030import java.util.LinkedList; 031import java.util.List; 032import java.util.Set; 033import java.util.TreeSet; 034import java.util.regex.Matcher; 035import java.util.regex.Pattern; 036 037import org.biojava.nbio.structure.BioAssemblyIdentifier; 038import org.biojava.nbio.structure.ResidueRange; 039import org.biojava.nbio.structure.Structure; 040import org.biojava.nbio.structure.StructureException; 041import org.biojava.nbio.structure.StructureIdentifier; 042import org.biojava.nbio.structure.SubstructureIdentifier; 043import org.biojava.nbio.structure.URLIdentifier; 044import org.biojava.nbio.structure.align.util.AtomCache; 045import org.biojava.nbio.structure.cath.CathDomain; 046import org.biojava.nbio.structure.cath.CathFactory; 047import org.biojava.nbio.structure.domain.PDPDomain; 048import org.biojava.nbio.structure.domain.PDPProvider; 049import org.biojava.nbio.structure.domain.RemotePDPProvider; 050import org.biojava.nbio.structure.ecod.EcodFactory; 051import org.biojava.nbio.structure.io.util.FileDownloadUtils; 052import org.biojava.nbio.structure.scop.ScopDatabase; 053import org.biojava.nbio.structure.scop.ScopDomain; 054import org.biojava.nbio.structure.scop.ScopFactory; 055import org.slf4j.Logger; 056import org.slf4j.LoggerFactory; 057 058 059/** 060 * A utility class that makes working with names of structures, domains and ranges easier. 061 * 062 * Accepts a wide range of identifier formats, including {@link ScopDomain}, 063 * {@link CathDomain}, PDP domains, and {@link SubstructureIdentifier} residue 064 * ranges. 065 * 066 * Where possible, data is extracted from the input string. Otherwise, range 067 * information may be loaded from one of the factory classes: 068 * {@link CathFactory},{@link ScopFactory}, etc. 069 * 070 * @see #getName the name. e.g. 4hhb, 4hhb.A, d4hhba_, PDP:4HHBAa etc. 071 */ 072 073public class StructureName implements Comparable<StructureName>, Serializable, StructureIdentifier { 074 private static final long serialVersionUID = 4021229518711762957L; 075 private static final Logger logger = LoggerFactory.getLogger(StructureName.class); 076 077 protected String name; 078 protected String pdbId; 079 protected String chainId; 080 081 private static final Pattern cathPattern = Pattern.compile("^(?:CATH:)?([0-9][a-z0-9]{3})(\\w)([0-9]{2})$",Pattern.CASE_INSENSITIVE); 082 // ds046__ is a special case with no PDB entry 083 private static final Pattern scopPattern = Pattern.compile("^(?:SCOP:)?d([0-9][a-z0-9]{3}|s046)(\\w|\\.)(\\w)$",Pattern.CASE_INSENSITIVE); 084 // ECOD chains and domains can't be automatically distinguished. Ex: e3j9zS13 is chain 'S1', e1wz2B14 is chain 'B' 085 private static final Pattern ecodPattern = Pattern.compile("^(?:ECOD:)?e([0-9][a-z0-9]{3})(?:\\w|\\.)\\w+$",Pattern.CASE_INSENSITIVE); 086 087 // Names are automatically used as prefixes 088 public enum Source { 089 PDB, 090 SCOP, 091 PDP, 092 CATH, 093 URL, 094 FILE, 095 ECOD, 096 BIO, 097 }; 098 099 private Source mySource = null; 100 101 // cache for getBaseIdentifier() method 102 private StructureIdentifier base = null; 103 104 /** 105 * Create a new StructureName from the given identifier, which may be a 106 * domain name, a substructure identifier, etc. 107 * <p> 108 * The source and PDB-Id are extracted at compile time, but fully 109 * interpreting the ID, which may require additional parsing or remote 110 * calls, is done lazily. 111 * <p> 112 * The following sources are supported. Any may be prefixed by the source 113 * name followed by a colon (e.g. PDB:4HHB). In this case, that source will be used 114 * unequivocally. If no source is specified, StructureName will make a 115 * (usually reliable) guess as to which source was intended. 116 * <ul> 117 * <li><b>PDB</b>PDB identifier, optionally followed by chain and/or residue 118 * ranges. Internally represented by a {@link SubstructureIdentifier}; 119 * see that class for the full format specification. 120 * Examples: 4hhb, 4hhb.A, 4hhb.A:1-50. 121 * <li><b>SCOP</b> SCOP domain (or SCOPe, depending on the 122 * {@link ScopFactory#getSCOP()} version). Example: d1h6w.2 123 * <li><b>PDP</b> Protein Domain Parser domain. PDP domains are not guessed, 124 * making the PDP: prefix obligatory. Example: PDP:4HHBAa 125 * <li><b>CATH</b> Cath domains. Example: 1qvrC03 126 * <li><b>URL</b> Arbitrary URLs. Most common protocols are handled, 127 * including http://, ftp://, and file://. Some parsing information can 128 * be passed as custom query parameters. Example: 129 * http://www.rcsb.org/pdb/files/1B8G.pdb.gz 130 * <li><b>FILE</b> A file path. Supports relative paths and expands ~ to 131 * the user's home directory. Only existing files will be automatically 132 * detected; to refer to a potentially not-yet existing file, prepend 133 * the prefix. Internally represented as a {@link URLIdentifier} 134 * after path expansion. Example: ~/custom_protein.pdb 135 * <li><b>ECOD</b> ECOD domain. Example: e1lyw.1 136 * <li><b>BIO</b> Biological assembly. These are not guessed, making 137 * the BIO: prefix obligatory. Example: BIO:2ehz:1 138 * </ul> 139 * @param name An identifier string 140 * @throws IllegalArgumentException if the name has a recognizable source but is semantically invalid 141 */ 142 public StructureName(String name){ 143 this.name = name; 144 145 init();//sets pdbId and mySource 146 } 147 148 149 /** 150 * Tries to determine the source and pdbId without fully realizing the identifier, 151 * which could require I/O depending on the source 152 * @throws IllegalArgumentException if the source is recognizable but invalid 153 */ 154 private void init(){ 155 156 // First try identifying a prefix 157 String[] prefix = name.split(":", 2); 158 mySource = null; 159 if(prefix.length > 1) { 160 // Match Source prefixes 161 String suffix = prefix[1]; 162 try { 163 mySource = Source.valueOf(prefix[0].toUpperCase()); 164 } catch( IllegalArgumentException e ) { 165 // unrecognized prefix; fall back on guessing 166 mySource = null; 167 } 168 if(mySource != null) { 169 switch( mySource) { 170 case SCOP: 171 if( ! initFromScop(suffix) ) 172 throw new IllegalArgumentException("Malformed SCOP domain name:"+suffix); 173 return; 174 case PDP: 175 if( ! initFromPDP(name) ) 176 throw new IllegalArgumentException("Malformed PDP domain name:"+suffix); 177 return; 178 case CATH: 179 if( ! initFromCATH(suffix) ) 180 throw new IllegalArgumentException("Malformed CATH domain name:"+suffix); 181 return; 182 case BIO: 183 if( ! initFromBIO(name) ) 184 throw new IllegalArgumentException("Malformed BIO name:"+suffix); 185 return; 186 case ECOD: 187 if( ! initFromECOD(suffix) ) 188 throw new IllegalArgumentException("Malformed ECOD domain name:"+suffix); 189 return; 190 case PDB: 191 if( ! initFromPDB(suffix) ) 192 throw new IllegalArgumentException("Malformed PDB specification:"+suffix); 193 return; 194 case FILE: 195 // Treat file:/ prefixes as URLs 196 if( ! suffix.startsWith("/")) { 197 // Otherwise, treat as file 198 initFromFile(); 199 return; 200 } 201 // fall through to URL case 202 case URL: 203 if( ! initFromURL(name)) 204 throw new IllegalArgumentException("Malformed URL specification:"+suffix); 205 return; 206 default: 207 throw new IllegalStateException("Unimplemented Source "+mySource); 208 } 209 } 210 } 211 212 // No known prefix, so revert to guessing 213 214 // First guess regex-based identifiers 215 // SCOP domain 216 if( initFromScop(name) ) 217 return; 218 // CATH 219 if( initFromCATH(name) ) 220 return; 221 // ECOD 222 if( initFromECOD(name) ) 223 return; 224 // Never guess BIO or PDP 225 226 // URL 227 if( initFromURL(name) ) 228 return; 229 230 // Guess FILE based on file existence 231 File file = new File(FileDownloadUtils.expandUserHome(name)); 232 if( file.canRead() && !file.isDirectory() ) { 233 // an attempt to mitigate issue #398. It doesn't fix it but it catches the most common case of passing a pdb id and finding a file in working dir matching it 234 if (name.matches("\\d\\w\\w\\w")) { 235 // the plain pdb id case, this is unlikely to be what the user wants: let's let it through but warn about it 236 logger.warn("Provided 4-letter structure name '{}' matches " 237 + "file name in directory {}. Will read structure " 238 + "data from file {} and not consider the name as a " 239 + "structure identifier. If this is not what you " 240 + "want, use 'FILE:{}'", 241 name, file.getAbsoluteFile().getParent(), 242 file.getAbsolutePath(), name); 243 } else { 244 logger.info("Provided structure name '{}' matches " 245 + "file name in directory {}. Will read structure " 246 + "data from file {}.", 247 name, file.getAbsoluteFile().getParent(), 248 file.getAbsolutePath()); 249 } 250 251 initFromFile(); 252 return; 253 } 254 255 // Default to PDB 256 initFromPDB( name ); 257 } 258 259 private boolean initFromScop(String name) { 260 Matcher matcher = scopPattern.matcher(name); 261 if ( matcher.matches() ) { 262 mySource = Source.SCOP; 263 pdbId = matcher.group(1).toUpperCase(); 264 chainId = matcher.group(2); 265 return true; 266 } 267 return false; 268 } 269 private boolean initFromPDP(String name) { 270 Matcher matcher = PDPDomain.PDP_NAME_PATTERN.matcher(name); 271 if( matcher.matches() ) { 272 pdbId = matcher.group(1).toUpperCase(); 273 chainId = matcher.group(2); 274 return true; 275 } 276 return false; 277 } 278 private boolean initFromCATH(String name) { 279 Matcher matcher = cathPattern.matcher(name); 280 if ( matcher.matches() ){ 281 mySource = Source.CATH; 282 pdbId = matcher.group(1).toUpperCase(); 283 chainId = matcher.group(2); 284 return true; 285 } 286 return false; 287 } 288 private boolean initFromECOD(String name) { 289 Matcher matcher = ecodPattern.matcher(name); 290 if ( matcher.matches() ){ 291 mySource = Source.ECOD; 292 pdbId = matcher.group(1).toUpperCase(); 293 chainId = null; 294 return true; 295 } 296 return false; 297 } 298 private boolean initFromBIO(String name) { 299 Matcher matcher = BioAssemblyIdentifier.BIO_NAME_PATTERN.matcher(name); 300 if( matcher.matches() ) { 301 pdbId = matcher.group(1).toUpperCase(); 302 return true; 303 } 304 return false; 305 } 306 private boolean initFromPDB(String suffix) { 307 mySource = Source.PDB; 308 SubstructureIdentifier si = new SubstructureIdentifier(suffix); 309 base = si; // Safe to realize immediately 310 311 pdbId = si.getPdbId(); 312 // Set chainId if unique 313 Set<String> chains = getChainIds(si); 314 if(chains.size() == 1) { 315 this.chainId = chains.iterator().next(); 316 } else if(chains.size() > 1) { 317 this.chainId = "."; 318 } else { 319 this.chainId = null; 320 } 321 return true; 322 } 323 private boolean initFromURL(String suffix) { 324 try { 325 URL url = new URL(suffix); 326 String path = url.getPath(); 327 mySource = Source.URL; 328 pdbId = URLIdentifier.guessPDBID( path.substring(path.lastIndexOf('/')+1) ); 329 chainId = null; // Don't bother checking query params here 330 return true; 331 } catch(MalformedURLException e) { 332 return false; 333 } 334 } 335 private boolean initFromFile() { 336 mySource = Source.FILE; 337 pdbId = null; 338 chainId = null; 339 return true; 340 } 341 342 private static Set<String> getChainIds(SubstructureIdentifier si) { 343 Set<String> chains = new TreeSet<String>(); 344 List<ResidueRange> ranges = si.getResidueRanges(); 345 for(ResidueRange range : ranges) { 346 String chain = range.getChainId(); 347 if(chain != null) { 348 chains.add(chain); 349 } 350 } 351 return chains; 352 } 353 354 /** 355 * Get the PDB ID for this name, if any. 356 * 357 * Equivalent to {@link SubstructureIdentifier#getPdbId() 358 * toCanonical().getPdbId()} 359 * @return The upper-case PDB Name, or null if not applicable 360 * @throws StructureException Wraps errors which occur when converting to canonical form 361 */ 362 public String getPdbId() throws StructureException { 363 if( pdbId == null) { 364 pdbId = toCanonical().getPdbId(); 365 } 366 return pdbId; 367 } 368 369 /** 370 * Gets the chain ID, for structures where it is unique and well-defined. 371 * May return '.' for multi-chain ranges, '_' for wildcard chains, or 372 * null if the information is unavailable. 373 * 374 * <p>This method should only be used casually. For precise chainIds, it 375 * is better to use {@link #toCanonical()} and iterate through the 376 * residue ranges. 377 * @return 378 */ 379 public String getChainId() { 380 return chainId; 381 } 382 /** 383 * 384 * @return the identifier string 385 * @deprecated use {@link #getIdentifier()} 386 */ 387 @Deprecated 388 public String getName(){ 389 390 return getIdentifier(); 391 } 392 393 /** 394 * Get the original form of the identifier 395 */ 396 @Override 397 public String getIdentifier() { 398 return name; 399 } 400 401 @Override 402 public String toString(){ 403 404 return name; 405 } 406 407 408 public boolean isScopName() { 409 return mySource == Source.SCOP; 410 } 411 412 public boolean isPDPDomain(){ 413 return mySource == Source.PDP; 414 } 415 416 public boolean isCathID(){ 417 return mySource == Source.CATH; 418 } 419 420 public boolean isPdbId(){ 421 return mySource == Source.PDB; 422 } 423 424 public boolean isURL() { 425 return mySource == Source.URL; 426 } 427 428 /** 429 * Indicates that the identifier was determined to correspond to a file. 430 * Note that some file identifiers may also be valid URLs; in that case, 431 * the URL source is preferred. 432 * @return 433 */ 434 public boolean isFile() { 435 return mySource == Source.FILE; 436 } 437 438 public boolean isEcodDomain() { 439 return mySource == Source.ECOD; 440 } 441 442 public boolean isBioAssembly() { 443 return mySource == Source.BIO; 444 } 445 446 public Source getSource() { 447 return mySource; 448 } 449 450 /** 451 * StructureName wraps another StructureIdentifier. The type of the base 452 * identifier depends on the {@link #getSource() source}. Most StructureName 453 * methods deligate to the base identifier. 454 * 455 * <p>It is possible that future versions of StructureName might change the 456 * return type. Except for some specialized uses, it is probably better 457 * to create the correct type of identifier directly, rather than creating 458 * a StructureName and casting the result of this method. 459 * @return A Str 460 * @throws StructureException Wraps exceptions that may be thrown by 461 * individual implementations. For example, a SCOP identifier may require 462 * that the domain definitions be available for download. 463 */ 464 public StructureIdentifier getBaseIdentifier() throws StructureException { 465 if( base == null ) { 466 467 switch(mySource) { 468 case CATH: 469 base = CathFactory.getCathDatabase().getDescriptionByCathId(getIdentifier()); 470 break; 471 case ECOD: 472 try { 473 base = EcodFactory.getEcodDatabase().getDomainsById(name); 474 } catch (IOException e) { 475 throw new StructureException("Unable to get ECOD domain "+name,e); 476 } 477 break; 478 case SCOP: 479 // Fuzzy matching of the domain name to the current default factory 480 base = guessScopDomain(getIdentifier(),ScopFactory.getSCOP()); 481 if(base == null) { 482 // Guessing didn't work, so just use the PDBID and Chain from name 483 // Guess that '_' means 'whole structure' 484 if (chainId.equals("_")) { 485 base = new SubstructureIdentifier(pdbId); 486 } else { 487 base = new SubstructureIdentifier(pdbId,ResidueRange.parseMultiple(chainId)); 488 } 489 logger.error("Unable to find {}, so using {}",name,base); 490 } 491 break; 492 case FILE: 493 try { 494 String[] prefix = name.split(":", 2); 495 String filename; 496 if(prefix.length > 1) { 497 filename = prefix[1]; 498 } else { 499 filename = name; 500 } 501 filename = FileDownloadUtils.expandUserHome(filename); 502 base = new URLIdentifier(new File(filename).toURI().toURL()); 503 } catch (MalformedURLException e) { 504 // Should never happen 505 throw new StructureException("Unable to get URL for file: "+name,e); 506 } 507 break; 508 case URL: 509 try { 510 base = new URLIdentifier(name); 511 } catch (MalformedURLException e) { 512 throw new StructureException("Invalid URL: "+name,e); 513 } 514 break; 515 case PDP: 516 try { 517 PDPProvider provider = new RemotePDPProvider(false); 518 base = provider.getPDPDomain(name); 519 } catch (IOException e) { 520 throw new StructureException("Unable to fetch PDP domain "+name, e); 521 } 522 break; 523 case BIO: 524 base = new BioAssemblyIdentifier(name); 525 break; 526 case PDB: 527 base = new SubstructureIdentifier(getIdentifier()); 528 break; 529 default: 530 throw new IllegalStateException("Unimplemented source: "+mySource); 531 } 532 } 533 return base; 534 } 535 536 @Override 537 public SubstructureIdentifier toCanonical() throws StructureException { 538 return getBaseIdentifier().toCanonical(); 539 } 540 541 @Override 542 public Structure reduce(Structure input) throws StructureException { 543 return getBaseIdentifier().reduce(input); 544 } 545 546 @Override 547 public Structure loadStructure(AtomCache cache) throws StructureException, 548 IOException { 549 return getBaseIdentifier().loadStructure(cache); 550 } 551 552 @Override 553 public int hashCode() { 554 final int prime = 31; 555 int result = 1; 556 result = prime * result + ((name == null) ? 0 : name.hashCode()); 557 return result; 558 } 559 560 @Override 561 public boolean equals(Object obj) { 562 if (this == obj) 563 return true; 564 if (obj == null) 565 return false; 566 if (getClass() != obj.getClass()) 567 return false; 568 StructureName other = (StructureName) obj; 569 if (name == null) { 570 if (other.name != null) 571 return false; 572 } else if (!name.equals(other.name)) 573 return false; 574 return true; 575 } 576 577 /** 578 * Orders identifiers lexicographically by PDB ID and then full Identifier 579 */ 580 @Override 581 public int compareTo(StructureName o) { 582 if ( this.equals(o)) 583 return 0; 584 585 String pdb1 = null; 586 String pdb2 = null; 587 try { 588 pdb1 = this.getPdbId(); 589 } catch (StructureException e) {} 590 try { 591 pdb2 = this.getPdbId(); 592 } catch (StructureException e) {} 593 594 int comp = 0; 595 596 // Sort those with PDBIDs before those without 597 if( pdb1 == null ) { 598 if( pdb2 != null) { 599 return 1; // this > o 600 } 601 // both null 602 } else if( pdb2 == null){ 603 return -1; // this < o 604 } else { 605 // neither null 606 comp = pdb1.compareTo(pdb2); 607 } 608 if( comp != 0 ) { 609 return comp; 610 } 611 612 // break tie with full identifiers 613 pdb1 = this.getIdentifier(); 614 pdb2 = o.getIdentifier(); 615 616 // Throws NPE for nulls 617 return pdb1.compareTo(pdb2); 618 } 619 620 /** 621 * <p> 622 * Guess a scop domain. If an exact match is found, return that. 623 * 624 * <p> 625 * Otherwise, return the first scop domain found for the specified protein such that 626 * <ul> 627 * <li>The chains match, or one of the chains is '_' or '.'. 628 * <li>The domains match, or one of the domains is '_'. 629 * </ul> 630 * 631 * In some cases there may be several valid matches. In this case a warning 632 * will be logged. 633 * 634 * @param name SCOP domain name, or a guess thereof 635 * @param scopDB SCOP domain provider 636 * @return The best match for name among the domains of scopDB, or null if none match. 637 */ 638 public static ScopDomain guessScopDomain(String name, ScopDatabase scopDB) { 639 List<ScopDomain> matches = new LinkedList<ScopDomain>(); 640 641 // Try exact match first 642 ScopDomain domain = scopDB.getDomainByScopID(name); 643 if (domain != null) { 644 return domain; 645 } 646 647 // Didn't work. Guess it! 648 logger.warn("Warning, could not find SCOP domain: " + name); 649 650 Matcher scopMatch = scopPattern.matcher(name); 651 if (scopMatch.matches()) { 652 String pdbID = scopMatch.group(1); 653 String chainID = scopMatch.group(2); 654 String domainID = scopMatch.group(3); 655 656 for (ScopDomain potentialSCOP : scopDB.getDomainsForPDB(pdbID)) { 657 Matcher potMatch = scopPattern.matcher(potentialSCOP.getScopId()); 658 if (potMatch.matches()) { 659 if (chainID.equals(potMatch.group(2)) || chainID.equals("_") || chainID.equals(".") 660 || potMatch.group(2).equals("_") || potMatch.group(2).equals(".")) { 661 if (domainID.equals(potMatch.group(3)) || domainID.equals("_") || potMatch.group(3).equals("_")) { 662 // Match, or near match 663 matches.add(potentialSCOP); 664 } 665 } 666 } 667 } 668 } 669 670 Iterator<ScopDomain> match = matches.iterator(); 671 if (match.hasNext()) { 672 ScopDomain bestMatch = match.next(); 673 if(logger.isWarnEnabled()) { 674 StringBuilder warnMsg = new StringBuilder(); 675 warnMsg.append("Trying domain " + bestMatch.getScopId() + "."); 676 if (match.hasNext()) { 677 warnMsg.append(" Other possibilities: "); 678 while (match.hasNext()) { 679 warnMsg.append(match.next().getScopId() + " "); 680 } 681 } 682 warnMsg.append(System.getProperty("line.separator")); 683 logger.warn(warnMsg.toString()); 684 } 685 return bestMatch; 686 } else { 687 return null; 688 } 689 } 690 691 692 693}