001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.structure.align.client; 022 023 024import java.io.File; 025import java.io.IOException; 026import java.io.Serializable; 027import java.net.MalformedURLException; 028import java.net.URL; 029import java.util.Iterator; 030import java.util.LinkedList; 031import java.util.List; 032import java.util.Set; 033import java.util.TreeSet; 034import java.util.regex.Matcher; 035import java.util.regex.Pattern; 036 037import org.biojava.nbio.structure.BioAssemblyIdentifier; 038import org.biojava.nbio.structure.ResidueRange; 039import org.biojava.nbio.structure.Structure; 040import org.biojava.nbio.structure.StructureException; 041import org.biojava.nbio.structure.StructureIdentifier; 042import org.biojava.nbio.structure.SubstructureIdentifier; 043import org.biojava.nbio.structure.URLIdentifier; 044import org.biojava.nbio.structure.align.util.AtomCache; 045import org.biojava.nbio.structure.cath.CathDomain; 046import org.biojava.nbio.structure.cath.CathFactory; 047import org.biojava.nbio.structure.domain.PDPDomain; 048import org.biojava.nbio.structure.domain.PDPProvider; 049import org.biojava.nbio.structure.domain.RemotePDPProvider; 050import org.biojava.nbio.structure.ecod.EcodFactory; 051import org.biojava.nbio.core.util.FileDownloadUtils; 052import org.biojava.nbio.structure.scop.ScopDatabase; 053import org.biojava.nbio.structure.scop.ScopDomain; 054import org.biojava.nbio.structure.scop.ScopFactory; 055import org.slf4j.Logger; 056import org.slf4j.LoggerFactory; 057 058 059/** 060 * A utility class that makes working with names of structures, domains and ranges easier. 061 * 062 * Accepts a wide range of identifier formats, including {@link ScopDomain}, 063 * {@link CathDomain}, PDP domains, and {@link SubstructureIdentifier} residue 064 * ranges. 065 * 066 * Where possible, data is extracted from the input string. Otherwise, range 067 * information may be loaded from one of the factory classes: 068 * {@link CathFactory},{@link ScopFactory}, etc. 069 * 070 * @see #getName the name. e.g. 4hhb, 4hhb.A, d4hhba_, PDP:4HHBAa etc. 071 */ 072 073public class StructureName implements Comparable<StructureName>, Serializable, StructureIdentifier { 074 private static final long serialVersionUID = 4021229518711762957L; 075 private static final Logger logger = LoggerFactory.getLogger(StructureName.class); 076 077 protected String name; 078 protected String pdbId; 079 protected String chainName; 080 081 private static final Pattern cathPattern = Pattern.compile("^(?:CATH:)?([0-9][a-z0-9]{3})(\\w)([0-9]{2})$",Pattern.CASE_INSENSITIVE); 082 // ds046__ is a special case with no PDB entry 083 private static final Pattern scopPattern = Pattern.compile("^(?:SCOP:)?d([0-9][a-z0-9]{3}|s046)(\\w|\\.)(\\w)$",Pattern.CASE_INSENSITIVE); 084 // ECOD chains and domains can't be automatically distinguished. Ex: e3j9zS13 is chain 'S1', e1wz2B14 is chain 'B' 085 private static final Pattern ecodPattern = Pattern.compile("^(?:ECOD:)?e([0-9][a-z0-9]{3})(?:\\w|\\.)\\w+$",Pattern.CASE_INSENSITIVE); 086 087 // Names are automatically used as prefixes 088 public enum Source { 089 PDB, 090 SCOP, 091 PDP, 092 CATH, 093 URL, 094 FILE, 095 ECOD, 096 BIO, 097 }; 098 099 private Source mySource = null; 100 101 // cache for getBaseIdentifier() method 102 private StructureIdentifier base = null; 103 104 /** 105 * Create a new StructureName from the given identifier, which may be a 106 * domain name, a substructure identifier, etc. 107 * <p> 108 * The source and PDB-Id are extracted at compile time, but fully 109 * interpreting the ID, which may require additional parsing or remote 110 * calls, is done lazily. 111 * <p> 112 * The following sources are supported. Any may be prefixed by the source 113 * name followed by a colon (e.g. PDB:4HHB). In this case, that source will be used 114 * unequivocally. If no source is specified, StructureName will make a 115 * (usually reliable) guess as to which source was intended. 116 * <ul> 117 * <li><b>PDB</b>PDB identifier, optionally followed by chain and/or residue 118 * ranges. Internally represented by a {@link SubstructureIdentifier}; 119 * see that class for the full format specification. 120 * Examples: 4hhb, 4hhb.A, 4hhb.A:1-50. 121 * <li><b>SCOP</b> SCOP domain (or SCOPe, depending on the 122 * {@link ScopFactory#getSCOP()} version). Example: d1h6w.2 123 * <li><b>PDP</b> Protein Domain Parser domain. PDP domains are not guessed, 124 * making the PDP: prefix obligatory. Example: PDP:4HHBAa 125 * <li><b>CATH</b> Cath domains. Example: 1qvrC03 126 * <li><b>URL</b> Arbitrary URLs. Most common protocols are handled, 127 * including http://, ftp://, and file://. Some parsing information can 128 * be passed as custom query parameters. Example: 129 * http://www.rcsb.org/pdb/files/1B8G.pdb.gz 130 * <li><b>FILE</b> A file path. Supports relative paths and expands ~ to 131 * the user's home directory. Only existing files will be automatically 132 * detected; to refer to a potentially not-yet existing file, prepend 133 * the prefix. Internally represented as a {@link URLIdentifier} 134 * after path expansion. Example: ~/custom_protein.pdb 135 * <li><b>ECOD</b> ECOD domain. Example: e1lyw.1 136 * <li><b>BIO</b> Biological assembly. These are not guessed, making 137 * the BIO: prefix obligatory. Example: BIO:2ehz:1 138 * </ul> 139 * @param name An identifier string 140 * @throws IllegalArgumentException if the name has a recognizable source but is semantically invalid 141 */ 142 public StructureName(String name){ 143 this.name = name; 144 145 init();//sets pdbId and mySource 146 } 147 148 149 /** 150 * Tries to determine the source and pdbId without fully realizing the identifier, 151 * which could require I/O depending on the source 152 * @throws IllegalArgumentException if the source is recognizable but invalid 153 */ 154 private void init(){ 155 156 // First try identifying a prefix 157 String[] prefix = name.split(":", 2); 158 mySource = null; 159 if(prefix.length > 1) { 160 // Match Source prefixes 161 String suffix = prefix[1]; 162 try { 163 mySource = Source.valueOf(prefix[0].toUpperCase()); 164 } catch( IllegalArgumentException e ) { 165 // unrecognized prefix; fall back on guessing 166 mySource = null; 167 } 168 if(mySource != null) { 169 switch( mySource) { 170 case SCOP: 171 if( ! initFromScop(suffix) ) 172 throw new IllegalArgumentException("Malformed SCOP domain name:"+suffix); 173 return; 174 case PDP: 175 if( ! initFromPDP(name) ) 176 throw new IllegalArgumentException("Malformed PDP domain name:"+suffix); 177 return; 178 case CATH: 179 if( ! initFromCATH(suffix) ) 180 throw new IllegalArgumentException("Malformed CATH domain name:"+suffix); 181 return; 182 case BIO: 183 if( ! initFromBIO(name) ) 184 throw new IllegalArgumentException("Malformed BIO name:"+suffix); 185 return; 186 case ECOD: 187 if( ! initFromECOD(suffix) ) 188 throw new IllegalArgumentException("Malformed ECOD domain name:"+suffix); 189 return; 190 case PDB: 191 if( ! initFromPDB(suffix) ) 192 throw new IllegalArgumentException("Malformed PDB specification:"+suffix); 193 return; 194 case FILE: 195 // Treat file:/ prefixes as URLs 196 if( ! suffix.startsWith("/")) { 197 // Otherwise, treat as file 198 initFromFile(); 199 return; 200 } 201 // fall through to URL case 202 case URL: 203 if( ! initFromURL(name)) 204 throw new IllegalArgumentException("Malformed URL specification:"+suffix); 205 return; 206 default: 207 throw new IllegalStateException("Unimplemented Source "+mySource); 208 } 209 } 210 } 211 212 // No known prefix, so revert to guessing 213 214 // First guess regex-based identifiers 215 // SCOP domain 216 if( initFromScop(name) ) 217 return; 218 // CATH 219 if( initFromCATH(name) ) 220 return; 221 // ECOD 222 if( initFromECOD(name) ) 223 return; 224 // Never guess BIO or PDP 225 226 // URL 227 if( initFromURL(name) ) 228 return; 229 230 // Guess FILE based on file existence 231 File file = new File(FileDownloadUtils.expandUserHome(name)); 232 if( file.canRead() && !file.isDirectory() ) { 233 // an attempt to mitigate issue #398. It doesn't fix it but it catches the most common case of passing a pdb id and finding a file in working dir matching it 234 if (name.matches("\\d\\w\\w\\w")) { 235 // the plain pdb id case, this is unlikely to be what the user wants: let's let it through but warn about it 236 logger.warn("Provided 4-letter structure name '{}' matches " 237 + "file name in directory {}. Will read structure " 238 + "data from file {} and not consider the name as a " 239 + "structure identifier. If this is not what you " 240 + "want, use 'FILE:{}'", 241 name, file.getAbsoluteFile().getParent(), 242 file.getAbsolutePath(), name); 243 } else { 244 logger.info("Provided structure name '{}' matches " 245 + "file name in directory {}. Will read structure " 246 + "data from file {}.", 247 name, file.getAbsoluteFile().getParent(), 248 file.getAbsolutePath()); 249 } 250 251 initFromFile(); 252 return; 253 } 254 255 // Default to PDB 256 initFromPDB( name ); 257 } 258 259 private boolean initFromScop(String name) { 260 Matcher matcher = scopPattern.matcher(name); 261 if ( matcher.matches() ) { 262 mySource = Source.SCOP; 263 pdbId = matcher.group(1).toUpperCase(); 264 chainName = matcher.group(2); 265 return true; 266 } 267 return false; 268 } 269 private boolean initFromPDP(String name) { 270 Matcher matcher = PDPDomain.PDP_NAME_PATTERN.matcher(name); 271 if( matcher.matches() ) { 272 pdbId = matcher.group(1).toUpperCase(); 273 chainName = matcher.group(2); 274 return true; 275 } 276 return false; 277 } 278 private boolean initFromCATH(String name) { 279 Matcher matcher = cathPattern.matcher(name); 280 if ( matcher.matches() ){ 281 mySource = Source.CATH; 282 pdbId = matcher.group(1).toUpperCase(); 283 chainName = matcher.group(2); 284 return true; 285 } 286 return false; 287 } 288 private boolean initFromECOD(String name) { 289 Matcher matcher = ecodPattern.matcher(name); 290 if ( matcher.matches() ){ 291 mySource = Source.ECOD; 292 pdbId = matcher.group(1).toUpperCase(); 293 chainName = null; 294 return true; 295 } 296 return false; 297 } 298 private boolean initFromBIO(String name) { 299 Matcher matcher = BioAssemblyIdentifier.BIO_NAME_PATTERN.matcher(name); 300 if( matcher.matches() ) { 301 pdbId = matcher.group(1).toUpperCase(); 302 return true; 303 } 304 return false; 305 } 306 private boolean initFromPDB(String suffix) { 307 mySource = Source.PDB; 308 SubstructureIdentifier si = new SubstructureIdentifier(suffix); 309 base = si; // Safe to realize immediately 310 311 pdbId = si.getPdbId(); 312 // Set chainName if unique 313 Set<String> chains = getChainNames(si); 314 if(chains.size() == 1) { 315 this.chainName = chains.iterator().next(); 316 } else if(chains.size() > 1) { 317 this.chainName = "."; 318 } else { 319 this.chainName = null; 320 } 321 return true; 322 } 323 private boolean initFromURL(String suffix) { 324 try { 325 URL url = new URL(suffix); 326 String path = url.getPath(); 327 mySource = Source.URL; 328 pdbId = URLIdentifier.guessPDBID( path.substring(path.lastIndexOf('/')+1) ); 329 chainName = null; // Don't bother checking query params here 330 return true; 331 } catch(MalformedURLException e) { 332 return false; 333 } 334 } 335 private boolean initFromFile() { 336 mySource = Source.FILE; 337 pdbId = null; 338 chainName = null; 339 return true; 340 } 341 342 private static Set<String> getChainNames(SubstructureIdentifier si) { 343 Set<String> chains = new TreeSet<String>(); 344 List<ResidueRange> ranges = si.getResidueRanges(); 345 for(ResidueRange range : ranges) { 346 String chainName = range.getChainName(); 347 if(chainName != null) { 348 chains.add(chainName); 349 } 350 } 351 return chains; 352 } 353 354 /** 355 * Get the PDB ID for this name, if any. 356 * 357 * Equivalent to {@link SubstructureIdentifier#getPdbId() 358 * toCanonical().getPdbId()} 359 * @return The upper-case PDB Name, or null if not applicable 360 * @throws StructureException Wraps errors which occur when converting to canonical form 361 */ 362 public String getPdbId() throws StructureException { 363 if( pdbId == null) { 364 pdbId = toCanonical().getPdbId(); 365 } 366 return pdbId; 367 } 368 369 /** 370 * Gets the chain ID, for structures where it is unique and well-defined. 371 * May return '.' for multi-chain ranges, '_' for wildcard chains, or 372 * null if the information is unavailable. 373 * 374 * <p>This method should only be used casually. For precise chainIds, it 375 * is better to use {@link #toCanonical()} and iterate through the 376 * residue ranges. 377 * @return 378 */ 379 public String getChainId() { 380 return chainName; 381 } 382 383 /** 384 * Get the original form of the identifier 385 */ 386 @Override 387 public String getIdentifier() { 388 return name; 389 } 390 391 @Override 392 public String toString(){ 393 394 return name; 395 } 396 397 398 public boolean isScopName() { 399 return mySource == Source.SCOP; 400 } 401 402 public boolean isPDPDomain(){ 403 return mySource == Source.PDP; 404 } 405 406 public boolean isCathID(){ 407 return mySource == Source.CATH; 408 } 409 410 public boolean isPdbId(){ 411 return mySource == Source.PDB; 412 } 413 414 public boolean isURL() { 415 return mySource == Source.URL; 416 } 417 418 /** 419 * Indicates that the identifier was determined to correspond to a file. 420 * Note that some file identifiers may also be valid URLs; in that case, 421 * the URL source is preferred. 422 * @return 423 */ 424 public boolean isFile() { 425 return mySource == Source.FILE; 426 } 427 428 public boolean isEcodDomain() { 429 return mySource == Source.ECOD; 430 } 431 432 public boolean isBioAssembly() { 433 return mySource == Source.BIO; 434 } 435 436 public Source getSource() { 437 return mySource; 438 } 439 440 /** 441 * StructureName wraps another StructureIdentifier. The type of the base 442 * identifier depends on the {@link #getSource() source}. Most StructureName 443 * methods deligate to the base identifier. 444 * 445 * <p>It is possible that future versions of StructureName might change the 446 * return type. Except for some specialized uses, it is probably better 447 * to create the correct type of identifier directly, rather than creating 448 * a StructureName and casting the result of this method. 449 * @return A Str 450 * @throws StructureException Wraps exceptions that may be thrown by 451 * individual implementations. For example, a SCOP identifier may require 452 * that the domain definitions be available for download. 453 */ 454 public StructureIdentifier getBaseIdentifier() throws StructureException { 455 if( base == null ) { 456 457 switch(mySource) { 458 case CATH: 459 base = CathFactory.getCathDatabase().getDescriptionByCathId(getIdentifier()); 460 break; 461 case ECOD: 462 try { 463 base = EcodFactory.getEcodDatabase().getDomainsById(name); 464 } catch (IOException e) { 465 throw new StructureException("Unable to get ECOD domain "+name,e); 466 } 467 break; 468 case SCOP: 469 // Fuzzy matching of the domain name to the current default factory 470 base = guessScopDomain(getIdentifier(),ScopFactory.getSCOP()); 471 if(base == null) { 472 // Guessing didn't work, so just use the PDBID and Chain from name 473 // Guess that '_' means 'whole structure' 474 if (chainName.equals("_")) { 475 base = new SubstructureIdentifier(pdbId); 476 } else { 477 base = new SubstructureIdentifier(pdbId,ResidueRange.parseMultiple(chainName)); 478 } 479 logger.error("Unable to find {}, so using {}",name,base); 480 } 481 break; 482 case FILE: 483 try { 484 String[] prefix = name.split(":", 2); 485 String filename; 486 if(prefix.length > 1) { 487 filename = prefix[1]; 488 } else { 489 filename = name; 490 } 491 filename = FileDownloadUtils.expandUserHome(filename); 492 base = new URLIdentifier(new File(filename).toURI().toURL()); 493 } catch (MalformedURLException e) { 494 // Should never happen 495 throw new StructureException("Unable to get URL for file: "+name,e); 496 } 497 break; 498 case URL: 499 try { 500 base = new URLIdentifier(name); 501 } catch (MalformedURLException e) { 502 throw new StructureException("Invalid URL: "+name,e); 503 } 504 break; 505 case PDP: 506 try { 507 PDPProvider provider = new RemotePDPProvider(false); 508 base = provider.getPDPDomain(name); 509 } catch (IOException e) { 510 throw new StructureException("Unable to fetch PDP domain "+name, e); 511 } 512 break; 513 case BIO: 514 base = new BioAssemblyIdentifier(name); 515 break; 516 case PDB: 517 base = new SubstructureIdentifier(getIdentifier()); 518 break; 519 default: 520 throw new IllegalStateException("Unimplemented source: "+mySource); 521 } 522 } 523 return base; 524 } 525 526 @Override 527 public SubstructureIdentifier toCanonical() throws StructureException { 528 return getBaseIdentifier().toCanonical(); 529 } 530 531 @Override 532 public Structure reduce(Structure input) throws StructureException { 533 return getBaseIdentifier().reduce(input); 534 } 535 536 @Override 537 public Structure loadStructure(AtomCache cache) throws StructureException, 538 IOException { 539 return getBaseIdentifier().loadStructure(cache); 540 } 541 542 @Override 543 public int hashCode() { 544 final int prime = 31; 545 int result = 1; 546 result = prime * result + ((name == null) ? 0 : name.hashCode()); 547 return result; 548 } 549 550 @Override 551 public boolean equals(Object obj) { 552 if (this == obj) 553 return true; 554 if (obj == null) 555 return false; 556 if (getClass() != obj.getClass()) 557 return false; 558 StructureName other = (StructureName) obj; 559 if (name == null) { 560 if (other.name != null) 561 return false; 562 } else if (!name.equals(other.name)) 563 return false; 564 return true; 565 } 566 567 /** 568 * Orders identifiers lexicographically by PDB ID and then full Identifier 569 */ 570 @Override 571 public int compareTo(StructureName o) { 572 if ( this.equals(o)) 573 return 0; 574 575 String pdb1 = null; 576 String pdb2 = null; 577 try { 578 pdb1 = this.getPdbId(); 579 } catch (StructureException e) {} 580 try { 581 pdb2 = this.getPdbId(); 582 } catch (StructureException e) {} 583 584 int comp = 0; 585 586 // Sort those with PDBIDs before those without 587 if( pdb1 == null ) { 588 if( pdb2 != null) { 589 return 1; // this > o 590 } 591 // both null 592 } else if( pdb2 == null){ 593 return -1; // this < o 594 } else { 595 // neither null 596 comp = pdb1.compareTo(pdb2); 597 } 598 if( comp != 0 ) { 599 return comp; 600 } 601 602 // break tie with full identifiers 603 pdb1 = this.getIdentifier(); 604 pdb2 = o.getIdentifier(); 605 606 // Throws NPE for nulls 607 return pdb1.compareTo(pdb2); 608 } 609 610 /** 611 * <p> 612 * Guess a scop domain. If an exact match is found, return that. 613 * 614 * <p> 615 * Otherwise, return the first scop domain found for the specified protein such that 616 * <ul> 617 * <li>The chains match, or one of the chains is '_' or '.'. 618 * <li>The domains match, or one of the domains is '_'. 619 * </ul> 620 * 621 * In some cases there may be several valid matches. In this case a warning 622 * will be logged. 623 * 624 * @param name SCOP domain name, or a guess thereof 625 * @param scopDB SCOP domain provider 626 * @return The best match for name among the domains of scopDB, or null if none match. 627 */ 628 public static ScopDomain guessScopDomain(String name, ScopDatabase scopDB) { 629 List<ScopDomain> matches = new LinkedList<ScopDomain>(); 630 631 // Try exact match first 632 ScopDomain domain = scopDB.getDomainByScopID(name); 633 if (domain != null) { 634 return domain; 635 } 636 637 // Didn't work. Guess it! 638 logger.warn("Warning, could not find SCOP domain: " + name); 639 640 Matcher scopMatch = scopPattern.matcher(name); 641 if (scopMatch.matches()) { 642 String pdbID = scopMatch.group(1); 643 String chainName = scopMatch.group(2); 644 String domainID = scopMatch.group(3); 645 646 for (ScopDomain potentialSCOP : scopDB.getDomainsForPDB(pdbID)) { 647 Matcher potMatch = scopPattern.matcher(potentialSCOP.getScopId()); 648 if (potMatch.matches()) { 649 if (chainName.equals(potMatch.group(2)) || chainName.equals("_") || chainName.equals(".") 650 || potMatch.group(2).equals("_") || potMatch.group(2).equals(".")) { 651 if (domainID.equals(potMatch.group(3)) || domainID.equals("_") || potMatch.group(3).equals("_")) { 652 // Match, or near match 653 matches.add(potentialSCOP); 654 } 655 } 656 } 657 } 658 } 659 660 Iterator<ScopDomain> match = matches.iterator(); 661 if (match.hasNext()) { 662 ScopDomain bestMatch = match.next(); 663 if(logger.isWarnEnabled()) { 664 StringBuilder warnMsg = new StringBuilder(); 665 warnMsg.append("Trying domain " + bestMatch.getScopId() + "."); 666 if (match.hasNext()) { 667 warnMsg.append(" Other possibilities: "); 668 while (match.hasNext()) { 669 warnMsg.append(match.next().getScopId()).append(" "); 670 } 671 } 672 warnMsg.append(System.getProperty("line.separator")); 673 logger.warn(warnMsg.toString()); 674 } 675 return bestMatch; 676 } else { 677 return null; 678 } 679 } 680 681 682 683}