001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.structure.align.client; 022 023 024import java.io.File; 025import java.io.IOException; 026import java.io.Serializable; 027import java.net.MalformedURLException; 028import java.net.URL; 029import java.util.Iterator; 030import java.util.LinkedList; 031import java.util.List; 032import java.util.Set; 033import java.util.TreeSet; 034import java.util.regex.Matcher; 035import java.util.regex.Pattern; 036 037import org.biojava.nbio.structure.BioAssemblyIdentifier; 038import org.biojava.nbio.structure.PdbId; 039import org.biojava.nbio.structure.ResidueRange; 040import org.biojava.nbio.structure.Structure; 041import org.biojava.nbio.structure.StructureException; 042import org.biojava.nbio.structure.StructureIdentifier; 043import org.biojava.nbio.structure.SubstructureIdentifier; 044import org.biojava.nbio.structure.URLIdentifier; 045import org.biojava.nbio.structure.align.util.AtomCache; 046import org.biojava.nbio.structure.cath.CathDomain; 047import org.biojava.nbio.structure.cath.CathFactory; 048import org.biojava.nbio.structure.ecod.EcodFactory; 049import org.biojava.nbio.core.util.FileDownloadUtils; 050import org.biojava.nbio.structure.scop.ScopDatabase; 051import org.biojava.nbio.structure.scop.ScopDomain; 052import org.biojava.nbio.structure.scop.ScopFactory; 053import org.slf4j.Logger; 054import org.slf4j.LoggerFactory; 055 056 057/** 058 * A utility class that makes working with names of structures, domains and ranges easier. 059 * 060 * Accepts a wide range of identifier formats, including {@link ScopDomain}, 061 * {@link CathDomain}, PDP domains, and {@link SubstructureIdentifier} residue 062 * ranges. 063 * 064 * Where possible, data is extracted from the input string. Otherwise, range 065 * information may be loaded from one of the factory classes: 066 * {@link CathFactory},{@link ScopFactory}, etc. 067 * 068 * @see #getIdentifier() the name. e.g. 4hhb, 4hhb.A, d4hhba_ etc. 069 */ 070 071public class StructureName implements Comparable<StructureName>, Serializable, StructureIdentifier { 072 private static final long serialVersionUID = 4021229518711762957L; 073 private static final Logger logger = LoggerFactory.getLogger(StructureName.class); 074 075 protected String name; 076 protected PdbId pdbId; 077 protected String chainName; 078 079 //TODO Double check all of the modified patterns 080 private static final Pattern cathPattern = Pattern.compile("^(?:CATH:)?([0-9][a-z0-9]{3})(\\w)([0-9]{2})$",Pattern.CASE_INSENSITIVE); 081 // ds046__ is a special case with no PDB entry 082 private static final Pattern scopPattern = Pattern.compile("^(?:SCOP:)?d([0-9][a-z0-9]{3}|s046)(\\w|\\.)(\\w)$",Pattern.CASE_INSENSITIVE); 083 // ECOD chains and domains can't be automatically distinguished. Ex: e3j9zS13 is chain 'S1', e1wz2B14 is chain 'B' 084 private static final Pattern ecodPattern = Pattern.compile("^(?:ECOD:)?e([0-9][a-z0-9]{3})(?:\\w|\\.)\\w+$",Pattern.CASE_INSENSITIVE); 085 086 // Names are automatically used as prefixes 087 public enum Source { 088 PDB, 089 SCOP, 090 CATH, 091 URL, 092 FILE, 093 ECOD, 094 BIO, 095 }; 096 097 private Source mySource = null; 098 099 // cache for getBaseIdentifier() method 100 private StructureIdentifier base = null; 101 102 /** 103 * Create a new StructureName from the given identifier, which may be a 104 * domain name, a substructure identifier, etc. 105 * <p> 106 * The source and PDB-Id are extracted at compile time, but fully 107 * interpreting the ID, which may require additional parsing or remote 108 * calls, is done lazily. 109 * <p> 110 * The following sources are supported. Any may be prefixed by the source 111 * name followed by a colon (e.g. PDB:4HHB). In this case, that source will be used 112 * unequivocally. If no source is specified, StructureName will make a 113 * (usually reliable) guess as to which source was intended. 114 * <ul> 115 * <li><b>PDB</b>PDB identifier, optionally followed by chain and/or residue 116 * ranges. Internally represented by a {@link SubstructureIdentifier}; 117 * see that class for the full format specification. 118 * Examples: 4hhb, 4hhb.A, 4hhb.A:1-50. 119 * <li><b>SCOP</b> SCOP domain (or SCOPe, depending on the 120 * {@link ScopFactory#getSCOP()} version). Example: d1h6w.2 121 * <li><b>CATH</b> Cath domains. Example: 1qvrC03 122 * <li><b>URL</b> Arbitrary URLs. Most common protocols are handled, 123 * including http://, ftp://, and file://. Some parsing information can 124 * be passed as custom query parameters. Example: 125 * http://www.rcsb.org/pdb/files/1B8G.pdb.gz 126 * <li><b>FILE</b> A file path. Supports relative paths and expands ~ to 127 * the user's home directory. Only existing files will be automatically 128 * detected; to refer to a potentially not-yet existing file, prepend 129 * the prefix. Internally represented as a {@link URLIdentifier} 130 * after path expansion. Example: ~/custom_protein.pdb 131 * <li><b>ECOD</b> ECOD domain. Example: e1lyw.1 132 * <li><b>BIO</b> Biological assembly. These are not guessed, making 133 * the BIO: prefix obligatory. Example: BIO:2ehz:1 134 * </ul> 135 * @param name An identifier string 136 * @throws IllegalArgumentException if the name has a recognizable source but is semantically invalid 137 */ 138 public StructureName(String name){ 139 this.name = name; 140 141 init();//sets pdbId and mySource 142 } 143 144 145 /** 146 * Tries to determine the source and pdbId without fully realizing the identifier, 147 * which could require I/O depending on the source 148 * @throws IllegalArgumentException if the source is recognizable but invalid 149 */ 150 private void init(){ 151 152 // First try identifying a prefix 153 String[] prefix = name.split(":", 2); 154 mySource = null; 155 if(prefix.length > 1) { 156 // Match Source prefixes 157 String suffix = prefix[1]; 158 try { 159 mySource = Source.valueOf(prefix[0].toUpperCase()); 160 } catch( IllegalArgumentException e ) { 161 // unrecognized prefix; fall back on guessing 162 mySource = null; 163 } 164 if(mySource != null) { 165 switch( mySource) { 166 case SCOP: 167 if( ! initFromScop(suffix) ) 168 throw new IllegalArgumentException("Malformed SCOP domain name:"+suffix); 169 return; 170 case CATH: 171 if( ! initFromCATH(suffix) ) 172 throw new IllegalArgumentException("Malformed CATH domain name:"+suffix); 173 return; 174 case BIO: 175 if( ! initFromBIO(name) ) 176 throw new IllegalArgumentException("Malformed BIO name:"+suffix); 177 return; 178 case ECOD: 179 if( ! initFromECOD(suffix) ) 180 throw new IllegalArgumentException("Malformed ECOD domain name:"+suffix); 181 return; 182 case PDB: 183 if( ! initFromPDB(suffix) ) 184 throw new IllegalArgumentException("Malformed PDB specification:"+suffix); 185 return; 186 case FILE: 187 // Treat file:/ prefixes as URLs 188 if( ! suffix.startsWith("/")) { 189 // Otherwise, treat as file 190 initFromFile(); 191 return; 192 } 193 // fall through to URL case 194 case URL: 195 if( ! initFromURL(name)) 196 throw new IllegalArgumentException("Malformed URL specification:"+suffix); 197 return; 198 default: 199 throw new IllegalStateException("Unimplemented Source "+mySource); 200 } 201 } 202 } 203 204 // No known prefix, so revert to guessing 205 206 // First guess regex-based identifiers 207 // SCOP domain 208 if( initFromScop(name) ) 209 return; 210 // CATH 211 if( initFromCATH(name) ) 212 return; 213 // ECOD 214 if( initFromECOD(name) ) 215 return; 216 // Never guess BIO or PDP 217 218 // URL 219 if( initFromURL(name) ) 220 return; 221 222 // Guess FILE based on file existence 223 File file = new File(FileDownloadUtils.expandUserHome(name)); 224 if( file.canRead() && !file.isDirectory() ) { 225 // an attempt to mitigate issue #398. It doesn't fix it but it catches the most common case of passing a pdb id and finding a file in working dir matching it 226 if (name.matches("\\d\\w\\w\\w")) { 227 // the plain pdb id case, this is unlikely to be what the user wants: let's let it through but warn about it 228 logger.warn("Provided 4-letter structure name '{}' matches " 229 + "file name in directory {}. Will read structure " 230 + "data from file {} and not consider the name as a " 231 + "structure identifier. If this is not what you " 232 + "want, use 'FILE:{}'", 233 name, file.getAbsoluteFile().getParent(), 234 file.getAbsolutePath(), name); 235 } else { 236 logger.info("Provided structure name '{}' matches " 237 + "file name in directory {}. Will read structure " 238 + "data from file {}.", 239 name, file.getAbsoluteFile().getParent(), 240 file.getAbsolutePath()); 241 } 242 243 initFromFile(); 244 return; 245 } 246 247 // Default to PDB 248 initFromPDB(name); 249 } 250 251 private boolean initFromScop(String name) { 252 Matcher matcher = scopPattern.matcher(name); 253 if ( matcher.matches() ) { 254 mySource = Source.SCOP; 255 pdbId = new PdbId(matcher.group(1)); 256 chainName = matcher.group(2); 257 return true; 258 } 259 return false; 260 } 261 262 private boolean initFromCATH(String name) { 263 Matcher matcher = cathPattern.matcher(name); 264 if ( matcher.matches() ){ 265 mySource = Source.CATH; 266 pdbId = new PdbId(matcher.group(1)); 267 chainName = matcher.group(2); 268 return true; 269 } 270 return false; 271 } 272 private boolean initFromECOD(String name) { 273 Matcher matcher = ecodPattern.matcher(name); 274 if ( matcher.matches() ){ 275 mySource = Source.ECOD; 276 pdbId = new PdbId(matcher.group(1)); 277 chainName = null; 278 return true; 279 } 280 return false; 281 } 282 private boolean initFromBIO(String name) { 283 Matcher matcher = BioAssemblyIdentifier.BIO_NAME_PATTERN.matcher(name); 284 if( matcher.matches() ) { 285 pdbId = new PdbId(matcher.group(1)); 286 return true; 287 } 288 return false; 289 } 290 private boolean initFromPDB(String suffix) { 291 mySource = Source.PDB; 292 SubstructureIdentifier si = new SubstructureIdentifier(suffix); 293 294 base = si; // Safe to realize immediately 295 296 pdbId = si.getPdbId(); 297 // Set chainName if unique 298 Set<String> chains = getChainNames(si); 299 if(chains.size() == 1) { 300 this.chainName = chains.iterator().next(); 301 } else if(chains.size() > 1) { 302 this.chainName = "."; 303 } else { 304 this.chainName = null; 305 } 306 return true; 307 } 308 private boolean initFromURL(String suffix) { 309 try { 310 URL url = new URL(suffix); 311 String path = url.getPath(); 312 mySource = Source.URL; 313 try { 314 pdbId = new PdbId(URLIdentifier.guessPDBID( path.substring(path.lastIndexOf('/')+1) )); 315 } catch (IllegalArgumentException e) { 316 pdbId = null; 317 } 318 chainName = null; // Don't bother checking query params here 319 return true; 320 } catch(MalformedURLException e) { 321 return false; 322 } 323 } 324 private boolean initFromFile() { 325 mySource = Source.FILE; 326 pdbId = null; 327 chainName = null; 328 return true; 329 } 330 331 private static Set<String> getChainNames(SubstructureIdentifier si) { 332 Set<String> chains = new TreeSet<>(); 333 List<ResidueRange> ranges = si.getResidueRanges(); 334 for(ResidueRange range : ranges) { 335 String chainName = range.getChainName(); 336 if(chainName != null) { 337 chains.add(chainName); 338 } 339 } 340 return chains; 341 } 342 343 /** 344 * Get the PDB ID for this name, if any. 345 * 346 * Equivalent to {@link SubstructureIdentifier#getPdbId() 347 * toCanonical().getPdbId()} 348 * @return The upper-case PDB Name, or null if not applicable 349 * @throws StructureException Wraps errors which occur when converting to canonical form 350 * @since 6.0.0 351 */ 352 public PdbId getPdbId() throws StructureException { 353 if( pdbId == null) { 354 pdbId = toCanonical().getPdbId(); 355 } 356 return pdbId; 357 } 358 359 /** 360 * Gets the chain ID, for structures where it is unique and well-defined. 361 * May return '.' for multi-chain ranges, '_' for wildcard chains, or 362 * null if the information is unavailable. 363 * 364 * <p>This method should only be used casually. For precise chainIds, it 365 * is better to use {@link #toCanonical()} and iterate through the 366 * residue ranges. 367 * @return 368 */ 369 public String getChainId() { 370 return chainName; 371 } 372 373 /** 374 * Get the original form of the identifier 375 */ 376 @Override 377 public String getIdentifier() { 378 return name; 379 } 380 381 @Override 382 public String toString(){ 383 384 return name; 385 } 386 387 388 public boolean isScopName() { 389 return mySource == Source.SCOP; 390 } 391 392 public boolean isCathID(){ 393 return mySource == Source.CATH; 394 } 395 396 public boolean isPdbId(){ 397 return mySource == Source.PDB; 398 } 399 400 public boolean isURL() { 401 return mySource == Source.URL; 402 } 403 404 /** 405 * Indicates that the identifier was determined to correspond to a file. 406 * Note that some file identifiers may also be valid URLs; in that case, 407 * the URL source is preferred. 408 * @return 409 */ 410 public boolean isFile() { 411 return mySource == Source.FILE; 412 } 413 414 public boolean isEcodDomain() { 415 return mySource == Source.ECOD; 416 } 417 418 public boolean isBioAssembly() { 419 return mySource == Source.BIO; 420 } 421 422 public Source getSource() { 423 return mySource; 424 } 425 426 /** 427 * StructureName wraps another StructureIdentifier. The type of the base 428 * identifier depends on the {@link #getSource() source}. Most StructureName 429 * methods deligate to the base identifier. 430 * 431 * <p>It is possible that future versions of StructureName might change the 432 * return type. Except for some specialized uses, it is probably better 433 * to create the correct type of identifier directly, rather than creating 434 * a StructureName and casting the result of this method. 435 * @return A Str 436 * @throws StructureException Wraps exceptions that may be thrown by 437 * individual implementations. For example, a SCOP identifier may require 438 * that the domain definitions be available for download. 439 */ 440 public StructureIdentifier getBaseIdentifier() throws StructureException { 441 if( base == null ) { 442 443 switch(mySource) { 444 case CATH: 445 base = CathFactory.getCathDatabase().getDescriptionByCathId(getIdentifier()); 446 break; 447 case ECOD: 448 try { 449 base = EcodFactory.getEcodDatabase().getDomainsById(name); 450 } catch (IOException e) { 451 throw new StructureException("Unable to get ECOD domain "+name,e); 452 } 453 break; 454 case SCOP: 455 // Fuzzy matching of the domain name to the current default factory 456 base = guessScopDomain(getIdentifier(),ScopFactory.getSCOP()); 457 if(base == null) { 458 // Guessing didn't work, so just use the PDBID and Chain from name 459 // Guess that '_' means 'whole structure' 460 if ("_".equals(chainName)) { 461 base = new SubstructureIdentifier(pdbId.getId()); 462 } else { 463 base = new SubstructureIdentifier(pdbId, ResidueRange.parseMultiple(chainName)); 464 } 465 logger.error("Unable to find {}, so using {}",name,base); 466 } 467 break; 468 case FILE: 469 try { 470 String[] prefix = name.split(":", 2); 471 String filename; 472 if(prefix.length > 1) { 473 filename = prefix[1]; 474 } else { 475 filename = name; 476 } 477 filename = FileDownloadUtils.expandUserHome(filename); 478 base = new URLIdentifier(new File(filename).toURI().toURL()); 479 } catch (MalformedURLException e) { 480 // Should never happen 481 throw new StructureException("Unable to get URL for file: "+name,e); 482 } 483 break; 484 case URL: 485 try { 486 base = new URLIdentifier(name); 487 } catch (MalformedURLException e) { 488 throw new StructureException("Invalid URL: "+name,e); 489 } 490 break; 491 case BIO: 492 base = new BioAssemblyIdentifier(name); 493 break; 494 case PDB: 495 base = new SubstructureIdentifier(getIdentifier()); 496 break; 497 default: 498 throw new IllegalStateException("Unimplemented source: "+mySource); 499 } 500 } 501 return base; 502 } 503 504 @Override 505 public SubstructureIdentifier toCanonical() throws StructureException { 506 return getBaseIdentifier().toCanonical(); 507 } 508 509 @Override 510 public Structure reduce(Structure input) throws StructureException { 511 return getBaseIdentifier().reduce(input); 512 } 513 514 @Override 515 public Structure loadStructure(AtomCache cache) throws StructureException, 516 IOException { 517 return getBaseIdentifier().loadStructure(cache); 518 } 519 520 @Override 521 public int hashCode() { 522 final int prime = 31; 523 int result = 1; 524 result = prime * result + ((name == null) ? 0 : name.hashCode()); 525 return result; 526 } 527 528 @Override 529 public boolean equals(Object obj) { 530 if (this == obj) 531 return true; 532 if (obj == null) 533 return false; 534 if (getClass() != obj.getClass()) 535 return false; 536 StructureName other = (StructureName) obj; 537 if (name == null) { 538 if (other.name != null) 539 return false; 540 } else if (!name.equals(other.name)) 541 return false; 542 return true; 543 } 544 545 /** 546 * Orders identifiers lexicographically by PDB ID and then full Identifier 547 */ 548 @Override 549 public int compareTo(StructureName o) { 550 if ( this.equals(o)) 551 return 0; 552 553 PdbId pdb1 = null; 554 PdbId pdb2 = null; 555 try { 556 pdb1 = this.getPdbId(); 557 } catch (StructureException e) {} 558 try { 559 pdb2 = this.getPdbId(); 560 } catch (StructureException e) {} 561 562 int comp = 0; 563 564 // Sort those with PDBIDs before those without 565 if( pdb1 == null ) { 566 if( pdb2 != null) { 567 return 1; // this > o 568 } 569 // both null 570 } else if( pdb2 == null){ 571 return -1; // this < o 572 } else { 573 // neither null 574 comp = pdb1.compareTo(pdb2); 575 } 576 if( comp != 0 ) { 577 return comp; 578 } 579 580 // break tie with full identifiers 581 String pdb1Str = this.getIdentifier(); 582 String pdb2Str = o.getIdentifier(); 583 584 // Throws NPE for nulls 585 return pdb1Str.compareTo(pdb2Str); 586 } 587 588 /** 589 * <p> 590 * Guess a scop domain. If an exact match is found, return that. 591 * 592 * <p> 593 * Otherwise, return the first scop domain found for the specified protein such that 594 * <ul> 595 * <li>The chains match, or one of the chains is '_' or '.'. 596 * <li>The domains match, or one of the domains is '_'. 597 * </ul> 598 * 599 * In some cases there may be several valid matches. In this case a warning 600 * will be logged. 601 * 602 * @param name SCOP domain name, or a guess thereof 603 * @param scopDB SCOP domain provider 604 * @return The best match for name among the domains of scopDB, or null if none match. 605 */ 606 public static ScopDomain guessScopDomain(String name, ScopDatabase scopDB) { 607 List<ScopDomain> matches = new LinkedList<>(); 608 609 // Try exact match first 610 ScopDomain domain = scopDB.getDomainByScopID(name); 611 if (domain != null) { 612 return domain; 613 } 614 615 // Didn't work. Guess it! 616 logger.warn("Warning, could not find SCOP domain: " + name); 617 618 Matcher scopMatch = scopPattern.matcher(name); 619 if (scopMatch.matches()) { 620 String pdbID = scopMatch.group(1); 621 String chainName = scopMatch.group(2); 622 String domainID = scopMatch.group(3); 623 624 for (ScopDomain potentialSCOP : scopDB.getDomainsForPDB(pdbID)) { 625 Matcher potMatch = scopPattern.matcher(potentialSCOP.getScopId()); 626 if (potMatch.matches()) { 627 if (chainName.equals(potMatch.group(2)) || "_".equals(chainName) || ".".equals(chainName) 628 || "_".equals(potMatch.group(2)) || ".".equals(potMatch.group(2))) { 629 if (domainID.equals(potMatch.group(3)) || "_".equals(domainID) || "_".equals(potMatch.group(3))) { 630 // Match, or near match 631 matches.add(potentialSCOP); 632 } 633 } 634 } 635 } 636 } 637 638 Iterator<ScopDomain> match = matches.iterator(); 639 if (match.hasNext()) { 640 ScopDomain bestMatch = match.next(); 641 if(logger.isWarnEnabled()) { 642 StringBuilder warnMsg = new StringBuilder(); 643 warnMsg.append("Trying domain " + bestMatch.getScopId() + "."); 644 if (match.hasNext()) { 645 warnMsg.append(" Other possibilities: "); 646 while (match.hasNext()) { 647 warnMsg.append(match.next().getScopId()).append(" "); 648 } 649 } 650 warnMsg.append(System.getProperty("line.separator")); 651 logger.warn(warnMsg.toString()); 652 } 653 return bestMatch; 654 } else { 655 return null; 656 } 657 } 658 659 660 661}