001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 */ 020 021package org.biojava.nbio.structure.ecod; 022 023import java.io.BufferedReader; 024import java.io.File; 025import java.io.FileReader; 026import java.io.IOException; 027import java.io.Reader; 028import java.net.MalformedURLException; 029import java.net.URL; 030import java.util.ArrayList; 031import java.util.Calendar; 032import java.util.Collections; 033import java.util.Date; 034import java.util.HashMap; 035import java.util.LinkedHashSet; 036import java.util.LinkedList; 037import java.util.List; 038import java.util.Map; 039import java.util.Set; 040import java.util.concurrent.locks.ReadWriteLock; 041import java.util.concurrent.locks.ReentrantReadWriteLock; 042import java.util.regex.Matcher; 043import java.util.regex.Pattern; 044 045import org.biojava.nbio.structure.PdbId; 046import org.biojava.nbio.structure.align.util.UserConfiguration; 047import org.biojava.nbio.core.util.FileDownloadUtils; 048import org.slf4j.Logger; 049import org.slf4j.LoggerFactory; 050 051/** 052 * Provides access to the Evolutionary Classification of Protein Domains (ECOD). 053 * 054 * The preferred mechanism for obtaining instances of this class is through the 055 * {@link EcodFactory} class. 056 * 057 * Reference: 058 * H. Cheng, R. D. Schaeffer, Y. Liao, L. N. Kinch, J. Pei, S. Shi, B. H.\ 059 * Kim, N. V. Grishin. (2014) ECOD: An evolutionary classification of protein 060 * domains. PLoS Comput Biol 10(12): e1003926. 061 * http://prodata.swmed.edu/ecod/ 062 * 063 * @author Spencer Bliven 064 * 065 */ 066public class EcodInstallation implements EcodDatabase { 067 private static final Logger logger = LoggerFactory.getLogger(EcodInstallation.class); 068 069 public static final String DEFAULT_VERSION = "latest"; 070 private static final String DOMAINS_FILENAME_FORMAT = "ecod.%s.domains.txt"; 071 072 public static final String ECOD_URL = "http://prodata.swmed.edu"; 073 public static final String DOMAINS_PATH = "/ecod/distributions/"; 074 075 // ECOD identifiers are e<pdbID><chain><domain>, where chain and domain 076 // Chain and domain can both be multi-letter (e.g. e2q7zA10) 077 public static final Pattern ECOD_RE = Pattern.compile("^e(....).+\\d+$"); 078 079 080 private String cacheLocation; 081 private String requestedVersion; // version requested, e.g. "latest". Used for the paths 082 private String parsedVersion; // actual version parsed 083 084 // lock to prevent multiple threads from downloading simultaneously 085 // Should hold the lock when reading/writing allDomains or domainMap 086 private ReadWriteLock domainsFileLock; 087 private List<EcodDomain> allDomains; 088 private Map<PdbId,List<EcodDomain>> domainMap;//PDB ID -> domains, lazily constructed from allDomains 089 090 private String url; 091 092 // Frequency of ECOD updates, in days. If non-null, redownloads "latest" if older than this. 093 private Integer updateFrequency = 14; 094 095 /** 096 * Use EcodFactory to create instances. The instantiation of multiple 097 * installations at the same path can lead to race conditions when downloading 098 * files. 099 * @param cacheLocation Location to save files, typically from the PDB_CACHE_DIR parameter 100 * @param version ECOD requestedVersion to fetch 101 */ 102 public EcodInstallation(String cacheLocation, String version) { 103 domainsFileLock = new ReentrantReadWriteLock(); 104 105 this.cacheLocation = cacheLocation; 106 107 this.requestedVersion = version; 108 this.url = ECOD_URL; 109 110 allDomains = null; // null signals it needs to be parsed 111 domainMap = null; // null signals it needs to be constructed from allDomains 112 } 113 114 /** 115 * @see EcodFactory#getEcodDatabase() 116 */ 117 public EcodInstallation() { 118 this( new UserConfiguration().getCacheFilePath(), DEFAULT_VERSION ); 119 } 120 /** 121 public EcodInstallation(String cacheLocation) { 122 this( cacheLocation, DEFAULT_VERSION ); 123 } 124 125 /** 126 * Get a list of all ECOD domains for a particular PDB ID 127 * @param id 128 * @return the list of domains, or null if no matching domains were found 129 * @throws IOException 130 */ 131 @Override 132 public List<EcodDomain> getDomainsForPdb(String id) throws IOException { 133 domainsFileLock.readLock().lock(); 134 try { 135 logger.trace("LOCK readlock"); 136 while( domainMap == null ) { 137 // unlock to allow ensureDomainsFileInstalled to get the write lock 138 logger.trace("UNLOCK readlock"); 139 domainsFileLock.readLock().unlock(); 140 indexDomains(); 141 domainsFileLock.readLock().lock(); 142 logger.trace("LOCK readlock"); 143 } 144 145 PdbId pdbId = null; 146 try { 147 pdbId = new PdbId(id); 148 } catch (IllegalArgumentException e) { 149 return null; 150 } 151 List<EcodDomain> doms = domainMap.get(pdbId); 152 if(doms == null) { 153 return null; 154 } 155 // Deep clone 156 List<EcodDomain> clonedDoms = new ArrayList<>(doms.size()); 157 for(EcodDomain d : doms) { 158 clonedDoms.add( new EcodDomain(d) ); 159 } 160 return clonedDoms; 161 } finally { 162 logger.trace("UNLOCK readlock"); 163 domainsFileLock.readLock().unlock(); 164 } 165 } 166 167 /** 168 * Get a list of domains within a particular level of the hierarchy 169 * @param hierarchy A dot-separated list giving the X-group, H-group, and/or 170 * T-group (e.g. "1.1" for all members of the RIFT-related H-group) 171 * @return 172 * @throws IOException 173 */ 174 @Override 175 public List<EcodDomain> filterByHierarchy(String hierarchy) throws IOException { 176 String[] xhtGroup = hierarchy.split("\\."); 177 Integer xGroup = xhtGroup.length>0 ? Integer.parseInt(xhtGroup[0]) : null; 178 Integer hGroup = xhtGroup.length>1 ? Integer.parseInt(xhtGroup[1]) : null; 179 Integer tGroup = xhtGroup.length>2 ? Integer.parseInt(xhtGroup[2]) : null; 180 181 List<EcodDomain> filtered = new ArrayList<>(); 182 for(EcodDomain d: getAllDomains()) { 183 boolean match = true; 184 if(xhtGroup.length>0) { 185 match = match && xGroup.equals(d.getXGroup()); 186 } 187 if(xhtGroup.length>1) { 188 match = match && hGroup.equals(d.getHGroup()); 189 } 190 if(xhtGroup.length>2) { 191 match = match && tGroup.equals(d.getTGroup()); 192 } 193 if(xhtGroup.length>3) { 194 logger.warn("Ignoring unexpected additional parts of ECOD {}",hierarchy); 195 } 196 if(match) { 197 filtered.add(d); 198 } 199 } 200 return filtered; 201 } 202 203 /** 204 * Get a particular ECOD domain by the domain ID (e.g. "e4hhbA1") 205 * @param ecodId 206 * @return 207 * @throws IOException 208 */ 209 @Override 210 public EcodDomain getDomainsById(String ecodId) throws IOException { 211 if(ecodId == null || ecodId.isEmpty()) { 212 return null; 213 } 214 215 Matcher match = ECOD_RE.matcher(ecodId); 216 String pdbId = null; 217 if( match.matches() ) 218 pdbId = match.group(1); 219 List<EcodDomain> doms = getDomainsForPdb(pdbId); 220 if(doms == null) { 221 logger.debug("Null domains for {} from {}",pdbId,ecodId); 222 return null; 223 } 224 logger.debug("Got {} domains from {}",doms.size(),pdbId); 225 for(EcodDomain d: doms) { 226 if(ecodId.equals(d.getDomainId())) { 227 return d; 228 } 229 } 230 return null; 231 } 232 233 /** 234 * Get all ECOD domains 235 * @return 236 * @throws IOException 237 */ 238 @Override 239 public List<EcodDomain> getAllDomains() throws IOException { 240 domainsFileLock.readLock().lock(); 241 logger.trace("LOCK readlock"); 242 try { 243 while( allDomains == null) { 244 // unlock to allow ensureDomainsFileInstalled to get the write lock 245 logger.trace("UNLOCK readlock"); 246 domainsFileLock.readLock().unlock(); 247 ensureDomainsFileInstalled(); 248 domainsFileLock.readLock().lock(); 249 logger.trace("LOCK readlock"); 250 } 251 return allDomains; 252 } finally { 253 logger.trace("UNLOCK readlock"); 254 domainsFileLock.readLock().unlock(); 255 } 256 257 } 258 259 /** 260 * Clears all domains, requiring the file to be reparsed for subsequent accesses 261 */ 262 public void clear() { 263 domainsFileLock.writeLock().lock(); 264 logger.trace("LOCK writelock"); 265 allDomains = null; 266 domainMap = null; 267 logger.trace("UNLOCK writelock"); 268 domainsFileLock.writeLock().unlock(); 269 } 270 /** 271 * Return the ECOD version, as parsed from the file. 272 * 273 * Note that this may differ from the version requested in the constructor 274 * for the special case of "latest" 275 * @return the ECOD version 276 * @throws IOException If an error occurs while downloading or parsing the file 277 */ 278 @Override 279 public String getVersion() throws IOException { 280 ensureDomainsFileInstalled(); 281 282 if( parsedVersion == null) { 283 return requestedVersion; 284 } 285 return parsedVersion; 286 } 287 288 /** 289 * Get the top-level ECOD server URL. Defaults to "http://prodata.swmed.edu" 290 * @return the url to the ecod server 291 */ 292 public String getUrl() { 293 return url; 294 } 295 296 /** 297 * Specify a different mirror for the ECOD server. 298 * @param url the urlFormat to set 299 */ 300 public void setUrl(String url) { 301 this.url = url; 302 } 303 304 /** 305 * Get the location of the cache directory (usually set to the PDB_CACHE_DIR 306 * property). ECOD files will be downloaded to this directory 307 * @return 308 */ 309 public String getCacheLocation() { 310 return cacheLocation; 311 } 312 /** 313 * Set an alternate download location for files 314 * @param cacheLocation 315 */ 316 public void setCacheLocation(String cacheLocation) { 317 if(cacheLocation.equals(this.cacheLocation)) { 318 return; //no change 319 } 320 // update location 321 domainsFileLock.writeLock().lock(); 322 logger.trace("LOCK writelock"); 323 this.cacheLocation = cacheLocation; 324 logger.trace("UNLOCK writelock"); 325 domainsFileLock.writeLock().unlock(); 326 } 327 328 /** 329 * Blocks until ECOD domains file has been downloaded and parsed. 330 * 331 * This may be useful in multithreaded environments. 332 * @throws IOException 333 */ 334 // Populates allDomains 335 public void ensureDomainsFileInstalled() throws IOException{ 336 // Quick check for availability 337 domainsFileLock.readLock().lock(); 338 logger.trace("LOCK readlock"); 339 try { 340 if( allDomains != null ) { 341 return; 342 } 343 } finally { 344 logger.trace("UNLOCK readlock"); 345 domainsFileLock.readLock().unlock(); 346 } 347 348 // Download domains 349 domainsFileLock.writeLock().lock(); 350 logger.trace("LOCK writelock"); 351 try { 352 if( !domainsAvailable() ) { 353 downloadDomains(); 354 } 355 parseDomains(); 356 } finally { 357 logger.trace("UNLOCK writelock"); 358 domainsFileLock.writeLock().unlock(); 359 } 360 } 361 362 /** 363 * Checks that the domains file has been downloaded 364 * @return 365 */ 366 private boolean domainsAvailable() { 367 domainsFileLock.readLock().lock(); 368 logger.trace("LOCK readlock"); 369 try { 370 File f = getDomainFile(); 371 372 if (! (f.exists() && FileDownloadUtils.validateFile(f))) 373 return false; 374 375 // Re-download old copies of "latest" 376 if(updateFrequency != null && requestedVersion.equals(DEFAULT_VERSION)) { 377 long mod = f.lastModified(); 378 // Time of last update 379 Date lastUpdate = new Date(); 380 Calendar cal = Calendar.getInstance(); 381 cal.setTime(lastUpdate); 382 cal.add(Calendar.DAY_OF_WEEK, -updateFrequency); 383 long updateTime = cal.getTimeInMillis(); 384 // Check if file predates last update 385 if( mod < updateTime ) { 386 logger.info("{} is out of date.",f); 387 return false; 388 } 389 } 390 return true; 391 } finally { 392 logger.trace("UNLOCK readlock"); 393 domainsFileLock.readLock().unlock(); 394 } 395 } 396 397 /** 398 * Downloads the domains file +/- its validation metadata, overwriting any existing file 399 * @throws IOException in cases of file I/O, including failure to download a healthy (non-corrupted) file. 400 */ 401 private void downloadDomains() throws IOException { 402 domainsFileLock.writeLock().lock(); 403 logger.trace("LOCK writelock"); 404 try { 405 URL domainsURL = new URL( url + DOMAINS_PATH + getDomainFilename()); 406 File localFile = getDomainFile(); 407 408 logger.info("Downloading {} to: {}",domainsURL, localFile); 409 FileDownloadUtils.createValidationFiles(domainsURL, localFile, null, FileDownloadUtils.Hash.UNKNOWN); 410 FileDownloadUtils.downloadFile(domainsURL, localFile); 411 if(! FileDownloadUtils.validateFile(localFile)) 412 throw new IOException("Downloaded file invalid: "+ localFile); 413 } catch (MalformedURLException e) { 414 logger.error("Malformed url: "+ url + DOMAINS_PATH + getDomainFilename(),e); 415 } finally { 416 logger.trace("UNLOCK writelock"); 417 domainsFileLock.writeLock().unlock(); 418 } 419 } 420 421 /** 422 * Basename for the domains file with the current requestedVersion. 423 * @return 424 */ 425 private String getDomainFilename() { 426 return String.format(DOMAINS_FILENAME_FORMAT,requestedVersion); 427 } 428 429 /** 430 * Local location for the domain file 431 * @return 432 */ 433 private File getDomainFile() { 434 return new File(getCacheLocation(),getDomainFilename()); 435 } 436 437 /** 438 * The expected ECOD update frequency determines whether the version 439 * "latest" should be re-downloaded 440 * @return the expected ECOD update frequency, in days 441 */ 442 public Integer getUpdateFrequency() { 443 return updateFrequency; 444 } 445 446 /** 447 * The "latest" version will be re-downloaded if it is older than 448 * {@link #getUpdateFrequency()} days. Setting this to null disables 449 * re-downloading (delete $PDB_CACHE_DIR/ecod.latest.domains.txt manually 450 * to force updating). Setting to 0 will force downloading for every 451 * program execution. 452 * @param updateFrequency the updateFrequency to set 453 */ 454 public void setUpdateFrequency(Integer updateFrequency) { 455 this.updateFrequency = updateFrequency; 456 } 457 458 /** 459 * Parses the domains from the local file 460 * @throws IOException 461 */ 462 private void parseDomains() throws IOException { 463 domainsFileLock.writeLock().lock(); 464 logger.trace("LOCK writelock"); 465 try { 466 EcodParser parser = new EcodParser(getDomainFile()); 467 allDomains = parser.getDomains(); 468 parsedVersion = parser.getVersion(); 469 } finally { 470 logger.trace("UNLOCK writelock"); 471 domainsFileLock.writeLock().unlock(); 472 } 473 } 474 475 /** 476 * Populates domainMap from allDomains 477 * @throws IOException 478 */ 479 private void indexDomains() throws IOException { 480 domainsFileLock.writeLock().lock(); 481 logger.trace("LOCK writelock"); 482 try { 483 if( allDomains == null) { 484 ensureDomainsFileInstalled(); 485 } 486 487 // Leave enough space for all PDBs as of 2015 488 domainMap = new HashMap<>((int) (150000/.85),.85f); 489 490 // Index with domainMap 491 for(EcodDomain d : allDomains) { 492 // Get the PDB ID, either directly or from the domain ID 493 PdbId pdbId = d.getPdbId(); 494 if( pdbId == null ) { 495 String ecodId = d.getDomainId(); 496 if( ecodId != null && !ecodId.isEmpty() ) { 497 Matcher match = ECOD_RE.matcher(ecodId); 498 pdbId = new PdbId(match.group(1)); 499 } 500 } 501 502 // Add current domain to the map 503 List<EcodDomain> currDomains; 504 if( domainMap.containsKey(pdbId) ) { 505 currDomains = domainMap.get(pdbId); 506 } else { 507 currDomains = new LinkedList<>(); 508 domainMap.put(pdbId,currDomains); 509 } 510 currDomains.add(d); 511 } 512 } finally { 513 logger.trace("UNLOCK writelock"); 514 domainsFileLock.writeLock().unlock(); 515 } 516 517 } 518 519 520 public static class EcodParser { 521 /* 522Version Notes 523 524Current version (1.4) contains the following columns: 525 526Column 1: ECOD uid - internal domain unique identifier 527Column 2: ECOD domain id - domain identifier 528Column 3: ECOD representative status - manual (curated) or automated nonrep 529Column 4: ECOD hierachy identifier - [X-group].[H-group].[T-group].[F-group] 530 * In develop45-66 these also include single numbers in the range 1-265 531Column 5: PDB identifier 532Column 6: Chain identifier (note: case-sensitive) 533Column 7: PDB residue number range 534 * These are sometimes incorrect up to at least develop124. Examples are: 535 e4lxaA2 (should be A:184-385), e4lxmC3 (should be C:46P-183) 536Column 8: seq_id number range (based on internal PDB indices) 537Column 9: Architecture name 538Column 10: X-group name 539Column 11: H-group name 540Column 12: T-group name 541Column 13: F-group name (F_UNCLASSIFIED denotes that domain has not been assigned to an F-group) 542Column 14: Domain assembly status (if domain is member of assembly, partners' ecod domain ids listed) 543Column 15: Comma-separated value list of non-polymer entities within 4 A of at least one residue of domain 544 545Notes older versions: 546changelog: 547v1.0 - original version (8/04/2014) 548v1.1 - added rep/nonrep data (1/15/2015) 549v1.2 - added f-group identifiers to fasta file, domain description file. ECODf identifiers now used when available for F-group name. 550 Domain assemblies now represented by assembly uid in domain assembly status. 551v1.4 - added seqid_range and headers (develop101) 552 */ 553 554 /** String for unclassified F-groups */ 555 public static final String F_UNCLASSIFIED = "F_UNCLASSIFIED"; 556 /** String for single-domain assemblies */ 557 public static final String NOT_DOMAIN_ASSEMBLY = "NOT_DOMAIN_ASSEMBLY"; 558 /** Deprecated way of indicating there is an assembly. replaced by the assembly id */ 559 public static final String IS_DOMAIN_ASSEMBLY = "IS_DOMAIN_ASSEMBLY"; 560 /** Indicates a manual representative */ 561 public static final String IS_REPRESENTATIVE = "MANUAL_REP"; 562 /** Indicates not a manual representative */ 563 public static final String NOT_REPRESENTATIVE = "AUTO_NONREP"; 564 565 private List<EcodDomain> domains; 566 private String version; 567 568 public EcodParser(String filename) throws IOException { 569 this(new File(filename)); 570 } 571 public EcodParser(File file) throws IOException { 572 this(new FileReader(file)); 573 } 574 public EcodParser(Reader reader) throws IOException { 575 this(new BufferedReader(reader)); 576 } 577 public EcodParser(BufferedReader reader) throws IOException { 578 version = null; 579 parse(reader); 580 } 581 582 private void parse(BufferedReader in) throws IOException { 583 try { 584 // Allocate plenty of space for ECOD as of 2015 585 ArrayList<EcodDomain> domainsList = new ArrayList<>(500000); 586 587 Pattern versionRE = Pattern.compile("^\\s*#.*ECOD\\s*version\\s+(\\S+).*"); 588 Pattern commentRE = Pattern.compile("^\\s*#.*"); 589 590 // prevent too many warnings; negative numbers print all warnings 591 int warnIsDomainAssembly = 1; 592 int warnHierarchicalFormat = 5; 593 int warnNumberOfFields = 10; 594 595 String line = in.readLine(); 596 int lineNum = 1; 597 while( line != null ) { 598 // Check for requestedVersion string 599 Matcher match = versionRE.matcher(line); 600 if(match.matches()) { 601 // special requestedVersion comment 602 this.version = match.group(1); 603 } else { 604 match = commentRE.matcher(line); 605 if(match.matches()) { 606 // ignore comments 607 } else { 608 // data line 609 String[] fields = line.split("\t"); 610 if( fields.length == 13 || fields.length == 14 || fields.length == 15) { 611 try { 612 int i = 0; // field number, to allow future insertion of fields 613 614 //Column 1: ECOD uid - internal domain unique identifier 615 Long uid = Long.parseLong(fields[i++]); 616 //Column 2: ECOD domain id - domain identifier 617 String domainId = fields[i++]; 618 619 //Column 3: ECOD representative status - manual (curated) or automated nonrep 620 // Manual column may be missing in version 1.0 files 621 Boolean manual = null; 622 if( fields.length >= 14) { 623 String manualString = fields[i++]; 624 if(manualString.equalsIgnoreCase(IS_REPRESENTATIVE)) { 625 manual = true; 626 } else if(manualString.equalsIgnoreCase(NOT_REPRESENTATIVE)) { 627 manual = false; 628 } else { 629 logger.warn("Unexpected value for manual field: {} in line {}",manualString,lineNum); 630 } 631 } 632 633 //Column 4: ECOD hierachy identifier - [X-group].[H-group].[T-group].[F-group] 634 // hierarchical field, e.g. "1.1.4.1" 635 String[] xhtGroup = fields[i++].split("\\."); 636 if(xhtGroup.length < 3 || 4 < xhtGroup.length) { 637 if(warnHierarchicalFormat > 1) { 638 logger.warn("Unexpected format for hierarchical field \"{}\" in line {}",fields[i-1],lineNum); 639 warnHierarchicalFormat--; 640 } else if(warnHierarchicalFormat != 0) { 641 logger.warn("Unexpected format for hierarchical field \"{}\" in line {}. Not printing future similar warnings.",fields[i-1],lineNum); 642 warnHierarchicalFormat--; 643 } 644 } 645 Integer xGroup = xhtGroup.length>0 ? Integer.parseInt(xhtGroup[0]) : null; 646 Integer hGroup = xhtGroup.length>1 ? Integer.parseInt(xhtGroup[1]) : null; 647 Integer tGroup = xhtGroup.length>2 ? Integer.parseInt(xhtGroup[2]) : null; 648 Integer fGroup = xhtGroup.length>3 ? Integer.parseInt(xhtGroup[3]) : null; 649 650 //Column 5: PDB identifier 651 String pdbId = fields[i++]; 652 //Column 6: Chain identifier (note: case-sensitive) 653 String chainId = fields[i++]; 654 //Column 7: PDB residue number range 655 String range = fields[i++]; 656 657 //Column 8: seq_id number range (based on internal PDB indices) 658 //Added in version 1.4 659 String seqId = null; 660 if( fields.length >= 15) { 661 seqId = fields[i++]; 662 } 663 664 //Column 9: Architecture name 665 // Intern strings likely to be shared by many domains 666 String architectureName = fields[i++].intern(); 667 //Column 10: X-group name 668 String xGroupName = fields[i++].intern(); 669 //Column 11: H-group name 670 String hGroupName = fields[i++].intern(); 671 //Column 12: T-group name 672 String tGroupName = fields[i++].intern(); 673 //Column 13: F-group name (F_UNCLASSIFIED denotes that domain has not been assigned to an F-group) 674 //Contents changed in version 1.3 675 String fGroupName = fields[i++].intern(); 676 677 678 hGroupName = clearStringQuotes(hGroupName); 679 tGroupName = clearStringQuotes(tGroupName); 680 fGroupName = clearStringQuotes(fGroupName); 681 xGroupName = clearStringQuotes(xGroupName); 682 683 //Column 14: Domain assembly status (if domain is member of assembly, partners' ecod domain ids listed) 684 //Column 15: Comma-separated value list of non-polymer entities within 4 A of at least one residue of domain 685 Long assemblyId = null; 686 String assemblyStr = fields[i++]; 687 if(assemblyStr.equals(NOT_DOMAIN_ASSEMBLY)) { 688 assemblyId = uid; 689 } else if("IS_DOMAIN_ASSEMBLY".equals(assemblyStr) ) { 690 if(warnIsDomainAssembly > 1) { 691 logger.info("Deprecated 'IS_DOMAIN_ASSEMBLY' value ignored in line {}.",lineNum); 692 warnIsDomainAssembly--; 693 } else if(warnIsDomainAssembly == 0) { 694 logger.info("Deprecated 'IS_DOMAIN_ASSEMBLY' value ignored in line {}. Not printing future similar warnings.",lineNum); 695 warnIsDomainAssembly--; 696 } 697 //assemblyId = null; 698 } else { 699 assemblyId = Long.parseLong(assemblyStr); 700 } 701 702 String ligandStr = fields[i++]; 703 Set<String> ligands = null; 704 if( "NO_LIGANDS_4A".equals(ligandStr) || ligandStr.isEmpty() ) { 705 ligands = Collections.emptySet(); 706 } else { 707 String[] ligSplit = ligandStr.split(","); 708 ligands = new LinkedHashSet<>(ligSplit.length); 709 for(String s : ligSplit) { 710 ligands.add(s.intern()); 711 } 712 } 713 714 715 EcodDomain domain = new EcodDomain(uid, domainId, manual, xGroup, hGroup, tGroup, fGroup,pdbId, chainId, range, seqId, architectureName, xGroupName, hGroupName, tGroupName, fGroupName, assemblyId, ligands); 716 domainsList.add(domain); 717 } catch(NumberFormatException e) { 718 logger.warn("Error in ECOD parsing at line "+lineNum,e); 719 } 720 } else { 721 if(warnNumberOfFields > 1) { 722 logger.warn("Unexpected number of fields in line {}.",lineNum); 723 warnNumberOfFields--; 724 } else if(warnNumberOfFields == 0) { 725 logger.warn("Unexpected number of fields in line {}. Not printing future similar warnings",lineNum); 726 warnIsDomainAssembly--; 727 } 728 } 729 } 730 } 731 732 line = in.readLine(); 733 lineNum++; 734 } 735 if(this.version == null) 736 logger.info("Parsed {} ECOD domains",domainsList.size()); 737 else 738 logger.info("Parsed {} ECOD domains from version {}",domainsList.size(),this.version); 739 740 741 this.domains = Collections.unmodifiableList( domainsList ); 742 743 } finally { 744 if(in != null) { 745 in.close(); 746 } 747 } 748 } 749 750 private String clearStringQuotes(String name) { 751 if ( name.startsWith("\"")) 752 name = name.substring(1); 753 754 if ( name.endsWith("\"")) 755 name = name.substring(0,name.length()-1); 756 757 return name; 758 } 759 760 /** 761 * @return a list of all EcodDomains 762 */ 763 public List<EcodDomain> getDomains() { 764 return domains; 765 } 766 767 /** 768 * @return the requestedVersion for this file, or null if none was parsed 769 */ 770 public String getVersion() { 771 return version; 772 } 773 } 774 775 776 @Override 777 public String toString() { 778 String version = null; 779 try { 780 version = getVersion(); 781 } catch (IOException e) { 782 // For parsing errors, use the requested version 783 version = requestedVersion; 784 } 785 786 return "EcodInstallation [cacheLocation=" + cacheLocation 787 + ", version=" + version + "]"; 788 } 789 790 public static void main(String[] args) { 791 if( args.length!= 1) { 792 System.out.println("usage: ecod_domains.txt"); 793 System.exit(1); return; 794 } 795 796 String filename = args[0]; 797 798 try { 799 EcodParser parser = new EcodParser(filename); 800 801 List<EcodDomain> domains = parser.getDomains(); 802 803 System.out.format("Found %d ECOD domains.%n",domains.size()); 804 805 System.out.println("First 10 domains:"); 806 int i = 0; 807 for(EcodDomain d: domains) { 808 if( i>10) break; 809 810 System.out.println(d.getDomainId()); 811 i++; 812 } 813 } catch (IOException e) { 814 e.printStackTrace(); 815 } 816 } 817}