001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 */ 020 021package org.biojava.nbio.structure.ecod; 022 023import java.io.BufferedReader; 024import java.io.File; 025import java.io.FileReader; 026import java.io.IOException; 027import java.io.Reader; 028import java.net.MalformedURLException; 029import java.net.URL; 030import java.util.ArrayList; 031import java.util.Calendar; 032import java.util.Collections; 033import java.util.Date; 034import java.util.HashMap; 035import java.util.LinkedHashSet; 036import java.util.LinkedList; 037import java.util.List; 038import java.util.Map; 039import java.util.Set; 040import java.util.concurrent.locks.ReadWriteLock; 041import java.util.concurrent.locks.ReentrantReadWriteLock; 042import java.util.regex.Matcher; 043import java.util.regex.Pattern; 044 045import org.biojava.nbio.structure.PdbId; 046import org.biojava.nbio.structure.align.util.UserConfiguration; 047import org.biojava.nbio.core.util.FileDownloadUtils; 048import org.slf4j.Logger; 049import org.slf4j.LoggerFactory; 050 051/** 052 * Provides access to the Evolutionary Classification of Protein Domains (ECOD). 053 * 054 * The preferred mechanism for obtaining instances of this class is through the 055 * {@link EcodFactory} class. 056 * 057 * Reference: 058 * H. Cheng, R. D. Schaeffer, Y. Liao, L. N. Kinch, J. Pei, S. Shi, B. H.\ 059 * Kim, N. V. Grishin. (2014) ECOD: An evolutionary classification of protein 060 * domains. PLoS Comput Biol 10(12): e1003926. 061 * http://prodata.swmed.edu/ecod/ 062 * 063 * @author Spencer Bliven 064 * 065 */ 066public class EcodInstallation implements EcodDatabase { 067 private static final Logger logger = LoggerFactory.getLogger(EcodInstallation.class); 068 069 public static final String DEFAULT_VERSION = "latest"; 070 private static final String DOMAINS_FILENAME_FORMAT = "ecod.%s.domains.txt"; 071 072 public static final String ECOD_URL = "http://prodata.swmed.edu"; 073 public static final String DOMAINS_PATH = "/ecod/distributions/"; 074 075 // ECOD identifiers are e<pdbID><chain><domain>, where chain and domain 076 // Chain and domain can both be multi-letter (e.g. e2q7zA10) 077 public static final Pattern ECOD_RE = Pattern.compile("^e(....).+\\d+$"); 078 079 080 private String cacheLocation; 081 private String requestedVersion; // version requested, e.g. "latest". Used for the paths 082 private String parsedVersion; // actual version parsed 083 084 // lock to prevent multiple threads from downloading simultaneously 085 // Should hold the lock when reading/writing allDomains or domainMap 086 private ReadWriteLock domainsFileLock; 087 private List<EcodDomain> allDomains; 088 private Map<PdbId,List<EcodDomain>> domainMap;//PDB ID -> domains, lazily constructed from allDomains 089 090 private String url; 091 092 // Frequency of ECOD updates, in days. If non-null, redownloads "latest" if older than this. 093 private Integer updateFrequency = 14; 094 095 /** 096 * Use EcodFactory to create instances. The instantiation of multiple 097 * installations at the same path can lead to race conditions when downloading 098 * files. 099 * @param cacheLocation Location to save files, typically from the PDB_CACHE_DIR parameter 100 * @param requestedVersion ECOD requestedVersion to fetch 101 */ 102 public EcodInstallation(String cacheLocation, String version) { 103 domainsFileLock = new ReentrantReadWriteLock(); 104 105 this.cacheLocation = cacheLocation; 106 107 this.requestedVersion = version; 108 this.url = ECOD_URL; 109 110 allDomains = null; // null signals it needs to be parsed 111 domainMap = null; // null signals it needs to be constructed from allDomains 112 } 113 114 /** 115 * @see EcodFactory#getEcodDatabase() 116 */ 117 public EcodInstallation() { 118 this( new UserConfiguration().getCacheFilePath(), DEFAULT_VERSION ); 119 } 120 /** 121 public EcodInstallation(String cacheLocation) { 122 this( cacheLocation, DEFAULT_VERSION ); 123 } 124 125 /** 126 * Get a list of all ECOD domains for a particular PDB ID 127 * @param id 128 * @return the list of domains, or null if no matching domains were found 129 * @throws IOException 130 */ 131 @Override 132 public List<EcodDomain> getDomainsForPdb(String id) throws IOException { 133 domainsFileLock.readLock().lock(); 134 try { 135 logger.trace("LOCK readlock"); 136 while( domainMap == null ) { 137 // unlock to allow ensureDomainsFileInstalled to get the write lock 138 logger.trace("UNLOCK readlock"); 139 domainsFileLock.readLock().unlock(); 140 indexDomains(); 141 domainsFileLock.readLock().lock(); 142 logger.trace("LOCK readlock"); 143 } 144 145 PdbId pdbId = null; 146 try { 147 pdbId = new PdbId(id); 148 } catch (IllegalArgumentException e) { 149 return null; 150 } 151 List<EcodDomain> doms = domainMap.get(pdbId); 152 if(doms == null) { 153 return null; 154 } 155 // Deep clone 156 List<EcodDomain> clonedDoms = new ArrayList<EcodDomain>(doms.size()); 157 for(EcodDomain d : doms) { 158 clonedDoms.add( new EcodDomain(d) ); 159 } 160 return clonedDoms; 161 } finally { 162 logger.trace("UNLOCK readlock"); 163 domainsFileLock.readLock().unlock(); 164 } 165 } 166 167 /** 168 * Get a list of domains within a particular level of the hierarchy 169 * @param hierarchy A dot-separated list giving the X-group, H-group, and/or 170 * T-group (e.g. "1.1" for all members of the RIFT-related H-group) 171 * @return 172 * @throws IOException 173 */ 174 @Override 175 public List<EcodDomain> filterByHierarchy(String hierarchy) throws IOException { 176 String[] xhtGroup = hierarchy.split("\\."); 177 Integer xGroup = xhtGroup.length>0 ? Integer.parseInt(xhtGroup[0]) : null; 178 Integer hGroup = xhtGroup.length>1 ? Integer.parseInt(xhtGroup[1]) : null; 179 Integer tGroup = xhtGroup.length>2 ? Integer.parseInt(xhtGroup[2]) : null; 180 181 List<EcodDomain> filtered = new ArrayList<EcodDomain>(); 182 for(EcodDomain d: getAllDomains()) { 183 boolean match = true; 184 if(xhtGroup.length>0) { 185 match = match && xGroup.equals(d.getXGroup()); 186 } 187 if(xhtGroup.length>1) { 188 match = match && hGroup.equals(d.getHGroup()); 189 } 190 if(xhtGroup.length>2) { 191 match = match && tGroup.equals(d.getTGroup()); 192 } 193 if(xhtGroup.length>3) { 194 logger.warn("Ignoring unexpected additional parts of ECOD {}",hierarchy); 195 } 196 if(match) { 197 filtered.add(d); 198 } 199 } 200 return filtered; 201 } 202 203 /** 204 * Get a particular ECOD domain by the domain ID (e.g. "e4hhbA1") 205 * @param ecodId 206 * @return 207 * @throws IOException 208 */ 209 @Override 210 public EcodDomain getDomainsById(String ecodId) throws IOException { 211 if(ecodId == null || ecodId.isEmpty()) { 212 return null; 213 } 214 215 Matcher match = ECOD_RE.matcher(ecodId); 216 String pdbId = null; 217 if( match.matches() ) 218 pdbId = match.group(1); 219 List<EcodDomain> doms = getDomainsForPdb(pdbId); 220 if(doms == null) { 221 logger.debug("Null domains for {} from {}",pdbId,ecodId); 222 return null; 223 } 224 logger.debug("Got {} domains from {}",doms.size(),pdbId); 225 for(EcodDomain d: doms) { 226 if(ecodId.equals(d.getDomainId())) { 227 return d; 228 } 229 } 230 return null; 231 } 232 233 /** 234 * Get all ECOD domains 235 * @return 236 * @throws IOException 237 */ 238 @Override 239 public List<EcodDomain> getAllDomains() throws IOException { 240 domainsFileLock.readLock().lock(); 241 logger.trace("LOCK readlock"); 242 try { 243 while( allDomains == null) { 244 // unlock to allow ensureDomainsFileInstalled to get the write lock 245 logger.trace("UNLOCK readlock"); 246 domainsFileLock.readLock().unlock(); 247 ensureDomainsFileInstalled(); 248 domainsFileLock.readLock().lock(); 249 logger.trace("LOCK readlock"); 250 } 251 return allDomains; 252 } finally { 253 logger.trace("UNLOCK readlock"); 254 domainsFileLock.readLock().unlock(); 255 } 256 257 } 258 259 /** 260 * Clears all domains, requiring the file to be reparsed for subsequent accesses 261 */ 262 public void clear() { 263 domainsFileLock.writeLock().lock(); 264 logger.trace("LOCK writelock"); 265 allDomains = null; 266 domainMap = null; 267 logger.trace("UNLOCK writelock"); 268 domainsFileLock.writeLock().unlock(); 269 } 270 /** 271 * Return the ECOD version, as parsed from the file. 272 * 273 * Note that this may differ from the version requested in the constructor 274 * for the special case of "latest" 275 * @return the ECOD version 276 * @throws IOException If an error occurs while downloading or parsing the file 277 */ 278 @Override 279 public String getVersion() throws IOException { 280 ensureDomainsFileInstalled(); 281 282 if( parsedVersion == null) { 283 return requestedVersion; 284 } 285 return parsedVersion; 286 } 287 288 /** 289 * Get the top-level ECOD server URL. Defaults to "http://prodata.swmed.edu" 290 * @return the url to the ecod server 291 */ 292 public String getUrl() { 293 return url; 294 } 295 296 /** 297 * Specify a different mirror for the ECOD server. 298 * @param urlFormat the urlFormat to set 299 */ 300 public void setUrl(String url) { 301 this.url = url; 302 } 303 304 /** 305 * Get the location of the cache directory (usually set to the PDB_CACHE_DIR 306 * property). ECOD files will be downloaded to this directory 307 * @return 308 */ 309 public String getCacheLocation() { 310 return cacheLocation; 311 } 312 /** 313 * Set an alternate download location for files 314 * @param cacheLocation 315 */ 316 public void setCacheLocation(String cacheLocation) { 317 if(cacheLocation.equals(this.cacheLocation)) { 318 return; //no change 319 } 320 // update location 321 domainsFileLock.writeLock().lock(); 322 logger.trace("LOCK writelock"); 323 this.cacheLocation = cacheLocation; 324 logger.trace("UNLOCK writelock"); 325 domainsFileLock.writeLock().unlock(); 326 } 327 328 /** 329 * Blocks until ECOD domains file has been downloaded and parsed. 330 * 331 * This may be useful in multithreaded environments. 332 * @throws IOException 333 */ 334 // Populates allDomains 335 public void ensureDomainsFileInstalled() throws IOException{ 336 // Quick check for availability 337 domainsFileLock.readLock().lock(); 338 logger.trace("LOCK readlock"); 339 try { 340 if( allDomains != null ) { 341 return; 342 } 343 } finally { 344 logger.trace("UNLOCK readlock"); 345 domainsFileLock.readLock().unlock(); 346 } 347 348 // Download domains 349 domainsFileLock.writeLock().lock(); 350 logger.trace("LOCK writelock"); 351 try { 352 if( !domainsAvailable() ) { 353 downloadDomains(); 354 } 355 parseDomains(); 356 } finally { 357 logger.trace("UNLOCK writelock"); 358 domainsFileLock.writeLock().unlock(); 359 } 360 } 361 362 /** 363 * Checks that the domains file has been downloaded 364 * @return 365 */ 366 private boolean domainsAvailable() { 367 domainsFileLock.readLock().lock(); 368 logger.trace("LOCK readlock"); 369 try { 370 File f = getDomainFile(); 371 372 if (!f.exists() || f.length() <= 0 ) 373 return false; 374 375 // Re-download old copies of "latest" 376 if(updateFrequency != null && requestedVersion == DEFAULT_VERSION ) { 377 long mod = f.lastModified(); 378 // Time of last update 379 Date lastUpdate = new Date(); 380 Calendar cal = Calendar.getInstance(); 381 cal.setTime(lastUpdate); 382 cal.add(Calendar.DAY_OF_WEEK, -updateFrequency); 383 long updateTime = cal.getTimeInMillis(); 384 // Check if file predates last update 385 if( mod < updateTime ) { 386 logger.info("{} is out of date.",f); 387 return false; 388 } 389 } 390 return true; 391 } finally { 392 logger.trace("UNLOCK readlock"); 393 domainsFileLock.readLock().unlock(); 394 } 395 } 396 397 /** 398 * Downloads the domains file, overwriting any existing file 399 * @throws IOException 400 */ 401 private void downloadDomains() throws IOException { 402 domainsFileLock.writeLock().lock(); 403 logger.trace("LOCK writelock"); 404 try { 405 URL domainsURL = new URL( url + DOMAINS_PATH + getDomainFilename()); 406 File localFile = getDomainFile(); 407 408 logger.info("Downloading {} to: {}",domainsURL, localFile); 409 FileDownloadUtils.downloadFile(domainsURL, localFile); 410 } catch (MalformedURLException e) { 411 logger.error("Malformed url: "+ url + DOMAINS_PATH + getDomainFilename(),e); 412 } finally { 413 logger.trace("UNLOCK writelock"); 414 domainsFileLock.writeLock().unlock(); 415 } 416 } 417 418 /** 419 * Basename for the domains file with the current requestedVersion. 420 * @return 421 */ 422 private String getDomainFilename() { 423 return String.format(DOMAINS_FILENAME_FORMAT,requestedVersion); 424 } 425 426 /** 427 * Local location for the domain file 428 * @return 429 */ 430 private File getDomainFile() { 431 return new File(getCacheLocation(),getDomainFilename()); 432 } 433 434 /** 435 * The expected ECOD update frequency determines whether the version 436 * "latest" should be re-downloaded 437 * @return the expected ECOD update frequency, in days 438 */ 439 public Integer getUpdateFrequency() { 440 return updateFrequency; 441 } 442 443 /** 444 * The "latest" version will be re-downloaded if it is older than 445 * {@link #getUpdateFrequency()} days. Setting this to null disables 446 * re-downloading (delete $PDB_CACHE_DIR/ecod.latest.domains.txt manually 447 * to force updating). Setting to 0 will force downloading for every 448 * program execution. 449 * @param updateFrequency the updateFrequency to set 450 */ 451 public void setUpdateFrequency(Integer updateFrequency) { 452 this.updateFrequency = updateFrequency; 453 } 454 455 /** 456 * Parses the domains from the local file 457 * @throws IOException 458 */ 459 private void parseDomains() throws IOException { 460 domainsFileLock.writeLock().lock(); 461 logger.trace("LOCK writelock"); 462 try { 463 EcodParser parser = new EcodParser(getDomainFile()); 464 allDomains = parser.getDomains(); 465 parsedVersion = parser.getVersion(); 466 } finally { 467 logger.trace("UNLOCK writelock"); 468 domainsFileLock.writeLock().unlock(); 469 } 470 } 471 472 /** 473 * Populates domainMap from allDomains 474 * @throws IOException 475 */ 476 private void indexDomains() throws IOException { 477 domainsFileLock.writeLock().lock(); 478 logger.trace("LOCK writelock"); 479 try { 480 if( allDomains == null) { 481 ensureDomainsFileInstalled(); 482 } 483 484 // Leave enough space for all PDBs as of 2015 485 domainMap = new HashMap<PdbId, List<EcodDomain>>((int) (150000/.85),.85f); 486 487 // Index with domainMap 488 for(EcodDomain d : allDomains) { 489 // Get the PDB ID, either directly or from the domain ID 490 PdbId pdbId = d.getPdbId(); 491 if( pdbId == null ) { 492 String ecodId = d.getDomainId(); 493 if( ecodId != null && !ecodId.isEmpty() ) { 494 Matcher match = ECOD_RE.matcher(ecodId); 495 pdbId = new PdbId(match.group(1)); 496 } 497 } 498 499 // Add current domain to the map 500 List<EcodDomain> currDomains; 501 if( domainMap.containsKey(pdbId) ) { 502 currDomains = domainMap.get(pdbId); 503 } else { 504 currDomains = new LinkedList<EcodDomain>(); 505 domainMap.put(pdbId,currDomains); 506 } 507 currDomains.add(d); 508 } 509 } finally { 510 logger.trace("UNLOCK writelock"); 511 domainsFileLock.writeLock().unlock(); 512 } 513 514 } 515 516 517 public static class EcodParser { 518 /* 519Version Notes 520 521Current version (1.4) contains the following columns: 522 523Column 1: ECOD uid - internal domain unique identifier 524Column 2: ECOD domain id - domain identifier 525Column 3: ECOD representative status - manual (curated) or automated nonrep 526Column 4: ECOD hierachy identifier - [X-group].[H-group].[T-group].[F-group] 527 * In develop45-66 these also include single numbers in the range 1-265 528Column 5: PDB identifier 529Column 6: Chain identifier (note: case-sensitive) 530Column 7: PDB residue number range 531 * These are sometimes incorrect up to at least develop124. Examples are: 532 e4lxaA2 (should be A:184-385), e4lxmC3 (should be C:46P-183) 533Column 8: seq_id number range (based on internal PDB indices) 534Column 9: Architecture name 535Column 10: X-group name 536Column 11: H-group name 537Column 12: T-group name 538Column 13: F-group name (F_UNCLASSIFIED denotes that domain has not been assigned to an F-group) 539Column 14: Domain assembly status (if domain is member of assembly, partners' ecod domain ids listed) 540Column 15: Comma-separated value list of non-polymer entities within 4 A of at least one residue of domain 541 542Notes older versions: 543changelog: 544v1.0 - original version (8/04/2014) 545v1.1 - added rep/nonrep data (1/15/2015) 546v1.2 - added f-group identifiers to fasta file, domain description file. ECODf identifiers now used when available for F-group name. 547 Domain assemblies now represented by assembly uid in domain assembly status. 548v1.4 - added seqid_range and headers (develop101) 549 */ 550 551 /** String for unclassified F-groups */ 552 public static final String F_UNCLASSIFIED = "F_UNCLASSIFIED"; 553 /** String for single-domain assemblies */ 554 public static final String NOT_DOMAIN_ASSEMBLY = "NOT_DOMAIN_ASSEMBLY"; 555 /** Deprecated way of indicating there is an assembly. replaced by the assembly id */ 556 public static final String IS_DOMAIN_ASSEMBLY = "IS_DOMAIN_ASSEMBLY"; 557 /** Indicates a manual representative */ 558 public static final String IS_REPRESENTATIVE = "MANUAL_REP"; 559 /** Indicates not a manual representative */ 560 public static final String NOT_REPRESENTATIVE = "AUTO_NONREP"; 561 562 private List<EcodDomain> domains; 563 private String version; 564 565 public EcodParser(String filename) throws IOException { 566 this(new File(filename)); 567 } 568 public EcodParser(File file) throws IOException { 569 this(new FileReader(file)); 570 } 571 public EcodParser(Reader reader) throws IOException { 572 this(new BufferedReader(reader)); 573 } 574 public EcodParser(BufferedReader reader) throws IOException { 575 version = null; 576 parse(reader); 577 } 578 579 private void parse(BufferedReader in) throws IOException { 580 try { 581 // Allocate plenty of space for ECOD as of 2015 582 ArrayList<EcodDomain> domainsList = new ArrayList<EcodDomain>(500000); 583 584 Pattern versionRE = Pattern.compile("^\\s*#.*ECOD\\s*version\\s+(\\S+).*"); 585 Pattern commentRE = Pattern.compile("^\\s*#.*"); 586 587 // prevent too many warnings; negative numbers print all warnings 588 int warnIsDomainAssembly = 1; 589 int warnHierarchicalFormat = 5; 590 int warnNumberOfFields = 10; 591 592 String line = in.readLine(); 593 int lineNum = 1; 594 while( line != null ) { 595 // Check for requestedVersion string 596 Matcher match = versionRE.matcher(line); 597 if(match.matches()) { 598 // special requestedVersion comment 599 this.version = match.group(1); 600 } else { 601 match = commentRE.matcher(line); 602 if(match.matches()) { 603 // ignore comments 604 } else { 605 // data line 606 String[] fields = line.split("\t"); 607 if( fields.length == 13 || fields.length == 14 || fields.length == 15) { 608 try { 609 int i = 0; // field number, to allow future insertion of fields 610 611 //Column 1: ECOD uid - internal domain unique identifier 612 Long uid = Long.parseLong(fields[i++]); 613 //Column 2: ECOD domain id - domain identifier 614 String domainId = fields[i++]; 615 616 //Column 3: ECOD representative status - manual (curated) or automated nonrep 617 // Manual column may be missing in version 1.0 files 618 Boolean manual = null; 619 if( fields.length >= 14) { 620 String manualString = fields[i++]; 621 if(manualString.equalsIgnoreCase(IS_REPRESENTATIVE)) { 622 manual = true; 623 } else if(manualString.equalsIgnoreCase(NOT_REPRESENTATIVE)) { 624 manual = false; 625 } else { 626 logger.warn("Unexpected value for manual field: {} in line {}",manualString,lineNum); 627 } 628 } 629 630 //Column 4: ECOD hierachy identifier - [X-group].[H-group].[T-group].[F-group] 631 // hierarchical field, e.g. "1.1.4.1" 632 String[] xhtGroup = fields[i++].split("\\."); 633 if(xhtGroup.length < 3 || 4 < xhtGroup.length) { 634 if(warnHierarchicalFormat > 1) { 635 logger.warn("Unexpected format for hierarchical field \"{}\" in line {}",fields[i-1],lineNum); 636 warnHierarchicalFormat--; 637 } else if(warnHierarchicalFormat != 0) { 638 logger.warn("Unexpected format for hierarchical field \"{}\" in line {}. Not printing future similar warnings.",fields[i-1],lineNum); 639 warnHierarchicalFormat--; 640 } 641 } 642 Integer xGroup = xhtGroup.length>0 ? Integer.parseInt(xhtGroup[0]) : null; 643 Integer hGroup = xhtGroup.length>1 ? Integer.parseInt(xhtGroup[1]) : null; 644 Integer tGroup = xhtGroup.length>2 ? Integer.parseInt(xhtGroup[2]) : null; 645 Integer fGroup = xhtGroup.length>3 ? Integer.parseInt(xhtGroup[3]) : null; 646 647 //Column 5: PDB identifier 648 String pdbId = fields[i++]; 649 //Column 6: Chain identifier (note: case-sensitive) 650 String chainId = fields[i++]; 651 //Column 7: PDB residue number range 652 String range = fields[i++]; 653 654 //Column 8: seq_id number range (based on internal PDB indices) 655 //Added in version 1.4 656 String seqId = null; 657 if( fields.length >= 15) { 658 seqId = fields[i++]; 659 } 660 661 //Column 9: Architecture name 662 // Intern strings likely to be shared by many domains 663 String architectureName = fields[i++].intern(); 664 //Column 10: X-group name 665 String xGroupName = fields[i++].intern(); 666 //Column 11: H-group name 667 String hGroupName = fields[i++].intern(); 668 //Column 12: T-group name 669 String tGroupName = fields[i++].intern(); 670 //Column 13: F-group name (F_UNCLASSIFIED denotes that domain has not been assigned to an F-group) 671 //Contents changed in version 1.3 672 String fGroupName = fields[i++].intern(); 673 674 675 hGroupName = clearStringQuotes(hGroupName); 676 tGroupName = clearStringQuotes(tGroupName); 677 fGroupName = clearStringQuotes(fGroupName); 678 xGroupName = clearStringQuotes(xGroupName); 679 680 //Column 14: Domain assembly status (if domain is member of assembly, partners' ecod domain ids listed) 681 //Column 15: Comma-separated value list of non-polymer entities within 4 A of at least one residue of domain 682 Long assemblyId = null; 683 String assemblyStr = fields[i++]; 684 if(assemblyStr.equals(NOT_DOMAIN_ASSEMBLY)) { 685 assemblyId = uid; 686 } else if(assemblyStr.equals("IS_DOMAIN_ASSEMBLY") ) { 687 if(warnIsDomainAssembly > 1) { 688 logger.info("Deprecated 'IS_DOMAIN_ASSEMBLY' value ignored in line {}.",lineNum); 689 warnIsDomainAssembly--; 690 } else if(warnIsDomainAssembly == 0) { 691 logger.info("Deprecated 'IS_DOMAIN_ASSEMBLY' value ignored in line {}. Not printing future similar warnings.",lineNum); 692 warnIsDomainAssembly--; 693 } 694 //assemblyId = null; 695 } else { 696 assemblyId = Long.parseLong(assemblyStr); 697 } 698 699 String ligandStr = fields[i++]; 700 Set<String> ligands = null; 701 if( ligandStr.equals("NO_LIGANDS_4A") || ligandStr.isEmpty() ) { 702 ligands = Collections.emptySet(); 703 } else { 704 String[] ligSplit = ligandStr.split(","); 705 ligands = new LinkedHashSet<String>(ligSplit.length); 706 for(String s : ligSplit) { 707 ligands.add(s.intern()); 708 } 709 } 710 711 712 EcodDomain domain = new EcodDomain(uid, domainId, manual, xGroup, hGroup, tGroup, fGroup,pdbId, chainId, range, seqId, architectureName, xGroupName, hGroupName, tGroupName, fGroupName, assemblyId, ligands); 713 domainsList.add(domain); 714 } catch(NumberFormatException e) { 715 logger.warn("Error in ECOD parsing at line "+lineNum,e); 716 } 717 } else { 718 if(warnNumberOfFields > 1) { 719 logger.warn("Unexpected number of fields in line {}.",lineNum); 720 warnNumberOfFields--; 721 } else if(warnNumberOfFields == 0) { 722 logger.warn("Unexpected number of fields in line {}. Not printing future similar warnings",lineNum); 723 warnIsDomainAssembly--; 724 } 725 } 726 } 727 } 728 729 line = in.readLine(); 730 lineNum++; 731 } 732 if(this.version == null) 733 logger.info("Parsed {} ECOD domains",domainsList.size()); 734 else 735 logger.info("Parsed {} ECOD domains from version {}",domainsList.size(),this.version); 736 737 738 this.domains = Collections.unmodifiableList( domainsList ); 739 740 } finally { 741 if(in != null) { 742 in.close(); 743 } 744 } 745 } 746 747 private String clearStringQuotes(String name) { 748 if ( name.startsWith("\"")) 749 name = name.substring(1); 750 751 if ( name.endsWith("\"")) 752 name = name.substring(0,name.length()-1); 753 754 return name; 755 } 756 757 /** 758 * @return a list of all EcodDomains 759 */ 760 public List<EcodDomain> getDomains() { 761 return domains; 762 } 763 764 /** 765 * @return the requestedVersion for this file, or null if none was parsed 766 */ 767 public String getVersion() { 768 return version; 769 } 770 } 771 772 773 @Override 774 public String toString() { 775 String version = null; 776 try { 777 version = getVersion(); 778 } catch (IOException e) { 779 // For parsing errors, use the requested version 780 version = requestedVersion; 781 } 782 783 return "EcodInstallation [cacheLocation=" + cacheLocation 784 + ", version=" + version + "]"; 785 } 786 787 public static void main(String[] args) { 788 if( args.length!= 1) { 789 System.out.println("usage: ecod_domains.txt"); 790 System.exit(1); return; 791 } 792 793 String filename = args[0]; 794 795 try { 796 EcodParser parser = new EcodParser(filename); 797 798 List<EcodDomain> domains = parser.getDomains(); 799 800 System.out.format("Found %d ECOD domains.%n",domains.size()); 801 802 System.out.println("First 10 domains:"); 803 int i = 0; 804 for(EcodDomain d: domains) { 805 if( i>10) break; 806 807 System.out.println(d.getDomainId()); 808 i++; 809 } 810 } catch (IOException e) { 811 e.printStackTrace(); 812 } 813 } 814}