001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 */ 020 021package org.biojava.nbio.structure.ecod; 022 023import java.io.BufferedReader; 024import java.io.File; 025import java.io.FileReader; 026import java.io.IOException; 027import java.io.Reader; 028import java.net.MalformedURLException; 029import java.net.URL; 030import java.util.ArrayList; 031import java.util.Calendar; 032import java.util.Collections; 033import java.util.Date; 034import java.util.HashMap; 035import java.util.LinkedHashSet; 036import java.util.LinkedList; 037import java.util.List; 038import java.util.Map; 039import java.util.Set; 040import java.util.concurrent.locks.ReadWriteLock; 041import java.util.concurrent.locks.ReentrantReadWriteLock; 042import java.util.regex.Matcher; 043import java.util.regex.Pattern; 044 045import org.biojava.nbio.structure.align.util.UserConfiguration; 046import org.biojava.nbio.core.util.FileDownloadUtils; 047import org.slf4j.Logger; 048import org.slf4j.LoggerFactory; 049 050/** 051 * Provides access to the Evolutionary Classification of Protein Domains (ECOD). 052 * 053 * The preferred mechanism for obtaining instances of this class is through the 054 * {@link EcodFactory} class. 055 * 056 * Reference: 057 * H. Cheng, R. D. Schaeffer, Y. Liao, L. N. Kinch, J. Pei, S. Shi, B. H.\ 058 * Kim, N. V. Grishin. (2014) ECOD: An evolutionary classification of protein 059 * domains. PLoS Comput Biol 10(12): e1003926. 060 * http://prodata.swmed.edu/ecod/ 061 * 062 * @author Spencer Bliven 063 * 064 */ 065public class EcodInstallation implements EcodDatabase { 066 private static final Logger logger = LoggerFactory.getLogger(EcodInstallation.class); 067 068 public static final String DEFAULT_VERSION = "latest"; 069 private static final String DOMAINS_FILENAME_FORMAT = "ecod.%s.domains.txt"; 070 071 public static final String ECOD_URL = "http://prodata.swmed.edu"; 072 public static final String DOMAINS_PATH = "/ecod/distributions/"; 073 074 // ECOD identifiers are e<pdbID><chain><domain>, where chain and domain 075 // Chain and domain can both be multi-letter (e.g. e2q7zA10) 076 public static final Pattern ECOD_RE = Pattern.compile("^e(....).+\\d+$"); 077 078 079 private String cacheLocation; 080 private String requestedVersion; // version requested, e.g. "latest". Used for the paths 081 private String parsedVersion; // actual version parsed 082 083 // lock to prevent multiple threads from downloading simultaneously 084 // Should hold the lock when reading/writing allDomains or domainMap 085 private ReadWriteLock domainsFileLock; 086 private List<EcodDomain> allDomains; 087 private Map<String,List<EcodDomain>> domainMap;//PDB ID -> domains, lazily constructed from allDomains 088 089 private String url; 090 091 // Frequency of ECOD updates, in days. If non-null, redownloads "latest" if older than this. 092 private Integer updateFrequency = 14; 093 094 /** 095 * Use EcodFactory to create instances. The instantiation of multiple 096 * installations at the same path can lead to race conditions when downloading 097 * files. 098 * @param cacheLocation Location to save files, typically from the PDB_CACHE_DIR parameter 099 * @param requestedVersion ECOD requestedVersion to fetch 100 */ 101 public EcodInstallation(String cacheLocation, String version) { 102 domainsFileLock = new ReentrantReadWriteLock(); 103 104 this.cacheLocation = cacheLocation; 105 106 this.requestedVersion = version; 107 this.url = ECOD_URL; 108 109 allDomains = null; // null signals it needs to be parsed 110 domainMap = null; // null signals it needs to be constructed from allDomains 111 } 112 113 /** 114 * @see EcodFactory#getEcodDatabase() 115 */ 116 public EcodInstallation() { 117 this( new UserConfiguration().getCacheFilePath(), DEFAULT_VERSION ); 118 } 119 /** 120 public EcodInstallation(String cacheLocation) { 121 this( cacheLocation, DEFAULT_VERSION ); 122 } 123 124 /** 125 * Get a list of all ECOD domains for a particular PDB ID 126 * @param pdbId 127 * @return the list of domains, or null if no matching domains were found 128 * @throws IOException 129 */ 130 @Override 131 public List<EcodDomain> getDomainsForPdb(String pdbId) throws IOException { 132 domainsFileLock.readLock().lock(); 133 try { 134 logger.trace("LOCK readlock"); 135 while( domainMap == null ) { 136 // unlock to allow ensureDomainsFileInstalled to get the write lock 137 logger.trace("UNLOCK readlock"); 138 domainsFileLock.readLock().unlock(); 139 indexDomains(); 140 domainsFileLock.readLock().lock(); 141 logger.trace("LOCK readlock"); 142 } 143 144 if(pdbId != null) 145 pdbId = pdbId.toLowerCase(); 146 List<EcodDomain> doms = domainMap.get(pdbId); 147 if(doms == null) { 148 return null; 149 } 150 // Deep clone 151 List<EcodDomain> clonedDoms = new ArrayList<EcodDomain>(doms.size()); 152 for(EcodDomain d : doms) { 153 clonedDoms.add( new EcodDomain(d) ); 154 } 155 return clonedDoms; 156 } finally { 157 logger.trace("UNLOCK readlock"); 158 domainsFileLock.readLock().unlock(); 159 } 160 } 161 162 /** 163 * Get a list of domains within a particular level of the hierarchy 164 * @param hierarchy A dot-separated list giving the X-group, H-group, and/or 165 * T-group (e.g. "1.1" for all members of the RIFT-related H-group) 166 * @return 167 * @throws IOException 168 */ 169 @Override 170 public List<EcodDomain> filterByHierarchy(String hierarchy) throws IOException { 171 String[] xhtGroup = hierarchy.split("\\."); 172 Integer xGroup = xhtGroup.length>0 ? Integer.parseInt(xhtGroup[0]) : null; 173 Integer hGroup = xhtGroup.length>1 ? Integer.parseInt(xhtGroup[1]) : null; 174 Integer tGroup = xhtGroup.length>2 ? Integer.parseInt(xhtGroup[2]) : null; 175 176 List<EcodDomain> filtered = new ArrayList<EcodDomain>(); 177 for(EcodDomain d: getAllDomains()) { 178 boolean match = true; 179 if(xhtGroup.length>0) { 180 match = match && xGroup.equals(d.getXGroup()); 181 } 182 if(xhtGroup.length>1) { 183 match = match && hGroup.equals(d.getHGroup()); 184 } 185 if(xhtGroup.length>2) { 186 match = match && tGroup.equals(d.getTGroup()); 187 } 188 if(xhtGroup.length>3) { 189 logger.warn("Ignoring unexpected additional parts of ECOD {}",hierarchy); 190 } 191 if(match) { 192 filtered.add(d); 193 } 194 } 195 return filtered; 196 } 197 198 /** 199 * Get a particular ECOD domain by the domain ID (e.g. "e4hhbA1") 200 * @param ecodId 201 * @return 202 * @throws IOException 203 */ 204 @Override 205 public EcodDomain getDomainsById(String ecodId) throws IOException { 206 if(ecodId == null || ecodId.isEmpty()) { 207 return null; 208 } 209 210 Matcher match = ECOD_RE.matcher(ecodId); 211 String pdbId = null; 212 if( match.matches() ) 213 pdbId = match.group(1); 214 List<EcodDomain> doms = getDomainsForPdb(pdbId); 215 if(doms == null) { 216 logger.debug("Null domains for {} from {}",pdbId,ecodId); 217 return null; 218 } 219 logger.debug("Got {} domains from {}",doms.size(),pdbId); 220 for(EcodDomain d: doms) { 221 if(ecodId.equals(d.getDomainId())) { 222 return d; 223 } 224 } 225 return null; 226 } 227 228 /** 229 * Get all ECOD domains 230 * @return 231 * @throws IOException 232 */ 233 @Override 234 public List<EcodDomain> getAllDomains() throws IOException { 235 domainsFileLock.readLock().lock(); 236 logger.trace("LOCK readlock"); 237 try { 238 while( allDomains == null) { 239 // unlock to allow ensureDomainsFileInstalled to get the write lock 240 logger.trace("UNLOCK readlock"); 241 domainsFileLock.readLock().unlock(); 242 ensureDomainsFileInstalled(); 243 domainsFileLock.readLock().lock(); 244 logger.trace("LOCK readlock"); 245 } 246 return allDomains; 247 } finally { 248 logger.trace("UNLOCK readlock"); 249 domainsFileLock.readLock().unlock(); 250 } 251 252 } 253 254 /** 255 * Clears all domains, requiring the file to be reparsed for subsequent accesses 256 */ 257 public void clear() { 258 domainsFileLock.writeLock().lock(); 259 logger.trace("LOCK writelock"); 260 allDomains = null; 261 domainMap = null; 262 logger.trace("UNLOCK writelock"); 263 domainsFileLock.writeLock().unlock(); 264 } 265 /** 266 * Return the ECOD version, as parsed from the file. 267 * 268 * Note that this may differ from the version requested in the constructor 269 * for the special case of "latest" 270 * @return the ECOD version 271 * @throws IOException If an error occurs while downloading or parsing the file 272 */ 273 @Override 274 public String getVersion() throws IOException { 275 ensureDomainsFileInstalled(); 276 277 if( parsedVersion == null) { 278 return requestedVersion; 279 } 280 return parsedVersion; 281 } 282 283 /** 284 * Get the top-level ECOD server URL. Defaults to "http://prodata.swmed.edu" 285 * @return the url to the ecod server 286 */ 287 public String getUrl() { 288 return url; 289 } 290 291 /** 292 * Specify a different mirror for the ECOD server. 293 * @param urlFormat the urlFormat to set 294 */ 295 public void setUrl(String url) { 296 this.url = url; 297 } 298 299 /** 300 * Get the location of the cache directory (usually set to the PDB_CACHE_DIR 301 * property). ECOD files will be downloaded to this directory 302 * @return 303 */ 304 public String getCacheLocation() { 305 return cacheLocation; 306 } 307 /** 308 * Set an alternate download location for files 309 * @param cacheLocation 310 */ 311 public void setCacheLocation(String cacheLocation) { 312 if(cacheLocation.equals(this.cacheLocation)) { 313 return; //no change 314 } 315 // update location 316 domainsFileLock.writeLock().lock(); 317 logger.trace("LOCK writelock"); 318 this.cacheLocation = cacheLocation; 319 logger.trace("UNLOCK writelock"); 320 domainsFileLock.writeLock().unlock(); 321 } 322 323 /** 324 * Blocks until ECOD domains file has been downloaded and parsed. 325 * 326 * This may be useful in multithreaded environments. 327 * @throws IOException 328 */ 329 // Populates allDomains 330 public void ensureDomainsFileInstalled() throws IOException{ 331 // Quick check for availability 332 domainsFileLock.readLock().lock(); 333 logger.trace("LOCK readlock"); 334 try { 335 if( allDomains != null ) { 336 return; 337 } 338 } finally { 339 logger.trace("UNLOCK readlock"); 340 domainsFileLock.readLock().unlock(); 341 } 342 343 // Download domains 344 domainsFileLock.writeLock().lock(); 345 logger.trace("LOCK writelock"); 346 try { 347 if( !domainsAvailable() ) { 348 downloadDomains(); 349 } 350 parseDomains(); 351 } finally { 352 logger.trace("UNLOCK writelock"); 353 domainsFileLock.writeLock().unlock(); 354 } 355 } 356 357 /** 358 * Checks that the domains file has been downloaded 359 * @return 360 */ 361 private boolean domainsAvailable() { 362 domainsFileLock.readLock().lock(); 363 logger.trace("LOCK readlock"); 364 try { 365 File f = getDomainFile(); 366 367 if (!f.exists() || f.length() <= 0 ) 368 return false; 369 370 // Re-download old copies of "latest" 371 if(updateFrequency != null && requestedVersion == DEFAULT_VERSION ) { 372 long mod = f.lastModified(); 373 // Time of last update 374 Date lastUpdate = new Date(); 375 Calendar cal = Calendar.getInstance(); 376 cal.setTime(lastUpdate); 377 cal.add(Calendar.DAY_OF_WEEK, -updateFrequency); 378 long updateTime = cal.getTimeInMillis(); 379 // Check if file predates last update 380 if( mod < updateTime ) { 381 logger.info("{} is out of date.",f); 382 return false; 383 } 384 } 385 return true; 386 } finally { 387 logger.trace("UNLOCK readlock"); 388 domainsFileLock.readLock().unlock(); 389 } 390 } 391 392 /** 393 * Downloads the domains file, overwriting any existing file 394 * @throws IOException 395 */ 396 private void downloadDomains() throws IOException { 397 domainsFileLock.writeLock().lock(); 398 logger.trace("LOCK writelock"); 399 try { 400 URL domainsURL = new URL( url + DOMAINS_PATH + getDomainFilename()); 401 File localFile = getDomainFile(); 402 403 logger.info("Downloading {} to: {}",domainsURL, localFile); 404 FileDownloadUtils.downloadFile(domainsURL, localFile); 405 } catch (MalformedURLException e) { 406 logger.error("Malformed url: "+ url + DOMAINS_PATH + getDomainFilename(),e); 407 } finally { 408 logger.trace("UNLOCK writelock"); 409 domainsFileLock.writeLock().unlock(); 410 } 411 } 412 413 /** 414 * Basename for the domains file with the current requestedVersion. 415 * @return 416 */ 417 private String getDomainFilename() { 418 return String.format(DOMAINS_FILENAME_FORMAT,requestedVersion); 419 } 420 421 /** 422 * Local location for the domain file 423 * @return 424 */ 425 private File getDomainFile() { 426 return new File(getCacheLocation(),getDomainFilename()); 427 } 428 429 /** 430 * The expected ECOD update frequency determines whether the version 431 * "latest" should be re-downloaded 432 * @return the expected ECOD update frequency, in days 433 */ 434 public Integer getUpdateFrequency() { 435 return updateFrequency; 436 } 437 438 /** 439 * The "latest" version will be re-downloaded if it is older than 440 * {@link #getUpdateFrequency()} days. Setting this to null disables 441 * re-downloading (delete $PDB_CACHE_DIR/ecod.latest.domains.txt manually 442 * to force updating). Setting to 0 will force downloading for every 443 * program execution. 444 * @param updateFrequency the updateFrequency to set 445 */ 446 public void setUpdateFrequency(Integer updateFrequency) { 447 this.updateFrequency = updateFrequency; 448 } 449 450 /** 451 * Parses the domains from the local file 452 * @throws IOException 453 */ 454 private void parseDomains() throws IOException { 455 domainsFileLock.writeLock().lock(); 456 logger.trace("LOCK writelock"); 457 try { 458 EcodParser parser = new EcodParser(getDomainFile()); 459 allDomains = parser.getDomains(); 460 parsedVersion = parser.getVersion(); 461 } finally { 462 logger.trace("UNLOCK writelock"); 463 domainsFileLock.writeLock().unlock(); 464 } 465 } 466 467 /** 468 * Populates domainMap from allDomains 469 * @throws IOException 470 */ 471 private void indexDomains() throws IOException { 472 domainsFileLock.writeLock().lock(); 473 logger.trace("LOCK writelock"); 474 try { 475 if( allDomains == null) { 476 ensureDomainsFileInstalled(); 477 } 478 479 // Leave enough space for all PDBs as of 2015 480 domainMap = new HashMap<String, List<EcodDomain>>((int) (150000/.85),.85f); 481 482 // Index with domainMap 483 for(EcodDomain d : allDomains) { 484 // Get the PDB ID, either directly or from the domain ID 485 String pdbId = d.getPdbId(); 486 if( pdbId == null ) { 487 String ecodId = d.getDomainId(); 488 if( ecodId != null && !ecodId.isEmpty() ) { 489 Matcher match = ECOD_RE.matcher(ecodId); 490 pdbId = match.group(1); 491 } 492 } 493 494 // Add current domain to the map 495 List<EcodDomain> currDomains; 496 if( domainMap.containsKey(pdbId) ) { 497 currDomains = domainMap.get(pdbId); 498 } else { 499 currDomains = new LinkedList<EcodDomain>(); 500 domainMap.put(pdbId,currDomains); 501 } 502 currDomains.add(d); 503 } 504 } finally { 505 logger.trace("UNLOCK writelock"); 506 domainsFileLock.writeLock().unlock(); 507 } 508 509 } 510 511 512 public static class EcodParser { 513 /* 514Version Notes 515 516Current version (1.4) contains the following columns: 517 518Column 1: ECOD uid - internal domain unique identifier 519Column 2: ECOD domain id - domain identifier 520Column 3: ECOD representative status - manual (curated) or automated nonrep 521Column 4: ECOD hierachy identifier - [X-group].[H-group].[T-group].[F-group] 522 * In develop45-66 these also include single numbers in the range 1-265 523Column 5: PDB identifier 524Column 6: Chain identifier (note: case-sensitive) 525Column 7: PDB residue number range 526 * These are sometimes incorrect up to at least develop124. Examples are: 527 e4lxaA2 (should be A:184-385), e4lxmC3 (should be C:46P-183) 528Column 8: seq_id number range (based on internal PDB indices) 529Column 9: Architecture name 530Column 10: X-group name 531Column 11: H-group name 532Column 12: T-group name 533Column 13: F-group name (F_UNCLASSIFIED denotes that domain has not been assigned to an F-group) 534Column 14: Domain assembly status (if domain is member of assembly, partners' ecod domain ids listed) 535Column 15: Comma-separated value list of non-polymer entities within 4 A of at least one residue of domain 536 537Notes older versions: 538changelog: 539v1.0 - original version (8/04/2014) 540v1.1 - added rep/nonrep data (1/15/2015) 541v1.2 - added f-group identifiers to fasta file, domain description file. ECODf identifiers now used when available for F-group name. 542 Domain assemblies now represented by assembly uid in domain assembly status. 543v1.4 - added seqid_range and headers (develop101) 544 */ 545 546 /** String for unclassified F-groups */ 547 public static final String F_UNCLASSIFIED = "F_UNCLASSIFIED"; 548 /** String for single-domain assemblies */ 549 public static final String NOT_DOMAIN_ASSEMBLY = "NOT_DOMAIN_ASSEMBLY"; 550 /** Deprecated way of indicating there is an assembly. replaced by the assembly id */ 551 public static final String IS_DOMAIN_ASSEMBLY = "IS_DOMAIN_ASSEMBLY"; 552 /** Indicates a manual representative */ 553 public static final String IS_REPRESENTATIVE = "MANUAL_REP"; 554 /** Indicates not a manual representative */ 555 public static final String NOT_REPRESENTATIVE = "AUTO_NONREP"; 556 557 private List<EcodDomain> domains; 558 private String version; 559 560 public EcodParser(String filename) throws IOException { 561 this(new File(filename)); 562 } 563 public EcodParser(File file) throws IOException { 564 this(new FileReader(file)); 565 } 566 public EcodParser(Reader reader) throws IOException { 567 this(new BufferedReader(reader)); 568 } 569 public EcodParser(BufferedReader reader) throws IOException { 570 version = null; 571 parse(reader); 572 } 573 574 private void parse(BufferedReader in) throws IOException { 575 try { 576 // Allocate plenty of space for ECOD as of 2015 577 ArrayList<EcodDomain> domainsList = new ArrayList<EcodDomain>(500000); 578 579 Pattern versionRE = Pattern.compile("^\\s*#.*ECOD\\s*version\\s+(\\S+).*"); 580 Pattern commentRE = Pattern.compile("^\\s*#.*"); 581 582 // prevent too many warnings; negative numbers print all warnings 583 int warnIsDomainAssembly = 1; 584 int warnHierarchicalFormat = 5; 585 int warnNumberOfFields = 10; 586 587 String line = in.readLine(); 588 int lineNum = 1; 589 while( line != null ) { 590 // Check for requestedVersion string 591 Matcher match = versionRE.matcher(line); 592 if(match.matches()) { 593 // special requestedVersion comment 594 this.version = match.group(1); 595 } else { 596 match = commentRE.matcher(line); 597 if(match.matches()) { 598 // ignore comments 599 } else { 600 // data line 601 String[] fields = line.split("\t"); 602 if( fields.length == 13 || fields.length == 14 || fields.length == 15) { 603 try { 604 int i = 0; // field number, to allow future insertion of fields 605 606 //Column 1: ECOD uid - internal domain unique identifier 607 Long uid = Long.parseLong(fields[i++]); 608 //Column 2: ECOD domain id - domain identifier 609 String domainId = fields[i++]; 610 611 //Column 3: ECOD representative status - manual (curated) or automated nonrep 612 // Manual column may be missing in version 1.0 files 613 Boolean manual = null; 614 if( fields.length >= 14) { 615 String manualString = fields[i++]; 616 if(manualString.equalsIgnoreCase(IS_REPRESENTATIVE)) { 617 manual = true; 618 } else if(manualString.equalsIgnoreCase(NOT_REPRESENTATIVE)) { 619 manual = false; 620 } else { 621 logger.warn("Unexpected value for manual field: {} in line {}",manualString,lineNum); 622 } 623 } 624 625 //Column 4: ECOD hierachy identifier - [X-group].[H-group].[T-group].[F-group] 626 // hierarchical field, e.g. "1.1.4.1" 627 String[] xhtGroup = fields[i++].split("\\."); 628 if(xhtGroup.length < 3 || 4 < xhtGroup.length) { 629 if(warnHierarchicalFormat > 1) { 630 logger.warn("Unexpected format for hierarchical field \"{}\" in line {}",fields[i-1],lineNum); 631 warnHierarchicalFormat--; 632 } else if(warnHierarchicalFormat != 0) { 633 logger.warn("Unexpected format for hierarchical field \"{}\" in line {}. Not printing future similar warnings.",fields[i-1],lineNum); 634 warnHierarchicalFormat--; 635 } 636 } 637 Integer xGroup = xhtGroup.length>0 ? Integer.parseInt(xhtGroup[0]) : null; 638 Integer hGroup = xhtGroup.length>1 ? Integer.parseInt(xhtGroup[1]) : null; 639 Integer tGroup = xhtGroup.length>2 ? Integer.parseInt(xhtGroup[2]) : null; 640 Integer fGroup = xhtGroup.length>3 ? Integer.parseInt(xhtGroup[3]) : null; 641 642 //Column 5: PDB identifier 643 String pdbId = fields[i++]; 644 //Column 6: Chain identifier (note: case-sensitive) 645 String chainId = fields[i++]; 646 //Column 7: PDB residue number range 647 String range = fields[i++]; 648 649 //Column 8: seq_id number range (based on internal PDB indices) 650 //Added in version 1.4 651 String seqId = null; 652 if( fields.length >= 15) { 653 seqId = fields[i++]; 654 } 655 656 //Column 9: Architecture name 657 // Intern strings likely to be shared by many domains 658 String architectureName = fields[i++].intern(); 659 //Column 10: X-group name 660 String xGroupName = fields[i++].intern(); 661 //Column 11: H-group name 662 String hGroupName = fields[i++].intern(); 663 //Column 12: T-group name 664 String tGroupName = fields[i++].intern(); 665 //Column 13: F-group name (F_UNCLASSIFIED denotes that domain has not been assigned to an F-group) 666 //Contents changed in version 1.3 667 String fGroupName = fields[i++].intern(); 668 669 670 hGroupName = clearStringQuotes(hGroupName); 671 tGroupName = clearStringQuotes(tGroupName); 672 fGroupName = clearStringQuotes(fGroupName); 673 xGroupName = clearStringQuotes(xGroupName); 674 675 //Column 14: Domain assembly status (if domain is member of assembly, partners' ecod domain ids listed) 676 //Column 15: Comma-separated value list of non-polymer entities within 4 A of at least one residue of domain 677 Long assemblyId = null; 678 String assemblyStr = fields[i++]; 679 if(assemblyStr.equals(NOT_DOMAIN_ASSEMBLY)) { 680 assemblyId = uid; 681 } else if(assemblyStr.equals("IS_DOMAIN_ASSEMBLY") ) { 682 if(warnIsDomainAssembly > 1) { 683 logger.info("Deprecated 'IS_DOMAIN_ASSEMBLY' value ignored in line {}.",lineNum); 684 warnIsDomainAssembly--; 685 } else if(warnIsDomainAssembly == 0) { 686 logger.info("Deprecated 'IS_DOMAIN_ASSEMBLY' value ignored in line {}. Not printing future similar warnings.",lineNum); 687 warnIsDomainAssembly--; 688 } 689 //assemblyId = null; 690 } else { 691 assemblyId = Long.parseLong(assemblyStr); 692 } 693 694 String ligandStr = fields[i++]; 695 Set<String> ligands = null; 696 if( ligandStr.equals("NO_LIGANDS_4A") || ligandStr.isEmpty() ) { 697 ligands = Collections.emptySet(); 698 } else { 699 String[] ligSplit = ligandStr.split(","); 700 ligands = new LinkedHashSet<String>(ligSplit.length); 701 for(String s : ligSplit) { 702 ligands.add(s.intern()); 703 } 704 } 705 706 707 EcodDomain domain = new EcodDomain(uid, domainId, manual, xGroup, hGroup, tGroup, fGroup,pdbId, chainId, range, seqId, architectureName, xGroupName, hGroupName, tGroupName, fGroupName, assemblyId, ligands); 708 domainsList.add(domain); 709 } catch(NumberFormatException e) { 710 logger.warn("Error in ECOD parsing at line "+lineNum,e); 711 } 712 } else { 713 if(warnNumberOfFields > 1) { 714 logger.warn("Unexpected number of fields in line {}.",lineNum); 715 warnNumberOfFields--; 716 } else if(warnNumberOfFields == 0) { 717 logger.warn("Unexpected number of fields in line {}. Not printing future similar warnings",lineNum); 718 warnIsDomainAssembly--; 719 } 720 } 721 } 722 } 723 724 line = in.readLine(); 725 lineNum++; 726 } 727 if(this.version == null) 728 logger.info("Parsed {} ECOD domains",domainsList.size()); 729 else 730 logger.info("Parsed {} ECOD domains from version {}",domainsList.size(),this.version); 731 732 733 this.domains = Collections.unmodifiableList( domainsList ); 734 735 } finally { 736 if(in != null) { 737 in.close(); 738 } 739 } 740 } 741 742 private String clearStringQuotes(String name) { 743 if ( name.startsWith("\"")) 744 name = name.substring(1); 745 746 if ( name.endsWith("\"")) 747 name = name.substring(0,name.length()-1); 748 749 return name; 750 } 751 752 /** 753 * @return a list of all EcodDomains 754 */ 755 public List<EcodDomain> getDomains() { 756 return domains; 757 } 758 759 /** 760 * @return the requestedVersion for this file, or null if none was parsed 761 */ 762 public String getVersion() { 763 return version; 764 } 765 } 766 767 768 @Override 769 public String toString() { 770 String version = null; 771 try { 772 version = getVersion(); 773 } catch (IOException e) { 774 // For parsing errors, use the requested version 775 version = requestedVersion; 776 } 777 778 return "EcodInstallation [cacheLocation=" + cacheLocation 779 + ", version=" + version + "]"; 780 } 781 782 public static void main(String[] args) { 783 if( args.length!= 1) { 784 System.out.println("usage: ecod_domains.txt"); 785 System.exit(1); return; 786 } 787 788 String filename = args[0]; 789 790 try { 791 EcodParser parser = new EcodParser(filename); 792 793 List<EcodDomain> domains = parser.getDomains(); 794 795 System.out.format("Found %d ECOD domains.%n",domains.size()); 796 797 System.out.println("First 10 domains:"); 798 int i = 0; 799 for(EcodDomain d: domains) { 800 if( i>10) break; 801 802 System.out.println(d.getDomainId()); 803 i++; 804 } 805 } catch (IOException e) { 806 e.printStackTrace(); 807 } 808 } 809}