001/*
002 * BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 */
020
021package org.biojava.nbio.structure.ecod;
022
023import java.io.BufferedReader;
024import java.io.File;
025import java.io.FileReader;
026import java.io.IOException;
027import java.io.Reader;
028import java.net.MalformedURLException;
029import java.net.URL;
030import java.util.ArrayList;
031import java.util.Calendar;
032import java.util.Collections;
033import java.util.Date;
034import java.util.HashMap;
035import java.util.LinkedHashSet;
036import java.util.LinkedList;
037import java.util.List;
038import java.util.Map;
039import java.util.Set;
040import java.util.concurrent.locks.ReadWriteLock;
041import java.util.concurrent.locks.ReentrantReadWriteLock;
042import java.util.regex.Matcher;
043import java.util.regex.Pattern;
044
045import org.biojava.nbio.structure.align.util.UserConfiguration;
046import org.biojava.nbio.core.util.FileDownloadUtils;
047import org.slf4j.Logger;
048import org.slf4j.LoggerFactory;
049
050/**
051 * Provides access to the Evolutionary Classification of Protein Domains (ECOD).
052 *
053 * The preferred mechanism for obtaining instances of this class is through the
054 * {@link EcodFactory} class.
055 *
056 * Reference:
057 * H. Cheng, R. D. Schaeffer, Y. Liao, L. N. Kinch, J. Pei, S. Shi, B. H.\
058 *   Kim, N. V. Grishin. (2014) ECOD: An evolutionary classification of protein
059 *   domains. PLoS Comput Biol 10(12): e1003926.
060 * http://prodata.swmed.edu/ecod/
061 *
062 * @author Spencer Bliven
063 *
064 */
065public class EcodInstallation implements EcodDatabase {
066        private static final Logger logger = LoggerFactory.getLogger(EcodInstallation.class);
067
068        public static final String DEFAULT_VERSION = "latest";
069        private static final String DOMAINS_FILENAME_FORMAT = "ecod.%s.domains.txt";
070
071        public static final String ECOD_URL = "http://prodata.swmed.edu";
072        public static final String DOMAINS_PATH = "/ecod/distributions/";
073
074        // ECOD identifiers are e<pdbID><chain><domain>, where chain and domain
075        // Chain and domain can both be multi-letter (e.g. e2q7zA10)
076        public static final Pattern ECOD_RE = Pattern.compile("^e(....).+\\d+$");
077
078
079        private String cacheLocation;
080        private String requestedVersion; // version requested, e.g. "latest". Used for the paths
081        private String parsedVersion; // actual version parsed
082
083        // lock to prevent multiple threads from downloading simultaneously
084        // Should hold the lock when reading/writing allDomains or domainMap
085        private ReadWriteLock domainsFileLock;
086        private List<EcodDomain> allDomains;
087        private Map<String,List<EcodDomain>> domainMap;//PDB ID -> domains, lazily constructed from allDomains
088
089        private String url;
090
091        // Frequency of ECOD updates, in days. If non-null, redownloads "latest" if older than this.
092        private Integer updateFrequency = 14;
093        
094        /**
095         * Use EcodFactory to create instances. The instantiation of multiple
096         * installations at the same path can lead to race conditions when downloading
097         * files.
098         * @param cacheLocation Location to save files, typically from the PDB_CACHE_DIR parameter
099         * @param requestedVersion ECOD requestedVersion to fetch
100         */
101        public EcodInstallation(String cacheLocation, String version) {
102                domainsFileLock = new ReentrantReadWriteLock();
103
104                this.cacheLocation = cacheLocation;
105
106                this.requestedVersion = version;
107                this.url = ECOD_URL;
108
109                allDomains = null; // null signals it needs to be parsed
110                domainMap = null; // null signals it needs to be constructed from allDomains
111        }
112
113        /**
114         * @see EcodFactory#getEcodDatabase()
115         */
116        public EcodInstallation() {
117                this( new UserConfiguration().getCacheFilePath(), DEFAULT_VERSION );
118        }
119        /**
120        public EcodInstallation(String cacheLocation) {
121                this( cacheLocation, DEFAULT_VERSION );
122        }
123
124        /**
125         * Get a list of all ECOD domains for a particular PDB ID
126         * @param pdbId
127         * @return the list of domains, or null if no matching domains were found
128         * @throws IOException
129         */
130        @Override
131        public List<EcodDomain> getDomainsForPdb(String pdbId) throws IOException {
132                domainsFileLock.readLock().lock();
133                try {
134                        logger.trace("LOCK readlock");
135                        while( domainMap == null ) {
136                                // unlock to allow ensureDomainsFileInstalled to get the write lock
137                                logger.trace("UNLOCK readlock");
138                                domainsFileLock.readLock().unlock();
139                                indexDomains();
140                                domainsFileLock.readLock().lock();
141                                logger.trace("LOCK readlock");
142                        }
143
144                        if(pdbId != null)
145                                pdbId = pdbId.toLowerCase();
146                        List<EcodDomain> doms = domainMap.get(pdbId);
147                        if(doms == null) {
148                                return null;
149                        }
150                        // Deep clone
151                        List<EcodDomain> clonedDoms = new ArrayList<EcodDomain>(doms.size());
152                        for(EcodDomain d : doms) {
153                                clonedDoms.add( new EcodDomain(d) );
154                        }
155                        return clonedDoms;
156                } finally {
157                        logger.trace("UNLOCK readlock");
158                        domainsFileLock.readLock().unlock();
159                }
160        }
161
162        /**
163         * Get a list of domains within a particular level of the hierarchy
164         * @param hierarchy A dot-separated list giving the X-group, H-group, and/or
165         *  T-group (e.g. "1.1" for all members of the RIFT-related H-group)
166         * @return
167         * @throws IOException
168         */
169        @Override
170        public List<EcodDomain> filterByHierarchy(String hierarchy) throws IOException {
171                String[] xhtGroup = hierarchy.split("\\.");
172                Integer xGroup = xhtGroup.length>0 ? Integer.parseInt(xhtGroup[0]) : null;
173                Integer hGroup = xhtGroup.length>1 ? Integer.parseInt(xhtGroup[1]) : null;
174                Integer tGroup = xhtGroup.length>2 ? Integer.parseInt(xhtGroup[2]) : null;
175
176                List<EcodDomain> filtered = new ArrayList<EcodDomain>();
177                for(EcodDomain d: getAllDomains()) {
178                        boolean match = true;
179                        if(xhtGroup.length>0) {
180                                match = match && xGroup.equals(d.getXGroup());
181                        }
182                        if(xhtGroup.length>1) {
183                                match = match && hGroup.equals(d.getHGroup());
184                        }
185                        if(xhtGroup.length>2) {
186                                match = match && tGroup.equals(d.getTGroup());
187                        }
188                        if(xhtGroup.length>3) {
189                                logger.warn("Ignoring unexpected additional parts of ECOD {}",hierarchy);
190                        }
191                        if(match) {
192                                filtered.add(d);
193                        }
194                }
195                return filtered;
196        }
197
198        /**
199         * Get a particular ECOD domain by the domain ID (e.g. "e4hhbA1")
200         * @param ecodId
201         * @return
202         * @throws IOException
203         */
204        @Override
205        public EcodDomain getDomainsById(String ecodId) throws IOException {
206                if(ecodId == null || ecodId.isEmpty()) {
207                        return null;
208                }
209
210                Matcher match = ECOD_RE.matcher(ecodId);
211                String pdbId = null;
212                if( match.matches() )
213                        pdbId = match.group(1);
214                List<EcodDomain> doms = getDomainsForPdb(pdbId);
215                if(doms == null) {
216                        logger.debug("Null domains for {} from {}",pdbId,ecodId);
217                        return null;
218                }
219                logger.debug("Got {} domains from {}",doms.size(),pdbId);
220                for(EcodDomain d: doms) {
221                        if(ecodId.equals(d.getDomainId())) {
222                                return d;
223                        }
224                }
225                return null;
226        }
227
228        /**
229         * Get all ECOD domains
230         * @return
231         * @throws IOException
232         */
233        @Override
234        public List<EcodDomain> getAllDomains() throws IOException {
235                domainsFileLock.readLock().lock();
236                logger.trace("LOCK readlock");
237                try {
238                        while( allDomains == null) {
239                                // unlock to allow ensureDomainsFileInstalled to get the write lock
240                                logger.trace("UNLOCK readlock");
241                                domainsFileLock.readLock().unlock();
242                                ensureDomainsFileInstalled();
243                                domainsFileLock.readLock().lock();
244                                logger.trace("LOCK readlock");
245                        }
246                        return allDomains;
247                } finally {
248                        logger.trace("UNLOCK readlock");
249                        domainsFileLock.readLock().unlock();
250                }
251
252        }
253
254        /**
255         * Clears all domains, requiring the file to be reparsed for subsequent accesses
256         */
257        public void clear() {
258                domainsFileLock.writeLock().lock();
259                logger.trace("LOCK writelock");
260                allDomains = null;
261                domainMap = null;
262                logger.trace("UNLOCK writelock");
263                domainsFileLock.writeLock().unlock();
264        }
265        /**
266         * Return the ECOD version, as parsed from the file.
267         *
268         * Note that this may differ from the version requested in the constructor
269         * for the special case of "latest"
270         * @return the ECOD version
271         * @throws IOException If an error occurs while downloading or parsing the file
272         */
273        @Override
274        public String getVersion() throws IOException {
275                ensureDomainsFileInstalled();
276
277                if( parsedVersion == null) {
278                        return requestedVersion;
279                }
280                return parsedVersion;
281        }
282
283        /**
284         * Get the top-level ECOD server URL. Defaults to "http://prodata.swmed.edu"
285         * @return the url to the ecod server
286         */
287        public String getUrl() {
288                return url;
289        }
290
291        /**
292         * Specify a different mirror for the ECOD server.
293         * @param urlFormat the urlFormat to set
294         */
295        public void setUrl(String url) {
296                this.url = url;
297        }
298
299        /**
300         * Get the location of the cache directory (usually set to the PDB_CACHE_DIR
301         * property). ECOD files will be downloaded to this directory
302         * @return
303         */
304        public String getCacheLocation() {
305                return cacheLocation;
306        }
307        /**
308         * Set an alternate download location for files
309         * @param cacheLocation
310         */
311        public void setCacheLocation(String cacheLocation) {
312                if(cacheLocation.equals(this.cacheLocation)) {
313                        return; //no change
314                }
315                // update location
316                domainsFileLock.writeLock().lock();
317                logger.trace("LOCK writelock");
318                this.cacheLocation = cacheLocation;
319                logger.trace("UNLOCK writelock");
320                domainsFileLock.writeLock().unlock();
321        }
322
323        /**
324         * Blocks until ECOD domains file has been downloaded and parsed.
325         *
326         * This may be useful in multithreaded environments.
327         * @throws IOException
328         */
329        // Populates allDomains
330        public void ensureDomainsFileInstalled() throws IOException{
331                // Quick check for availability
332                domainsFileLock.readLock().lock();
333                logger.trace("LOCK readlock");
334                try {
335                        if( allDomains != null ) {
336                                return;
337                        }
338                } finally {
339                        logger.trace("UNLOCK readlock");
340                        domainsFileLock.readLock().unlock();
341                }
342
343                // Download domains
344                domainsFileLock.writeLock().lock();
345                logger.trace("LOCK writelock");
346                try {
347                        if( !domainsAvailable() ) {
348                                downloadDomains();
349                        }
350                        parseDomains();
351                } finally {
352                        logger.trace("UNLOCK writelock");
353                        domainsFileLock.writeLock().unlock();
354                }
355        }
356
357        /**
358         * Checks that the domains file has been downloaded
359         * @return
360         */
361        private boolean domainsAvailable() {
362                domainsFileLock.readLock().lock();
363                logger.trace("LOCK readlock");
364                try {
365                        File f = getDomainFile();
366
367                        if (!f.exists() || f.length() <= 0 )
368                                return false;
369                        
370                        // Re-download old copies of "latest"
371                        if(updateFrequency != null && requestedVersion == DEFAULT_VERSION ) {
372                                long mod = f.lastModified();
373                                // Time of last update
374                                Date lastUpdate = new Date();
375                                Calendar cal = Calendar.getInstance();
376                                cal.setTime(lastUpdate);
377                                cal.add(Calendar.DAY_OF_WEEK, -updateFrequency);
378                                long updateTime = cal.getTimeInMillis();
379                                // Check if file predates last update
380                                if( mod < updateTime ) {
381                                        logger.info("{} is out of date.",f);
382                                        return false;
383                                }
384                        }
385                        return true;
386                } finally {
387                        logger.trace("UNLOCK readlock");
388                        domainsFileLock.readLock().unlock();
389                }
390        }
391
392        /**
393         * Downloads the domains file, overwriting any existing file
394         * @throws IOException
395         */
396        private void downloadDomains() throws IOException {
397                domainsFileLock.writeLock().lock();
398                logger.trace("LOCK writelock");
399                try {
400                        URL domainsURL = new URL( url + DOMAINS_PATH + getDomainFilename());
401                        File localFile = getDomainFile();
402
403                        logger.info("Downloading {} to: {}",domainsURL, localFile);
404                        FileDownloadUtils.downloadFile(domainsURL, localFile);
405                } catch (MalformedURLException e) {
406                        logger.error("Malformed url: "+ url + DOMAINS_PATH + getDomainFilename(),e);
407                } finally {
408                        logger.trace("UNLOCK writelock");
409                        domainsFileLock.writeLock().unlock();
410                }
411        }
412
413        /**
414         * Basename for the domains file with the current requestedVersion.
415         * @return
416         */
417        private String getDomainFilename() {
418                return  String.format(DOMAINS_FILENAME_FORMAT,requestedVersion);
419        }
420
421        /**
422         * Local location for the domain file
423         * @return
424         */
425        private File getDomainFile() {
426                return new File(getCacheLocation(),getDomainFilename());
427        }
428
429        /**
430         * The expected ECOD update frequency determines whether the version
431         * "latest" should be re-downloaded
432         * @return the expected ECOD update frequency, in days
433         */
434        public Integer getUpdateFrequency() {
435                return updateFrequency;
436        }
437
438        /**
439         * The "latest" version will be re-downloaded if it is older than
440         * {@link #getUpdateFrequency()} days. Setting this to null disables
441         * re-downloading (delete $PDB_CACHE_DIR/ecod.latest.domains.txt manually
442         * to force updating). Setting to 0 will force downloading for every
443         * program execution.
444         * @param updateFrequency the updateFrequency to set
445         */
446        public void setUpdateFrequency(Integer updateFrequency) {
447                this.updateFrequency = updateFrequency;
448        }
449
450        /**
451         * Parses the domains from the local file
452         * @throws IOException
453         */
454        private void parseDomains() throws IOException {
455                domainsFileLock.writeLock().lock();
456                logger.trace("LOCK writelock");
457                try {
458                        EcodParser parser = new EcodParser(getDomainFile());
459                        allDomains = parser.getDomains();
460                        parsedVersion = parser.getVersion();
461                } finally {
462                        logger.trace("UNLOCK writelock");
463                        domainsFileLock.writeLock().unlock();
464                }
465        }
466
467        /**
468         * Populates domainMap from allDomains
469         * @throws IOException
470         */
471        private void indexDomains() throws IOException {
472                domainsFileLock.writeLock().lock();
473                logger.trace("LOCK writelock");
474                try {
475                        if( allDomains == null) {
476                                ensureDomainsFileInstalled();
477                        }
478
479                        // Leave enough space for all PDBs as of 2015
480                        domainMap = new HashMap<String, List<EcodDomain>>((int) (150000/.85),.85f);
481
482                        // Index with domainMap
483                        for(EcodDomain d : allDomains) {
484                                // Get the PDB ID, either directly or from the domain ID
485                                String pdbId = d.getPdbId();
486                                if( pdbId == null ) {
487                                        String ecodId = d.getDomainId();
488                                        if( ecodId != null && !ecodId.isEmpty() ) {
489                                                Matcher match = ECOD_RE.matcher(ecodId);
490                                                pdbId = match.group(1);
491                                        }
492                                }
493
494                                // Add current domain to the map
495                                List<EcodDomain> currDomains;
496                                if( domainMap.containsKey(pdbId) ) {
497                                        currDomains = domainMap.get(pdbId);
498                                } else {
499                                        currDomains = new LinkedList<EcodDomain>();
500                                        domainMap.put(pdbId,currDomains);
501                                }
502                                currDomains.add(d);
503                        }
504                } finally {
505                        logger.trace("UNLOCK writelock");
506                        domainsFileLock.writeLock().unlock();
507                }
508
509        }
510
511
512        public static class EcodParser {
513                /*
514Version Notes
515
516Current version (1.4) contains the following columns:
517
518Column 1: ECOD uid - internal domain unique identifier
519Column 2: ECOD domain id - domain identifier
520Column 3: ECOD representative status - manual (curated) or automated nonrep
521Column 4: ECOD hierachy identifier - [X-group].[H-group].[T-group].[F-group]
522        * In develop45-66 these also include single numbers in the range 1-265
523Column 5: PDB identifier
524Column 6: Chain identifier (note: case-sensitive)
525Column 7: PDB residue number range
526        * These are sometimes incorrect up to at least develop124. Examples are:
527          e4lxaA2 (should be A:184-385), e4lxmC3 (should be C:46P-183)
528Column 8: seq_id number range (based on internal PDB indices)
529Column 9: Architecture name
530Column 10: X-group name
531Column 11: H-group name
532Column 12: T-group name
533Column 13: F-group name (F_UNCLASSIFIED denotes that domain has not been assigned to an F-group)
534Column 14: Domain assembly status (if domain is member of assembly, partners' ecod domain ids listed)
535Column 15: Comma-separated value list of non-polymer entities within 4 A of at least one residue of domain
536
537Notes older versions:
538changelog:
539v1.0 - original version (8/04/2014)
540v1.1 - added rep/nonrep data (1/15/2015)
541v1.2 - added f-group identifiers to fasta file, domain description file. ECODf identifiers now used when available for F-group name.
542        Domain assemblies now represented by assembly uid in domain assembly status.
543v1.4 - added seqid_range and headers (develop101)
544                 */
545
546                /** String for unclassified F-groups */
547                public static final String F_UNCLASSIFIED = "F_UNCLASSIFIED";
548                /** String for single-domain assemblies */
549                public static final String NOT_DOMAIN_ASSEMBLY = "NOT_DOMAIN_ASSEMBLY";
550                /** Deprecated way of indicating there is an assembly. replaced by the assembly id */
551                public static final String IS_DOMAIN_ASSEMBLY = "IS_DOMAIN_ASSEMBLY";
552                /** Indicates a manual representative */
553                public static final String IS_REPRESENTATIVE = "MANUAL_REP";
554                /** Indicates not a manual representative */
555                public static final String NOT_REPRESENTATIVE = "AUTO_NONREP";
556
557                private List<EcodDomain> domains;
558                private String version;
559
560                public EcodParser(String filename) throws IOException {
561                        this(new File(filename));
562                }
563                public EcodParser(File file) throws IOException {
564                        this(new FileReader(file));
565                }
566                public EcodParser(Reader reader) throws IOException {
567                        this(new BufferedReader(reader));
568                }
569                public EcodParser(BufferedReader reader) throws IOException {
570                        version = null;
571                        parse(reader);
572                }
573
574                private void parse(BufferedReader in) throws IOException {
575                        try {
576                                // Allocate plenty of space for ECOD as of 2015
577                                ArrayList<EcodDomain> domainsList = new ArrayList<EcodDomain>(500000);
578
579                                Pattern versionRE = Pattern.compile("^\\s*#.*ECOD\\s*version\\s+(\\S+).*");
580                                Pattern commentRE = Pattern.compile("^\\s*#.*");
581
582                                // prevent too many warnings; negative numbers print all warnings
583                                int warnIsDomainAssembly = 1;
584                                int warnHierarchicalFormat = 5;
585                                int warnNumberOfFields = 10;
586
587                                String line = in.readLine();
588                                int lineNum = 1;
589                                while( line != null ) {
590                                        // Check for requestedVersion string
591                                        Matcher match = versionRE.matcher(line);
592                                        if(match.matches()) {
593                                                // special requestedVersion comment
594                                                this.version = match.group(1);
595                                        } else {
596                                                match = commentRE.matcher(line);
597                                                if(match.matches()) {
598                                                        // ignore comments
599                                                } else {
600                                                        // data line
601                                                        String[] fields = line.split("\t");
602                                                        if( fields.length == 13 || fields.length == 14 || fields.length == 15) {
603                                                                try {
604                                                                        int i = 0; // field number, to allow future insertion of fields
605
606                                                                        //Column 1: ECOD uid - internal domain unique identifier
607                                                                        Long uid = Long.parseLong(fields[i++]);
608                                                                        //Column 2: ECOD domain id - domain identifier
609                                                                        String domainId = fields[i++];
610
611                                                                        //Column 3: ECOD representative status - manual (curated) or automated nonrep
612                                                                        // Manual column may be missing in version 1.0 files
613                                                                        Boolean manual = null;
614                                                                        if( fields.length >= 14) {
615                                                                                String manualString = fields[i++];
616                                                                                if(manualString.equalsIgnoreCase(IS_REPRESENTATIVE)) {
617                                                                                        manual = true;
618                                                                                } else if(manualString.equalsIgnoreCase(NOT_REPRESENTATIVE)) {
619                                                                                        manual = false;
620                                                                                } else {
621                                                                                        logger.warn("Unexpected value for manual field: {} in line {}",manualString,lineNum);
622                                                                                }
623                                                                        }
624
625                                                                        //Column 4: ECOD hierachy identifier - [X-group].[H-group].[T-group].[F-group]
626                                                                        // hierarchical field, e.g. "1.1.4.1"
627                                                                        String[] xhtGroup = fields[i++].split("\\.");
628                                                                        if(xhtGroup.length < 3 || 4 < xhtGroup.length) {
629                                                                                if(warnHierarchicalFormat > 1) {
630                                                                                        logger.warn("Unexpected format for hierarchical field \"{}\" in line {}",fields[i-1],lineNum);
631                                                                                        warnHierarchicalFormat--;
632                                                                                } else if(warnHierarchicalFormat != 0) {
633                                                                                        logger.warn("Unexpected format for hierarchical field \"{}\" in line {}. Not printing future similar warnings.",fields[i-1],lineNum);
634                                                                                        warnHierarchicalFormat--;
635                                                                                }
636                                                                        }
637                                                                        Integer xGroup = xhtGroup.length>0 ? Integer.parseInt(xhtGroup[0]) : null;
638                                                                        Integer hGroup = xhtGroup.length>1 ? Integer.parseInt(xhtGroup[1]) : null;
639                                                                        Integer tGroup = xhtGroup.length>2 ? Integer.parseInt(xhtGroup[2]) : null;
640                                                                        Integer fGroup = xhtGroup.length>3 ? Integer.parseInt(xhtGroup[3]) : null;
641
642                                                                        //Column 5: PDB identifier
643                                                                        String pdbId = fields[i++];
644                                                                        //Column 6: Chain identifier (note: case-sensitive)
645                                                                        String chainId = fields[i++];
646                                                                        //Column 7: PDB residue number range
647                                                                        String range = fields[i++];
648
649                                                                        //Column 8: seq_id number range (based on internal PDB indices)
650                                                                        //Added in version 1.4
651                                                                        String seqId = null;
652                                                                        if( fields.length >= 15) {
653                                                                                seqId = fields[i++];
654                                                                        }
655
656                                                                        //Column 9: Architecture name
657                                                                        // Intern strings likely to be shared by many domains
658                                                                        String architectureName = fields[i++].intern();
659                                                                        //Column 10: X-group name
660                                                                        String xGroupName = fields[i++].intern();
661                                                                        //Column 11: H-group name
662                                                                        String hGroupName = fields[i++].intern();
663                                                                        //Column 12: T-group name
664                                                                        String tGroupName = fields[i++].intern();
665                                                                        //Column 13: F-group name (F_UNCLASSIFIED denotes that domain has not been assigned to an F-group)
666                                                                        //Contents changed in version 1.3
667                                                                        String fGroupName = fields[i++].intern();
668
669
670                                                                        hGroupName = clearStringQuotes(hGroupName);
671                                                                        tGroupName = clearStringQuotes(tGroupName);
672                                                                        fGroupName = clearStringQuotes(fGroupName);
673                                                                        xGroupName = clearStringQuotes(xGroupName);
674
675                                                                        //Column 14: Domain assembly status (if domain is member of assembly, partners' ecod domain ids listed)
676                                                                        //Column 15: Comma-separated value list of non-polymer entities within 4 A of at least one residue of domain
677                                                                        Long assemblyId = null;
678                                                                        String assemblyStr = fields[i++];
679                                                                        if(assemblyStr.equals(NOT_DOMAIN_ASSEMBLY)) {
680                                                                                assemblyId = uid;
681                                                                        } else if(assemblyStr.equals("IS_DOMAIN_ASSEMBLY") ) {
682                                                                                if(warnIsDomainAssembly > 1) {
683                                                                                        logger.info("Deprecated 'IS_DOMAIN_ASSEMBLY' value ignored in line {}.",lineNum);
684                                                                                        warnIsDomainAssembly--;
685                                                                                } else if(warnIsDomainAssembly == 0) {
686                                                                                        logger.info("Deprecated 'IS_DOMAIN_ASSEMBLY' value ignored in line {}. Not printing future similar warnings.",lineNum);
687                                                                                        warnIsDomainAssembly--;
688                                                                                }
689                                                                                //assemblyId = null;
690                                                                        } else {
691                                                                                assemblyId = Long.parseLong(assemblyStr);
692                                                                        }
693
694                                                                        String ligandStr = fields[i++];
695                                                                        Set<String> ligands = null;
696                                                                        if( ligandStr.equals("NO_LIGANDS_4A") || ligandStr.isEmpty() ) {
697                                                                                ligands = Collections.emptySet();
698                                                                        } else {
699                                                                                String[] ligSplit = ligandStr.split(",");
700                                                                                ligands = new LinkedHashSet<String>(ligSplit.length);
701                                                                                for(String s : ligSplit) {
702                                                                                        ligands.add(s.intern());
703                                                                                }
704                                                                        }
705
706
707                                                                        EcodDomain domain = new EcodDomain(uid, domainId, manual, xGroup, hGroup, tGroup, fGroup,pdbId, chainId, range, seqId, architectureName, xGroupName, hGroupName, tGroupName, fGroupName, assemblyId, ligands);
708                                                                        domainsList.add(domain);
709                                                                } catch(NumberFormatException e) {
710                                                                        logger.warn("Error in ECOD parsing at line "+lineNum,e);
711                                                                }
712                                                        } else {
713                                                                if(warnNumberOfFields > 1) {
714                                                                        logger.warn("Unexpected number of fields in line {}.",lineNum);
715                                                                        warnNumberOfFields--;
716                                                                } else if(warnNumberOfFields == 0) {
717                                                                        logger.warn("Unexpected number of fields in line {}. Not printing future similar warnings",lineNum);
718                                                                        warnIsDomainAssembly--;
719                                                                }
720                                                        }
721                                                }
722                                        }
723
724                                        line = in.readLine();
725                                        lineNum++;
726                                }
727                                if(this.version == null)
728                                        logger.info("Parsed {} ECOD domains",domainsList.size());
729                                else
730                                        logger.info("Parsed {} ECOD domains from version {}",domainsList.size(),this.version);
731
732
733                                this.domains = Collections.unmodifiableList( domainsList );
734
735                        } finally {
736                                if(in != null) {
737                                        in.close();
738                                }
739                        }
740                }
741
742                private String clearStringQuotes(String name) {
743                        if ( name.startsWith("\""))
744                                name = name.substring(1);
745
746                        if ( name.endsWith("\""))
747                                name = name.substring(0,name.length()-1);
748
749                        return name;
750                }
751
752                /**
753                 * @return a list of all EcodDomains
754                 */
755                public List<EcodDomain> getDomains() {
756                        return domains;
757                }
758
759                /**
760                 * @return the requestedVersion for this file, or null if none was parsed
761                 */
762                public String getVersion() {
763                        return version;
764                }
765        }
766
767
768        @Override
769        public String toString() {
770                String version = null;
771                try {
772                        version = getVersion();
773                } catch (IOException e) {
774                        // For parsing errors, use the requested version
775                        version = requestedVersion;
776                }
777
778                return "EcodInstallation [cacheLocation=" + cacheLocation
779                                + ", version=" + version + "]";
780        }
781
782        public static void main(String[] args) {
783                if( args.length!= 1) {
784                        System.out.println("usage: ecod_domains.txt");
785                        System.exit(1); return;
786                }
787
788                String filename = args[0];
789
790                try {
791                        EcodParser parser = new EcodParser(filename);
792
793                        List<EcodDomain> domains = parser.getDomains();
794
795                        System.out.format("Found %d ECOD domains.%n",domains.size());
796
797                        System.out.println("First 10 domains:");
798                        int i = 0;
799                        for(EcodDomain d: domains) {
800                                if( i>10) break;
801
802                                System.out.println(d.getDomainId());
803                                i++;
804                        }
805                } catch (IOException e) {
806                        e.printStackTrace();
807                }
808        }
809}