001/*
002 * BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 */
020
021package org.biojava.nbio.structure.ecod;
022
023import java.io.BufferedReader;
024import java.io.File;
025import java.io.FileReader;
026import java.io.IOException;
027import java.io.Reader;
028import java.net.MalformedURLException;
029import java.net.URL;
030import java.util.ArrayList;
031import java.util.Calendar;
032import java.util.Collections;
033import java.util.Date;
034import java.util.HashMap;
035import java.util.LinkedHashSet;
036import java.util.LinkedList;
037import java.util.List;
038import java.util.Map;
039import java.util.Set;
040import java.util.concurrent.locks.ReadWriteLock;
041import java.util.concurrent.locks.ReentrantReadWriteLock;
042import java.util.regex.Matcher;
043import java.util.regex.Pattern;
044
045import org.biojava.nbio.structure.PdbId;
046import org.biojava.nbio.structure.align.util.UserConfiguration;
047import org.biojava.nbio.core.util.FileDownloadUtils;
048import org.slf4j.Logger;
049import org.slf4j.LoggerFactory;
050
051/**
052 * Provides access to the Evolutionary Classification of Protein Domains (ECOD).
053 *
054 * The preferred mechanism for obtaining instances of this class is through the
055 * {@link EcodFactory} class.
056 *
057 * Reference:
058 * H. Cheng, R. D. Schaeffer, Y. Liao, L. N. Kinch, J. Pei, S. Shi, B. H.\
059 *   Kim, N. V. Grishin. (2014) ECOD: An evolutionary classification of protein
060 *   domains. PLoS Comput Biol 10(12): e1003926.
061 * http://prodata.swmed.edu/ecod/
062 *
063 * @author Spencer Bliven
064 *
065 */
066public class EcodInstallation implements EcodDatabase {
067        private static final Logger logger = LoggerFactory.getLogger(EcodInstallation.class);
068
069        public static final String DEFAULT_VERSION = "latest";
070        private static final String DOMAINS_FILENAME_FORMAT = "ecod.%s.domains.txt";
071
072        public static final String ECOD_URL = "http://prodata.swmed.edu";
073        public static final String DOMAINS_PATH = "/ecod/distributions/";
074
075        // ECOD identifiers are e<pdbID><chain><domain>, where chain and domain
076        // Chain and domain can both be multi-letter (e.g. e2q7zA10)
077        public static final Pattern ECOD_RE = Pattern.compile("^e(....).+\\d+$");
078
079
080        private String cacheLocation;
081        private String requestedVersion; // version requested, e.g. "latest". Used for the paths
082        private String parsedVersion; // actual version parsed
083
084        // lock to prevent multiple threads from downloading simultaneously
085        // Should hold the lock when reading/writing allDomains or domainMap
086        private ReadWriteLock domainsFileLock;
087        private List<EcodDomain> allDomains;
088        private Map<PdbId,List<EcodDomain>> domainMap;//PDB ID -> domains, lazily constructed from allDomains
089
090        private String url;
091
092        // Frequency of ECOD updates, in days. If non-null, redownloads "latest" if older than this.
093        private Integer updateFrequency = 14;
094
095        /**
096         * Use EcodFactory to create instances. The instantiation of multiple
097         * installations at the same path can lead to race conditions when downloading
098         * files.
099         * @param cacheLocation Location to save files, typically from the PDB_CACHE_DIR parameter
100         * @param version ECOD requestedVersion to fetch
101         */
102        public EcodInstallation(String cacheLocation, String version) {
103                domainsFileLock = new ReentrantReadWriteLock();
104
105                this.cacheLocation = cacheLocation;
106
107                this.requestedVersion = version;
108                this.url = ECOD_URL;
109
110                allDomains = null; // null signals it needs to be parsed
111                domainMap = null; // null signals it needs to be constructed from allDomains
112        }
113
114        /**
115         * @see EcodFactory#getEcodDatabase()
116         */
117        public EcodInstallation() {
118                this( new UserConfiguration().getCacheFilePath(), DEFAULT_VERSION );
119        }
120        /**
121        public EcodInstallation(String cacheLocation) {
122                this( cacheLocation, DEFAULT_VERSION );
123        }
124
125        /**
126         * Get a list of all ECOD domains for a particular PDB ID
127         * @param id
128         * @return the list of domains, or null if no matching domains were found
129         * @throws IOException
130         */
131        @Override
132        public List<EcodDomain> getDomainsForPdb(String id) throws IOException {
133                domainsFileLock.readLock().lock();
134                try {
135                        logger.trace("LOCK readlock");
136                        while( domainMap == null ) {
137                                // unlock to allow ensureDomainsFileInstalled to get the write lock
138                                logger.trace("UNLOCK readlock");
139                                domainsFileLock.readLock().unlock();
140                                indexDomains();
141                                domainsFileLock.readLock().lock();
142                                logger.trace("LOCK readlock");
143                        }
144
145                        PdbId pdbId = null;
146                        try {
147                                pdbId = new PdbId(id);
148                        } catch (IllegalArgumentException e) {
149                                return null;
150                        }
151                        List<EcodDomain> doms = domainMap.get(pdbId);
152                        if(doms == null) {
153                                return null;
154                        }
155                        // Deep clone
156                        List<EcodDomain> clonedDoms = new ArrayList<>(doms.size());
157                        for(EcodDomain d : doms) {
158                                clonedDoms.add( new EcodDomain(d) );
159                        }
160                        return clonedDoms;
161                } finally {
162                        logger.trace("UNLOCK readlock");
163                        domainsFileLock.readLock().unlock();
164                }
165        }
166
167        /**
168         * Get a list of domains within a particular level of the hierarchy
169         * @param hierarchy A dot-separated list giving the X-group, H-group, and/or
170         *  T-group (e.g. "1.1" for all members of the RIFT-related H-group)
171         * @return
172         * @throws IOException
173         */
174        @Override
175        public List<EcodDomain> filterByHierarchy(String hierarchy) throws IOException {
176                String[] xhtGroup = hierarchy.split("\\.");
177                Integer xGroup = xhtGroup.length>0 ? Integer.parseInt(xhtGroup[0]) : null;
178                Integer hGroup = xhtGroup.length>1 ? Integer.parseInt(xhtGroup[1]) : null;
179                Integer tGroup = xhtGroup.length>2 ? Integer.parseInt(xhtGroup[2]) : null;
180
181                List<EcodDomain> filtered = new ArrayList<>();
182                for(EcodDomain d: getAllDomains()) {
183                        boolean match = true;
184                        if(xhtGroup.length>0) {
185                                match = match && xGroup.equals(d.getXGroup());
186                        }
187                        if(xhtGroup.length>1) {
188                                match = match && hGroup.equals(d.getHGroup());
189                        }
190                        if(xhtGroup.length>2) {
191                                match = match && tGroup.equals(d.getTGroup());
192                        }
193                        if(xhtGroup.length>3) {
194                                logger.warn("Ignoring unexpected additional parts of ECOD {}",hierarchy);
195                        }
196                        if(match) {
197                                filtered.add(d);
198                        }
199                }
200                return filtered;
201        }
202
203        /**
204         * Get a particular ECOD domain by the domain ID (e.g. "e4hhbA1")
205         * @param ecodId
206         * @return
207         * @throws IOException
208         */
209        @Override
210        public EcodDomain getDomainsById(String ecodId) throws IOException {
211                if(ecodId == null || ecodId.isEmpty()) {
212                        return null;
213                }
214
215                Matcher match = ECOD_RE.matcher(ecodId);
216                String pdbId = null;
217                if( match.matches() )
218                        pdbId = match.group(1);
219                List<EcodDomain> doms = getDomainsForPdb(pdbId);
220                if(doms == null) {
221                        logger.debug("Null domains for {} from {}",pdbId,ecodId);
222                        return null;
223                }
224                logger.debug("Got {} domains from {}",doms.size(),pdbId);
225                for(EcodDomain d: doms) {
226                        if(ecodId.equals(d.getDomainId())) {
227                                return d;
228                        }
229                }
230                return null;
231        }
232
233        /**
234         * Get all ECOD domains
235         * @return
236         * @throws IOException
237         */
238        @Override
239        public List<EcodDomain> getAllDomains() throws IOException {
240                domainsFileLock.readLock().lock();
241                logger.trace("LOCK readlock");
242                try {
243                        while( allDomains == null) {
244                                // unlock to allow ensureDomainsFileInstalled to get the write lock
245                                logger.trace("UNLOCK readlock");
246                                domainsFileLock.readLock().unlock();
247                                ensureDomainsFileInstalled();
248                                domainsFileLock.readLock().lock();
249                                logger.trace("LOCK readlock");
250                        }
251                        return allDomains;
252                } finally {
253                        logger.trace("UNLOCK readlock");
254                        domainsFileLock.readLock().unlock();
255                }
256
257        }
258
259        /**
260         * Clears all domains, requiring the file to be reparsed for subsequent accesses
261         */
262        public void clear() {
263                domainsFileLock.writeLock().lock();
264                logger.trace("LOCK writelock");
265                allDomains = null;
266                domainMap = null;
267                logger.trace("UNLOCK writelock");
268                domainsFileLock.writeLock().unlock();
269        }
270        /**
271         * Return the ECOD version, as parsed from the file.
272         *
273         * Note that this may differ from the version requested in the constructor
274         * for the special case of "latest"
275         * @return the ECOD version
276         * @throws IOException If an error occurs while downloading or parsing the file
277         */
278        @Override
279        public String getVersion() throws IOException {
280                ensureDomainsFileInstalled();
281
282                if( parsedVersion == null) {
283                        return requestedVersion;
284                }
285                return parsedVersion;
286        }
287
288        /**
289         * Get the top-level ECOD server URL. Defaults to "http://prodata.swmed.edu"
290         * @return the url to the ecod server
291         */
292        public String getUrl() {
293                return url;
294        }
295
296        /**
297         * Specify a different mirror for the ECOD server.
298         * @param url the urlFormat to set
299         */
300        public void setUrl(String url) {
301                this.url = url;
302        }
303
304        /**
305         * Get the location of the cache directory (usually set to the PDB_CACHE_DIR
306         * property). ECOD files will be downloaded to this directory
307         * @return
308         */
309        public String getCacheLocation() {
310                return cacheLocation;
311        }
312        /**
313         * Set an alternate download location for files
314         * @param cacheLocation
315         */
316        public void setCacheLocation(String cacheLocation) {
317                if(cacheLocation.equals(this.cacheLocation)) {
318                        return; //no change
319                }
320                // update location
321                domainsFileLock.writeLock().lock();
322                logger.trace("LOCK writelock");
323                this.cacheLocation = cacheLocation;
324                logger.trace("UNLOCK writelock");
325                domainsFileLock.writeLock().unlock();
326        }
327
328        /**
329         * Blocks until ECOD domains file has been downloaded and parsed.
330         *
331         * This may be useful in multithreaded environments.
332         * @throws IOException
333         */
334        // Populates allDomains
335        public void ensureDomainsFileInstalled() throws IOException{
336                // Quick check for availability
337                domainsFileLock.readLock().lock();
338                logger.trace("LOCK readlock");
339                try {
340                        if( allDomains != null ) {
341                                return;
342                        }
343                } finally {
344                        logger.trace("UNLOCK readlock");
345                        domainsFileLock.readLock().unlock();
346                }
347
348                // Download domains
349                domainsFileLock.writeLock().lock();
350                logger.trace("LOCK writelock");
351                try {
352                        if( !domainsAvailable() ) {
353                                downloadDomains();
354                        }
355                        parseDomains();
356                } finally {
357                        logger.trace("UNLOCK writelock");
358                        domainsFileLock.writeLock().unlock();
359                }
360        }
361
362        /**
363         * Checks that the domains file has been downloaded
364         * @return
365         */
366        private boolean domainsAvailable() {
367                domainsFileLock.readLock().lock();
368                logger.trace("LOCK readlock");
369                try {
370                        File f = getDomainFile();
371
372                        if (! (f.exists() && FileDownloadUtils.validateFile(f)))
373                                return false;
374
375                        // Re-download old copies of "latest"
376                        if(updateFrequency != null && requestedVersion.equals(DEFAULT_VERSION)) {
377                                long mod = f.lastModified();
378                                // Time of last update
379                                Date lastUpdate = new Date();
380                                Calendar cal = Calendar.getInstance();
381                                cal.setTime(lastUpdate);
382                                cal.add(Calendar.DAY_OF_WEEK, -updateFrequency);
383                                long updateTime = cal.getTimeInMillis();
384                                // Check if file predates last update
385                                if( mod < updateTime ) {
386                                        logger.info("{} is out of date.",f);
387                                        return false;
388                                }
389                        }
390                        return true;
391                } finally {
392                        logger.trace("UNLOCK readlock");
393                        domainsFileLock.readLock().unlock();
394                }
395        }
396
397        /**
398         * Downloads the domains file +/- its validation metadata, overwriting any existing file
399         * @throws IOException in cases of file I/O, including failure to download a healthy (non-corrupted) file.
400         */
401        private void downloadDomains() throws IOException {
402                domainsFileLock.writeLock().lock();
403                logger.trace("LOCK writelock");
404                try {
405                        URL domainsURL = new URL( url + DOMAINS_PATH + getDomainFilename());
406                        File localFile = getDomainFile();
407
408                        logger.info("Downloading {} to: {}",domainsURL, localFile);
409                        FileDownloadUtils.createValidationFiles(domainsURL, localFile, null, FileDownloadUtils.Hash.UNKNOWN);
410                        FileDownloadUtils.downloadFile(domainsURL, localFile);
411                        if(! FileDownloadUtils.validateFile(localFile))
412                                throw new IOException("Downloaded file invalid: "+ localFile);
413                } catch (MalformedURLException e) {
414                        logger.error("Malformed url: "+ url + DOMAINS_PATH + getDomainFilename(),e);
415                } finally {
416                        logger.trace("UNLOCK writelock");
417                        domainsFileLock.writeLock().unlock();
418                }
419        }
420
421        /**
422         * Basename for the domains file with the current requestedVersion.
423         * @return
424         */
425        private String getDomainFilename() {
426                return  String.format(DOMAINS_FILENAME_FORMAT,requestedVersion);
427        }
428
429        /**
430         * Local location for the domain file
431         * @return
432         */
433        private File getDomainFile() {
434                return new File(getCacheLocation(),getDomainFilename());
435        }
436
437        /**
438         * The expected ECOD update frequency determines whether the version
439         * "latest" should be re-downloaded
440         * @return the expected ECOD update frequency, in days
441         */
442        public Integer getUpdateFrequency() {
443                return updateFrequency;
444        }
445
446        /**
447         * The "latest" version will be re-downloaded if it is older than
448         * {@link #getUpdateFrequency()} days. Setting this to null disables
449         * re-downloading (delete $PDB_CACHE_DIR/ecod.latest.domains.txt manually
450         * to force updating). Setting to 0 will force downloading for every
451         * program execution.
452         * @param updateFrequency the updateFrequency to set
453         */
454        public void setUpdateFrequency(Integer updateFrequency) {
455                this.updateFrequency = updateFrequency;
456        }
457
458        /**
459         * Parses the domains from the local file
460         * @throws IOException
461         */
462        private void parseDomains() throws IOException {
463                domainsFileLock.writeLock().lock();
464                logger.trace("LOCK writelock");
465                try {
466                        EcodParser parser = new EcodParser(getDomainFile());
467                        allDomains = parser.getDomains();
468                        parsedVersion = parser.getVersion();
469                } finally {
470                        logger.trace("UNLOCK writelock");
471                        domainsFileLock.writeLock().unlock();
472                }
473        }
474
475        /**
476         * Populates domainMap from allDomains
477         * @throws IOException
478         */
479        private void indexDomains() throws IOException {
480                domainsFileLock.writeLock().lock();
481                logger.trace("LOCK writelock");
482                try {
483                        if( allDomains == null) {
484                                ensureDomainsFileInstalled();
485                        }
486
487                        // Leave enough space for all PDBs as of 2015
488                        domainMap = new HashMap<>((int) (150000/.85),.85f);
489
490                        // Index with domainMap
491                        for(EcodDomain d : allDomains) {
492                                // Get the PDB ID, either directly or from the domain ID
493                                PdbId pdbId = d.getPdbId();
494                                if( pdbId == null ) {
495                                        String ecodId = d.getDomainId();
496                                        if( ecodId != null && !ecodId.isEmpty() ) {
497                                                Matcher match = ECOD_RE.matcher(ecodId);
498                                                pdbId = new PdbId(match.group(1));
499                                        }
500                                }
501
502                                // Add current domain to the map
503                                List<EcodDomain> currDomains;
504                                if( domainMap.containsKey(pdbId) ) {
505                                        currDomains = domainMap.get(pdbId);
506                                } else {
507                                        currDomains = new LinkedList<>();
508                                        domainMap.put(pdbId,currDomains);
509                                }
510                                currDomains.add(d);
511                        }
512                } finally {
513                        logger.trace("UNLOCK writelock");
514                        domainsFileLock.writeLock().unlock();
515                }
516
517        }
518
519
520        public static class EcodParser {
521                /*
522Version Notes
523
524Current version (1.4) contains the following columns:
525
526Column 1: ECOD uid - internal domain unique identifier
527Column 2: ECOD domain id - domain identifier
528Column 3: ECOD representative status - manual (curated) or automated nonrep
529Column 4: ECOD hierachy identifier - [X-group].[H-group].[T-group].[F-group]
530        * In develop45-66 these also include single numbers in the range 1-265
531Column 5: PDB identifier
532Column 6: Chain identifier (note: case-sensitive)
533Column 7: PDB residue number range
534        * These are sometimes incorrect up to at least develop124. Examples are:
535          e4lxaA2 (should be A:184-385), e4lxmC3 (should be C:46P-183)
536Column 8: seq_id number range (based on internal PDB indices)
537Column 9: Architecture name
538Column 10: X-group name
539Column 11: H-group name
540Column 12: T-group name
541Column 13: F-group name (F_UNCLASSIFIED denotes that domain has not been assigned to an F-group)
542Column 14: Domain assembly status (if domain is member of assembly, partners' ecod domain ids listed)
543Column 15: Comma-separated value list of non-polymer entities within 4 A of at least one residue of domain
544
545Notes older versions:
546changelog:
547v1.0 - original version (8/04/2014)
548v1.1 - added rep/nonrep data (1/15/2015)
549v1.2 - added f-group identifiers to fasta file, domain description file. ECODf identifiers now used when available for F-group name.
550        Domain assemblies now represented by assembly uid in domain assembly status.
551v1.4 - added seqid_range and headers (develop101)
552                 */
553
554                /** String for unclassified F-groups */
555                public static final String F_UNCLASSIFIED = "F_UNCLASSIFIED";
556                /** String for single-domain assemblies */
557                public static final String NOT_DOMAIN_ASSEMBLY = "NOT_DOMAIN_ASSEMBLY";
558                /** Deprecated way of indicating there is an assembly. replaced by the assembly id */
559                public static final String IS_DOMAIN_ASSEMBLY = "IS_DOMAIN_ASSEMBLY";
560                /** Indicates a manual representative */
561                public static final String IS_REPRESENTATIVE = "MANUAL_REP";
562                /** Indicates not a manual representative */
563                public static final String NOT_REPRESENTATIVE = "AUTO_NONREP";
564
565                private List<EcodDomain> domains;
566                private String version;
567
568                public EcodParser(String filename) throws IOException {
569                        this(new File(filename));
570                }
571                public EcodParser(File file) throws IOException {
572                        this(new FileReader(file));
573                }
574                public EcodParser(Reader reader) throws IOException {
575                        this(new BufferedReader(reader));
576                }
577                public EcodParser(BufferedReader reader) throws IOException {
578                        version = null;
579                        parse(reader);
580                }
581
582                private void parse(BufferedReader in) throws IOException {
583                        try {
584                                // Allocate plenty of space for ECOD as of 2015
585                                ArrayList<EcodDomain> domainsList = new ArrayList<>(500000);
586
587                                Pattern versionRE = Pattern.compile("^\\s*#.*ECOD\\s*version\\s+(\\S+).*");
588                                Pattern commentRE = Pattern.compile("^\\s*#.*");
589
590                                // prevent too many warnings; negative numbers print all warnings
591                                int warnIsDomainAssembly = 1;
592                                int warnHierarchicalFormat = 5;
593                                int warnNumberOfFields = 10;
594
595                                String line = in.readLine();
596                                int lineNum = 1;
597                                while( line != null ) {
598                                        // Check for requestedVersion string
599                                        Matcher match = versionRE.matcher(line);
600                                        if(match.matches()) {
601                                                // special requestedVersion comment
602                                                this.version = match.group(1);
603                                        } else {
604                                                match = commentRE.matcher(line);
605                                                if(match.matches()) {
606                                                        // ignore comments
607                                                } else {
608                                                        // data line
609                                                        String[] fields = line.split("\t");
610                                                        if( fields.length == 13 || fields.length == 14 || fields.length == 15) {
611                                                                try {
612                                                                        int i = 0; // field number, to allow future insertion of fields
613
614                                                                        //Column 1: ECOD uid - internal domain unique identifier
615                                                                        Long uid = Long.parseLong(fields[i++]);
616                                                                        //Column 2: ECOD domain id - domain identifier
617                                                                        String domainId = fields[i++];
618
619                                                                        //Column 3: ECOD representative status - manual (curated) or automated nonrep
620                                                                        // Manual column may be missing in version 1.0 files
621                                                                        Boolean manual = null;
622                                                                        if( fields.length >= 14) {
623                                                                                String manualString = fields[i++];
624                                                                                if(manualString.equalsIgnoreCase(IS_REPRESENTATIVE)) {
625                                                                                        manual = true;
626                                                                                } else if(manualString.equalsIgnoreCase(NOT_REPRESENTATIVE)) {
627                                                                                        manual = false;
628                                                                                } else {
629                                                                                        logger.warn("Unexpected value for manual field: {} in line {}",manualString,lineNum);
630                                                                                }
631                                                                        }
632
633                                                                        //Column 4: ECOD hierachy identifier - [X-group].[H-group].[T-group].[F-group]
634                                                                        // hierarchical field, e.g. "1.1.4.1"
635                                                                        String[] xhtGroup = fields[i++].split("\\.");
636                                                                        if(xhtGroup.length < 3 || 4 < xhtGroup.length) {
637                                                                                if(warnHierarchicalFormat > 1) {
638                                                                                        logger.warn("Unexpected format for hierarchical field \"{}\" in line {}",fields[i-1],lineNum);
639                                                                                        warnHierarchicalFormat--;
640                                                                                } else if(warnHierarchicalFormat != 0) {
641                                                                                        logger.warn("Unexpected format for hierarchical field \"{}\" in line {}. Not printing future similar warnings.",fields[i-1],lineNum);
642                                                                                        warnHierarchicalFormat--;
643                                                                                }
644                                                                        }
645                                                                        Integer xGroup = xhtGroup.length>0 ? Integer.parseInt(xhtGroup[0]) : null;
646                                                                        Integer hGroup = xhtGroup.length>1 ? Integer.parseInt(xhtGroup[1]) : null;
647                                                                        Integer tGroup = xhtGroup.length>2 ? Integer.parseInt(xhtGroup[2]) : null;
648                                                                        Integer fGroup = xhtGroup.length>3 ? Integer.parseInt(xhtGroup[3]) : null;
649
650                                                                        //Column 5: PDB identifier
651                                                                        String pdbId = fields[i++];
652                                                                        //Column 6: Chain identifier (note: case-sensitive)
653                                                                        String chainId = fields[i++];
654                                                                        //Column 7: PDB residue number range
655                                                                        String range = fields[i++];
656
657                                                                        //Column 8: seq_id number range (based on internal PDB indices)
658                                                                        //Added in version 1.4
659                                                                        String seqId = null;
660                                                                        if( fields.length >= 15) {
661                                                                                seqId = fields[i++];
662                                                                        }
663
664                                                                        //Column 9: Architecture name
665                                                                        // Intern strings likely to be shared by many domains
666                                                                        String architectureName = fields[i++].intern();
667                                                                        //Column 10: X-group name
668                                                                        String xGroupName = fields[i++].intern();
669                                                                        //Column 11: H-group name
670                                                                        String hGroupName = fields[i++].intern();
671                                                                        //Column 12: T-group name
672                                                                        String tGroupName = fields[i++].intern();
673                                                                        //Column 13: F-group name (F_UNCLASSIFIED denotes that domain has not been assigned to an F-group)
674                                                                        //Contents changed in version 1.3
675                                                                        String fGroupName = fields[i++].intern();
676
677
678                                                                        hGroupName = clearStringQuotes(hGroupName);
679                                                                        tGroupName = clearStringQuotes(tGroupName);
680                                                                        fGroupName = clearStringQuotes(fGroupName);
681                                                                        xGroupName = clearStringQuotes(xGroupName);
682
683                                                                        //Column 14: Domain assembly status (if domain is member of assembly, partners' ecod domain ids listed)
684                                                                        //Column 15: Comma-separated value list of non-polymer entities within 4 A of at least one residue of domain
685                                                                        Long assemblyId = null;
686                                                                        String assemblyStr = fields[i++];
687                                                                        if(assemblyStr.equals(NOT_DOMAIN_ASSEMBLY)) {
688                                                                                assemblyId = uid;
689                                                                        } else if("IS_DOMAIN_ASSEMBLY".equals(assemblyStr) ) {
690                                                                                if(warnIsDomainAssembly > 1) {
691                                                                                        logger.info("Deprecated 'IS_DOMAIN_ASSEMBLY' value ignored in line {}.",lineNum);
692                                                                                        warnIsDomainAssembly--;
693                                                                                } else if(warnIsDomainAssembly == 0) {
694                                                                                        logger.info("Deprecated 'IS_DOMAIN_ASSEMBLY' value ignored in line {}. Not printing future similar warnings.",lineNum);
695                                                                                        warnIsDomainAssembly--;
696                                                                                }
697                                                                                //assemblyId = null;
698                                                                        } else {
699                                                                                assemblyId = Long.parseLong(assemblyStr);
700                                                                        }
701
702                                                                        String ligandStr = fields[i++];
703                                                                        Set<String> ligands = null;
704                                                                        if( "NO_LIGANDS_4A".equals(ligandStr) || ligandStr.isEmpty() ) {
705                                                                                ligands = Collections.emptySet();
706                                                                        } else {
707                                                                                String[] ligSplit = ligandStr.split(",");
708                                                                                ligands = new LinkedHashSet<>(ligSplit.length);
709                                                                                for(String s : ligSplit) {
710                                                                                        ligands.add(s.intern());
711                                                                                }
712                                                                        }
713
714
715                                                                        EcodDomain domain = new EcodDomain(uid, domainId, manual, xGroup, hGroup, tGroup, fGroup,pdbId, chainId, range, seqId, architectureName, xGroupName, hGroupName, tGroupName, fGroupName, assemblyId, ligands);
716                                                                        domainsList.add(domain);
717                                                                } catch(NumberFormatException e) {
718                                                                        logger.warn("Error in ECOD parsing at line "+lineNum,e);
719                                                                }
720                                                        } else {
721                                                                if(warnNumberOfFields > 1) {
722                                                                        logger.warn("Unexpected number of fields in line {}.",lineNum);
723                                                                        warnNumberOfFields--;
724                                                                } else if(warnNumberOfFields == 0) {
725                                                                        logger.warn("Unexpected number of fields in line {}. Not printing future similar warnings",lineNum);
726                                                                        warnIsDomainAssembly--;
727                                                                }
728                                                        }
729                                                }
730                                        }
731
732                                        line = in.readLine();
733                                        lineNum++;
734                                }
735                                if(this.version == null)
736                                        logger.info("Parsed {} ECOD domains",domainsList.size());
737                                else
738                                        logger.info("Parsed {} ECOD domains from version {}",domainsList.size(),this.version);
739
740
741                                this.domains = Collections.unmodifiableList( domainsList );
742
743                        } finally {
744                                if(in != null) {
745                                        in.close();
746                                }
747                        }
748                }
749
750                private String clearStringQuotes(String name) {
751                        if ( name.startsWith("\""))
752                                name = name.substring(1);
753
754                        if ( name.endsWith("\""))
755                                name = name.substring(0,name.length()-1);
756
757                        return name;
758                }
759
760                /**
761                 * @return a list of all EcodDomains
762                 */
763                public List<EcodDomain> getDomains() {
764                        return domains;
765                }
766
767                /**
768                 * @return the requestedVersion for this file, or null if none was parsed
769                 */
770                public String getVersion() {
771                        return version;
772                }
773        }
774
775
776        @Override
777        public String toString() {
778                String version = null;
779                try {
780                        version = getVersion();
781                } catch (IOException e) {
782                        // For parsing errors, use the requested version
783                        version = requestedVersion;
784                }
785
786                return "EcodInstallation [cacheLocation=" + cacheLocation
787                                + ", version=" + version + "]";
788        }
789
790        public static void main(String[] args) {
791                if( args.length!= 1) {
792                        System.out.println("usage: ecod_domains.txt");
793                        System.exit(1); return;
794                }
795
796                String filename = args[0];
797
798                try {
799                        EcodParser parser = new EcodParser(filename);
800
801                        List<EcodDomain> domains = parser.getDomains();
802
803                        System.out.format("Found %d ECOD domains.%n",domains.size());
804
805                        System.out.println("First 10 domains:");
806                        int i = 0;
807                        for(EcodDomain d: domains) {
808                                if( i>10) break;
809
810                                System.out.println(d.getDomainId());
811                                i++;
812                        }
813                } catch (IOException e) {
814                        e.printStackTrace();
815                }
816        }
817}