001/*
002 * BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 */
020
021package org.biojava.nbio.structure.ecod;
022
023import java.io.BufferedReader;
024import java.io.File;
025import java.io.FileReader;
026import java.io.IOException;
027import java.io.Reader;
028import java.net.MalformedURLException;
029import java.net.URL;
030import java.util.ArrayList;
031import java.util.Calendar;
032import java.util.Collections;
033import java.util.Date;
034import java.util.HashMap;
035import java.util.LinkedHashSet;
036import java.util.LinkedList;
037import java.util.List;
038import java.util.Map;
039import java.util.Set;
040import java.util.concurrent.locks.ReadWriteLock;
041import java.util.concurrent.locks.ReentrantReadWriteLock;
042import java.util.regex.Matcher;
043import java.util.regex.Pattern;
044
045import org.biojava.nbio.structure.PdbId;
046import org.biojava.nbio.structure.align.util.UserConfiguration;
047import org.biojava.nbio.core.util.FileDownloadUtils;
048import org.slf4j.Logger;
049import org.slf4j.LoggerFactory;
050
051/**
052 * Provides access to the Evolutionary Classification of Protein Domains (ECOD).
053 *
054 * The preferred mechanism for obtaining instances of this class is through the
055 * {@link EcodFactory} class.
056 *
057 * Reference:
058 * H. Cheng, R. D. Schaeffer, Y. Liao, L. N. Kinch, J. Pei, S. Shi, B. H.\
059 *   Kim, N. V. Grishin. (2014) ECOD: An evolutionary classification of protein
060 *   domains. PLoS Comput Biol 10(12): e1003926.
061 * http://prodata.swmed.edu/ecod/
062 *
063 * @author Spencer Bliven
064 *
065 */
066public class EcodInstallation implements EcodDatabase {
067        private static final Logger logger = LoggerFactory.getLogger(EcodInstallation.class);
068
069        public static final String DEFAULT_VERSION = "latest";
070        private static final String DOMAINS_FILENAME_FORMAT = "ecod.%s.domains.txt";
071
072        public static final String ECOD_URL = "http://prodata.swmed.edu";
073        public static final String DOMAINS_PATH = "/ecod/distributions/";
074
075        // ECOD identifiers are e<pdbID><chain><domain>, where chain and domain
076        // Chain and domain can both be multi-letter (e.g. e2q7zA10)
077        public static final Pattern ECOD_RE = Pattern.compile("^e(....).+\\d+$");
078
079
080        private String cacheLocation;
081        private String requestedVersion; // version requested, e.g. "latest". Used for the paths
082        private String parsedVersion; // actual version parsed
083
084        // lock to prevent multiple threads from downloading simultaneously
085        // Should hold the lock when reading/writing allDomains or domainMap
086        private ReadWriteLock domainsFileLock;
087        private List<EcodDomain> allDomains;
088        private Map<PdbId,List<EcodDomain>> domainMap;//PDB ID -> domains, lazily constructed from allDomains
089
090        private String url;
091
092        // Frequency of ECOD updates, in days. If non-null, redownloads "latest" if older than this.
093        private Integer updateFrequency = 14;
094
095        /**
096         * Use EcodFactory to create instances. The instantiation of multiple
097         * installations at the same path can lead to race conditions when downloading
098         * files.
099         * @param cacheLocation Location to save files, typically from the PDB_CACHE_DIR parameter
100         * @param requestedVersion ECOD requestedVersion to fetch
101         */
102        public EcodInstallation(String cacheLocation, String version) {
103                domainsFileLock = new ReentrantReadWriteLock();
104
105                this.cacheLocation = cacheLocation;
106
107                this.requestedVersion = version;
108                this.url = ECOD_URL;
109
110                allDomains = null; // null signals it needs to be parsed
111                domainMap = null; // null signals it needs to be constructed from allDomains
112        }
113
114        /**
115         * @see EcodFactory#getEcodDatabase()
116         */
117        public EcodInstallation() {
118                this( new UserConfiguration().getCacheFilePath(), DEFAULT_VERSION );
119        }
120        /**
121        public EcodInstallation(String cacheLocation) {
122                this( cacheLocation, DEFAULT_VERSION );
123        }
124
125        /**
126         * Get a list of all ECOD domains for a particular PDB ID
127         * @param id
128         * @return the list of domains, or null if no matching domains were found
129         * @throws IOException
130         */
131        @Override
132        public List<EcodDomain> getDomainsForPdb(String id) throws IOException {
133                domainsFileLock.readLock().lock();
134                try {
135                        logger.trace("LOCK readlock");
136                        while( domainMap == null ) {
137                                // unlock to allow ensureDomainsFileInstalled to get the write lock
138                                logger.trace("UNLOCK readlock");
139                                domainsFileLock.readLock().unlock();
140                                indexDomains();
141                                domainsFileLock.readLock().lock();
142                                logger.trace("LOCK readlock");
143                        }
144
145                        PdbId pdbId = null;
146                        try {
147                                pdbId = new PdbId(id);
148                        } catch (IllegalArgumentException e) {
149                                return null;
150                        }
151                        List<EcodDomain> doms = domainMap.get(pdbId);
152                        if(doms == null) {
153                                return null;
154                        }
155                        // Deep clone
156                        List<EcodDomain> clonedDoms = new ArrayList<EcodDomain>(doms.size());
157                        for(EcodDomain d : doms) {
158                                clonedDoms.add( new EcodDomain(d) );
159                        }
160                        return clonedDoms;
161                } finally {
162                        logger.trace("UNLOCK readlock");
163                        domainsFileLock.readLock().unlock();
164                }
165        }
166
167        /**
168         * Get a list of domains within a particular level of the hierarchy
169         * @param hierarchy A dot-separated list giving the X-group, H-group, and/or
170         *  T-group (e.g. "1.1" for all members of the RIFT-related H-group)
171         * @return
172         * @throws IOException
173         */
174        @Override
175        public List<EcodDomain> filterByHierarchy(String hierarchy) throws IOException {
176                String[] xhtGroup = hierarchy.split("\\.");
177                Integer xGroup = xhtGroup.length>0 ? Integer.parseInt(xhtGroup[0]) : null;
178                Integer hGroup = xhtGroup.length>1 ? Integer.parseInt(xhtGroup[1]) : null;
179                Integer tGroup = xhtGroup.length>2 ? Integer.parseInt(xhtGroup[2]) : null;
180
181                List<EcodDomain> filtered = new ArrayList<EcodDomain>();
182                for(EcodDomain d: getAllDomains()) {
183                        boolean match = true;
184                        if(xhtGroup.length>0) {
185                                match = match && xGroup.equals(d.getXGroup());
186                        }
187                        if(xhtGroup.length>1) {
188                                match = match && hGroup.equals(d.getHGroup());
189                        }
190                        if(xhtGroup.length>2) {
191                                match = match && tGroup.equals(d.getTGroup());
192                        }
193                        if(xhtGroup.length>3) {
194                                logger.warn("Ignoring unexpected additional parts of ECOD {}",hierarchy);
195                        }
196                        if(match) {
197                                filtered.add(d);
198                        }
199                }
200                return filtered;
201        }
202
203        /**
204         * Get a particular ECOD domain by the domain ID (e.g. "e4hhbA1")
205         * @param ecodId
206         * @return
207         * @throws IOException
208         */
209        @Override
210        public EcodDomain getDomainsById(String ecodId) throws IOException {
211                if(ecodId == null || ecodId.isEmpty()) {
212                        return null;
213                }
214
215                Matcher match = ECOD_RE.matcher(ecodId);
216                String pdbId = null;
217                if( match.matches() )
218                        pdbId = match.group(1);
219                List<EcodDomain> doms = getDomainsForPdb(pdbId);
220                if(doms == null) {
221                        logger.debug("Null domains for {} from {}",pdbId,ecodId);
222                        return null;
223                }
224                logger.debug("Got {} domains from {}",doms.size(),pdbId);
225                for(EcodDomain d: doms) {
226                        if(ecodId.equals(d.getDomainId())) {
227                                return d;
228                        }
229                }
230                return null;
231        }
232
233        /**
234         * Get all ECOD domains
235         * @return
236         * @throws IOException
237         */
238        @Override
239        public List<EcodDomain> getAllDomains() throws IOException {
240                domainsFileLock.readLock().lock();
241                logger.trace("LOCK readlock");
242                try {
243                        while( allDomains == null) {
244                                // unlock to allow ensureDomainsFileInstalled to get the write lock
245                                logger.trace("UNLOCK readlock");
246                                domainsFileLock.readLock().unlock();
247                                ensureDomainsFileInstalled();
248                                domainsFileLock.readLock().lock();
249                                logger.trace("LOCK readlock");
250                        }
251                        return allDomains;
252                } finally {
253                        logger.trace("UNLOCK readlock");
254                        domainsFileLock.readLock().unlock();
255                }
256
257        }
258
259        /**
260         * Clears all domains, requiring the file to be reparsed for subsequent accesses
261         */
262        public void clear() {
263                domainsFileLock.writeLock().lock();
264                logger.trace("LOCK writelock");
265                allDomains = null;
266                domainMap = null;
267                logger.trace("UNLOCK writelock");
268                domainsFileLock.writeLock().unlock();
269        }
270        /**
271         * Return the ECOD version, as parsed from the file.
272         *
273         * Note that this may differ from the version requested in the constructor
274         * for the special case of "latest"
275         * @return the ECOD version
276         * @throws IOException If an error occurs while downloading or parsing the file
277         */
278        @Override
279        public String getVersion() throws IOException {
280                ensureDomainsFileInstalled();
281
282                if( parsedVersion == null) {
283                        return requestedVersion;
284                }
285                return parsedVersion;
286        }
287
288        /**
289         * Get the top-level ECOD server URL. Defaults to "http://prodata.swmed.edu"
290         * @return the url to the ecod server
291         */
292        public String getUrl() {
293                return url;
294        }
295
296        /**
297         * Specify a different mirror for the ECOD server.
298         * @param urlFormat the urlFormat to set
299         */
300        public void setUrl(String url) {
301                this.url = url;
302        }
303
304        /**
305         * Get the location of the cache directory (usually set to the PDB_CACHE_DIR
306         * property). ECOD files will be downloaded to this directory
307         * @return
308         */
309        public String getCacheLocation() {
310                return cacheLocation;
311        }
312        /**
313         * Set an alternate download location for files
314         * @param cacheLocation
315         */
316        public void setCacheLocation(String cacheLocation) {
317                if(cacheLocation.equals(this.cacheLocation)) {
318                        return; //no change
319                }
320                // update location
321                domainsFileLock.writeLock().lock();
322                logger.trace("LOCK writelock");
323                this.cacheLocation = cacheLocation;
324                logger.trace("UNLOCK writelock");
325                domainsFileLock.writeLock().unlock();
326        }
327
328        /**
329         * Blocks until ECOD domains file has been downloaded and parsed.
330         *
331         * This may be useful in multithreaded environments.
332         * @throws IOException
333         */
334        // Populates allDomains
335        public void ensureDomainsFileInstalled() throws IOException{
336                // Quick check for availability
337                domainsFileLock.readLock().lock();
338                logger.trace("LOCK readlock");
339                try {
340                        if( allDomains != null ) {
341                                return;
342                        }
343                } finally {
344                        logger.trace("UNLOCK readlock");
345                        domainsFileLock.readLock().unlock();
346                }
347
348                // Download domains
349                domainsFileLock.writeLock().lock();
350                logger.trace("LOCK writelock");
351                try {
352                        if( !domainsAvailable() ) {
353                                downloadDomains();
354                        }
355                        parseDomains();
356                } finally {
357                        logger.trace("UNLOCK writelock");
358                        domainsFileLock.writeLock().unlock();
359                }
360        }
361
362        /**
363         * Checks that the domains file has been downloaded
364         * @return
365         */
366        private boolean domainsAvailable() {
367                domainsFileLock.readLock().lock();
368                logger.trace("LOCK readlock");
369                try {
370                        File f = getDomainFile();
371
372                        if (!f.exists() || f.length() <= 0 )
373                                return false;
374
375                        // Re-download old copies of "latest"
376                        if(updateFrequency != null && requestedVersion == DEFAULT_VERSION ) {
377                                long mod = f.lastModified();
378                                // Time of last update
379                                Date lastUpdate = new Date();
380                                Calendar cal = Calendar.getInstance();
381                                cal.setTime(lastUpdate);
382                                cal.add(Calendar.DAY_OF_WEEK, -updateFrequency);
383                                long updateTime = cal.getTimeInMillis();
384                                // Check if file predates last update
385                                if( mod < updateTime ) {
386                                        logger.info("{} is out of date.",f);
387                                        return false;
388                                }
389                        }
390                        return true;
391                } finally {
392                        logger.trace("UNLOCK readlock");
393                        domainsFileLock.readLock().unlock();
394                }
395        }
396
397        /**
398         * Downloads the domains file, overwriting any existing file
399         * @throws IOException
400         */
401        private void downloadDomains() throws IOException {
402                domainsFileLock.writeLock().lock();
403                logger.trace("LOCK writelock");
404                try {
405                        URL domainsURL = new URL( url + DOMAINS_PATH + getDomainFilename());
406                        File localFile = getDomainFile();
407
408                        logger.info("Downloading {} to: {}",domainsURL, localFile);
409                        FileDownloadUtils.downloadFile(domainsURL, localFile);
410                } catch (MalformedURLException e) {
411                        logger.error("Malformed url: "+ url + DOMAINS_PATH + getDomainFilename(),e);
412                } finally {
413                        logger.trace("UNLOCK writelock");
414                        domainsFileLock.writeLock().unlock();
415                }
416        }
417
418        /**
419         * Basename for the domains file with the current requestedVersion.
420         * @return
421         */
422        private String getDomainFilename() {
423                return  String.format(DOMAINS_FILENAME_FORMAT,requestedVersion);
424        }
425
426        /**
427         * Local location for the domain file
428         * @return
429         */
430        private File getDomainFile() {
431                return new File(getCacheLocation(),getDomainFilename());
432        }
433
434        /**
435         * The expected ECOD update frequency determines whether the version
436         * "latest" should be re-downloaded
437         * @return the expected ECOD update frequency, in days
438         */
439        public Integer getUpdateFrequency() {
440                return updateFrequency;
441        }
442
443        /**
444         * The "latest" version will be re-downloaded if it is older than
445         * {@link #getUpdateFrequency()} days. Setting this to null disables
446         * re-downloading (delete $PDB_CACHE_DIR/ecod.latest.domains.txt manually
447         * to force updating). Setting to 0 will force downloading for every
448         * program execution.
449         * @param updateFrequency the updateFrequency to set
450         */
451        public void setUpdateFrequency(Integer updateFrequency) {
452                this.updateFrequency = updateFrequency;
453        }
454
455        /**
456         * Parses the domains from the local file
457         * @throws IOException
458         */
459        private void parseDomains() throws IOException {
460                domainsFileLock.writeLock().lock();
461                logger.trace("LOCK writelock");
462                try {
463                        EcodParser parser = new EcodParser(getDomainFile());
464                        allDomains = parser.getDomains();
465                        parsedVersion = parser.getVersion();
466                } finally {
467                        logger.trace("UNLOCK writelock");
468                        domainsFileLock.writeLock().unlock();
469                }
470        }
471
472        /**
473         * Populates domainMap from allDomains
474         * @throws IOException
475         */
476        private void indexDomains() throws IOException {
477                domainsFileLock.writeLock().lock();
478                logger.trace("LOCK writelock");
479                try {
480                        if( allDomains == null) {
481                                ensureDomainsFileInstalled();
482                        }
483
484                        // Leave enough space for all PDBs as of 2015
485                        domainMap = new HashMap<PdbId, List<EcodDomain>>((int) (150000/.85),.85f);
486
487                        // Index with domainMap
488                        for(EcodDomain d : allDomains) {
489                                // Get the PDB ID, either directly or from the domain ID
490                                PdbId pdbId = d.getPdbId();
491                                if( pdbId == null ) {
492                                        String ecodId = d.getDomainId();
493                                        if( ecodId != null && !ecodId.isEmpty() ) {
494                                                Matcher match = ECOD_RE.matcher(ecodId);
495                                                pdbId = new PdbId(match.group(1));
496                                        }
497                                }
498
499                                // Add current domain to the map
500                                List<EcodDomain> currDomains;
501                                if( domainMap.containsKey(pdbId) ) {
502                                        currDomains = domainMap.get(pdbId);
503                                } else {
504                                        currDomains = new LinkedList<EcodDomain>();
505                                        domainMap.put(pdbId,currDomains);
506                                }
507                                currDomains.add(d);
508                        }
509                } finally {
510                        logger.trace("UNLOCK writelock");
511                        domainsFileLock.writeLock().unlock();
512                }
513
514        }
515
516
517        public static class EcodParser {
518                /*
519Version Notes
520
521Current version (1.4) contains the following columns:
522
523Column 1: ECOD uid - internal domain unique identifier
524Column 2: ECOD domain id - domain identifier
525Column 3: ECOD representative status - manual (curated) or automated nonrep
526Column 4: ECOD hierachy identifier - [X-group].[H-group].[T-group].[F-group]
527        * In develop45-66 these also include single numbers in the range 1-265
528Column 5: PDB identifier
529Column 6: Chain identifier (note: case-sensitive)
530Column 7: PDB residue number range
531        * These are sometimes incorrect up to at least develop124. Examples are:
532          e4lxaA2 (should be A:184-385), e4lxmC3 (should be C:46P-183)
533Column 8: seq_id number range (based on internal PDB indices)
534Column 9: Architecture name
535Column 10: X-group name
536Column 11: H-group name
537Column 12: T-group name
538Column 13: F-group name (F_UNCLASSIFIED denotes that domain has not been assigned to an F-group)
539Column 14: Domain assembly status (if domain is member of assembly, partners' ecod domain ids listed)
540Column 15: Comma-separated value list of non-polymer entities within 4 A of at least one residue of domain
541
542Notes older versions:
543changelog:
544v1.0 - original version (8/04/2014)
545v1.1 - added rep/nonrep data (1/15/2015)
546v1.2 - added f-group identifiers to fasta file, domain description file. ECODf identifiers now used when available for F-group name.
547        Domain assemblies now represented by assembly uid in domain assembly status.
548v1.4 - added seqid_range and headers (develop101)
549                 */
550
551                /** String for unclassified F-groups */
552                public static final String F_UNCLASSIFIED = "F_UNCLASSIFIED";
553                /** String for single-domain assemblies */
554                public static final String NOT_DOMAIN_ASSEMBLY = "NOT_DOMAIN_ASSEMBLY";
555                /** Deprecated way of indicating there is an assembly. replaced by the assembly id */
556                public static final String IS_DOMAIN_ASSEMBLY = "IS_DOMAIN_ASSEMBLY";
557                /** Indicates a manual representative */
558                public static final String IS_REPRESENTATIVE = "MANUAL_REP";
559                /** Indicates not a manual representative */
560                public static final String NOT_REPRESENTATIVE = "AUTO_NONREP";
561
562                private List<EcodDomain> domains;
563                private String version;
564
565                public EcodParser(String filename) throws IOException {
566                        this(new File(filename));
567                }
568                public EcodParser(File file) throws IOException {
569                        this(new FileReader(file));
570                }
571                public EcodParser(Reader reader) throws IOException {
572                        this(new BufferedReader(reader));
573                }
574                public EcodParser(BufferedReader reader) throws IOException {
575                        version = null;
576                        parse(reader);
577                }
578
579                private void parse(BufferedReader in) throws IOException {
580                        try {
581                                // Allocate plenty of space for ECOD as of 2015
582                                ArrayList<EcodDomain> domainsList = new ArrayList<EcodDomain>(500000);
583
584                                Pattern versionRE = Pattern.compile("^\\s*#.*ECOD\\s*version\\s+(\\S+).*");
585                                Pattern commentRE = Pattern.compile("^\\s*#.*");
586
587                                // prevent too many warnings; negative numbers print all warnings
588                                int warnIsDomainAssembly = 1;
589                                int warnHierarchicalFormat = 5;
590                                int warnNumberOfFields = 10;
591
592                                String line = in.readLine();
593                                int lineNum = 1;
594                                while( line != null ) {
595                                        // Check for requestedVersion string
596                                        Matcher match = versionRE.matcher(line);
597                                        if(match.matches()) {
598                                                // special requestedVersion comment
599                                                this.version = match.group(1);
600                                        } else {
601                                                match = commentRE.matcher(line);
602                                                if(match.matches()) {
603                                                        // ignore comments
604                                                } else {
605                                                        // data line
606                                                        String[] fields = line.split("\t");
607                                                        if( fields.length == 13 || fields.length == 14 || fields.length == 15) {
608                                                                try {
609                                                                        int i = 0; // field number, to allow future insertion of fields
610
611                                                                        //Column 1: ECOD uid - internal domain unique identifier
612                                                                        Long uid = Long.parseLong(fields[i++]);
613                                                                        //Column 2: ECOD domain id - domain identifier
614                                                                        String domainId = fields[i++];
615
616                                                                        //Column 3: ECOD representative status - manual (curated) or automated nonrep
617                                                                        // Manual column may be missing in version 1.0 files
618                                                                        Boolean manual = null;
619                                                                        if( fields.length >= 14) {
620                                                                                String manualString = fields[i++];
621                                                                                if(manualString.equalsIgnoreCase(IS_REPRESENTATIVE)) {
622                                                                                        manual = true;
623                                                                                } else if(manualString.equalsIgnoreCase(NOT_REPRESENTATIVE)) {
624                                                                                        manual = false;
625                                                                                } else {
626                                                                                        logger.warn("Unexpected value for manual field: {} in line {}",manualString,lineNum);
627                                                                                }
628                                                                        }
629
630                                                                        //Column 4: ECOD hierachy identifier - [X-group].[H-group].[T-group].[F-group]
631                                                                        // hierarchical field, e.g. "1.1.4.1"
632                                                                        String[] xhtGroup = fields[i++].split("\\.");
633                                                                        if(xhtGroup.length < 3 || 4 < xhtGroup.length) {
634                                                                                if(warnHierarchicalFormat > 1) {
635                                                                                        logger.warn("Unexpected format for hierarchical field \"{}\" in line {}",fields[i-1],lineNum);
636                                                                                        warnHierarchicalFormat--;
637                                                                                } else if(warnHierarchicalFormat != 0) {
638                                                                                        logger.warn("Unexpected format for hierarchical field \"{}\" in line {}. Not printing future similar warnings.",fields[i-1],lineNum);
639                                                                                        warnHierarchicalFormat--;
640                                                                                }
641                                                                        }
642                                                                        Integer xGroup = xhtGroup.length>0 ? Integer.parseInt(xhtGroup[0]) : null;
643                                                                        Integer hGroup = xhtGroup.length>1 ? Integer.parseInt(xhtGroup[1]) : null;
644                                                                        Integer tGroup = xhtGroup.length>2 ? Integer.parseInt(xhtGroup[2]) : null;
645                                                                        Integer fGroup = xhtGroup.length>3 ? Integer.parseInt(xhtGroup[3]) : null;
646
647                                                                        //Column 5: PDB identifier
648                                                                        String pdbId = fields[i++];
649                                                                        //Column 6: Chain identifier (note: case-sensitive)
650                                                                        String chainId = fields[i++];
651                                                                        //Column 7: PDB residue number range
652                                                                        String range = fields[i++];
653
654                                                                        //Column 8: seq_id number range (based on internal PDB indices)
655                                                                        //Added in version 1.4
656                                                                        String seqId = null;
657                                                                        if( fields.length >= 15) {
658                                                                                seqId = fields[i++];
659                                                                        }
660
661                                                                        //Column 9: Architecture name
662                                                                        // Intern strings likely to be shared by many domains
663                                                                        String architectureName = fields[i++].intern();
664                                                                        //Column 10: X-group name
665                                                                        String xGroupName = fields[i++].intern();
666                                                                        //Column 11: H-group name
667                                                                        String hGroupName = fields[i++].intern();
668                                                                        //Column 12: T-group name
669                                                                        String tGroupName = fields[i++].intern();
670                                                                        //Column 13: F-group name (F_UNCLASSIFIED denotes that domain has not been assigned to an F-group)
671                                                                        //Contents changed in version 1.3
672                                                                        String fGroupName = fields[i++].intern();
673
674
675                                                                        hGroupName = clearStringQuotes(hGroupName);
676                                                                        tGroupName = clearStringQuotes(tGroupName);
677                                                                        fGroupName = clearStringQuotes(fGroupName);
678                                                                        xGroupName = clearStringQuotes(xGroupName);
679
680                                                                        //Column 14: Domain assembly status (if domain is member of assembly, partners' ecod domain ids listed)
681                                                                        //Column 15: Comma-separated value list of non-polymer entities within 4 A of at least one residue of domain
682                                                                        Long assemblyId = null;
683                                                                        String assemblyStr = fields[i++];
684                                                                        if(assemblyStr.equals(NOT_DOMAIN_ASSEMBLY)) {
685                                                                                assemblyId = uid;
686                                                                        } else if(assemblyStr.equals("IS_DOMAIN_ASSEMBLY") ) {
687                                                                                if(warnIsDomainAssembly > 1) {
688                                                                                        logger.info("Deprecated 'IS_DOMAIN_ASSEMBLY' value ignored in line {}.",lineNum);
689                                                                                        warnIsDomainAssembly--;
690                                                                                } else if(warnIsDomainAssembly == 0) {
691                                                                                        logger.info("Deprecated 'IS_DOMAIN_ASSEMBLY' value ignored in line {}. Not printing future similar warnings.",lineNum);
692                                                                                        warnIsDomainAssembly--;
693                                                                                }
694                                                                                //assemblyId = null;
695                                                                        } else {
696                                                                                assemblyId = Long.parseLong(assemblyStr);
697                                                                        }
698
699                                                                        String ligandStr = fields[i++];
700                                                                        Set<String> ligands = null;
701                                                                        if( ligandStr.equals("NO_LIGANDS_4A") || ligandStr.isEmpty() ) {
702                                                                                ligands = Collections.emptySet();
703                                                                        } else {
704                                                                                String[] ligSplit = ligandStr.split(",");
705                                                                                ligands = new LinkedHashSet<String>(ligSplit.length);
706                                                                                for(String s : ligSplit) {
707                                                                                        ligands.add(s.intern());
708                                                                                }
709                                                                        }
710
711
712                                                                        EcodDomain domain = new EcodDomain(uid, domainId, manual, xGroup, hGroup, tGroup, fGroup,pdbId, chainId, range, seqId, architectureName, xGroupName, hGroupName, tGroupName, fGroupName, assemblyId, ligands);
713                                                                        domainsList.add(domain);
714                                                                } catch(NumberFormatException e) {
715                                                                        logger.warn("Error in ECOD parsing at line "+lineNum,e);
716                                                                }
717                                                        } else {
718                                                                if(warnNumberOfFields > 1) {
719                                                                        logger.warn("Unexpected number of fields in line {}.",lineNum);
720                                                                        warnNumberOfFields--;
721                                                                } else if(warnNumberOfFields == 0) {
722                                                                        logger.warn("Unexpected number of fields in line {}. Not printing future similar warnings",lineNum);
723                                                                        warnIsDomainAssembly--;
724                                                                }
725                                                        }
726                                                }
727                                        }
728
729                                        line = in.readLine();
730                                        lineNum++;
731                                }
732                                if(this.version == null)
733                                        logger.info("Parsed {} ECOD domains",domainsList.size());
734                                else
735                                        logger.info("Parsed {} ECOD domains from version {}",domainsList.size(),this.version);
736
737
738                                this.domains = Collections.unmodifiableList( domainsList );
739
740                        } finally {
741                                if(in != null) {
742                                        in.close();
743                                }
744                        }
745                }
746
747                private String clearStringQuotes(String name) {
748                        if ( name.startsWith("\""))
749                                name = name.substring(1);
750
751                        if ( name.endsWith("\""))
752                                name = name.substring(0,name.length()-1);
753
754                        return name;
755                }
756
757                /**
758                 * @return a list of all EcodDomains
759                 */
760                public List<EcodDomain> getDomains() {
761                        return domains;
762                }
763
764                /**
765                 * @return the requestedVersion for this file, or null if none was parsed
766                 */
767                public String getVersion() {
768                        return version;
769                }
770        }
771
772
773        @Override
774        public String toString() {
775                String version = null;
776                try {
777                        version = getVersion();
778                } catch (IOException e) {
779                        // For parsing errors, use the requested version
780                        version = requestedVersion;
781                }
782
783                return "EcodInstallation [cacheLocation=" + cacheLocation
784                                + ", version=" + version + "]";
785        }
786
787        public static void main(String[] args) {
788                if( args.length!= 1) {
789                        System.out.println("usage: ecod_domains.txt");
790                        System.exit(1); return;
791                }
792
793                String filename = args[0];
794
795                try {
796                        EcodParser parser = new EcodParser(filename);
797
798                        List<EcodDomain> domains = parser.getDomains();
799
800                        System.out.format("Found %d ECOD domains.%n",domains.size());
801
802                        System.out.println("First 10 domains:");
803                        int i = 0;
804                        for(EcodDomain d: domains) {
805                                if( i>10) break;
806
807                                System.out.println(d.getDomainId());
808                                i++;
809                        }
810                } catch (IOException e) {
811                        e.printStackTrace();
812                }
813        }
814}