001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.structure.align.client;
022
023
024import java.io.File;
025import java.io.IOException;
026import java.io.Serializable;
027import java.net.MalformedURLException;
028import java.net.URL;
029import java.util.Iterator;
030import java.util.LinkedList;
031import java.util.List;
032import java.util.Set;
033import java.util.TreeSet;
034import java.util.regex.Matcher;
035import java.util.regex.Pattern;
036
037import org.biojava.nbio.structure.BioAssemblyIdentifier;
038import org.biojava.nbio.structure.ResidueRange;
039import org.biojava.nbio.structure.Structure;
040import org.biojava.nbio.structure.StructureException;
041import org.biojava.nbio.structure.StructureIdentifier;
042import org.biojava.nbio.structure.SubstructureIdentifier;
043import org.biojava.nbio.structure.URLIdentifier;
044import org.biojava.nbio.structure.align.util.AtomCache;
045import org.biojava.nbio.structure.cath.CathDomain;
046import org.biojava.nbio.structure.cath.CathFactory;
047import org.biojava.nbio.structure.domain.PDPDomain;
048import org.biojava.nbio.structure.domain.PDPProvider;
049import org.biojava.nbio.structure.domain.RemotePDPProvider;
050import org.biojava.nbio.structure.ecod.EcodFactory;
051import org.biojava.nbio.structure.io.util.FileDownloadUtils;
052import org.biojava.nbio.structure.scop.ScopDatabase;
053import org.biojava.nbio.structure.scop.ScopDomain;
054import org.biojava.nbio.structure.scop.ScopFactory;
055import org.slf4j.Logger;
056import org.slf4j.LoggerFactory;
057
058
059/**
060 * A utility class that makes working with names of structures, domains and ranges easier.
061 *
062 * Accepts a wide range of identifier formats, including {@link ScopDomain},
063 * {@link CathDomain}, PDP domains, and {@link SubstructureIdentifier} residue
064 * ranges.
065 *
066 * Where possible, data is extracted from the input string. Otherwise, range
067 * information may be loaded from one of the factory classes:
068 * {@link CathFactory},{@link ScopFactory}, etc.
069 *
070 * @see #getName the name. e.g. 4hhb, 4hhb.A, d4hhba_, PDP:4HHBAa etc.
071 */
072
073public class StructureName implements Comparable<StructureName>, Serializable, StructureIdentifier {
074        private static final long serialVersionUID = 4021229518711762957L;
075        private static final Logger logger = LoggerFactory.getLogger(StructureName.class);
076
077        protected String name;
078        protected String pdbId;
079        protected String chainId;
080
081        private static final Pattern cathPattern = Pattern.compile("^(?:CATH:)?([0-9][a-z0-9]{3})(\\w)([0-9]{2})$",Pattern.CASE_INSENSITIVE);
082        // ds046__ is a special case with no PDB entry
083        private static final Pattern scopPattern = Pattern.compile("^(?:SCOP:)?d([0-9][a-z0-9]{3}|s046)(\\w|\\.)(\\w)$",Pattern.CASE_INSENSITIVE);
084        // ECOD chains and domains can't be automatically distinguished. Ex: e3j9zS13 is chain 'S1', e1wz2B14 is chain 'B'
085        private static final Pattern ecodPattern = Pattern.compile("^(?:ECOD:)?e([0-9][a-z0-9]{3})(?:\\w|\\.)\\w+$",Pattern.CASE_INSENSITIVE);
086
087        // Names are automatically used as prefixes
088        public enum Source {
089                PDB,
090                SCOP,
091                PDP,
092                CATH,
093                URL,
094                FILE,
095                ECOD,
096                BIO,
097        };
098
099        private Source mySource = null;
100
101        // cache for getBaseIdentifier() method
102        private StructureIdentifier base = null;
103
104        /**
105         * Create a new StructureName from the given identifier, which may be a
106         * domain name, a substructure identifier, etc.
107         * <p>
108         * The source and PDB-Id are extracted at compile time, but fully
109         * interpreting the ID, which may require additional parsing or remote
110         * calls, is done lazily.
111         * <p>
112         * The following sources are supported. Any may be prefixed by the source
113         * name followed by a colon (e.g. PDB:4HHB). In this case, that source will be used
114         * unequivocally. If no source is specified, StructureName will make a
115         * (usually reliable) guess as to which source was intended.
116         * <ul>
117         * <li><b>PDB</b>PDB identifier, optionally followed by chain and/or residue
118         *     ranges. Internally represented by a {@link SubstructureIdentifier};
119         *     see that class for the full format specification.
120         *     Examples: 4hhb, 4hhb.A, 4hhb.A:1-50.
121         * <li><b>SCOP</b> SCOP domain (or SCOPe, depending on the
122         *     {@link ScopFactory#getSCOP()} version). Example: d1h6w.2
123         * <li><b>PDP</b> Protein Domain Parser domain. PDP domains are not guessed,
124         *     making the PDP: prefix obligatory. Example: PDP:4HHBAa
125         * <li><b>CATH</b> Cath domains. Example: 1qvrC03
126         * <li><b>URL</b> Arbitrary URLs. Most common protocols are handled,
127         *     including http://, ftp://, and file://. Some parsing information can
128         *     be passed as custom query parameters. Example:
129         *     http://www.rcsb.org/pdb/files/1B8G.pdb.gz
130         * <li><b>FILE</b> A file path. Supports relative paths and expands ~ to
131         *     the user's home directory. Only existing files will be automatically
132         *     detected; to refer to a potentially not-yet existing file, prepend
133         *     the prefix. Internally represented as a {@link URLIdentifier}
134         *     after path expansion. Example: ~/custom_protein.pdb
135         * <li><b>ECOD</b> ECOD domain. Example: e1lyw.1
136         * <li><b>BIO</b> Biological assembly. These are not guessed, making
137         *     the BIO: prefix obligatory. Example: BIO:2ehz:1
138         * </ul>
139         * @param name An identifier string
140         * @throws IllegalArgumentException if the name has a recognizable source but is semantically invalid
141         */
142        public StructureName(String name){
143                this.name = name;
144
145                init();//sets pdbId and mySource
146        }
147
148
149        /**
150         * Tries to determine the source and pdbId without fully realizing the identifier,
151         * which could require I/O depending on the source
152         * @throws IllegalArgumentException if the source is recognizable but invalid
153         */
154        private void init(){
155
156                // First try identifying a prefix
157                String[] prefix = name.split(":", 2);
158                mySource = null;
159                if(prefix.length > 1) {
160                        // Match Source prefixes
161                        String suffix = prefix[1];
162                        try {
163                                mySource = Source.valueOf(prefix[0].toUpperCase());
164                        } catch( IllegalArgumentException e ) {
165                                // unrecognized prefix; fall back on guessing
166                                mySource = null;
167                        }
168                        if(mySource != null) {
169                                switch( mySource) {
170                                case SCOP:
171                                        if( ! initFromScop(suffix) )
172                                                throw new IllegalArgumentException("Malformed SCOP domain name:"+suffix);
173                                        return;
174                                case PDP:
175                                        if( ! initFromPDP(name) )
176                                                throw new IllegalArgumentException("Malformed PDP domain name:"+suffix);
177                                        return;
178                                case CATH:
179                                        if( ! initFromCATH(suffix) )
180                                                throw new IllegalArgumentException("Malformed CATH domain name:"+suffix);
181                                        return;
182                                case BIO:
183                                        if( ! initFromBIO(name) )
184                                                throw new IllegalArgumentException("Malformed BIO name:"+suffix);
185                                        return;
186                                case ECOD:
187                                        if( ! initFromECOD(suffix) )
188                                                throw new IllegalArgumentException("Malformed ECOD domain name:"+suffix);
189                                        return;
190                                case PDB:
191                                        if( ! initFromPDB(suffix) )
192                                                throw new IllegalArgumentException("Malformed PDB specification:"+suffix);
193                                        return;
194                                case FILE:
195                                        // Treat file:/ prefixes as URLs
196                                        if( ! suffix.startsWith("/")) {
197                                                // Otherwise, treat as file
198                                                initFromFile();
199                                                return;
200                                        }
201                                        // fall through to URL case
202                                case URL:
203                                        if( ! initFromURL(name))
204                                                throw new IllegalArgumentException("Malformed URL specification:"+suffix);
205                                        return;
206                                default:
207                                        throw new IllegalStateException("Unimplemented Source "+mySource);
208                                }
209                        }
210                }
211
212                // No known prefix, so revert to guessing
213
214                // First guess regex-based identifiers
215                // SCOP domain
216                if( initFromScop(name) )
217                        return;
218                // CATH
219                if( initFromCATH(name) )
220                        return;
221                // ECOD
222                if( initFromECOD(name) )
223                        return;
224                // Never guess BIO or PDP
225
226                // URL
227                if( initFromURL(name) )
228                        return;
229
230                // Guess FILE based on file existence
231                File file = new File(FileDownloadUtils.expandUserHome(name));
232                if( file.canRead() && !file.isDirectory() ) {
233                        // an attempt to mitigate issue #398. It doesn't fix it but it catches the most common case of passing a pdb id and finding a file in working dir matching it
234                        if (name.matches("\\d\\w\\w\\w")) {
235                                // the plain pdb id case, this is unlikely to be what the user wants: let's let it through but warn about it
236                                logger.warn("Provided 4-letter structure name '{}' matches "
237                                                + "file name in directory {}. Will read structure "
238                                                + "data from file {} and not consider the name as a "
239                                                + "structure identifier. If this is not what you "
240                                                + "want, use 'FILE:{}'",
241                                                name, file.getAbsoluteFile().getParent(),
242                                                file.getAbsolutePath(), name);
243                        } else {
244                                logger.info("Provided structure name '{}' matches "
245                                                + "file name in directory {}. Will read structure "
246                                                + "data from file {}.",
247                                                name, file.getAbsoluteFile().getParent(),
248                                                file.getAbsolutePath());
249                        }
250
251                        initFromFile();
252                        return;
253                }
254
255                // Default to PDB
256                initFromPDB( name );
257        }
258
259        private boolean initFromScop(String name) {
260                Matcher matcher = scopPattern.matcher(name);
261                if ( matcher.matches() ) {
262                        mySource = Source.SCOP;
263                        pdbId = matcher.group(1).toUpperCase();
264                        chainId = matcher.group(2);
265                        return true;
266                }
267                return false;
268        }
269        private boolean initFromPDP(String name) {
270                Matcher matcher = PDPDomain.PDP_NAME_PATTERN.matcher(name);
271                if( matcher.matches() ) {
272                        pdbId = matcher.group(1).toUpperCase();
273                        chainId = matcher.group(2);
274                        return true;
275                }
276                return false;
277        }
278        private boolean initFromCATH(String name) {
279                Matcher matcher = cathPattern.matcher(name);
280                if ( matcher.matches() ){
281                        mySource = Source.CATH;
282                        pdbId = matcher.group(1).toUpperCase();
283                        chainId = matcher.group(2);
284                        return true;
285                }
286                return false;
287        }
288        private boolean initFromECOD(String name) {
289                Matcher matcher = ecodPattern.matcher(name);
290                if ( matcher.matches() ){
291                        mySource = Source.ECOD;
292                        pdbId = matcher.group(1).toUpperCase();
293                        chainId = null;
294                        return true;
295                }
296                return false;
297        }
298        private boolean initFromBIO(String name) {
299                Matcher matcher = BioAssemblyIdentifier.BIO_NAME_PATTERN.matcher(name);
300                if( matcher.matches() ) {
301                        pdbId = matcher.group(1).toUpperCase();
302                        return true;
303                }
304                return false;
305        }
306        private boolean initFromPDB(String suffix) {
307                mySource = Source.PDB;
308                SubstructureIdentifier si = new SubstructureIdentifier(suffix);
309                base = si; // Safe to realize immediately
310
311                pdbId = si.getPdbId();
312                // Set chainId if unique
313                Set<String> chains = getChainIds(si);
314                if(chains.size() == 1) {
315                        this.chainId = chains.iterator().next();
316                } else if(chains.size() > 1) {
317                        this.chainId = ".";
318                } else {
319                        this.chainId = null;
320                }
321                return true;
322        }
323        private boolean initFromURL(String suffix) {
324                try {
325                        URL url = new URL(suffix);
326                        String path = url.getPath();
327                        mySource = Source.URL;
328                        pdbId = URLIdentifier.guessPDBID( path.substring(path.lastIndexOf('/')+1) );
329                        chainId = null; // Don't bother checking query params here
330                        return true;
331                } catch(MalformedURLException e) {
332                        return false;
333                }
334        }
335        private boolean initFromFile() {
336                mySource = Source.FILE;
337                pdbId = null;
338                chainId = null;
339                return true;
340        }
341
342        private static Set<String> getChainIds(SubstructureIdentifier si) {
343                Set<String> chains = new TreeSet<String>();
344                List<ResidueRange> ranges = si.getResidueRanges();
345                for(ResidueRange range : ranges) {
346                        String chain = range.getChainId();
347                        if(chain != null) {
348                                chains.add(chain);
349                        }
350                }
351                return chains;
352        }
353
354        /**
355         * Get the PDB ID for this name, if any.
356         *
357         * Equivalent to {@link SubstructureIdentifier#getPdbId()
358         * toCanonical().getPdbId()}
359         * @return The upper-case PDB Name, or null if not applicable
360         * @throws StructureException Wraps errors which occur when converting to canonical form
361         */
362        public String getPdbId() throws StructureException {
363                if( pdbId == null) {
364                        pdbId = toCanonical().getPdbId();
365                }
366                return pdbId;
367        }
368
369        /**
370         * Gets the chain ID, for structures where it is unique and well-defined.
371         * May return '.' for multi-chain ranges, '_' for wildcard chains, or
372         * null if the information is unavailable.
373         *
374         * <p>This method should only be used casually. For precise chainIds, it
375         * is better to use {@link #toCanonical()} and iterate through the
376         * residue ranges.
377         * @return
378         */
379        public String getChainId() {
380                return chainId;
381        }
382        /**
383         *
384         * @return the identifier string
385         * @deprecated use {@link #getIdentifier()}
386         */
387        @Deprecated
388        public String getName(){
389
390                return getIdentifier();
391        }
392
393        /**
394         * Get the original form of the identifier
395         */
396        @Override
397        public String getIdentifier() {
398                return name;
399        }
400
401        @Override
402        public String toString(){
403
404                return name;
405        }
406
407
408        public boolean isScopName() {
409                return mySource == Source.SCOP;
410        }
411
412        public boolean isPDPDomain(){
413                return mySource == Source.PDP;
414        }
415
416        public boolean isCathID(){
417                return mySource == Source.CATH;
418        }
419
420        public boolean isPdbId(){
421                return mySource == Source.PDB;
422        }
423
424        public boolean isURL() {
425                return mySource == Source.URL;
426        }
427
428        /**
429         * Indicates that the identifier was determined to correspond to a file.
430         * Note that some file identifiers may also be valid URLs; in that case,
431         * the URL source is preferred.
432         * @return
433         */
434        public boolean isFile() {
435                return mySource == Source.FILE;
436        }
437
438        public boolean isEcodDomain() {
439                return mySource == Source.ECOD;
440        }
441
442        public boolean isBioAssembly() {
443                return mySource == Source.BIO;
444        }
445
446        public Source getSource() {
447                return mySource;
448        }
449
450        /**
451         * StructureName wraps another StructureIdentifier. The type of the base
452         * identifier depends on the {@link #getSource() source}. Most StructureName
453         * methods deligate to the base identifier.
454         *
455         * <p>It is possible that future versions of StructureName might change the
456         * return type. Except for some specialized uses, it is probably better
457         * to create the correct type of identifier directly, rather than creating
458         * a StructureName and casting the result of this method.
459         * @return A Str
460         * @throws StructureException Wraps exceptions that may be thrown by
461         *  individual implementations. For example, a SCOP identifier may require
462         *  that the domain definitions be available for download.
463         */
464        public StructureIdentifier getBaseIdentifier() throws StructureException {
465                if( base == null ) {
466
467                        switch(mySource) {
468                        case CATH:
469                                base = CathFactory.getCathDatabase().getDescriptionByCathId(getIdentifier());
470                                break;
471                        case ECOD:
472                                try {
473                                        base = EcodFactory.getEcodDatabase().getDomainsById(name);
474                                } catch (IOException e) {
475                                        throw new StructureException("Unable to get ECOD domain "+name,e);
476                                }
477                                break;
478                        case SCOP:
479                                // Fuzzy matching of the domain name to the current default factory
480                                base = guessScopDomain(getIdentifier(),ScopFactory.getSCOP());
481                                if(base == null) {
482                                        // Guessing didn't work, so just use the PDBID and Chain from name
483                                        // Guess that '_' means 'whole structure'
484                                        if (chainId.equals("_")) {
485                                                base = new SubstructureIdentifier(pdbId);
486                                        } else {
487                                                base = new SubstructureIdentifier(pdbId,ResidueRange.parseMultiple(chainId));
488                                        }
489                                        logger.error("Unable to find {}, so using {}",name,base);
490                                }
491                                break;
492                        case FILE:
493                                try {
494                                        String[] prefix = name.split(":", 2);
495                                        String filename;
496                                        if(prefix.length > 1) {
497                                                filename = prefix[1];
498                                        } else {
499                                                filename = name;
500                                        }
501                                        filename = FileDownloadUtils.expandUserHome(filename);
502                                        base = new URLIdentifier(new File(filename).toURI().toURL());
503                                } catch (MalformedURLException e) {
504                                        // Should never happen
505                                        throw new StructureException("Unable to get URL for file: "+name,e);
506                                }
507                                break;
508                        case URL:
509                                try {
510                                        base = new URLIdentifier(name);
511                                } catch (MalformedURLException e) {
512                                        throw new StructureException("Invalid URL: "+name,e);
513                                }
514                                break;
515                        case PDP:
516                                try {
517                                        PDPProvider provider = new RemotePDPProvider(false);
518                                        base = provider.getPDPDomain(name);
519                                } catch (IOException e) {
520                                        throw new StructureException("Unable to fetch PDP domain "+name, e);
521                                }
522                                break;
523                        case BIO:
524                                base = new BioAssemblyIdentifier(name);
525                                break;
526                        case PDB:
527                                base = new SubstructureIdentifier(getIdentifier());
528                                break;
529                        default:
530                                throw new IllegalStateException("Unimplemented source: "+mySource);
531                        }
532                }
533                return base;
534        }
535
536        @Override
537        public SubstructureIdentifier toCanonical() throws StructureException {
538                return getBaseIdentifier().toCanonical();
539        }
540
541        @Override
542        public Structure reduce(Structure input) throws StructureException {
543                return getBaseIdentifier().reduce(input);
544        }
545
546        @Override
547        public Structure loadStructure(AtomCache cache) throws StructureException,
548        IOException {
549                return getBaseIdentifier().loadStructure(cache);
550        }
551
552        @Override
553        public int hashCode() {
554                final int prime = 31;
555                int result = 1;
556                result = prime * result + ((name == null) ? 0 : name.hashCode());
557                return result;
558        }
559
560        @Override
561        public boolean equals(Object obj) {
562                if (this == obj)
563                        return true;
564                if (obj == null)
565                        return false;
566                if (getClass() != obj.getClass())
567                        return false;
568                StructureName other = (StructureName) obj;
569                if (name == null) {
570                        if (other.name != null)
571                                return false;
572                } else if (!name.equals(other.name))
573                        return false;
574                return true;
575        }
576
577        /**
578         * Orders identifiers lexicographically by PDB ID and then full Identifier
579         */
580        @Override
581        public int compareTo(StructureName o) {
582                if ( this.equals(o))
583                        return 0;
584
585                String pdb1 = null;
586                String pdb2 = null;
587                try {
588                        pdb1 = this.getPdbId();
589                } catch (StructureException e) {}
590                try {
591                        pdb2 = this.getPdbId();
592                } catch (StructureException e) {}
593
594                int comp = 0;
595
596                // Sort those with PDBIDs before those without
597                if( pdb1 == null ) {
598                        if( pdb2 != null) {
599                                return 1; // this > o
600                        }
601                        // both null
602                } else if( pdb2 == null){
603                        return -1; // this < o
604                } else {
605                        // neither null
606                        comp = pdb1.compareTo(pdb2);
607                }
608                if( comp != 0 ) {
609                        return comp;
610                }
611
612                // break tie with full identifiers
613                pdb1 = this.getIdentifier();
614                pdb2 = o.getIdentifier();
615
616                // Throws NPE for nulls
617                return pdb1.compareTo(pdb2);
618        }
619
620        /**
621         * <p>
622         * Guess a scop domain. If an exact match is found, return that.
623         *
624         * <p>
625         * Otherwise, return the first scop domain found for the specified protein such that
626         * <ul>
627         * <li>The chains match, or one of the chains is '_' or '.'.
628         * <li>The domains match, or one of the domains is '_'.
629         * </ul>
630         *
631         * In some cases there may be several valid matches. In this case a warning
632         * will be logged.
633         *
634         * @param name SCOP domain name, or a guess thereof
635         * @param scopDB SCOP domain provider
636         * @return The best match for name among the domains of scopDB, or null if none match.
637         */
638        public static ScopDomain guessScopDomain(String name, ScopDatabase scopDB) {
639                List<ScopDomain> matches = new LinkedList<ScopDomain>();
640
641                // Try exact match first
642                ScopDomain domain = scopDB.getDomainByScopID(name);
643                if (domain != null) {
644                        return domain;
645                }
646
647                // Didn't work. Guess it!
648                logger.warn("Warning, could not find SCOP domain: " + name);
649
650                Matcher scopMatch = scopPattern.matcher(name);
651                if (scopMatch.matches()) {
652                        String pdbID = scopMatch.group(1);
653                        String chainID = scopMatch.group(2);
654                        String domainID = scopMatch.group(3);
655
656                        for (ScopDomain potentialSCOP : scopDB.getDomainsForPDB(pdbID)) {
657                                Matcher potMatch = scopPattern.matcher(potentialSCOP.getScopId());
658                                if (potMatch.matches()) {
659                                        if (chainID.equals(potMatch.group(2)) || chainID.equals("_") || chainID.equals(".")
660                                                        || potMatch.group(2).equals("_") || potMatch.group(2).equals(".")) {
661                                                if (domainID.equals(potMatch.group(3)) || domainID.equals("_") || potMatch.group(3).equals("_")) {
662                                                        // Match, or near match
663                                                        matches.add(potentialSCOP);
664                                                }
665                                        }
666                                }
667                        }
668                }
669
670                Iterator<ScopDomain> match = matches.iterator();
671                if (match.hasNext()) {
672                        ScopDomain bestMatch = match.next();
673                        if(logger.isWarnEnabled()) {
674                                StringBuilder warnMsg = new StringBuilder();
675                                warnMsg.append("Trying domain " + bestMatch.getScopId() + ".");
676                                if (match.hasNext()) {
677                                        warnMsg.append(" Other possibilities: ");
678                                        while (match.hasNext()) {
679                                                warnMsg.append(match.next().getScopId() + " ");
680                                        }
681                                }
682                                warnMsg.append(System.getProperty("line.separator"));
683                                logger.warn(warnMsg.toString());
684                        }
685                        return bestMatch;
686                } else {
687                        return null;
688                }
689        }
690
691
692
693}