001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.structure.align.client;
022
023
024import java.io.File;
025import java.io.IOException;
026import java.io.Serializable;
027import java.net.MalformedURLException;
028import java.net.URL;
029import java.util.Iterator;
030import java.util.LinkedList;
031import java.util.List;
032import java.util.Set;
033import java.util.TreeSet;
034import java.util.regex.Matcher;
035import java.util.regex.Pattern;
036
037import org.biojava.nbio.structure.BioAssemblyIdentifier;
038import org.biojava.nbio.structure.ResidueRange;
039import org.biojava.nbio.structure.Structure;
040import org.biojava.nbio.structure.StructureException;
041import org.biojava.nbio.structure.StructureIdentifier;
042import org.biojava.nbio.structure.SubstructureIdentifier;
043import org.biojava.nbio.structure.URLIdentifier;
044import org.biojava.nbio.structure.align.util.AtomCache;
045import org.biojava.nbio.structure.cath.CathDomain;
046import org.biojava.nbio.structure.cath.CathFactory;
047import org.biojava.nbio.structure.domain.PDPDomain;
048import org.biojava.nbio.structure.domain.PDPProvider;
049import org.biojava.nbio.structure.domain.RemotePDPProvider;
050import org.biojava.nbio.structure.ecod.EcodFactory;
051import org.biojava.nbio.core.util.FileDownloadUtils;
052import org.biojava.nbio.structure.scop.ScopDatabase;
053import org.biojava.nbio.structure.scop.ScopDomain;
054import org.biojava.nbio.structure.scop.ScopFactory;
055import org.slf4j.Logger;
056import org.slf4j.LoggerFactory;
057
058
059/**
060 * A utility class that makes working with names of structures, domains and ranges easier.
061 *
062 * Accepts a wide range of identifier formats, including {@link ScopDomain},
063 * {@link CathDomain}, PDP domains, and {@link SubstructureIdentifier} residue
064 * ranges.
065 *
066 * Where possible, data is extracted from the input string. Otherwise, range
067 * information may be loaded from one of the factory classes:
068 * {@link CathFactory},{@link ScopFactory}, etc.
069 *
070 * @see #getName the name. e.g. 4hhb, 4hhb.A, d4hhba_, PDP:4HHBAa etc.
071 */
072
073public class StructureName implements Comparable<StructureName>, Serializable, StructureIdentifier {
074        private static final long serialVersionUID = 4021229518711762957L;
075        private static final Logger logger = LoggerFactory.getLogger(StructureName.class);
076
077        protected String name;
078        protected String pdbId;
079        protected String chainName;
080
081        private static final Pattern cathPattern = Pattern.compile("^(?:CATH:)?([0-9][a-z0-9]{3})(\\w)([0-9]{2})$",Pattern.CASE_INSENSITIVE);
082        // ds046__ is a special case with no PDB entry
083        private static final Pattern scopPattern = Pattern.compile("^(?:SCOP:)?d([0-9][a-z0-9]{3}|s046)(\\w|\\.)(\\w)$",Pattern.CASE_INSENSITIVE);
084        // ECOD chains and domains can't be automatically distinguished. Ex: e3j9zS13 is chain 'S1', e1wz2B14 is chain 'B'
085        private static final Pattern ecodPattern = Pattern.compile("^(?:ECOD:)?e([0-9][a-z0-9]{3})(?:\\w|\\.)\\w+$",Pattern.CASE_INSENSITIVE);
086
087        // Names are automatically used as prefixes
088        public enum Source {
089                PDB,
090                SCOP,
091                PDP,
092                CATH,
093                URL,
094                FILE,
095                ECOD,
096                BIO,
097        };
098
099        private Source mySource = null;
100
101        // cache for getBaseIdentifier() method
102        private StructureIdentifier base = null;
103
104        /**
105         * Create a new StructureName from the given identifier, which may be a
106         * domain name, a substructure identifier, etc.
107         * <p>
108         * The source and PDB-Id are extracted at compile time, but fully
109         * interpreting the ID, which may require additional parsing or remote
110         * calls, is done lazily.
111         * <p>
112         * The following sources are supported. Any may be prefixed by the source
113         * name followed by a colon (e.g. PDB:4HHB). In this case, that source will be used
114         * unequivocally. If no source is specified, StructureName will make a
115         * (usually reliable) guess as to which source was intended.
116         * <ul>
117         * <li><b>PDB</b>PDB identifier, optionally followed by chain and/or residue
118         *     ranges. Internally represented by a {@link SubstructureIdentifier};
119         *     see that class for the full format specification.
120         *     Examples: 4hhb, 4hhb.A, 4hhb.A:1-50.
121         * <li><b>SCOP</b> SCOP domain (or SCOPe, depending on the
122         *     {@link ScopFactory#getSCOP()} version). Example: d1h6w.2
123         * <li><b>PDP</b> Protein Domain Parser domain. PDP domains are not guessed,
124         *     making the PDP: prefix obligatory. Example: PDP:4HHBAa
125         * <li><b>CATH</b> Cath domains. Example: 1qvrC03
126         * <li><b>URL</b> Arbitrary URLs. Most common protocols are handled,
127         *     including http://, ftp://, and file://. Some parsing information can
128         *     be passed as custom query parameters. Example:
129         *     http://www.rcsb.org/pdb/files/1B8G.pdb.gz
130         * <li><b>FILE</b> A file path. Supports relative paths and expands ~ to
131         *     the user's home directory. Only existing files will be automatically
132         *     detected; to refer to a potentially not-yet existing file, prepend
133         *     the prefix. Internally represented as a {@link URLIdentifier}
134         *     after path expansion. Example: ~/custom_protein.pdb
135         * <li><b>ECOD</b> ECOD domain. Example: e1lyw.1
136         * <li><b>BIO</b> Biological assembly. These are not guessed, making
137         *     the BIO: prefix obligatory. Example: BIO:2ehz:1
138         * </ul>
139         * @param name An identifier string
140         * @throws IllegalArgumentException if the name has a recognizable source but is semantically invalid
141         */
142        public StructureName(String name){
143                this.name = name;
144
145                init();//sets pdbId and mySource
146        }
147
148
149        /**
150         * Tries to determine the source and pdbId without fully realizing the identifier,
151         * which could require I/O depending on the source
152         * @throws IllegalArgumentException if the source is recognizable but invalid
153         */
154        private void init(){
155
156                // First try identifying a prefix
157                String[] prefix = name.split(":", 2);
158                mySource = null;
159                if(prefix.length > 1) {
160                        // Match Source prefixes
161                        String suffix = prefix[1];
162                        try {
163                                mySource = Source.valueOf(prefix[0].toUpperCase());
164                        } catch( IllegalArgumentException e ) {
165                                // unrecognized prefix; fall back on guessing
166                                mySource = null;
167                        }
168                        if(mySource != null) {
169                                switch( mySource) {
170                                case SCOP:
171                                        if( ! initFromScop(suffix) )
172                                                throw new IllegalArgumentException("Malformed SCOP domain name:"+suffix);
173                                        return;
174                                case PDP:
175                                        if( ! initFromPDP(name) )
176                                                throw new IllegalArgumentException("Malformed PDP domain name:"+suffix);
177                                        return;
178                                case CATH:
179                                        if( ! initFromCATH(suffix) )
180                                                throw new IllegalArgumentException("Malformed CATH domain name:"+suffix);
181                                        return;
182                                case BIO:
183                                        if( ! initFromBIO(name) )
184                                                throw new IllegalArgumentException("Malformed BIO name:"+suffix);
185                                        return;
186                                case ECOD:
187                                        if( ! initFromECOD(suffix) )
188                                                throw new IllegalArgumentException("Malformed ECOD domain name:"+suffix);
189                                        return;
190                                case PDB:
191                                        if( ! initFromPDB(suffix) )
192                                                throw new IllegalArgumentException("Malformed PDB specification:"+suffix);
193                                        return;
194                                case FILE:
195                                        // Treat file:/ prefixes as URLs
196                                        if( ! suffix.startsWith("/")) {
197                                                // Otherwise, treat as file
198                                                initFromFile();
199                                                return;
200                                        }
201                                        // fall through to URL case
202                                case URL:
203                                        if( ! initFromURL(name))
204                                                throw new IllegalArgumentException("Malformed URL specification:"+suffix);
205                                        return;
206                                default:
207                                        throw new IllegalStateException("Unimplemented Source "+mySource);
208                                }
209                        }
210                }
211
212                // No known prefix, so revert to guessing
213
214                // First guess regex-based identifiers
215                // SCOP domain
216                if( initFromScop(name) )
217                        return;
218                // CATH
219                if( initFromCATH(name) )
220                        return;
221                // ECOD
222                if( initFromECOD(name) )
223                        return;
224                // Never guess BIO or PDP
225
226                // URL
227                if( initFromURL(name) )
228                        return;
229
230                // Guess FILE based on file existence
231                File file = new File(FileDownloadUtils.expandUserHome(name));
232                if( file.canRead() && !file.isDirectory() ) {
233                        // an attempt to mitigate issue #398. It doesn't fix it but it catches the most common case of passing a pdb id and finding a file in working dir matching it
234                        if (name.matches("\\d\\w\\w\\w")) {
235                                // the plain pdb id case, this is unlikely to be what the user wants: let's let it through but warn about it
236                                logger.warn("Provided 4-letter structure name '{}' matches "
237                                                + "file name in directory {}. Will read structure "
238                                                + "data from file {} and not consider the name as a "
239                                                + "structure identifier. If this is not what you "
240                                                + "want, use 'FILE:{}'",
241                                                name, file.getAbsoluteFile().getParent(),
242                                                file.getAbsolutePath(), name);
243                        } else {
244                                logger.info("Provided structure name '{}' matches "
245                                                + "file name in directory {}. Will read structure "
246                                                + "data from file {}.",
247                                                name, file.getAbsoluteFile().getParent(),
248                                                file.getAbsolutePath());
249                        }
250
251                        initFromFile();
252                        return;
253                }
254
255                // Default to PDB
256                initFromPDB( name );
257        }
258
259        private boolean initFromScop(String name) {
260                Matcher matcher = scopPattern.matcher(name);
261                if ( matcher.matches() ) {
262                        mySource = Source.SCOP;
263                        pdbId = matcher.group(1).toUpperCase();
264                        chainName = matcher.group(2);
265                        return true;
266                }
267                return false;
268        }
269        private boolean initFromPDP(String name) {
270                Matcher matcher = PDPDomain.PDP_NAME_PATTERN.matcher(name);
271                if( matcher.matches() ) {
272                        pdbId = matcher.group(1).toUpperCase();
273                        chainName = matcher.group(2);
274                        return true;
275                }
276                return false;
277        }
278        private boolean initFromCATH(String name) {
279                Matcher matcher = cathPattern.matcher(name);
280                if ( matcher.matches() ){
281                        mySource = Source.CATH;
282                        pdbId = matcher.group(1).toUpperCase();
283                        chainName = matcher.group(2);
284                        return true;
285                }
286                return false;
287        }
288        private boolean initFromECOD(String name) {
289                Matcher matcher = ecodPattern.matcher(name);
290                if ( matcher.matches() ){
291                        mySource = Source.ECOD;
292                        pdbId = matcher.group(1).toUpperCase();
293                        chainName = null;
294                        return true;
295                }
296                return false;
297        }
298        private boolean initFromBIO(String name) {
299                Matcher matcher = BioAssemblyIdentifier.BIO_NAME_PATTERN.matcher(name);
300                if( matcher.matches() ) {
301                        pdbId = matcher.group(1).toUpperCase();
302                        return true;
303                }
304                return false;
305        }
306        private boolean initFromPDB(String suffix) {
307                mySource = Source.PDB;
308                SubstructureIdentifier si = new SubstructureIdentifier(suffix);
309                base = si; // Safe to realize immediately
310
311                pdbId = si.getPdbId();
312                // Set chainName if unique
313                Set<String> chains = getChainNames(si);
314                if(chains.size() == 1) {
315                        this.chainName = chains.iterator().next();
316                } else if(chains.size() > 1) {
317                        this.chainName = ".";
318                } else {
319                        this.chainName = null;
320                }
321                return true;
322        }
323        private boolean initFromURL(String suffix) {
324                try {
325                        URL url = new URL(suffix);
326                        String path = url.getPath();
327                        mySource = Source.URL;
328                        pdbId = URLIdentifier.guessPDBID( path.substring(path.lastIndexOf('/')+1) );
329                        chainName = null; // Don't bother checking query params here
330                        return true;
331                } catch(MalformedURLException e) {
332                        return false;
333                }
334        }
335        private boolean initFromFile() {
336                mySource = Source.FILE;
337                pdbId = null;
338                chainName = null;
339                return true;
340        }
341
342        private static Set<String> getChainNames(SubstructureIdentifier si) {
343                Set<String> chains = new TreeSet<String>();
344                List<ResidueRange> ranges = si.getResidueRanges();
345                for(ResidueRange range : ranges) {
346                        String chainName = range.getChainName();
347                        if(chainName != null) {
348                                chains.add(chainName);
349                        }
350                }
351                return chains;
352        }
353
354        /**
355         * Get the PDB ID for this name, if any.
356         *
357         * Equivalent to {@link SubstructureIdentifier#getPdbId()
358         * toCanonical().getPdbId()}
359         * @return The upper-case PDB Name, or null if not applicable
360         * @throws StructureException Wraps errors which occur when converting to canonical form
361         */
362        public String getPdbId() throws StructureException {
363                if( pdbId == null) {
364                        pdbId = toCanonical().getPdbId();
365                }
366                return pdbId;
367        }
368
369        /**
370         * Gets the chain ID, for structures where it is unique and well-defined.
371         * May return '.' for multi-chain ranges, '_' for wildcard chains, or
372         * null if the information is unavailable.
373         *
374         * <p>This method should only be used casually. For precise chainIds, it
375         * is better to use {@link #toCanonical()} and iterate through the
376         * residue ranges.
377         * @return
378         */
379        public String getChainId() {
380                return chainName;
381        }
382        
383        /**
384         * Get the original form of the identifier
385         */
386        @Override
387        public String getIdentifier() {
388                return name;
389        }
390
391        @Override
392        public String toString(){
393
394                return name;
395        }
396
397
398        public boolean isScopName() {
399                return mySource == Source.SCOP;
400        }
401
402        public boolean isPDPDomain(){
403                return mySource == Source.PDP;
404        }
405
406        public boolean isCathID(){
407                return mySource == Source.CATH;
408        }
409
410        public boolean isPdbId(){
411                return mySource == Source.PDB;
412        }
413
414        public boolean isURL() {
415                return mySource == Source.URL;
416        }
417
418        /**
419         * Indicates that the identifier was determined to correspond to a file.
420         * Note that some file identifiers may also be valid URLs; in that case,
421         * the URL source is preferred.
422         * @return
423         */
424        public boolean isFile() {
425                return mySource == Source.FILE;
426        }
427
428        public boolean isEcodDomain() {
429                return mySource == Source.ECOD;
430        }
431
432        public boolean isBioAssembly() {
433                return mySource == Source.BIO;
434        }
435
436        public Source getSource() {
437                return mySource;
438        }
439
440        /**
441         * StructureName wraps another StructureIdentifier. The type of the base
442         * identifier depends on the {@link #getSource() source}. Most StructureName
443         * methods deligate to the base identifier.
444         *
445         * <p>It is possible that future versions of StructureName might change the
446         * return type. Except for some specialized uses, it is probably better
447         * to create the correct type of identifier directly, rather than creating
448         * a StructureName and casting the result of this method.
449         * @return A Str
450         * @throws StructureException Wraps exceptions that may be thrown by
451         *  individual implementations. For example, a SCOP identifier may require
452         *  that the domain definitions be available for download.
453         */
454        public StructureIdentifier getBaseIdentifier() throws StructureException {
455                if( base == null ) {
456
457                        switch(mySource) {
458                        case CATH:
459                                base = CathFactory.getCathDatabase().getDescriptionByCathId(getIdentifier());
460                                break;
461                        case ECOD:
462                                try {
463                                        base = EcodFactory.getEcodDatabase().getDomainsById(name);
464                                } catch (IOException e) {
465                                        throw new StructureException("Unable to get ECOD domain "+name,e);
466                                }
467                                break;
468                        case SCOP:
469                                // Fuzzy matching of the domain name to the current default factory
470                                base = guessScopDomain(getIdentifier(),ScopFactory.getSCOP());
471                                if(base == null) {
472                                        // Guessing didn't work, so just use the PDBID and Chain from name
473                                        // Guess that '_' means 'whole structure'
474                                        if (chainName.equals("_")) {
475                                                base = new SubstructureIdentifier(pdbId);
476                                        } else {
477                                                base = new SubstructureIdentifier(pdbId,ResidueRange.parseMultiple(chainName));
478                                        }
479                                        logger.error("Unable to find {}, so using {}",name,base);
480                                }
481                                break;
482                        case FILE:
483                                try {
484                                        String[] prefix = name.split(":", 2);
485                                        String filename;
486                                        if(prefix.length > 1) {
487                                                filename = prefix[1];
488                                        } else {
489                                                filename = name;
490                                        }
491                                        filename = FileDownloadUtils.expandUserHome(filename);
492                                        base = new URLIdentifier(new File(filename).toURI().toURL());
493                                } catch (MalformedURLException e) {
494                                        // Should never happen
495                                        throw new StructureException("Unable to get URL for file: "+name,e);
496                                }
497                                break;
498                        case URL:
499                                try {
500                                        base = new URLIdentifier(name);
501                                } catch (MalformedURLException e) {
502                                        throw new StructureException("Invalid URL: "+name,e);
503                                }
504                                break;
505                        case PDP:
506                                try {
507                                        PDPProvider provider = new RemotePDPProvider(false);
508                                        base = provider.getPDPDomain(name);
509                                } catch (IOException e) {
510                                        throw new StructureException("Unable to fetch PDP domain "+name, e);
511                                }
512                                break;
513                        case BIO:
514                                base = new BioAssemblyIdentifier(name);
515                                break;
516                        case PDB:
517                                base = new SubstructureIdentifier(getIdentifier());
518                                break;
519                        default:
520                                throw new IllegalStateException("Unimplemented source: "+mySource);
521                        }
522                }
523                return base;
524        }
525
526        @Override
527        public SubstructureIdentifier toCanonical() throws StructureException {
528                return getBaseIdentifier().toCanonical();
529        }
530
531        @Override
532        public Structure reduce(Structure input) throws StructureException {
533                return getBaseIdentifier().reduce(input);
534        }
535
536        @Override
537        public Structure loadStructure(AtomCache cache) throws StructureException,
538        IOException {
539                return getBaseIdentifier().loadStructure(cache);
540        }
541
542        @Override
543        public int hashCode() {
544                final int prime = 31;
545                int result = 1;
546                result = prime * result + ((name == null) ? 0 : name.hashCode());
547                return result;
548        }
549
550        @Override
551        public boolean equals(Object obj) {
552                if (this == obj)
553                        return true;
554                if (obj == null)
555                        return false;
556                if (getClass() != obj.getClass())
557                        return false;
558                StructureName other = (StructureName) obj;
559                if (name == null) {
560                        if (other.name != null)
561                                return false;
562                } else if (!name.equals(other.name))
563                        return false;
564                return true;
565        }
566
567        /**
568         * Orders identifiers lexicographically by PDB ID and then full Identifier
569         */
570        @Override
571        public int compareTo(StructureName o) {
572                if ( this.equals(o))
573                        return 0;
574
575                String pdb1 = null;
576                String pdb2 = null;
577                try {
578                        pdb1 = this.getPdbId();
579                } catch (StructureException e) {}
580                try {
581                        pdb2 = this.getPdbId();
582                } catch (StructureException e) {}
583
584                int comp = 0;
585
586                // Sort those with PDBIDs before those without
587                if( pdb1 == null ) {
588                        if( pdb2 != null) {
589                                return 1; // this > o
590                        }
591                        // both null
592                } else if( pdb2 == null){
593                        return -1; // this < o
594                } else {
595                        // neither null
596                        comp = pdb1.compareTo(pdb2);
597                }
598                if( comp != 0 ) {
599                        return comp;
600                }
601
602                // break tie with full identifiers
603                pdb1 = this.getIdentifier();
604                pdb2 = o.getIdentifier();
605
606                // Throws NPE for nulls
607                return pdb1.compareTo(pdb2);
608        }
609
610        /**
611         * <p>
612         * Guess a scop domain. If an exact match is found, return that.
613         *
614         * <p>
615         * Otherwise, return the first scop domain found for the specified protein such that
616         * <ul>
617         * <li>The chains match, or one of the chains is '_' or '.'.
618         * <li>The domains match, or one of the domains is '_'.
619         * </ul>
620         *
621         * In some cases there may be several valid matches. In this case a warning
622         * will be logged.
623         *
624         * @param name SCOP domain name, or a guess thereof
625         * @param scopDB SCOP domain provider
626         * @return The best match for name among the domains of scopDB, or null if none match.
627         */
628        public static ScopDomain guessScopDomain(String name, ScopDatabase scopDB) {
629                List<ScopDomain> matches = new LinkedList<ScopDomain>();
630
631                // Try exact match first
632                ScopDomain domain = scopDB.getDomainByScopID(name);
633                if (domain != null) {
634                        return domain;
635                }
636
637                // Didn't work. Guess it!
638                logger.warn("Warning, could not find SCOP domain: " + name);
639
640                Matcher scopMatch = scopPattern.matcher(name);
641                if (scopMatch.matches()) {
642                        String pdbID = scopMatch.group(1);
643                        String chainName = scopMatch.group(2);
644                        String domainID = scopMatch.group(3);
645
646                        for (ScopDomain potentialSCOP : scopDB.getDomainsForPDB(pdbID)) {
647                                Matcher potMatch = scopPattern.matcher(potentialSCOP.getScopId());
648                                if (potMatch.matches()) {
649                                        if (chainName.equals(potMatch.group(2)) || chainName.equals("_") || chainName.equals(".")
650                                                        || potMatch.group(2).equals("_") || potMatch.group(2).equals(".")) {
651                                                if (domainID.equals(potMatch.group(3)) || domainID.equals("_") || potMatch.group(3).equals("_")) {
652                                                        // Match, or near match
653                                                        matches.add(potentialSCOP);
654                                                }
655                                        }
656                                }
657                        }
658                }
659
660                Iterator<ScopDomain> match = matches.iterator();
661                if (match.hasNext()) {
662                        ScopDomain bestMatch = match.next();
663                        if(logger.isWarnEnabled()) {
664                                StringBuilder warnMsg = new StringBuilder();
665                                warnMsg.append("Trying domain " + bestMatch.getScopId() + ".");
666                                if (match.hasNext()) {
667                                        warnMsg.append(" Other possibilities: ");
668                                        while (match.hasNext()) {
669                                                warnMsg.append(match.next().getScopId()).append(" ");
670                                        }
671                                }
672                                warnMsg.append(System.getProperty("line.separator"));
673                                logger.warn(warnMsg.toString());
674                        }
675                        return bestMatch;
676                } else {
677                        return null;
678                }
679        }
680
681
682
683}