001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.structure.align.client;
022
023
024import java.io.File;
025import java.io.IOException;
026import java.io.Serializable;
027import java.net.MalformedURLException;
028import java.net.URL;
029import java.util.Iterator;
030import java.util.LinkedList;
031import java.util.List;
032import java.util.Set;
033import java.util.TreeSet;
034import java.util.regex.Matcher;
035import java.util.regex.Pattern;
036
037import org.biojava.nbio.structure.BioAssemblyIdentifier;
038import org.biojava.nbio.structure.PdbId;
039import org.biojava.nbio.structure.ResidueRange;
040import org.biojava.nbio.structure.Structure;
041import org.biojava.nbio.structure.StructureException;
042import org.biojava.nbio.structure.StructureIdentifier;
043import org.biojava.nbio.structure.SubstructureIdentifier;
044import org.biojava.nbio.structure.URLIdentifier;
045import org.biojava.nbio.structure.align.util.AtomCache;
046import org.biojava.nbio.structure.cath.CathDomain;
047import org.biojava.nbio.structure.cath.CathFactory;
048import org.biojava.nbio.structure.ecod.EcodFactory;
049import org.biojava.nbio.core.util.FileDownloadUtils;
050import org.biojava.nbio.structure.scop.ScopDatabase;
051import org.biojava.nbio.structure.scop.ScopDomain;
052import org.biojava.nbio.structure.scop.ScopFactory;
053import org.slf4j.Logger;
054import org.slf4j.LoggerFactory;
055
056
057/**
058 * A utility class that makes working with names of structures, domains and ranges easier.
059 *
060 * Accepts a wide range of identifier formats, including {@link ScopDomain},
061 * {@link CathDomain}, PDP domains, and {@link SubstructureIdentifier} residue
062 * ranges.
063 *
064 * Where possible, data is extracted from the input string. Otherwise, range
065 * information may be loaded from one of the factory classes:
066 * {@link CathFactory},{@link ScopFactory}, etc.
067 *
068 * @see #getIdentifier() the name. e.g. 4hhb, 4hhb.A, d4hhba_ etc.
069 */
070
071public class StructureName implements Comparable<StructureName>, Serializable, StructureIdentifier {
072        private static final long serialVersionUID = 4021229518711762957L;
073        private static final Logger logger = LoggerFactory.getLogger(StructureName.class);
074
075        protected String name;
076        protected PdbId pdbId;
077        protected String chainName;
078
079        //TODO Double check all of the modified patterns
080        private static final Pattern cathPattern = Pattern.compile("^(?:CATH:)?([0-9][a-z0-9]{3})(\\w)([0-9]{2})$",Pattern.CASE_INSENSITIVE);
081        // ds046__ is a special case with no PDB entry
082        private static final Pattern scopPattern = Pattern.compile("^(?:SCOP:)?d([0-9][a-z0-9]{3}|s046)(\\w|\\.)(\\w)$",Pattern.CASE_INSENSITIVE);
083        // ECOD chains and domains can't be automatically distinguished. Ex: e3j9zS13 is chain 'S1', e1wz2B14 is chain 'B'
084        private static final Pattern ecodPattern = Pattern.compile("^(?:ECOD:)?e([0-9][a-z0-9]{3})(?:\\w|\\.)\\w+$",Pattern.CASE_INSENSITIVE);
085
086        // Names are automatically used as prefixes
087        public enum Source {
088                PDB,
089                SCOP,
090                CATH,
091                URL,
092                FILE,
093                ECOD,
094                BIO,
095        };
096
097        private Source mySource = null;
098
099        // cache for getBaseIdentifier() method
100        private StructureIdentifier base = null;
101
102        /**
103         * Create a new StructureName from the given identifier, which may be a
104         * domain name, a substructure identifier, etc.
105         * <p>
106         * The source and PDB-Id are extracted at compile time, but fully
107         * interpreting the ID, which may require additional parsing or remote
108         * calls, is done lazily.
109         * <p>
110         * The following sources are supported. Any may be prefixed by the source
111         * name followed by a colon (e.g. PDB:4HHB). In this case, that source will be used
112         * unequivocally. If no source is specified, StructureName will make a
113         * (usually reliable) guess as to which source was intended.
114         * <ul>
115         * <li><b>PDB</b>PDB identifier, optionally followed by chain and/or residue
116         *     ranges. Internally represented by a {@link SubstructureIdentifier};
117         *     see that class for the full format specification.
118         *     Examples: 4hhb, 4hhb.A, 4hhb.A:1-50.
119         * <li><b>SCOP</b> SCOP domain (or SCOPe, depending on the
120         *     {@link ScopFactory#getSCOP()} version). Example: d1h6w.2
121         * <li><b>CATH</b> Cath domains. Example: 1qvrC03
122         * <li><b>URL</b> Arbitrary URLs. Most common protocols are handled,
123         *     including http://, ftp://, and file://. Some parsing information can
124         *     be passed as custom query parameters. Example:
125         *     http://www.rcsb.org/pdb/files/1B8G.pdb.gz
126         * <li><b>FILE</b> A file path. Supports relative paths and expands ~ to
127         *     the user's home directory. Only existing files will be automatically
128         *     detected; to refer to a potentially not-yet existing file, prepend
129         *     the prefix. Internally represented as a {@link URLIdentifier}
130         *     after path expansion. Example: ~/custom_protein.pdb
131         * <li><b>ECOD</b> ECOD domain. Example: e1lyw.1
132         * <li><b>BIO</b> Biological assembly. These are not guessed, making
133         *     the BIO: prefix obligatory. Example: BIO:2ehz:1
134         * </ul>
135         * @param name An identifier string
136         * @throws IllegalArgumentException if the name has a recognizable source but is semantically invalid
137         */
138        public StructureName(String name){
139                this.name = name;
140
141                init();//sets pdbId and mySource
142        }
143
144
145        /**
146         * Tries to determine the source and pdbId without fully realizing the identifier,
147         * which could require I/O depending on the source
148         * @throws IllegalArgumentException if the source is recognizable but invalid
149         */
150        private void init(){
151
152                // First try identifying a prefix
153                String[] prefix = name.split(":", 2);
154                mySource = null;
155                if(prefix.length > 1) {
156                        // Match Source prefixes
157                        String suffix = prefix[1];
158                        try {
159                                mySource = Source.valueOf(prefix[0].toUpperCase());
160                        } catch( IllegalArgumentException e ) {
161                                // unrecognized prefix; fall back on guessing
162                                mySource = null;
163                        }
164                        if(mySource != null) {
165                                switch( mySource) {
166                                case SCOP:
167                                        if( ! initFromScop(suffix) )
168                                                throw new IllegalArgumentException("Malformed SCOP domain name:"+suffix);
169                                        return;
170                                case CATH:
171                                        if( ! initFromCATH(suffix) )
172                                                throw new IllegalArgumentException("Malformed CATH domain name:"+suffix);
173                                        return;
174                                case BIO:
175                                        if( ! initFromBIO(name) )
176                                                throw new IllegalArgumentException("Malformed BIO name:"+suffix);
177                                        return;
178                                case ECOD:
179                                        if( ! initFromECOD(suffix) )
180                                                throw new IllegalArgumentException("Malformed ECOD domain name:"+suffix);
181                                        return;
182                                case PDB:
183                                        if( ! initFromPDB(suffix) )
184                                                throw new IllegalArgumentException("Malformed PDB specification:"+suffix);
185                                        return;
186                                case FILE:
187                                        // Treat file:/ prefixes as URLs
188                                        if( ! suffix.startsWith("/")) {
189                                                // Otherwise, treat as file
190                                                initFromFile();
191                                                return;
192                                        }
193                                        // fall through to URL case
194                                case URL:
195                                        if( ! initFromURL(name))
196                                                throw new IllegalArgumentException("Malformed URL specification:"+suffix);
197                                        return;
198                                default:
199                                        throw new IllegalStateException("Unimplemented Source "+mySource);
200                                }
201                        }
202                }
203
204                // No known prefix, so revert to guessing
205
206                // First guess regex-based identifiers
207                // SCOP domain
208                if( initFromScop(name) )
209                        return;
210                // CATH
211                if( initFromCATH(name) )
212                        return;
213                // ECOD
214                if( initFromECOD(name) )
215                        return;
216                // Never guess BIO or PDP
217
218                // URL
219                if( initFromURL(name) )
220                        return;
221
222                // Guess FILE based on file existence
223                File file = new File(FileDownloadUtils.expandUserHome(name));
224                if( file.canRead() && !file.isDirectory() ) {
225                        // an attempt to mitigate issue #398. It doesn't fix it but it catches the most common case of passing a pdb id and finding a file in working dir matching it
226                        if (name.matches("\\d\\w\\w\\w")) {
227                                // the plain pdb id case, this is unlikely to be what the user wants: let's let it through but warn about it
228                                logger.warn("Provided 4-letter structure name '{}' matches "
229                                                + "file name in directory {}. Will read structure "
230                                                + "data from file {} and not consider the name as a "
231                                                + "structure identifier. If this is not what you "
232                                                + "want, use 'FILE:{}'",
233                                                name, file.getAbsoluteFile().getParent(),
234                                                file.getAbsolutePath(), name);
235                        } else {
236                                logger.info("Provided structure name '{}' matches "
237                                                + "file name in directory {}. Will read structure "
238                                                + "data from file {}.",
239                                                name, file.getAbsoluteFile().getParent(),
240                                                file.getAbsolutePath());
241                        }
242
243                        initFromFile();
244                        return;
245                }
246
247                // Default to PDB
248                initFromPDB(name);
249        }
250
251        private boolean initFromScop(String name) {
252                Matcher matcher = scopPattern.matcher(name);
253                if ( matcher.matches() ) {
254                        mySource = Source.SCOP;
255                        pdbId = new PdbId(matcher.group(1));
256                        chainName = matcher.group(2);
257                        return true;
258                }
259                return false;
260        }
261
262        private boolean initFromCATH(String name) {
263                Matcher matcher = cathPattern.matcher(name);
264                if ( matcher.matches() ){
265                        mySource = Source.CATH;
266                        pdbId = new PdbId(matcher.group(1));
267                        chainName = matcher.group(2);
268                        return true;
269                }
270                return false;
271        }
272        private boolean initFromECOD(String name) {
273                Matcher matcher = ecodPattern.matcher(name);
274                if ( matcher.matches() ){
275                        mySource = Source.ECOD;
276                        pdbId = new PdbId(matcher.group(1));
277                        chainName = null;
278                        return true;
279                }
280                return false;
281        }
282        private boolean initFromBIO(String name) {
283                Matcher matcher = BioAssemblyIdentifier.BIO_NAME_PATTERN.matcher(name);
284                if( matcher.matches() ) {
285                        pdbId = new PdbId(matcher.group(1));
286                        return true;
287                }
288                return false;
289        }
290        private boolean initFromPDB(String suffix) {
291                mySource = Source.PDB;
292                SubstructureIdentifier si = new SubstructureIdentifier(suffix);
293
294                base = si; // Safe to realize immediately
295
296                pdbId = si.getPdbId();
297                // Set chainName if unique
298                Set<String> chains = getChainNames(si);
299                if(chains.size() == 1) {
300                        this.chainName = chains.iterator().next();
301                } else if(chains.size() > 1) {
302                        this.chainName = ".";
303                } else {
304                        this.chainName = null;
305                }
306                return true;
307        }
308        private boolean initFromURL(String suffix) {
309                try {
310                        URL url = new URL(suffix);
311                        String path = url.getPath();
312                        mySource = Source.URL;
313                        try {
314                                pdbId = new PdbId(URLIdentifier.guessPDBID( path.substring(path.lastIndexOf('/')+1) ));
315                        } catch (IllegalArgumentException e) {
316                                pdbId = null;
317                        }
318                        chainName = null; // Don't bother checking query params here
319                        return true;
320                } catch(MalformedURLException e) {
321                        return false;
322                }
323        }
324        private boolean initFromFile() {
325                mySource = Source.FILE;
326                pdbId = null;
327                chainName = null;
328                return true;
329        }
330
331        private static Set<String> getChainNames(SubstructureIdentifier si) {
332                Set<String> chains = new TreeSet<String>();
333                List<ResidueRange> ranges = si.getResidueRanges();
334                for(ResidueRange range : ranges) {
335                        String chainName = range.getChainName();
336                        if(chainName != null) {
337                                chains.add(chainName);
338                        }
339                }
340                return chains;
341        }
342
343        /**
344         * Get the PDB ID for this name, if any.
345         *
346         * Equivalent to {@link SubstructureIdentifier#getPdbId()
347         * toCanonical().getPdbId()}
348         * @return The upper-case PDB Name, or null if not applicable
349         * @throws StructureException Wraps errors which occur when converting to canonical form
350         * @since 6.0.0
351         */
352        public PdbId getPdbId() throws StructureException {
353                if( pdbId == null) {
354                        pdbId = toCanonical().getPdbId();
355                }
356                return pdbId;
357        }
358        
359        /**
360         * Gets the chain ID, for structures where it is unique and well-defined.
361         * May return '.' for multi-chain ranges, '_' for wildcard chains, or
362         * null if the information is unavailable.
363         *
364         * <p>This method should only be used casually. For precise chainIds, it
365         * is better to use {@link #toCanonical()} and iterate through the
366         * residue ranges.
367         * @return
368         */
369        public String getChainId() {
370                return chainName;
371        }
372
373        /**
374         * Get the original form of the identifier
375         */
376        @Override
377        public String getIdentifier() {
378                return name;
379        }
380
381        @Override
382        public String toString(){
383
384                return name;
385        }
386
387
388        public boolean isScopName() {
389                return mySource == Source.SCOP;
390        }
391
392        public boolean isCathID(){
393                return mySource == Source.CATH;
394        }
395
396        public boolean isPdbId(){
397                return mySource == Source.PDB;
398        }
399
400        public boolean isURL() {
401                return mySource == Source.URL;
402        }
403
404        /**
405         * Indicates that the identifier was determined to correspond to a file.
406         * Note that some file identifiers may also be valid URLs; in that case,
407         * the URL source is preferred.
408         * @return
409         */
410        public boolean isFile() {
411                return mySource == Source.FILE;
412        }
413
414        public boolean isEcodDomain() {
415                return mySource == Source.ECOD;
416        }
417
418        public boolean isBioAssembly() {
419                return mySource == Source.BIO;
420        }
421
422        public Source getSource() {
423                return mySource;
424        }
425
426        /**
427         * StructureName wraps another StructureIdentifier. The type of the base
428         * identifier depends on the {@link #getSource() source}. Most StructureName
429         * methods deligate to the base identifier.
430         *
431         * <p>It is possible that future versions of StructureName might change the
432         * return type. Except for some specialized uses, it is probably better
433         * to create the correct type of identifier directly, rather than creating
434         * a StructureName and casting the result of this method.
435         * @return A Str
436         * @throws StructureException Wraps exceptions that may be thrown by
437         *  individual implementations. For example, a SCOP identifier may require
438         *  that the domain definitions be available for download.
439         */
440        public StructureIdentifier getBaseIdentifier() throws StructureException {
441                if( base == null ) {
442
443                        switch(mySource) {
444                        case CATH:
445                                base = CathFactory.getCathDatabase().getDescriptionByCathId(getIdentifier());
446                                break;
447                        case ECOD:
448                                try {
449                                        base = EcodFactory.getEcodDatabase().getDomainsById(name);
450                                } catch (IOException e) {
451                                        throw new StructureException("Unable to get ECOD domain "+name,e);
452                                }
453                                break;
454                        case SCOP:
455                                // Fuzzy matching of the domain name to the current default factory
456                                base = guessScopDomain(getIdentifier(),ScopFactory.getSCOP());
457                                if(base == null) {
458                                        // Guessing didn't work, so just use the PDBID and Chain from name
459                                        // Guess that '_' means 'whole structure'
460                                        if (chainName.equals("_")) {
461                                                base = new SubstructureIdentifier(pdbId.getId());
462                                        } else {
463                                                base = new SubstructureIdentifier(pdbId, ResidueRange.parseMultiple(chainName));
464                                        }
465                                        logger.error("Unable to find {}, so using {}",name,base);
466                                }
467                                break;
468                        case FILE:
469                                try {
470                                        String[] prefix = name.split(":", 2);
471                                        String filename;
472                                        if(prefix.length > 1) {
473                                                filename = prefix[1];
474                                        } else {
475                                                filename = name;
476                                        }
477                                        filename = FileDownloadUtils.expandUserHome(filename);
478                                        base = new URLIdentifier(new File(filename).toURI().toURL());
479                                } catch (MalformedURLException e) {
480                                        // Should never happen
481                                        throw new StructureException("Unable to get URL for file: "+name,e);
482                                }
483                                break;
484                        case URL:
485                                try {
486                                        base = new URLIdentifier(name);
487                                } catch (MalformedURLException e) {
488                                        throw new StructureException("Invalid URL: "+name,e);
489                                }
490                                break;
491                        case BIO:
492                                base = new BioAssemblyIdentifier(name);
493                                break;
494                        case PDB:
495                                base = new SubstructureIdentifier(getIdentifier());
496                                break;
497                        default:
498                                throw new IllegalStateException("Unimplemented source: "+mySource);
499                        }
500                }
501                return base;
502        }
503
504        @Override
505        public SubstructureIdentifier toCanonical() throws StructureException {
506                return getBaseIdentifier().toCanonical();
507        }
508
509        @Override
510        public Structure reduce(Structure input) throws StructureException {
511                return getBaseIdentifier().reduce(input);
512        }
513
514        @Override
515        public Structure loadStructure(AtomCache cache) throws StructureException,
516        IOException {
517                return getBaseIdentifier().loadStructure(cache);
518        }
519
520        @Override
521        public int hashCode() {
522                final int prime = 31;
523                int result = 1;
524                result = prime * result + ((name == null) ? 0 : name.hashCode());
525                return result;
526        }
527
528        @Override
529        public boolean equals(Object obj) {
530                if (this == obj)
531                        return true;
532                if (obj == null)
533                        return false;
534                if (getClass() != obj.getClass())
535                        return false;
536                StructureName other = (StructureName) obj;
537                if (name == null) {
538                        if (other.name != null)
539                                return false;
540                } else if (!name.equals(other.name))
541                        return false;
542                return true;
543        }
544
545        /**
546         * Orders identifiers lexicographically by PDB ID and then full Identifier
547         */
548        @Override
549        public int compareTo(StructureName o) {
550                if ( this.equals(o))
551                        return 0;
552
553                PdbId pdb1 = null;
554                PdbId pdb2 = null;
555                try {
556                        pdb1 = this.getPdbId();
557                } catch (StructureException e) {}
558                try {
559                        pdb2 = this.getPdbId();
560                } catch (StructureException e) {}
561
562                int comp = 0;
563
564                // Sort those with PDBIDs before those without
565                if( pdb1 == null ) {
566                        if( pdb2 != null) {
567                                return 1; // this > o
568                        }
569                        // both null
570                } else if( pdb2 == null){
571                        return -1; // this < o
572                } else {
573                        // neither null
574                        comp = pdb1.compareTo(pdb2);
575                }
576                if( comp != 0 ) {
577                        return comp;
578                }
579
580                // break tie with full identifiers
581                String pdb1Str = this.getIdentifier();
582                String pdb2Str = o.getIdentifier();
583
584                // Throws NPE for nulls
585                return pdb1Str.compareTo(pdb2Str);
586        }
587
588        /**
589         * <p>
590         * Guess a scop domain. If an exact match is found, return that.
591         *
592         * <p>
593         * Otherwise, return the first scop domain found for the specified protein such that
594         * <ul>
595         * <li>The chains match, or one of the chains is '_' or '.'.
596         * <li>The domains match, or one of the domains is '_'.
597         * </ul>
598         *
599         * In some cases there may be several valid matches. In this case a warning
600         * will be logged.
601         *
602         * @param name SCOP domain name, or a guess thereof
603         * @param scopDB SCOP domain provider
604         * @return The best match for name among the domains of scopDB, or null if none match.
605         */
606        public static ScopDomain guessScopDomain(String name, ScopDatabase scopDB) {
607                List<ScopDomain> matches = new LinkedList<ScopDomain>();
608
609                // Try exact match first
610                ScopDomain domain = scopDB.getDomainByScopID(name);
611                if (domain != null) {
612                        return domain;
613                }
614
615                // Didn't work. Guess it!
616                logger.warn("Warning, could not find SCOP domain: " + name);
617
618                Matcher scopMatch = scopPattern.matcher(name);
619                if (scopMatch.matches()) {
620                        String pdbID = scopMatch.group(1);
621                        String chainName = scopMatch.group(2);
622                        String domainID = scopMatch.group(3);
623
624                        for (ScopDomain potentialSCOP : scopDB.getDomainsForPDB(pdbID)) {
625                                Matcher potMatch = scopPattern.matcher(potentialSCOP.getScopId());
626                                if (potMatch.matches()) {
627                                        if (chainName.equals(potMatch.group(2)) || chainName.equals("_") || chainName.equals(".")
628                                                        || potMatch.group(2).equals("_") || potMatch.group(2).equals(".")) {
629                                                if (domainID.equals(potMatch.group(3)) || domainID.equals("_") || potMatch.group(3).equals("_")) {
630                                                        // Match, or near match
631                                                        matches.add(potentialSCOP);
632                                                }
633                                        }
634                                }
635                        }
636                }
637
638                Iterator<ScopDomain> match = matches.iterator();
639                if (match.hasNext()) {
640                        ScopDomain bestMatch = match.next();
641                        if(logger.isWarnEnabled()) {
642                                StringBuilder warnMsg = new StringBuilder();
643                                warnMsg.append("Trying domain " + bestMatch.getScopId() + ".");
644                                if (match.hasNext()) {
645                                        warnMsg.append(" Other possibilities: ");
646                                        while (match.hasNext()) {
647                                                warnMsg.append(match.next().getScopId()).append(" ");
648                                        }
649                                }
650                                warnMsg.append(System.getProperty("line.separator"));
651                                logger.warn(warnMsg.toString());
652                        }
653                        return bestMatch;
654                } else {
655                        return null;
656                }
657        }
658
659
660
661}