001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.structure;
022
023import org.biojava.nbio.structure.align.util.AtomCache;
024import org.biojava.nbio.structure.io.PDBFileReader;
025import org.biojava.nbio.structure.io.cif.CifStructureConverter;
026import org.biojava.nbio.structure.io.StructureFiletype;
027import org.biojava.nbio.structure.io.mmtf.MmtfActions;
028import org.slf4j.Logger;
029import org.slf4j.LoggerFactory;
030
031import java.io.IOException;
032import java.io.UnsupportedEncodingException;
033import java.net.MalformedURLException;
034import java.net.URL;
035import java.net.URLDecoder;
036import java.util.Collections;
037import java.util.LinkedHashMap;
038import java.util.List;
039import java.util.Map;
040import java.util.regex.Matcher;
041import java.util.regex.Pattern;
042
043/**
044 * Represents a structure loaded from a URL (including a file URL)
045 * <p>
046 * A few custom query parameters are supported:
047 *
048 * <ul>
049 * <li><code>format=[pdb|cif]</code> Specify the file format (will otherwise be
050 *     guessed from the extension)
051 * <li><code>pdbId=[String]</code> Specify the PDB ID (also guessed from the filename)
052 * <li><code>chainID=[String]</code> A single chain from the structure
053 * <li><code>residues=[String]</code> Residue ranges, in a form understood by
054 *     {@link SubstructureIdentifier}
055 * </ul>
056 * @author Spencer Bliven
057 *
058 */
059public class URLIdentifier implements StructureIdentifier {
060        private static final long serialVersionUID = -5161230822868926035L;
061        private static final Logger logger = LoggerFactory.getLogger(URLIdentifier.class);
062
063        // Used for guessing the PDB ID from the filename
064        //UPDATE: It seems that this RegEx rarely succeeded , because the file
065        //name is most of the time in the format pdbxxxx.EXT not xxxx.EXT.
066        private static final Pattern PDBID_REGEX = Pattern.compile("^(?:pdb)?([0-9][a-z0-9]{3})([._-]|\\s).*", Pattern.CASE_INSENSITIVE);
067//      private static final Pattern PDBID_REGEX = Pattern.compile("^(?:pdb)?((PDB_[0-9]{4})?[0-9][a-z0-9]{3})([._-]|\\s).*", Pattern.CASE_INSENSITIVE);
068        
069        /** URL parameter specifying the file format (PDB or CIF) */
070        public static final String FORMAT_PARAM = "format";
071        /** URL parameter specifying the PDB ID */
072        public static final String PDBID_PARAM = "pdbid";
073        /** URL parameter specifying a single chain to include; overridden by residues */
074
075        //TODO: should this get renamed to chainname or asymid?
076        public static final String CHAINID_PARAM = "chainid";
077        /**
078         * URL parameter specifying residue ranges to include, e.g. <code>residues=A:1-70</code>
079         * @see SubstructureIdentifier
080         */
081        public static final String RESIDUES_PARAM = "residues";
082
083        final private URL url;
084        public URLIdentifier(URL url) {
085                this.url = url;
086        }
087
088        public URLIdentifier(String url) throws MalformedURLException {
089                this(new URL(url));
090        }
091
092        public URL getURL() {
093                return url;
094        }
095
096        @Override
097        public String getIdentifier() {
098                return url.toString();
099        }
100
101        /**
102         * @return A SubstructureIdentifier without ranges (e.g. including all residues)
103         */
104        @Override
105        public SubstructureIdentifier toCanonical() throws StructureException{
106                String pdbId = null;
107                List<ResidueRange> ranges = Collections.emptyList();
108                try {
109                        Map<String, String> params = parseQuery(url);
110                        if (params.containsKey(PDBID_PARAM)) {
111                                pdbId = params.get(PDBID_PARAM);
112                        }
113                        if (params.containsKey(RESIDUES_PARAM)) {
114                                ranges = ResidueRange.parseMultiple(params.get(RESIDUES_PARAM));
115                        } else if (params.containsKey(CHAINID_PARAM)) {
116                                ranges = Collections.singletonList(new ResidueRange(params.get(CHAINID_PARAM), (ResidueNumber) null, (ResidueNumber) null));
117                        }
118                } catch (UnsupportedEncodingException e) {
119                        logger.error("Unable to decode URL {}", url, e);
120                }
121                if (pdbId == null) {
122                        String path = url.getPath();
123                        pdbId = guessPDBID(path.substring(path.lastIndexOf("/") + 1));
124                }
125                return new SubstructureIdentifier((pdbId==null?(PdbId)null:new PdbId(pdbId)), ranges);
126        }
127
128        @Override
129        public Structure reduce(Structure input) throws StructureException {
130                return toCanonical().reduce(input);
131        }
132
133        /**
134         * Load the structure from the URL
135         * @return null
136         */
137        @Override
138        public Structure loadStructure(AtomCache cache) throws StructureException, IOException {
139                StructureFiletype format = StructureFiletype.UNKNOWN;
140
141                // Use user-specified format
142                try {
143                        Map<String, String> params = parseQuery(url);
144                        if (params.containsKey(FORMAT_PARAM)) {
145                                String formatStr = params.get(FORMAT_PARAM);
146                                format = StructureIO.guessFiletype("." + formatStr);
147                        }
148                } catch (UnsupportedEncodingException e) {
149                        logger.error("Unable to decode URL {}", url, e);
150                }
151
152                // Guess format from extension
153                if (format == StructureFiletype.UNKNOWN) {
154                        format = StructureIO.guessFiletype(url.getPath());
155                }
156
157                switch(format) {
158                        case CIF: case BCIF:
159                                return CifStructureConverter.fromURL(url, cache.getFileParsingParams());
160                        case MMTF:
161                                return MmtfActions.readFromInputStream(url.openStream());
162                        default: case PDB:
163                                // pdb file based parsing
164                                PDBFileReader reader = new PDBFileReader(cache.getPath());
165                                reader.setFetchBehavior(cache.getFetchBehavior());
166                                reader.setObsoleteBehavior(cache.getObsoleteBehavior());
167                                reader.setFileParsingParameters(cache.getFileParsingParams());
168                                return reader.getStructure(url);
169                }
170        }
171
172        /**
173         * Recognizes PDB IDs that occur at the beginning of name followed by some
174         * delimiter.
175         * @param name Input filename
176         * @return A 4-character id-like string, or null if none is found
177         */
178        public static String guessPDBID(String name) {
179                Matcher match = PDBID_REGEX.matcher(name);
180                if (match.matches()) {
181                        return match.group(1).toUpperCase();
182                }
183                // Give up if doesn't match
184                return null;
185        }
186
187        /**
188         * Parses URL parameters into a map. Keys are stored lower-case.
189         *
190         * @param url
191         * @return
192         * @throws UnsupportedEncodingException
193         */
194        private static Map<String,String> parseQuery(URL url) throws UnsupportedEncodingException {
195                Map<String,String> params = new LinkedHashMap<>();
196                String query = url.getQuery();
197                if (query == null || query.isEmpty()) {
198                        // empty query
199                        return params;
200                }
201                String[] pairs = url.getQuery().split("&");
202                for (String pair : pairs) {
203                        int i = pair.indexOf("=");
204                        String key = pair;
205                        if (i > 0) {
206                                key = URLDecoder.decode(pair.substring(0, i), "UTF-8");
207                        }
208                        String value = null;
209                        if(i > 0 && pair.length() > i + 1) {
210                                value = URLDecoder.decode(pair.substring(i + 1), "UTF-8");
211                        }
212                        // note that this uses the last instance if a parameter is specified multiple times
213                        params.put(key.toLowerCase(), value);
214                }
215                return params;
216        }
217}