001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.structure;
022
023import java.io.BufferedReader;
024import java.io.IOException;
025import java.io.InputStream;
026import java.io.InputStreamReader;
027import java.io.UnsupportedEncodingException;
028import java.net.MalformedURLException;
029import java.net.URL;
030import java.net.URLDecoder;
031import java.util.Arrays;
032import java.util.Collections;
033import java.util.LinkedHashMap;
034import java.util.List;
035import java.util.Map;
036import java.util.regex.Matcher;
037import java.util.regex.Pattern;
038
039import org.biojava.nbio.core.util.InputStreamProvider;
040import org.biojava.nbio.structure.StructureIO.StructureFiletype;
041import org.biojava.nbio.structure.align.util.AtomCache;
042import org.biojava.nbio.structure.io.PDBFileReader;
043import org.biojava.nbio.structure.io.mmcif.MMcifParser;
044import org.biojava.nbio.structure.io.mmcif.SimpleMMcifConsumer;
045import org.biojava.nbio.structure.io.mmcif.SimpleMMcifParser;
046import org.slf4j.Logger;
047import org.slf4j.LoggerFactory;
048
049/**
050 * Represents a structure loaded from a URL (including a file URL)
051 *
052 * A few custom query parameters are supported:
053 *
054 * <ul>
055 * <li><tt>format=[pdb|cif]</tt> Specify the file format (will otherwise be
056 *     guessed from the extension)
057 * <li><tt>pdbId=[String]</tt> Specify the PDB ID (also guessed from the filename)
058 * <li><tt>chainID=[String]</tt> A single chain from the structure
059 * <li><tt>residues=[String]</tt> Residue ranges, in a form understood by
060 *     {@link SubstructureIdentifier}
061 * </ul>
062 * @author Spencer Bliven
063 *
064 */
065public class URLIdentifier implements StructureIdentifier {
066        private static final Logger logger = LoggerFactory.getLogger(URLIdentifier.class);
067
068        // Used for guessing the PDB ID from the filename
069        private static final Pattern PDBID_REGEX = Pattern.compile("^([0-9][a-z0-9]{3})([._-]|\\s).*",Pattern.CASE_INSENSITIVE);
070
071        /** URL parameter specifying the file format (PDB or CIF) */
072        public static final String FORMAT_PARAM = "format";
073        /** URL parameter specifying the PDB ID */
074        public static final String PDBID_PARAM = "pdbid";
075        /** URL parameter specifying a single chain to include; overridden by residues */
076        public static final String CHAINID_PARAM = "chainid";
077        /** URL parameter specifying residue ranges to include, e.g. <tt>residues=A:1-70</tt>
078         * @see SubstructureIdentifier
079         */
080        public static final String RESIDUES_PARAM = "residues";
081
082        final private URL url;
083        public URLIdentifier(URL url) {
084                this.url = url;
085        }
086
087        public URLIdentifier(String url) throws MalformedURLException {
088                this(new URL(url));
089        }
090
091        public URL getURL() {
092                return url;
093        }
094        @Override
095        public String getIdentifier() {
096                return url.toString();
097        }
098
099        /**
100         * @return A SubstructureIdentifier without ranges (e.g. including all residues)
101         */
102        @Override
103        public SubstructureIdentifier toCanonical() {
104                String pdbId = null;
105                List<ResidueRange> ranges = Collections.emptyList();
106                try {
107                        Map<String, String> params = parseQuery(url);
108                        if(params.containsKey(PDBID_PARAM)) {
109                                pdbId = params.get(PDBID_PARAM);
110                        }
111                        if(params.containsKey(RESIDUES_PARAM)) {
112                                ranges = ResidueRange.parseMultiple(params.get(RESIDUES_PARAM));
113                        } else if(params.containsKey(CHAINID_PARAM)) {
114                                ranges = Arrays.asList(new ResidueRange(params.get(CHAINID_PARAM),(ResidueNumber)null,(ResidueNumber)null));
115                        }
116                } catch (UnsupportedEncodingException e) {
117                        logger.error("Unable to decode URL "+url,e);
118                }
119                if(pdbId == null) {
120                        String path = url.getPath();
121                        pdbId = guessPDBID(path.substring(path.lastIndexOf("/")+1));
122                }
123                return new SubstructureIdentifier(pdbId, ranges);
124        }
125
126        @Override
127        public Structure reduce(Structure input) throws StructureException {
128                return toCanonical().reduce(input);
129        }
130        /**
131         * Load the structure from the URL
132         * @return null
133         */
134        @Override
135        public Structure loadStructure(AtomCache cache) throws StructureException,
136                        IOException {
137                StructureFiletype format = StructureFiletype.UNKNOWN;
138
139                // Use user-specified format
140                try {
141                        Map<String, String> params = parseQuery(url);
142                        if(params.containsKey(FORMAT_PARAM)) {
143                                String formatStr = params.get(FORMAT_PARAM);
144                                format = StructureIO.guessFiletype("."+formatStr);
145                        }
146                } catch (UnsupportedEncodingException e) {
147                        logger.error("Unable to decode URL "+url,e);
148                }
149
150                // Guess format from extension
151                if(format == StructureFiletype.UNKNOWN) {
152                        format = StructureIO.guessFiletype(url.getPath());
153                }
154
155                switch(format) {
156                case CIF:
157                        // need to do mmcif parsing!
158
159                        InputStreamProvider prov = new InputStreamProvider();
160                        InputStream inStream =  prov.getInputStream(url);
161
162                        MMcifParser parser = new SimpleMMcifParser();
163
164                        SimpleMMcifConsumer consumer = new SimpleMMcifConsumer();
165                        consumer.setFileParsingParameters(cache.getFileParsingParams());
166
167
168                        parser.addMMcifConsumer(consumer);
169
170                        try {
171                                parser.parse(new BufferedReader(new InputStreamReader(inStream)));
172                        } catch (IOException e){
173                                e.printStackTrace();
174                        }
175
176                        // now get the protein structure.
177                        return consumer.getStructure();
178                default:
179                case PDB:
180                        // pdb file based parsing
181
182                        PDBFileReader reader = new PDBFileReader(cache.getPath());
183                        reader.setFetchBehavior(cache.getFetchBehavior());
184                        reader.setObsoleteBehavior(cache.getObsoleteBehavior());
185                        reader.setFileParsingParameters(cache.getFileParsingParams());
186                        return reader.getStructure(url);
187                }
188        }
189
190
191        /**
192         * Recognizes PDB IDs that occur at the beginning of name followed by some
193         * delimiter.
194         * @param name Input filename
195         * @return A 4-character id-like string, or null if none is found
196         */
197        public static String guessPDBID(String name) {
198                Matcher match = PDBID_REGEX.matcher(name);
199                if(match.matches()) {
200                        return match.group(1).toUpperCase();
201                } else {
202                        // Give up if doesn't match
203                        return null;
204                }
205        }
206
207        /**
208         * Parses URL parameters into a map. Keys are stored lower-case.
209         *
210         * @param url
211         * @return
212         * @throws UnsupportedEncodingException
213         */
214        private static Map<String,String> parseQuery(URL url) throws UnsupportedEncodingException {
215                Map<String,String> params = new LinkedHashMap<String, String>();
216                String query = url.getQuery();
217                if( query == null || query.isEmpty()) {
218                        // empty query
219                        return params;
220                }
221                String[] pairs = url.getQuery().split("&");
222                for(String pair: pairs) {
223                        int i = pair.indexOf("=");
224                        String key = pair;
225                        if(i > 0) {
226                                key = URLDecoder.decode(pair.substring(0, i), "UTF-8");
227                        }
228                        String value = null;
229                        if(i > 0 && pair.length() > i+1) {
230                                value = URLDecoder.decode(pair.substring(i+1), "UTF-8");
231                        }
232                        // note that this uses the last instance if a parameter is specified multiple times
233                        params.put(key.toLowerCase(), value);
234                }
235                return params;
236        }
237}