001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.structure;
022
023import java.io.BufferedReader;
024import java.io.IOException;
025import java.io.InputStream;
026import java.io.InputStreamReader;
027import java.io.UnsupportedEncodingException;
028import java.net.MalformedURLException;
029import java.net.URL;
030import java.net.URLDecoder;
031import java.util.Arrays;
032import java.util.Collections;
033import java.util.LinkedHashMap;
034import java.util.List;
035import java.util.Map;
036import java.util.regex.Matcher;
037import java.util.regex.Pattern;
038
039import org.biojava.nbio.core.util.InputStreamProvider;
040import org.biojava.nbio.structure.StructureIO.StructureFiletype;
041import org.biojava.nbio.structure.align.util.AtomCache;
042import org.biojava.nbio.structure.io.PDBFileReader;
043import org.biojava.nbio.structure.io.mmcif.MMcifParser;
044import org.biojava.nbio.structure.io.mmcif.SimpleMMcifConsumer;
045import org.biojava.nbio.structure.io.mmcif.SimpleMMcifParser;
046import org.slf4j.Logger;
047import org.slf4j.LoggerFactory;
048
049/**
050 * Represents a structure loaded from a URL (including a file URL)
051 *
052 * A few custom query parameters are supported:
053 *
054 * <ul>
055 * <li><tt>format=[pdb|cif]</tt> Specify the file format (will otherwise be
056 *     guessed from the extension)
057 * <li><tt>pdbId=[String]</tt> Specify the PDB ID (also guessed from the filename)
058 * <li><tt>chainID=[String]</tt> A single chain from the structure
059 * <li><tt>residues=[String]</tt> Residue ranges, in a form understood by
060 *     {@link SubstructureIdentifier}
061 * </ul>
062 * @author Spencer Bliven
063 *
064 */
065public class URLIdentifier implements StructureIdentifier {
066
067        private static final long serialVersionUID = -5161230822868926035L;
068
069        private static final Logger logger = LoggerFactory.getLogger(URLIdentifier.class);
070
071        // Used for guessing the PDB ID from the filename
072        private static final Pattern PDBID_REGEX = Pattern.compile("^([0-9][a-z0-9]{3})([._-]|\\s).*",Pattern.CASE_INSENSITIVE);
073
074        /** URL parameter specifying the file format (PDB or CIF) */
075        public static final String FORMAT_PARAM = "format";
076        /** URL parameter specifying the PDB ID */
077        public static final String PDBID_PARAM = "pdbid";
078        /** URL parameter specifying a single chain to include; overridden by residues */
079
080        //TODO: should this get renamed to chainname or asymid?
081        public static final String CHAINID_PARAM = "chainid";
082        /** URL parameter specifying residue ranges to include, e.g. <tt>residues=A:1-70</tt>
083         * @see SubstructureIdentifier
084         */
085        public static final String RESIDUES_PARAM = "residues";
086
087        final private URL url;
088        public URLIdentifier(URL url) {
089                this.url = url;
090        }
091
092        public URLIdentifier(String url) throws MalformedURLException {
093                this(new URL(url));
094        }
095
096        public URL getURL() {
097                return url;
098        }
099        @Override
100        public String getIdentifier() {
101                return url.toString();
102        }
103
104        /**
105         * @return A SubstructureIdentifier without ranges (e.g. including all residues)
106         */
107        @Override
108        public SubstructureIdentifier toCanonical() {
109                String pdbId = null;
110                List<ResidueRange> ranges = Collections.emptyList();
111                try {
112                        Map<String, String> params = parseQuery(url);
113                        if(params.containsKey(PDBID_PARAM)) {
114                                pdbId = params.get(PDBID_PARAM);
115                        }
116                        if(params.containsKey(RESIDUES_PARAM)) {
117                                ranges = ResidueRange.parseMultiple(params.get(RESIDUES_PARAM));
118                        } else if(params.containsKey(CHAINID_PARAM)) {
119                                ranges = Arrays.asList(new ResidueRange(params.get(CHAINID_PARAM),(ResidueNumber)null,(ResidueNumber)null));
120                        }
121                } catch (UnsupportedEncodingException e) {
122                        logger.error("Unable to decode URL "+url,e);
123                }
124                if(pdbId == null) {
125                        String path = url.getPath();
126                        pdbId = guessPDBID(path.substring(path.lastIndexOf("/")+1));
127                }
128                return new SubstructureIdentifier(pdbId, ranges);
129        }
130
131        @Override
132        public Structure reduce(Structure input) throws StructureException {
133                return toCanonical().reduce(input);
134        }
135        /**
136         * Load the structure from the URL
137         * @return null
138         */
139        @Override
140        public Structure loadStructure(AtomCache cache) throws StructureException,
141                        IOException {
142                StructureFiletype format = StructureFiletype.UNKNOWN;
143
144                // Use user-specified format
145                try {
146                        Map<String, String> params = parseQuery(url);
147                        if(params.containsKey(FORMAT_PARAM)) {
148                                String formatStr = params.get(FORMAT_PARAM);
149                                format = StructureIO.guessFiletype("."+formatStr);
150                        }
151                } catch (UnsupportedEncodingException e) {
152                        logger.error("Unable to decode URL "+url,e);
153                }
154
155                // Guess format from extension
156                if(format == StructureFiletype.UNKNOWN) {
157                        format = StructureIO.guessFiletype(url.getPath());
158                }
159
160                switch(format) {
161                case CIF:
162                        // need to do mmcif parsing!
163
164                        InputStreamProvider prov = new InputStreamProvider();
165                        InputStream inStream =  prov.getInputStream(url);
166
167                        MMcifParser parser = new SimpleMMcifParser();
168
169                        SimpleMMcifConsumer consumer = new SimpleMMcifConsumer();
170                        consumer.setFileParsingParameters(cache.getFileParsingParams());
171
172
173                        parser.addMMcifConsumer(consumer);
174
175                        try {
176                                parser.parse(new BufferedReader(new InputStreamReader(inStream)));
177                        } catch (IOException e){
178                                e.printStackTrace();
179                        }
180
181                        // now get the protein structure.
182                        return consumer.getStructure();
183                default:
184                case PDB:
185                        // pdb file based parsing
186
187                        PDBFileReader reader = new PDBFileReader(cache.getPath());
188                        reader.setFetchBehavior(cache.getFetchBehavior());
189                        reader.setObsoleteBehavior(cache.getObsoleteBehavior());
190                        reader.setFileParsingParameters(cache.getFileParsingParams());
191                        return reader.getStructure(url);
192                }
193        }
194
195
196        /**
197         * Recognizes PDB IDs that occur at the beginning of name followed by some
198         * delimiter.
199         * @param name Input filename
200         * @return A 4-character id-like string, or null if none is found
201         */
202        public static String guessPDBID(String name) {
203                Matcher match = PDBID_REGEX.matcher(name);
204                if(match.matches()) {
205                        return match.group(1).toUpperCase();
206                } else {
207                        // Give up if doesn't match
208                        return null;
209                }
210        }
211
212        /**
213         * Parses URL parameters into a map. Keys are stored lower-case.
214         *
215         * @param url
216         * @return
217         * @throws UnsupportedEncodingException
218         */
219        private static Map<String,String> parseQuery(URL url) throws UnsupportedEncodingException {
220                Map<String,String> params = new LinkedHashMap<String, String>();
221                String query = url.getQuery();
222                if( query == null || query.isEmpty()) {
223                        // empty query
224                        return params;
225                }
226                String[] pairs = url.getQuery().split("&");
227                for(String pair: pairs) {
228                        int i = pair.indexOf("=");
229                        String key = pair;
230                        if(i > 0) {
231                                key = URLDecoder.decode(pair.substring(0, i), "UTF-8");
232                        }
233                        String value = null;
234                        if(i > 0 && pair.length() > i+1) {
235                                value = URLDecoder.decode(pair.substring(i+1), "UTF-8");
236                        }
237                        // note that this uses the last instance if a parameter is specified multiple times
238                        params.put(key.toLowerCase(), value);
239                }
240                return params;
241        }
242}