001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.structure; 022 023import java.io.BufferedReader; 024import java.io.IOException; 025import java.io.InputStream; 026import java.io.InputStreamReader; 027import java.io.UnsupportedEncodingException; 028import java.net.MalformedURLException; 029import java.net.URL; 030import java.net.URLDecoder; 031import java.util.Arrays; 032import java.util.Collections; 033import java.util.LinkedHashMap; 034import java.util.List; 035import java.util.Map; 036import java.util.regex.Matcher; 037import java.util.regex.Pattern; 038 039import org.biojava.nbio.core.util.InputStreamProvider; 040import org.biojava.nbio.structure.StructureIO.StructureFiletype; 041import org.biojava.nbio.structure.align.util.AtomCache; 042import org.biojava.nbio.structure.io.PDBFileReader; 043import org.biojava.nbio.structure.io.mmcif.MMcifParser; 044import org.biojava.nbio.structure.io.mmcif.SimpleMMcifConsumer; 045import org.biojava.nbio.structure.io.mmcif.SimpleMMcifParser; 046import org.slf4j.Logger; 047import org.slf4j.LoggerFactory; 048 049/** 050 * Represents a structure loaded from a URL (including a file URL) 051 * 052 * A few custom query parameters are supported: 053 * 054 * <ul> 055 * <li><tt>format=[pdb|cif]</tt> Specify the file format (will otherwise be 056 * guessed from the extension) 057 * <li><tt>pdbId=[String]</tt> Specify the PDB ID (also guessed from the filename) 058 * <li><tt>chainID=[String]</tt> A single chain from the structure 059 * <li><tt>residues=[String]</tt> Residue ranges, in a form understood by 060 * {@link SubstructureIdentifier} 061 * </ul> 062 * @author Spencer Bliven 063 * 064 */ 065public class URLIdentifier implements StructureIdentifier { 066 private static final Logger logger = LoggerFactory.getLogger(URLIdentifier.class); 067 068 // Used for guessing the PDB ID from the filename 069 private static final Pattern PDBID_REGEX = Pattern.compile("^([0-9][a-z0-9]{3})([._-]|\\s).*",Pattern.CASE_INSENSITIVE); 070 071 /** URL parameter specifying the file format (PDB or CIF) */ 072 public static final String FORMAT_PARAM = "format"; 073 /** URL parameter specifying the PDB ID */ 074 public static final String PDBID_PARAM = "pdbid"; 075 /** URL parameter specifying a single chain to include; overridden by residues */ 076 public static final String CHAINID_PARAM = "chainid"; 077 /** URL parameter specifying residue ranges to include, e.g. <tt>residues=A:1-70</tt> 078 * @see SubstructureIdentifier 079 */ 080 public static final String RESIDUES_PARAM = "residues"; 081 082 final private URL url; 083 public URLIdentifier(URL url) { 084 this.url = url; 085 } 086 087 public URLIdentifier(String url) throws MalformedURLException { 088 this(new URL(url)); 089 } 090 091 public URL getURL() { 092 return url; 093 } 094 @Override 095 public String getIdentifier() { 096 return url.toString(); 097 } 098 099 /** 100 * @return A SubstructureIdentifier without ranges (e.g. including all residues) 101 */ 102 @Override 103 public SubstructureIdentifier toCanonical() { 104 String pdbId = null; 105 List<ResidueRange> ranges = Collections.emptyList(); 106 try { 107 Map<String, String> params = parseQuery(url); 108 if(params.containsKey(PDBID_PARAM)) { 109 pdbId = params.get(PDBID_PARAM); 110 } 111 if(params.containsKey(RESIDUES_PARAM)) { 112 ranges = ResidueRange.parseMultiple(params.get(RESIDUES_PARAM)); 113 } else if(params.containsKey(CHAINID_PARAM)) { 114 ranges = Arrays.asList(new ResidueRange(params.get(CHAINID_PARAM),(ResidueNumber)null,(ResidueNumber)null)); 115 } 116 } catch (UnsupportedEncodingException e) { 117 logger.error("Unable to decode URL "+url,e); 118 } 119 if(pdbId == null) { 120 String path = url.getPath(); 121 pdbId = guessPDBID(path.substring(path.lastIndexOf("/")+1)); 122 } 123 return new SubstructureIdentifier(pdbId, ranges); 124 } 125 126 @Override 127 public Structure reduce(Structure input) throws StructureException { 128 return toCanonical().reduce(input); 129 } 130 /** 131 * Load the structure from the URL 132 * @return null 133 */ 134 @Override 135 public Structure loadStructure(AtomCache cache) throws StructureException, 136 IOException { 137 StructureFiletype format = StructureFiletype.UNKNOWN; 138 139 // Use user-specified format 140 try { 141 Map<String, String> params = parseQuery(url); 142 if(params.containsKey(FORMAT_PARAM)) { 143 String formatStr = params.get(FORMAT_PARAM); 144 format = StructureIO.guessFiletype("."+formatStr); 145 } 146 } catch (UnsupportedEncodingException e) { 147 logger.error("Unable to decode URL "+url,e); 148 } 149 150 // Guess format from extension 151 if(format == StructureFiletype.UNKNOWN) { 152 format = StructureIO.guessFiletype(url.getPath()); 153 } 154 155 switch(format) { 156 case CIF: 157 // need to do mmcif parsing! 158 159 InputStreamProvider prov = new InputStreamProvider(); 160 InputStream inStream = prov.getInputStream(url); 161 162 MMcifParser parser = new SimpleMMcifParser(); 163 164 SimpleMMcifConsumer consumer = new SimpleMMcifConsumer(); 165 consumer.setFileParsingParameters(cache.getFileParsingParams()); 166 167 168 parser.addMMcifConsumer(consumer); 169 170 try { 171 parser.parse(new BufferedReader(new InputStreamReader(inStream))); 172 } catch (IOException e){ 173 e.printStackTrace(); 174 } 175 176 // now get the protein structure. 177 return consumer.getStructure(); 178 default: 179 case PDB: 180 // pdb file based parsing 181 182 PDBFileReader reader = new PDBFileReader(cache.getPath()); 183 reader.setFetchBehavior(cache.getFetchBehavior()); 184 reader.setObsoleteBehavior(cache.getObsoleteBehavior()); 185 reader.setFileParsingParameters(cache.getFileParsingParams()); 186 return reader.getStructure(url); 187 } 188 } 189 190 191 /** 192 * Recognizes PDB IDs that occur at the beginning of name followed by some 193 * delimiter. 194 * @param name Input filename 195 * @return A 4-character id-like string, or null if none is found 196 */ 197 public static String guessPDBID(String name) { 198 Matcher match = PDBID_REGEX.matcher(name); 199 if(match.matches()) { 200 return match.group(1).toUpperCase(); 201 } else { 202 // Give up if doesn't match 203 return null; 204 } 205 } 206 207 /** 208 * Parses URL parameters into a map. Keys are stored lower-case. 209 * 210 * @param url 211 * @return 212 * @throws UnsupportedEncodingException 213 */ 214 private static Map<String,String> parseQuery(URL url) throws UnsupportedEncodingException { 215 Map<String,String> params = new LinkedHashMap<String, String>(); 216 String query = url.getQuery(); 217 if( query == null || query.isEmpty()) { 218 // empty query 219 return params; 220 } 221 String[] pairs = url.getQuery().split("&"); 222 for(String pair: pairs) { 223 int i = pair.indexOf("="); 224 String key = pair; 225 if(i > 0) { 226 key = URLDecoder.decode(pair.substring(0, i), "UTF-8"); 227 } 228 String value = null; 229 if(i > 0 && pair.length() > i+1) { 230 value = URLDecoder.decode(pair.substring(i+1), "UTF-8"); 231 } 232 // note that this uses the last instance if a parameter is specified multiple times 233 params.put(key.toLowerCase(), value); 234 } 235 return params; 236 } 237}