001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.structure; 022 023import java.io.BufferedReader; 024import java.io.IOException; 025import java.io.InputStream; 026import java.io.InputStreamReader; 027import java.io.UnsupportedEncodingException; 028import java.net.MalformedURLException; 029import java.net.URL; 030import java.net.URLDecoder; 031import java.util.Arrays; 032import java.util.Collections; 033import java.util.LinkedHashMap; 034import java.util.List; 035import java.util.Map; 036import java.util.regex.Matcher; 037import java.util.regex.Pattern; 038 039import org.biojava.nbio.core.util.InputStreamProvider; 040import org.biojava.nbio.structure.StructureIO.StructureFiletype; 041import org.biojava.nbio.structure.align.util.AtomCache; 042import org.biojava.nbio.structure.io.PDBFileReader; 043import org.biojava.nbio.structure.io.mmcif.MMcifParser; 044import org.biojava.nbio.structure.io.mmcif.SimpleMMcifConsumer; 045import org.biojava.nbio.structure.io.mmcif.SimpleMMcifParser; 046import org.slf4j.Logger; 047import org.slf4j.LoggerFactory; 048 049/** 050 * Represents a structure loaded from a URL (including a file URL) 051 * 052 * A few custom query parameters are supported: 053 * 054 * <ul> 055 * <li><tt>format=[pdb|cif]</tt> Specify the file format (will otherwise be 056 * guessed from the extension) 057 * <li><tt>pdbId=[String]</tt> Specify the PDB ID (also guessed from the filename) 058 * <li><tt>chainID=[String]</tt> A single chain from the structure 059 * <li><tt>residues=[String]</tt> Residue ranges, in a form understood by 060 * {@link SubstructureIdentifier} 061 * </ul> 062 * @author Spencer Bliven 063 * 064 */ 065public class URLIdentifier implements StructureIdentifier { 066 067 private static final long serialVersionUID = -5161230822868926035L; 068 069 private static final Logger logger = LoggerFactory.getLogger(URLIdentifier.class); 070 071 // Used for guessing the PDB ID from the filename 072 private static final Pattern PDBID_REGEX = Pattern.compile("^([0-9][a-z0-9]{3})([._-]|\\s).*",Pattern.CASE_INSENSITIVE); 073 074 /** URL parameter specifying the file format (PDB or CIF) */ 075 public static final String FORMAT_PARAM = "format"; 076 /** URL parameter specifying the PDB ID */ 077 public static final String PDBID_PARAM = "pdbid"; 078 /** URL parameter specifying a single chain to include; overridden by residues */ 079 080 //TODO: should this get renamed to chainname or asymid? 081 public static final String CHAINID_PARAM = "chainid"; 082 /** URL parameter specifying residue ranges to include, e.g. <tt>residues=A:1-70</tt> 083 * @see SubstructureIdentifier 084 */ 085 public static final String RESIDUES_PARAM = "residues"; 086 087 final private URL url; 088 public URLIdentifier(URL url) { 089 this.url = url; 090 } 091 092 public URLIdentifier(String url) throws MalformedURLException { 093 this(new URL(url)); 094 } 095 096 public URL getURL() { 097 return url; 098 } 099 @Override 100 public String getIdentifier() { 101 return url.toString(); 102 } 103 104 /** 105 * @return A SubstructureIdentifier without ranges (e.g. including all residues) 106 */ 107 @Override 108 public SubstructureIdentifier toCanonical() { 109 String pdbId = null; 110 List<ResidueRange> ranges = Collections.emptyList(); 111 try { 112 Map<String, String> params = parseQuery(url); 113 if(params.containsKey(PDBID_PARAM)) { 114 pdbId = params.get(PDBID_PARAM); 115 } 116 if(params.containsKey(RESIDUES_PARAM)) { 117 ranges = ResidueRange.parseMultiple(params.get(RESIDUES_PARAM)); 118 } else if(params.containsKey(CHAINID_PARAM)) { 119 ranges = Arrays.asList(new ResidueRange(params.get(CHAINID_PARAM),(ResidueNumber)null,(ResidueNumber)null)); 120 } 121 } catch (UnsupportedEncodingException e) { 122 logger.error("Unable to decode URL "+url,e); 123 } 124 if(pdbId == null) { 125 String path = url.getPath(); 126 pdbId = guessPDBID(path.substring(path.lastIndexOf("/")+1)); 127 } 128 return new SubstructureIdentifier(pdbId, ranges); 129 } 130 131 @Override 132 public Structure reduce(Structure input) throws StructureException { 133 return toCanonical().reduce(input); 134 } 135 /** 136 * Load the structure from the URL 137 * @return null 138 */ 139 @Override 140 public Structure loadStructure(AtomCache cache) throws StructureException, 141 IOException { 142 StructureFiletype format = StructureFiletype.UNKNOWN; 143 144 // Use user-specified format 145 try { 146 Map<String, String> params = parseQuery(url); 147 if(params.containsKey(FORMAT_PARAM)) { 148 String formatStr = params.get(FORMAT_PARAM); 149 format = StructureIO.guessFiletype("."+formatStr); 150 } 151 } catch (UnsupportedEncodingException e) { 152 logger.error("Unable to decode URL "+url,e); 153 } 154 155 // Guess format from extension 156 if(format == StructureFiletype.UNKNOWN) { 157 format = StructureIO.guessFiletype(url.getPath()); 158 } 159 160 switch(format) { 161 case CIF: 162 // need to do mmcif parsing! 163 164 InputStreamProvider prov = new InputStreamProvider(); 165 InputStream inStream = prov.getInputStream(url); 166 167 MMcifParser parser = new SimpleMMcifParser(); 168 169 SimpleMMcifConsumer consumer = new SimpleMMcifConsumer(); 170 consumer.setFileParsingParameters(cache.getFileParsingParams()); 171 172 173 parser.addMMcifConsumer(consumer); 174 175 try { 176 parser.parse(new BufferedReader(new InputStreamReader(inStream))); 177 } catch (IOException e){ 178 e.printStackTrace(); 179 } 180 181 // now get the protein structure. 182 return consumer.getStructure(); 183 default: 184 case PDB: 185 // pdb file based parsing 186 187 PDBFileReader reader = new PDBFileReader(cache.getPath()); 188 reader.setFetchBehavior(cache.getFetchBehavior()); 189 reader.setObsoleteBehavior(cache.getObsoleteBehavior()); 190 reader.setFileParsingParameters(cache.getFileParsingParams()); 191 return reader.getStructure(url); 192 } 193 } 194 195 196 /** 197 * Recognizes PDB IDs that occur at the beginning of name followed by some 198 * delimiter. 199 * @param name Input filename 200 * @return A 4-character id-like string, or null if none is found 201 */ 202 public static String guessPDBID(String name) { 203 Matcher match = PDBID_REGEX.matcher(name); 204 if(match.matches()) { 205 return match.group(1).toUpperCase(); 206 } else { 207 // Give up if doesn't match 208 return null; 209 } 210 } 211 212 /** 213 * Parses URL parameters into a map. Keys are stored lower-case. 214 * 215 * @param url 216 * @return 217 * @throws UnsupportedEncodingException 218 */ 219 private static Map<String,String> parseQuery(URL url) throws UnsupportedEncodingException { 220 Map<String,String> params = new LinkedHashMap<String, String>(); 221 String query = url.getQuery(); 222 if( query == null || query.isEmpty()) { 223 // empty query 224 return params; 225 } 226 String[] pairs = url.getQuery().split("&"); 227 for(String pair: pairs) { 228 int i = pair.indexOf("="); 229 String key = pair; 230 if(i > 0) { 231 key = URLDecoder.decode(pair.substring(0, i), "UTF-8"); 232 } 233 String value = null; 234 if(i > 0 && pair.length() > i+1) { 235 value = URLDecoder.decode(pair.substring(i+1), "UTF-8"); 236 } 237 // note that this uses the last instance if a parameter is specified multiple times 238 params.put(key.toLowerCase(), value); 239 } 240 return params; 241 } 242}