001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.structure; 022 023import org.biojava.nbio.structure.align.util.AtomCache; 024import org.biojava.nbio.structure.io.PDBFileReader; 025import org.biojava.nbio.structure.io.cif.CifStructureConverter; 026import org.biojava.nbio.structure.io.StructureFiletype; 027import org.biojava.nbio.structure.io.mmtf.MmtfActions; 028import org.slf4j.Logger; 029import org.slf4j.LoggerFactory; 030 031import java.io.IOException; 032import java.io.UnsupportedEncodingException; 033import java.net.MalformedURLException; 034import java.net.URL; 035import java.net.URLDecoder; 036import java.util.Collections; 037import java.util.LinkedHashMap; 038import java.util.List; 039import java.util.Map; 040import java.util.regex.Matcher; 041import java.util.regex.Pattern; 042 043/** 044 * Represents a structure loaded from a URL (including a file URL) 045 * <p> 046 * A few custom query parameters are supported: 047 * 048 * <ul> 049 * <li><code>format=[pdb|cif]</code> Specify the file format (will otherwise be 050 * guessed from the extension) 051 * <li><code>pdbId=[String]</code> Specify the PDB ID (also guessed from the filename) 052 * <li><code>chainID=[String]</code> A single chain from the structure 053 * <li><code>residues=[String]</code> Residue ranges, in a form understood by 054 * {@link SubstructureIdentifier} 055 * </ul> 056 * @author Spencer Bliven 057 * 058 */ 059public class URLIdentifier implements StructureIdentifier { 060 private static final long serialVersionUID = -5161230822868926035L; 061 private static final Logger logger = LoggerFactory.getLogger(URLIdentifier.class); 062 063 // Used for guessing the PDB ID from the filename 064 //UPDATE: It seems that this RegEx rarely succeeded , because the file 065 //name is most of the time in the format pdbxxxx.EXT not xxxx.EXT. 066 private static final Pattern PDBID_REGEX = Pattern.compile("^(?:pdb)?([0-9][a-z0-9]{3})([._-]|\\s).*", Pattern.CASE_INSENSITIVE); 067// private static final Pattern PDBID_REGEX = Pattern.compile("^(?:pdb)?((PDB_[0-9]{4})?[0-9][a-z0-9]{3})([._-]|\\s).*", Pattern.CASE_INSENSITIVE); 068 069 /** URL parameter specifying the file format (PDB or CIF) */ 070 public static final String FORMAT_PARAM = "format"; 071 /** URL parameter specifying the PDB ID */ 072 public static final String PDBID_PARAM = "pdbid"; 073 /** URL parameter specifying a single chain to include; overridden by residues */ 074 075 //TODO: should this get renamed to chainname or asymid? 076 public static final String CHAINID_PARAM = "chainid"; 077 /** 078 * URL parameter specifying residue ranges to include, e.g. <code>residues=A:1-70</code> 079 * @see SubstructureIdentifier 080 */ 081 public static final String RESIDUES_PARAM = "residues"; 082 083 final private URL url; 084 public URLIdentifier(URL url) { 085 this.url = url; 086 } 087 088 public URLIdentifier(String url) throws MalformedURLException { 089 this(new URL(url)); 090 } 091 092 public URL getURL() { 093 return url; 094 } 095 096 @Override 097 public String getIdentifier() { 098 return url.toString(); 099 } 100 101 /** 102 * @return A SubstructureIdentifier without ranges (e.g. including all residues) 103 */ 104 @Override 105 public SubstructureIdentifier toCanonical() throws StructureException{ 106 String pdbId = null; 107 List<ResidueRange> ranges = Collections.emptyList(); 108 try { 109 Map<String, String> params = parseQuery(url); 110 if (params.containsKey(PDBID_PARAM)) { 111 pdbId = params.get(PDBID_PARAM); 112 } 113 if (params.containsKey(RESIDUES_PARAM)) { 114 ranges = ResidueRange.parseMultiple(params.get(RESIDUES_PARAM)); 115 } else if (params.containsKey(CHAINID_PARAM)) { 116 ranges = Collections.singletonList(new ResidueRange(params.get(CHAINID_PARAM), (ResidueNumber) null, (ResidueNumber) null)); 117 } 118 } catch (UnsupportedEncodingException e) { 119 logger.error("Unable to decode URL {}", url, e); 120 } 121 if (pdbId == null) { 122 String path = url.getPath(); 123 pdbId = guessPDBID(path.substring(path.lastIndexOf("/") + 1)); 124 } 125 return new SubstructureIdentifier((pdbId==null?(PdbId)null:new PdbId(pdbId)), ranges); 126 } 127 128 @Override 129 public Structure reduce(Structure input) throws StructureException { 130 return toCanonical().reduce(input); 131 } 132 133 /** 134 * Load the structure from the URL 135 * @return null 136 */ 137 @Override 138 public Structure loadStructure(AtomCache cache) throws StructureException, IOException { 139 StructureFiletype format = StructureFiletype.UNKNOWN; 140 141 // Use user-specified format 142 try { 143 Map<String, String> params = parseQuery(url); 144 if (params.containsKey(FORMAT_PARAM)) { 145 String formatStr = params.get(FORMAT_PARAM); 146 format = StructureIO.guessFiletype("." + formatStr); 147 } 148 } catch (UnsupportedEncodingException e) { 149 logger.error("Unable to decode URL {}", url, e); 150 } 151 152 // Guess format from extension 153 if (format == StructureFiletype.UNKNOWN) { 154 format = StructureIO.guessFiletype(url.getPath()); 155 } 156 157 switch(format) { 158 case CIF: case BCIF: 159 return CifStructureConverter.fromURL(url, cache.getFileParsingParams()); 160 case MMTF: 161 return MmtfActions.readFromInputStream(url.openStream()); 162 default: case PDB: 163 // pdb file based parsing 164 PDBFileReader reader = new PDBFileReader(cache.getPath()); 165 reader.setFetchBehavior(cache.getFetchBehavior()); 166 reader.setObsoleteBehavior(cache.getObsoleteBehavior()); 167 reader.setFileParsingParameters(cache.getFileParsingParams()); 168 return reader.getStructure(url); 169 } 170 } 171 172 /** 173 * Recognizes PDB IDs that occur at the beginning of name followed by some 174 * delimiter. 175 * @param name Input filename 176 * @return A 4-character id-like string, or null if none is found 177 */ 178 public static String guessPDBID(String name) { 179 Matcher match = PDBID_REGEX.matcher(name); 180 if (match.matches()) { 181 return match.group(1).toUpperCase(); 182 } 183 // Give up if doesn't match 184 return null; 185 } 186 187 /** 188 * Parses URL parameters into a map. Keys are stored lower-case. 189 * 190 * @param url 191 * @return 192 * @throws UnsupportedEncodingException 193 */ 194 private static Map<String,String> parseQuery(URL url) throws UnsupportedEncodingException { 195 Map<String,String> params = new LinkedHashMap<>(); 196 String query = url.getQuery(); 197 if (query == null || query.isEmpty()) { 198 // empty query 199 return params; 200 } 201 String[] pairs = url.getQuery().split("&"); 202 for (String pair : pairs) { 203 int i = pair.indexOf("="); 204 String key = pair; 205 if (i > 0) { 206 key = URLDecoder.decode(pair.substring(0, i), "UTF-8"); 207 } 208 String value = null; 209 if(i > 0 && pair.length() > i + 1) { 210 value = URLDecoder.decode(pair.substring(i + 1), "UTF-8"); 211 } 212 // note that this uses the last instance if a parameter is specified multiple times 213 params.put(key.toLowerCase(), value); 214 } 215 return params; 216 } 217}