001package org.biojava.nbio.structure.chem; 002 003import org.biojava.nbio.core.util.InputStreamProvider; 004import org.biojava.nbio.structure.align.util.URLConnectionTools; 005import org.biojava.nbio.structure.align.util.UserConfiguration; 006import org.biojava.nbio.structure.io.LocalPDBDirectory; 007import org.biojava.nbio.structure.io.cif.ChemCompConverter; 008import org.rcsb.cif.ParsingException; 009import org.slf4j.Logger; 010import org.slf4j.LoggerFactory; 011 012import java.io.BufferedReader; 013import java.io.File; 014import java.io.FileOutputStream; 015import java.io.FilenameFilter; 016import java.io.IOException; 017import java.io.InputStreamReader; 018import java.io.PrintWriter; 019import java.io.StringWriter; 020import java.net.URL; 021import java.net.URLConnection; 022import java.nio.file.Files; 023import java.nio.file.Paths; 024import java.nio.file.StandardCopyOption; 025import java.util.ArrayList; 026import java.util.List; 027import java.util.concurrent.atomic.AtomicBoolean; 028import java.util.regex.Matcher; 029import java.util.regex.Pattern; 030import java.util.zip.GZIPOutputStream; 031 032/** 033 * This provider of chemical components can download and cache chemical component definition files from the RCSB PDB web 034 * site. It is the default way to access these definitions. If this provider is called he first time, it will download 035 * and install all chemical component definitions in a local directory. Once the definition files have been installed, 036 * it has quick startup time and low memory requirements. 037 * 038 * An alternative provider, that keeps all definitions in memory is the {@link AllChemCompProvider}. Another provider, 039 * that does not require any network access, but only can support a limited set of chemical component definitions, is 040 * the {@link ReducedChemCompProvider}. 041 * 042 * @author Andreas Prlic 043 */ 044public class DownloadChemCompProvider implements ChemCompProvider { 045 private static final Logger logger = LoggerFactory.getLogger(DownloadChemCompProvider.class); 046 047 private static final String NEWLINE = System.getProperty("line.separator"); 048 049 public static final String CHEM_COMP_CACHE_DIRECTORY = "chemcomp"; 050 public static final String DEFAULT_SERVER_URL = "https://files.rcsb.org/ligands/download/"; 051 public static final String DEFAULT_CHEMCOMP_PATHURL_TEMPLATE = "{ccd_id}.cif"; 052 053 /** 054 * The base URL to which the full path specified via {@link #setChemCompPathUrlTemplate(String)} is appended. 055 * It is assumed that it has a trailing slash. 056 */ 057 public static String serverBaseUrl = DEFAULT_SERVER_URL; 058 059 private static File path; 060 061 private static String chemCompPathUrlTemplate = DEFAULT_CHEMCOMP_PATHURL_TEMPLATE; 062 063 static final Pattern CCD_ID_TEMPLATE_REGEX = Pattern.compile("\\{ccd_id(?::(\\d+_\\d+|[-+]?\\d+))?}"); 064 065 066 // flags to make sure there is only one thread running that is loading the dictionary 067 static AtomicBoolean loading = new AtomicBoolean(false); 068 069 static final List<String> protectedIDs = new ArrayList<>(); 070 static { 071 protectedIDs.add("CON"); 072 protectedIDs.add("PRN"); 073 protectedIDs.add("AUX"); 074 protectedIDs.add("NUL"); 075 } 076 077 private static ChemCompProvider fallback = null; // Fallback provider if the download fails 078 079 /** 080 * by default we will download only some of the files. User has to request that all files should be downloaded... 081 */ 082 boolean downloadAll = false; 083 084 public DownloadChemCompProvider() { 085 this(null); 086 } 087 088 public DownloadChemCompProvider(String cacheFilePath) { 089 logger.debug("Initialising DownloadChemCompProvider"); 090 091 // note that path is static, so this is just to make sure that all non-static methods will have path initialised 092 if (cacheFilePath != null) { 093 path = new File(cacheFilePath); 094 } 095 } 096 097 /** 098 * Set the base URL for the location of all chemical component CIF files, to which the chemCompPathUrlTemplate 099 * is appended, settable in {@link #setChemCompPathUrlTemplate(String)}. A trailing slash is appended 100 * if not present. 101 */ 102 public static void setServerBaseUrl(String serverBaseUrl) { 103 if (!serverBaseUrl.endsWith("/")) { 104 serverBaseUrl = serverBaseUrl + "/"; 105 } 106 DownloadChemCompProvider.serverBaseUrl = serverBaseUrl; 107 } 108 109 /** 110 * Set the path to append to the serverBaseUrl (settable in {@link #setServerBaseUrl(String)}). 111 * The string can contain placeholders that will be expanded at runtime: 112 * <ul> 113 * <li>"{ccd_id}" to be replaced by the chemical component identifier, in capitals</li> 114 * <li>"{ccd_id:beginIndex-endIndex}" to be replaced by a substring of the chemical component identifier in capitals, 115 * with indices following the same convention as {@link String#substring(int, int)} </li> 116 * <li>"{ccd_id:index}" to be replaced by a substring of the chemical component identifier in capitals, 117 * with index either a positive or negative integer to substring from left or right of the string respectively.</li> 118 * </ul> 119 * If any of the indices are off-bounds, then the full chemical component identifier is replaced 120 */ 121 public static void setChemCompPathUrlTemplate(String chemCompPathUrlTemplate) { 122 DownloadChemCompProvider.chemCompPathUrlTemplate = chemCompPathUrlTemplate; 123 } 124 125 /** 126 * Get this provider's cache path 127 * @return 128 */ 129 public static File getPath() { 130 if (path == null) { 131 UserConfiguration config = new UserConfiguration(); 132 path = new File(config.getCacheFilePath()); 133 } 134 return path; 135 } 136 137 /** 138 * Checks if the chemical components already have been installed into the PDB directory. 139 * If not, will download the chemical components definitions file and split it up into small 140 * subfiles. 141 */ 142 public void checkDoFirstInstall() { 143 if (!downloadAll) { 144 return; 145 } 146 147 // this makes sure there is a file separator between every component, 148 // if path has a trailing file separator or not, it will work for both cases 149 File dir = new File(getPath(), CHEM_COMP_CACHE_DIRECTORY); 150 File f = new File(dir, "components.cif.gz"); 151 152 if (!f.exists()) { 153 downloadAllDefinitions(); 154 } else { 155 // file exists.. did it get extracted? 156 FilenameFilter filter = (dir1, file) -> file.endsWith(".cif.gz"); 157 String[] files = dir.list(filter); 158 if (files.length < 500) { 159 // not all did get unpacked 160 try { 161 split(); 162 } catch (IOException e) { 163 logger.error("Could not split file {} into individual chemical component files. Error: {}", 164 f.toString(), e.getMessage()); 165 } 166 } 167 } 168 } 169 170 private void split() throws IOException { 171 logger.info("Installing individual chem comp files ..."); 172 173 File dir = new File(getPath(), CHEM_COMP_CACHE_DIRECTORY); 174 File f = new File(dir, "components.cif.gz"); 175 176 int counter = 0; 177 InputStreamProvider prov = new InputStreamProvider(); 178 179 try (BufferedReader buf = new BufferedReader (new InputStreamReader(prov.getInputStream(f)))) { 180 String line; 181 line = buf.readLine (); 182 StringWriter writer = new StringWriter(); 183 184 String currentID = null; 185 while (line != null) { 186 if (line.startsWith("data_")) { 187 // a new record found! 188 189 if (currentID != null) { 190 writeID(writer.toString(), currentID); 191 counter++; 192 } 193 194 currentID = line.substring(5); 195 writer = new StringWriter(); 196 } 197 198 writer.append(line); 199 writer.append(NEWLINE); 200 201 line = buf.readLine(); 202 } 203 204 // write the last record... 205 writeID(writer.toString(), currentID); 206 counter++; 207 } 208 209 logger.info("Created {} chemical component files.", counter); 210 } 211 212 /** 213 * Output chemical contents to a file 214 * @param contents File contents 215 * @param currentID Chemical ID, used to determine the filename 216 * @throws IOException 217 */ 218 private void writeID(String contents, String currentID) throws IOException { 219 String localName = getLocalFileName(currentID); 220 try (PrintWriter pw = new PrintWriter(new GZIPOutputStream(new FileOutputStream(localName)))) { 221 pw.print(contents); 222 pw.flush(); 223 } 224 } 225 226 /** 227 * Loads the definitions for this {@link ChemComp} from a local file and instantiates a new object. 228 * 229 * @param recordName the ID of the {@link ChemComp} 230 * @return a new {@link ChemComp} definition. 231 */ 232 @Override 233 public ChemComp getChemComp(String recordName) { 234 // make sure we work with upper case records 235 recordName = recordName.toUpperCase().trim(); 236 237 boolean haveFile = true; 238 if ("?".equals(recordName)) { 239 return null; 240 } 241 242 if (fileIsAbsent(recordName)) { 243 // check if we should install all components 244 checkDoFirstInstall(); 245 } 246 if (fileIsAbsent(recordName)) { 247 // we previously have installed already the definitions, 248 // just do an incrememntal update 249 haveFile = downloadChemCompRecord(recordName); 250 } 251 252 // Added check that download was successful and chemical component is available. 253 if (haveFile) { 254 String filename = getLocalFileName(recordName); 255 try { 256 ChemComp chemComp; 257 try { 258 ChemicalComponentDictionary dict = ChemCompConverter.fromPath(Paths.get(filename)); 259 chemComp = dict.getChemComp(recordName); 260 } catch (ParsingException e) { 261 // happens for corrupt files 262 chemComp = null; 263 } 264 265 // May be null if the file was corrupt. Fall back on ReducedChemCompProvider in that case 266 if (chemComp != null) { 267 return chemComp; 268 } 269 } catch (IOException e) { 270 logger.warn("Could not download chemical component file {} for {}. Error: {}. Now trying to use the " + 271 "local chemical component definitions.", filename, recordName, e.getMessage()); 272 } 273 } 274 275 // see https://github.com/biojava/biojava/issues/315 276 // probably a network error happened. Try to use the ReducedChemCOmpProvider 277 if (fallback == null) { 278 fallback = new ReducedChemCompProvider(); 279 } 280 281 logger.warn("Falling back to ReducedChemCompProvider for {}. This could indicate a network error.", recordName); 282 return fallback.getChemComp(recordName); 283 } 284 285 /** 286 * Returns the file name that contains the definition for this {@link ChemComp} 287 * 288 * @param recordName the ID of the {@link ChemComp} 289 * @return full path to the file 290 */ 291 public static String getLocalFileName(String recordName) { 292 if (protectedIDs.contains(recordName)) { 293 recordName = "_" + recordName; 294 } 295 296 File f = new File(getPath(), CHEM_COMP_CACHE_DIRECTORY); 297 if (!f.exists()) { 298 logger.info("Creating directory {}", f); 299 300 boolean success = f.mkdir(); 301 // we've checked in initPath that path is writable, so there's no need to check if it succeeds 302 // in the unlikely case that in the meantime it isn't writable at least we log an error 303 if (!success) { 304 logger.error("Directory {} could not be created", f); 305 } 306 } 307 308 File theFile = new File(f, recordName + ".cif.gz"); 309 return theFile.toString(); 310 } 311 312 private static boolean fileIsAbsent(String recordName) { 313 String fileName = getLocalFileName(recordName); 314 File f = new File(fileName); 315 316 // delete files that are too short to have contents 317 if (f.length() < LocalPDBDirectory.MIN_PDB_FILE_SIZE) { 318 // Delete defensively. 319 // Note that if delete is unsuccessful, we re-download the file anyways 320 f.delete(); 321 return true; 322 } 323 324 return !f.exists(); 325 } 326 327 /** 328 * Expands the given path URL template, replacing the placeholders as specified in {@link #setChemCompPathUrlTemplate(String)} 329 * by the ccdId given (or its substrings, if indices are present in the template) 330 * @param templateStr the template string with placeholders for ccd ids 331 * @param ccdId the ccd id to replace (in full or a substring) 332 * @return the input templateStr with placeholders replaced 333 */ 334 static String expandPathUrlTemplate(String templateStr, String ccdId) { 335 Matcher m = CCD_ID_TEMPLATE_REGEX.matcher(templateStr); 336 StringBuilder output = new StringBuilder(); 337 int lastIndex = 0; 338 while (m.find()) { 339 String repString = ccdId; 340 String indicesStr = m.group(1); 341 try { 342 if (indicesStr == null) { 343 // no substringing 344 repString = ccdId; 345 } else if (!indicesStr.contains("_")) { 346 // left/right substring 347 int idx = Integer.parseInt(indicesStr); 348 if (idx < 0) { // right substring 349 repString = ccdId.substring(ccdId.length() + idx); 350 } else { // left substring 351 repString = ccdId.substring(0, idx); 352 } 353 } else if (indicesStr.contains("_")) { 354 // start and end index 355 String[] tokens = indicesStr.split("_"); 356 int begIdx = Integer.parseInt(tokens[0]); 357 int endIdx = Integer.parseInt(tokens[1]); 358 repString = ccdId.substring(begIdx, endIdx); 359 } 360 } catch (IndexOutOfBoundsException e) { 361 // we don't set repString, it keeps original value ccdId 362 logger.debug("Indices included in path URL template {} are out of bounds for string {}", templateStr, ccdId); 363 } 364 output.append(templateStr, lastIndex, m.start()).append(repString); 365 366 lastIndex = m.end(); 367 // TODO when we upgrade to java 11, use the new methods introduced in java 9, see https://stackoverflow.com/questions/9605716/java-regular-expression-find-and-replace 368 } 369 if (lastIndex < templateStr.length()) { 370 output.append(templateStr, lastIndex, templateStr.length()); 371 } 372 return output.toString(); 373 } 374 375 /** 376 * @param recordName : three-letter name 377 * @return true if successful download 378 */ 379 private static boolean downloadChemCompRecord(String recordName) { 380 String localName = getLocalFileName(recordName); 381 File newFile; 382 try { 383 newFile = Files.createTempFile("chemcomp" + recordName,"cif").toFile(); 384 logger.debug("Will write chem comp file to temp file {}", newFile.toString()); 385 } catch(IOException e) { 386 logger.error("Could not write to temp directory {} to create the chemical component download temp file", System.getProperty("java.io.tmpdir")); 387 return false; 388 } 389 390 String u = serverBaseUrl + expandPathUrlTemplate(chemCompPathUrlTemplate, recordName); 391 392 logger.debug("Downloading chem comp definition from {}", u); 393 394 URL url = null; 395 try { 396 url = new URL(u); 397 URLConnection uconn = URLConnectionTools.openURLConnection(url); 398 399 try (PrintWriter pw = new PrintWriter(new GZIPOutputStream(new FileOutputStream(newFile))); 400 BufferedReader fileBuffer = new BufferedReader(new InputStreamReader(uconn.getInputStream()))) { 401 String line; 402 boolean success = false; 403 while ((line = fileBuffer.readLine()) != null) { 404 pw.println(line); 405 success = true; 406 } 407 if(!success) { 408 throw new IOException("Malformed URL or no content found in "+url.toString()); 409 } 410 411 pw.flush(); 412 } 413 // Now we move this across to where it actually wants to be 414 Files.move(newFile.toPath(), Paths.get(localName), StandardCopyOption.REPLACE_EXISTING); 415 416 return true; 417 } catch (IOException e) { 418 logger.error("Could not download {} OR store locally to {} Error ={}", 419 url, 420 localName, 421 e.getMessage()); 422 newFile.delete(); 423 } 424 return false; 425 } 426 427 private void downloadAllDefinitions() { 428 if (loading.get()) { 429 logger.info("Waiting for other thread to install chemical components..."); 430 } 431 432 while (loading.get()) { 433 // another thread is already downloading the components definitions 434 // wait for the other thread to finish... 435 try { 436 // wait half a second 437 Thread.sleep(500); 438 } catch (InterruptedException e) { 439 //e.printStackTrace(); 440 logger.error("Thread interrupted "+e.getMessage()); 441 } 442 443 logger.info("Another thread installed the chemical components."); 444 return; 445 } 446 447 loading.set(true); 448 long timeS = System.currentTimeMillis(); 449 450 logger.info("Performing first installation of chemical components."); 451 logger.info("Downloading components.cif.gz ..."); 452 453 try { 454 AllChemCompProvider.downloadFile(); 455 } catch (IOException e) { 456 logger.error("Could not download the all chemical components file. Error: {}. " 457 + "Chemical components information won't be available", e.getMessage()); 458 // no point in trying to split if the file could not be downloaded 459 loading.set(false); 460 return; 461 } 462 try { 463 split(); 464 } catch (IOException e) { 465 logger.error("Could not split all chem comp file into individual chemical component files. Error: {}", 466 e.getMessage()); 467 // no point in reporting time 468 loading.set(false); 469 return; 470 } 471 long timeE = System.currentTimeMillis(); 472 logger.info("time to install chem comp dictionary: " + (timeE - timeS) / 1000 + " sec."); 473 loading.set(false); 474 } 475 476 /** 477 * By default this provider will download only some of the {@link ChemComp} files. 478 * The user has to request that all files should be downloaded by setting this parameter to true. 479 * 480 * @return flag if the all components should be downloaded and installed at startup. (default: false) 481 */ 482 public boolean isDownloadAll() { 483 return downloadAll; 484 } 485 486 /** By default this provider will download only some of the {@link ChemComp} files. 487 * The user has to request that all files should be downloaded by setting this parameter to true. 488 * 489 * @param downloadAll if the all components should be downloaded and installed at startup. (default: false) 490 */ 491 public void setDownloadAll(boolean downloadAll) { 492 this.downloadAll = downloadAll; 493 } 494}