001package org.biojava.nbio.structure.chem; 002 003import org.biojava.nbio.core.util.InputStreamProvider; 004import org.biojava.nbio.structure.align.util.URLConnectionTools; 005import org.biojava.nbio.structure.align.util.UserConfiguration; 006import org.biojava.nbio.structure.io.LocalPDBDirectory; 007import org.biojava.nbio.structure.io.cif.ChemCompConverter; 008import org.rcsb.cif.ParsingException; 009import org.slf4j.Logger; 010import org.slf4j.LoggerFactory; 011 012import java.io.BufferedReader; 013import java.io.File; 014import java.io.FileOutputStream; 015import java.io.FilenameFilter; 016import java.io.IOException; 017import java.io.InputStreamReader; 018import java.io.PrintWriter; 019import java.io.StringWriter; 020import java.net.URL; 021import java.net.URLConnection; 022import java.nio.file.Files; 023import java.nio.file.Paths; 024import java.nio.file.StandardCopyOption; 025import java.util.ArrayList; 026import java.util.List; 027import java.util.concurrent.atomic.AtomicBoolean; 028import java.util.regex.Matcher; 029import java.util.regex.Pattern; 030import java.util.zip.GZIPOutputStream; 031 032/** 033 * This provider of chemical components can download and cache chemical component definition files from the RCSB PDB web 034 * site. It is the default way to access these definitions. If this provider is called he first time, it will download 035 * and install all chemical component definitions in a local directory. Once the definition files have been installed, 036 * it has quick startup time and low memory requirements. 037 * 038 * An alternative provider, that keeps all definitions in memory is the {@link AllChemCompProvider}. Another provider, 039 * that does not require any network access, but only can support a limited set of chemical component definitions, is 040 * the {@link ReducedChemCompProvider}. 041 * 042 * @author Andreas Prlic 043 */ 044public class DownloadChemCompProvider implements ChemCompProvider { 045 private static final Logger logger = LoggerFactory.getLogger(DownloadChemCompProvider.class); 046 047 private static final String NEWLINE = System.getProperty("line.separator"); 048 049 public static final String CHEM_COMP_CACHE_DIRECTORY = "chemcomp"; 050 public static final String DEFAULT_SERVER_URL = "https://files.rcsb.org/ligands/download/"; 051 public static final String DEFAULT_CHEMCOMP_PATHURL_TEMPLATE = "{ccd_id}.cif"; 052 053 /** 054 * The base URL to which the full path specified via {@link #setChemCompPathUrlTemplate(String)} is appended. 055 * It is assumed that it has a trailing slash. 056 */ 057 public static String serverBaseUrl = DEFAULT_SERVER_URL; 058 059 private static File path; 060 061 private static String chemCompPathUrlTemplate = DEFAULT_CHEMCOMP_PATHURL_TEMPLATE; 062 063 static final Pattern CCD_ID_TEMPLATE_REGEX = Pattern.compile("\\{ccd_id(?::(\\d+_\\d+|[-+]?\\d+))?}"); 064 065 066 // flags to make sure there is only one thread running that is loading the dictionary 067 static AtomicBoolean loading = new AtomicBoolean(false); 068 069 static final List<String> protectedIDs = new ArrayList<>(); 070 static { 071 protectedIDs.add("CON"); 072 protectedIDs.add("PRN"); 073 protectedIDs.add("AUX"); 074 protectedIDs.add("NUL"); 075 } 076 077 private static ChemCompProvider fallback = null; // Fallback provider if the download fails 078 079 /** 080 * by default we will download only some of the files. User has to request that all files should be downloaded... 081 */ 082 boolean downloadAll = false; 083 084 public DownloadChemCompProvider() { 085 this(null); 086 } 087 088 public DownloadChemCompProvider(String cacheFilePath) { 089 logger.debug("Initialising DownloadChemCompProvider"); 090 091 // note that path is static, so this is just to make sure that all non-static methods will have path initialised 092 if (cacheFilePath != null) { 093 path = new File(cacheFilePath); 094 } 095 } 096 097 /** 098 * Set the base URL for the location of all chemical component CIF files, to which the chemCompPathUrlTemplate 099 * is appended, settable in {@link #setChemCompPathUrlTemplate(String)}. A trailing slash is appended 100 * if not present. 101 */ 102 public static void setServerBaseUrl(String serverBaseUrl) { 103 if (!serverBaseUrl.endsWith("/")) { 104 serverBaseUrl = serverBaseUrl + "/"; 105 } 106 DownloadChemCompProvider.serverBaseUrl = serverBaseUrl; 107 } 108 109 /** 110 * Set the path to append to the serverBaseUrl (settable in {@link #setServerBaseUrl(String)}). 111 * The string can contain placeholders that will be expanded at runtime: 112 * <li>"{ccd_id}" to be replaced by the chemical component identifier, in capitals</li> 113 * <li>"{ccd_id:beginIndex-endIndex}" to be replaced by a substring of the chemical component identifier in capitals, 114 * with indices following the same convention as {@link String#substring(int, int)} </li> 115 * <li>"{ccd_id:index}" to be replaced by a substring of the chemical component identifier in capitals, 116 * with index either a positive or negative integer to substring from left or right of the string respectively.</li> 117 * If any of the indices are off-bounds, then the full chemical component identifier is replaced 118 */ 119 public static void setChemCompPathUrlTemplate(String chemCompPathUrlTemplate) { 120 DownloadChemCompProvider.chemCompPathUrlTemplate = chemCompPathUrlTemplate; 121 } 122 123 /** 124 * Get this provider's cache path 125 * @return 126 */ 127 public static File getPath() { 128 if (path == null) { 129 UserConfiguration config = new UserConfiguration(); 130 path = new File(config.getCacheFilePath()); 131 } 132 return path; 133 } 134 135 /** 136 * Checks if the chemical components already have been installed into the PDB directory. 137 * If not, will download the chemical components definitions file and split it up into small 138 * subfiles. 139 */ 140 public void checkDoFirstInstall() { 141 if (!downloadAll) { 142 return; 143 } 144 145 // this makes sure there is a file separator between every component, 146 // if path has a trailing file separator or not, it will work for both cases 147 File dir = new File(getPath(), CHEM_COMP_CACHE_DIRECTORY); 148 File f = new File(dir, "components.cif.gz"); 149 150 if (!f.exists()) { 151 downloadAllDefinitions(); 152 } else { 153 // file exists.. did it get extracted? 154 FilenameFilter filter = (dir1, file) -> file.endsWith(".cif.gz"); 155 String[] files = dir.list(filter); 156 if (files.length < 500) { 157 // not all did get unpacked 158 try { 159 split(); 160 } catch (IOException e) { 161 logger.error("Could not split file {} into individual chemical component files. Error: {}", 162 f.toString(), e.getMessage()); 163 } 164 } 165 } 166 } 167 168 private void split() throws IOException { 169 logger.info("Installing individual chem comp files ..."); 170 171 File dir = new File(getPath(), CHEM_COMP_CACHE_DIRECTORY); 172 File f = new File(dir, "components.cif.gz"); 173 174 int counter = 0; 175 InputStreamProvider prov = new InputStreamProvider(); 176 177 try (BufferedReader buf = new BufferedReader (new InputStreamReader(prov.getInputStream(f)))) { 178 String line; 179 line = buf.readLine (); 180 StringWriter writer = new StringWriter(); 181 182 String currentID = null; 183 while (line != null) { 184 if (line.startsWith("data_")) { 185 // a new record found! 186 187 if (currentID != null) { 188 writeID(writer.toString(), currentID); 189 counter++; 190 } 191 192 currentID = line.substring(5); 193 writer = new StringWriter(); 194 } 195 196 writer.append(line); 197 writer.append(NEWLINE); 198 199 line = buf.readLine(); 200 } 201 202 // write the last record... 203 writeID(writer.toString(), currentID); 204 counter++; 205 } 206 207 logger.info("Created {} chemical component files.", counter); 208 } 209 210 /** 211 * Output chemical contents to a file 212 * @param contents File contents 213 * @param currentID Chemical ID, used to determine the filename 214 * @throws IOException 215 */ 216 private void writeID(String contents, String currentID) throws IOException { 217 String localName = getLocalFileName(currentID); 218 try (PrintWriter pw = new PrintWriter(new GZIPOutputStream(new FileOutputStream(localName)))) { 219 pw.print(contents); 220 pw.flush(); 221 } 222 } 223 224 /** 225 * Loads the definitions for this {@link ChemComp} from a local file and instantiates a new object. 226 * 227 * @param recordName the ID of the {@link ChemComp} 228 * @return a new {@link ChemComp} definition. 229 */ 230 @Override 231 public ChemComp getChemComp(String recordName) { 232 // make sure we work with upper case records 233 recordName = recordName.toUpperCase().trim(); 234 235 boolean haveFile = true; 236 if (recordName.equals("?")) { 237 return null; 238 } 239 240 if (fileIsAbsent(recordName)) { 241 // check if we should install all components 242 checkDoFirstInstall(); 243 } 244 if (fileIsAbsent(recordName)) { 245 // we previously have installed already the definitions, 246 // just do an incrememntal update 247 haveFile = downloadChemCompRecord(recordName); 248 } 249 250 // Added check that download was successful and chemical component is available. 251 if (haveFile) { 252 String filename = getLocalFileName(recordName); 253 try { 254 ChemComp chemComp; 255 try { 256 ChemicalComponentDictionary dict = ChemCompConverter.fromPath(Paths.get(filename)); 257 chemComp = dict.getChemComp(recordName); 258 } catch (ParsingException e) { 259 // happens for corrupt files 260 chemComp = null; 261 } 262 263 // May be null if the file was corrupt. Fall back on ReducedChemCompProvider in that case 264 if (chemComp != null) { 265 return chemComp; 266 } 267 } catch (IOException e) { 268 logger.warn("Could not download chemical component file {} for {}. Error: {}. Now trying to use the " + 269 "local chemical component definitions.", filename, recordName, e.getMessage()); 270 } 271 } 272 273 // see https://github.com/biojava/biojava/issues/315 274 // probably a network error happened. Try to use the ReducedChemCOmpProvider 275 if (fallback == null) { 276 fallback = new ReducedChemCompProvider(); 277 } 278 279 logger.warn("Falling back to ReducedChemCompProvider for {}. This could indicate a network error.", recordName); 280 return fallback.getChemComp(recordName); 281 } 282 283 /** 284 * Returns the file name that contains the definition for this {@link ChemComp} 285 * 286 * @param recordName the ID of the {@link ChemComp} 287 * @return full path to the file 288 */ 289 public static String getLocalFileName(String recordName) { 290 if (protectedIDs.contains(recordName)) { 291 recordName = "_" + recordName; 292 } 293 294 File f = new File(getPath(), CHEM_COMP_CACHE_DIRECTORY); 295 if (!f.exists()) { 296 logger.info("Creating directory {}", f); 297 298 boolean success = f.mkdir(); 299 // we've checked in initPath that path is writable, so there's no need to check if it succeeds 300 // in the unlikely case that in the meantime it isn't writable at least we log an error 301 if (!success) { 302 logger.error("Directory {} could not be created", f); 303 } 304 } 305 306 File theFile = new File(f, recordName + ".cif.gz"); 307 return theFile.toString(); 308 } 309 310 private static boolean fileIsAbsent(String recordName) { 311 String fileName = getLocalFileName(recordName); 312 File f = new File(fileName); 313 314 // delete files that are too short to have contents 315 if (f.length() < LocalPDBDirectory.MIN_PDB_FILE_SIZE) { 316 // Delete defensively. 317 // Note that if delete is unsuccessful, we re-download the file anyways 318 f.delete(); 319 return true; 320 } 321 322 return !f.exists(); 323 } 324 325 /** 326 * Expands the given path URL template, replacing the placeholders as specified in {@link #setChemCompPathUrlTemplate(String)} 327 * by the ccdId given (or its substrings, if indices are present in the template) 328 * @param templateStr the template string with placeholders for ccd ids 329 * @param ccdId the ccd id to replace (in full or a substring) 330 * @return the input templateStr with placeholders replaced 331 */ 332 static String expandPathUrlTemplate(String templateStr, String ccdId) { 333 Matcher m = CCD_ID_TEMPLATE_REGEX.matcher(templateStr); 334 StringBuilder output = new StringBuilder(); 335 int lastIndex = 0; 336 while (m.find()) { 337 String repString = ccdId; 338 String indicesStr = m.group(1); 339 try { 340 if (indicesStr == null) { 341 // no substringing 342 repString = ccdId; 343 } else if (!indicesStr.contains("_")) { 344 // left/right substring 345 int idx = Integer.parseInt(indicesStr); 346 if (idx < 0) { // right substring 347 repString = ccdId.substring(ccdId.length() + idx); 348 } else { // left substring 349 repString = ccdId.substring(0, idx); 350 } 351 } else if (indicesStr.contains("_")) { 352 // start and end index 353 String[] tokens = indicesStr.split("_"); 354 int begIdx = Integer.parseInt(tokens[0]); 355 int endIdx = Integer.parseInt(tokens[1]); 356 repString = ccdId.substring(begIdx, endIdx); 357 } 358 } catch (IndexOutOfBoundsException e) { 359 // we don't set repString, it keeps original value ccdId 360 logger.debug("Indices included in path URL template {} are out of bounds for string {}", templateStr, ccdId); 361 } 362 output.append(templateStr, lastIndex, m.start()).append(repString); 363 364 lastIndex = m.end(); 365 // TODO when we upgrade to java 11, use the new methods introduced in java 9, see https://stackoverflow.com/questions/9605716/java-regular-expression-find-and-replace 366 } 367 if (lastIndex < templateStr.length()) { 368 output.append(templateStr, lastIndex, templateStr.length()); 369 } 370 return output.toString(); 371 } 372 373 /** 374 * @param recordName : three-letter name 375 * @return true if successful download 376 */ 377 private static boolean downloadChemCompRecord(String recordName) { 378 String localName = getLocalFileName(recordName); 379 File newFile; 380 try { 381 newFile = File.createTempFile("chemcomp" + recordName, "cif"); 382 logger.debug("Will write chem comp file to temp file {}", newFile.toString()); 383 } catch(IOException e) { 384 logger.error("Could not write to temp directory {} to create the chemical component download temp file", System.getProperty("java.io.tmpdir")); 385 return false; 386 } 387 388 String u = serverBaseUrl + expandPathUrlTemplate(chemCompPathUrlTemplate, recordName); 389 390 logger.debug("Downloading chem comp definition from {}", u); 391 392 URL url = null; 393 try { 394 url = new URL(u); 395 URLConnection uconn = URLConnectionTools.openURLConnection(url); 396 397 try (PrintWriter pw = new PrintWriter(new GZIPOutputStream(new FileOutputStream(newFile))); 398 BufferedReader fileBuffer = new BufferedReader(new InputStreamReader(uconn.getInputStream()))) { 399 String line; 400 boolean success = false; 401 while ((line = fileBuffer.readLine()) != null) { 402 pw.println(line); 403 success = true; 404 } 405 if(!success) { 406 throw new IOException("Malformed URL or no content found in "+url.toString()); 407 } 408 409 pw.flush(); 410 } 411 // Now we move this across to where it actually wants to be 412 Files.move(newFile.toPath(), Paths.get(localName), StandardCopyOption.REPLACE_EXISTING); 413 414 return true; 415 } catch (IOException e) { 416 logger.error("Could not download {} OR store locally to {} Error ={}", 417 url, 418 localName, 419 e.getMessage()); 420 newFile.delete(); 421 } 422 return false; 423 } 424 425 private void downloadAllDefinitions() { 426 if (loading.get()) { 427 logger.info("Waiting for other thread to install chemical components..."); 428 } 429 430 while (loading.get()) { 431 // another thread is already downloading the components definitions 432 // wait for the other thread to finish... 433 try { 434 // wait half a second 435 Thread.sleep(500); 436 } catch (InterruptedException e) { 437 //e.printStackTrace(); 438 logger.error("Thread interrupted "+e.getMessage()); 439 } 440 441 logger.info("Another thread installed the chemical components."); 442 return; 443 } 444 445 loading.set(true); 446 long timeS = System.currentTimeMillis(); 447 448 logger.info("Performing first installation of chemical components."); 449 logger.info("Downloading components.cif.gz ..."); 450 451 try { 452 AllChemCompProvider.downloadFile(); 453 } catch (IOException e) { 454 logger.error("Could not download the all chemical components file. Error: {}. " 455 + "Chemical components information won't be available", e.getMessage()); 456 // no point in trying to split if the file could not be downloaded 457 loading.set(false); 458 return; 459 } 460 try { 461 split(); 462 } catch (IOException e) { 463 logger.error("Could not split all chem comp file into individual chemical component files. Error: {}", 464 e.getMessage()); 465 // no point in reporting time 466 loading.set(false); 467 return; 468 } 469 long timeE = System.currentTimeMillis(); 470 logger.info("time to install chem comp dictionary: " + (timeE - timeS) / 1000 + " sec."); 471 loading.set(false); 472 } 473 474 /** 475 * By default this provider will download only some of the {@link ChemComp} files. 476 * The user has to request that all files should be downloaded by setting this parameter to true. 477 * 478 * @return flag if the all components should be downloaded and installed at startup. (default: false) 479 */ 480 public boolean isDownloadAll() { 481 return downloadAll; 482 } 483 484 /** By default this provider will download only some of the {@link ChemComp} files. 485 * The user has to request that all files should be downloaded by setting this parameter to true. 486 * 487 * @param downloadAll if the all components should be downloaded and installed at startup. (default: false) 488 */ 489 public void setDownloadAll(boolean downloadAll) { 490 this.downloadAll = downloadAll; 491 } 492}