001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.structure.io.mmcif; 022 023import java.io.BufferedReader; 024import java.io.File; 025import java.io.FileOutputStream; 026import java.io.FilenameFilter; 027import java.io.IOException; 028import java.io.InputStream; 029import java.io.InputStreamReader; 030import java.io.PrintWriter; 031import java.io.StringWriter; 032import java.net.URL; 033import java.net.URLConnection; 034import java.nio.file.Files; 035import java.nio.file.Paths; 036import java.nio.file.StandardCopyOption; 037import java.util.ArrayList; 038import java.util.List; 039import java.util.concurrent.atomic.AtomicBoolean; 040import java.util.zip.GZIPOutputStream; 041 042import org.biojava.nbio.core.util.InputStreamProvider; 043import org.biojava.nbio.structure.align.util.URLConnectionTools; 044import org.biojava.nbio.structure.align.util.UserConfiguration; 045import org.biojava.nbio.structure.io.LocalPDBDirectory; 046import org.biojava.nbio.structure.io.mmcif.model.ChemComp; 047import org.slf4j.Logger; 048import org.slf4j.LoggerFactory; 049 050 051 052/** 053 * This provider of chemical components can download and cache chemical component definition files from the RCSB PDB web site. 054 * It is the default way to access these definitions. 055 * If this provider is called he first time, it will download and install all chemical 056 * component definitions in a local directory. 057 * Once the definition files have been installed, it has quick startup time and low memory requirements. 058 * 059 * An alternative provider, that keeps all definitions in memory is the {@link AllChemCompProvider}. Another provider, that 060 * does not require any network access, but only can support a limited set of chemical component definitions, is the {@link ReducedChemCompProvider}. 061 * 062 * 063 * @author Andreas Prlic 064 * 065 */ 066public class DownloadChemCompProvider implements ChemCompProvider { 067 068 private static final Logger logger = LoggerFactory.getLogger(DownloadChemCompProvider.class); 069 070 public static final String CHEM_COMP_CACHE_DIRECTORY = "chemcomp"; 071 072 public static final String DEFAULT_SERVER_URL = "http://files.rcsb.org/ligands/download/"; 073 074 public static String serverBaseUrl = DEFAULT_SERVER_URL; 075 076 /** 077 * Use default RCSB server layout (true) or internal RCSB server layout (false) 078 */ 079 public static boolean useDefaultUrlLayout = true; 080 081 082 private static File path; 083 //private static final String FILE_SEPARATOR = System.getProperty("file.separator"); 084 private static final String NEWLINE = System.getProperty("line.separator"); 085 086 087 // flags to make sure there is only one thread running that is loading the dictionary 088 static AtomicBoolean loading = new AtomicBoolean(false); 089 090 static final List<String> protectedIDs = new ArrayList<String> (); 091 static { 092 protectedIDs.add("CON"); 093 protectedIDs.add("PRN"); 094 protectedIDs.add("AUX"); 095 protectedIDs.add("NUL"); 096 } 097 098 private static ChemCompProvider fallback = null; // Fallback provider if the download fails 099 100 /** by default we will download only some of the files. User has to request that all files should be downloaded... 101 * 102 */ 103 boolean downloadAll = false; 104 105 public DownloadChemCompProvider(){ 106 this(null); 107 } 108 109 public DownloadChemCompProvider(String cacheFilePath){ 110 logger.debug("Initialising DownloadChemCompProvider"); 111 112 // note that path is static, so this is just to make sure that all non-static methods will have path initialised 113 if(cacheFilePath != null) { 114 path = new File(cacheFilePath); 115 } 116 } 117 118 /** 119 * Get this provider's cache path 120 * @return 121 */ 122 public static File getPath(){ 123 if (path==null) { 124 UserConfiguration config = new UserConfiguration(); 125 path = new File(config.getCacheFilePath()); 126 } 127 return path; 128 } 129 130 /** 131 * Checks if the chemical components already have been installed into the PDB directory. 132 * If not, will download the chemical components definitions file and split it up into small 133 * subfiles. 134 */ 135 public void checkDoFirstInstall(){ 136 137 if ( ! downloadAll ) { 138 return; 139 } 140 141 142 // this makes sure there is a file separator between every component, 143 // if path has a trailing file separator or not, it will work for both cases 144 File dir = new File(getPath(), CHEM_COMP_CACHE_DIRECTORY); 145 File f = new File(dir, "components.cif.gz"); 146 147 if ( ! f.exists()) { 148 149 downloadAllDefinitions(); 150 151 } else { 152 // file exists.. did it get extracted? 153 154 FilenameFilter filter =new FilenameFilter() { 155 156 @Override 157 public boolean accept(File dir, String file) { 158 return file.endsWith(".cif.gz"); 159 } 160 }; 161 String[] files = dir.list(filter); 162 if ( files.length < 500) { 163 // not all did get unpacked 164 try { 165 split(); 166 } catch (IOException e) { 167 logger.error("Could not split file {} into individual chemical component files. Error: {}", 168 f.toString(), e.getMessage()); 169 } 170 } 171 } 172 } 173 174 private void split() throws IOException { 175 176 logger.info("Installing individual chem comp files ..."); 177 178 File dir = new File(getPath(), CHEM_COMP_CACHE_DIRECTORY); 179 File f = new File(dir, "components.cif.gz"); 180 181 182 int counter = 0; 183 InputStreamProvider prov = new InputStreamProvider(); 184 185 try( BufferedReader buf = new BufferedReader (new InputStreamReader (prov.getInputStream(f))); 186 ) { 187 String line = null; 188 line = buf.readLine (); 189 StringWriter writer = new StringWriter(); 190 191 String currentID = null; 192 while (line != null){ 193 194 if ( line.startsWith("data_")) { 195 // a new record found! 196 197 if ( currentID != null) { 198 writeID(writer.toString(), currentID); 199 counter++; 200 } 201 202 currentID = line.substring(5); 203 writer = new StringWriter(); 204 } 205 206 writer.append(line); 207 writer.append(NEWLINE); 208 209 line = buf.readLine (); 210 } 211 212 // write the last record... 213 writeID(writer.toString(),currentID); 214 counter++; 215 216 } 217 218 logger.info("Created " + counter + " chemical component files."); 219 } 220 221 /** 222 * Output chemical contents to a file 223 * @param contents File contents 224 * @param currentID Chemical ID, used to determine the filename 225 * @throws IOException 226 */ 227 private void writeID(String contents, String currentID) throws IOException{ 228 229 String localName = getLocalFileName(currentID); 230 231 try ( PrintWriter pw = new PrintWriter(new GZIPOutputStream(new FileOutputStream(localName))) ) { 232 233 pw.print(contents); 234 pw.flush(); 235 } 236 } 237 238 /** 239 * Loads the definitions for this {@link ChemComp} from a local file and instantiates a new object. 240 * 241 * @param recordName the ID of the {@link ChemComp} 242 * @return a new {@link ChemComp} definition. 243 */ 244 @Override 245 public ChemComp getChemComp(String recordName) { 246 247 // make sure we work with upper case records 248 recordName = recordName.toUpperCase().trim(); 249 250 boolean haveFile = true; 251 if ( recordName.equals("?")){ 252 return null; 253 } 254 255 if ( ! fileExists(recordName)) { 256 // check if we should install all components 257 checkDoFirstInstall(); 258 } 259 if ( ! fileExists(recordName)) { 260 // we previously have installed already the definitions, 261 // just do an incrememntal update 262 haveFile = downloadChemCompRecord(recordName); 263 } 264 265 // Added check that download was successful and chemical component is available. 266 if (haveFile) { 267 String filename = getLocalFileName(recordName); 268 InputStream inStream = null; 269 try { 270 271 InputStreamProvider isp = new InputStreamProvider(); 272 273 inStream = isp.getInputStream(filename); 274 275 MMcifParser parser = new SimpleMMcifParser(); 276 277 ChemCompConsumer consumer = new ChemCompConsumer(); 278 279 // The Consumer builds up the BioJava - structure object. 280 // you could also hook in your own and build up you own data model. 281 parser.addMMcifConsumer(consumer); 282 283 parser.parse(new BufferedReader(new InputStreamReader(inStream))); 284 285 ChemicalComponentDictionary dict = consumer.getDictionary(); 286 287 ChemComp chemComp = dict.getChemComp(recordName); 288 289 // May be null if the file was corrupt. Fall back on ReducedChemCompProvider in that case 290 if(chemComp != null) { 291 return chemComp; 292 } 293 294 } catch (IOException e) { 295 296 logger.warn( 297 "Could not download chemical component file {} for {}. Error: {}. Now trying to use the local chemical component definitions.", 298 filename, recordName, e.getMessage()); 299 300 } 301 finally{ 302 // Now close it 303 if(inStream!=null){ 304 try { 305 inStream.close(); 306 } catch (IOException e) { 307 // This would be weird... 308 logger.error("Could not close chemical component file {}. A resource leak could occur!!", filename); 309 } 310 } 311 312 } 313 } 314 315 // see https://github.com/biojava/biojava/issues/315 316 // probably a network error happened. Try to use the ReducedChemCOmpProvider 317 if( fallback == null) { 318 fallback = new ReducedChemCompProvider(); 319 } 320 321 logger.warn("Falling back to ReducedChemCompProvider for {}. This could indicate a network error.", recordName); 322 return fallback.getChemComp(recordName); 323 324 } 325 326 /** 327 * Returns the file name that contains the definition for this {@link ChemComp} 328 * 329 * @param recordName the ID of the {@link ChemComp} 330 * @return full path to the file 331 */ 332 public static String getLocalFileName(String recordName){ 333 334 if ( protectedIDs.contains(recordName)){ 335 recordName = "_" + recordName; 336 } 337 338 File f = new File(getPath(), CHEM_COMP_CACHE_DIRECTORY); 339 if (! f.exists()){ 340 logger.info("Creating directory " + f); 341 342 boolean success = f.mkdir(); 343 // we've checked in initPath that path is writable, so there's no need to check if it succeeds 344 // in the unlikely case that in the meantime it isn't writable at least we log an error 345 if (!success) 346 logger.error("Directory {} could not be created",f); 347 348 } 349 350 File theFile = new File(f,recordName + ".cif.gz"); 351 352 return theFile.toString(); 353 } 354 355 private static boolean fileExists(String recordName){ 356 357 String fileName = getLocalFileName(recordName); 358 359 File f = new File(fileName); 360 361 // delete files that are too short to have contents 362 if( f.length() < LocalPDBDirectory.MIN_PDB_FILE_SIZE ) { 363 // Delete defensively. 364 // Note that if delete is unsuccessful, we re-download the file anyways 365 f.delete(); 366 return false; 367 } 368 369 return f.exists(); 370 371 } 372 373 /** 374 * @param recordName : three-letter name 375 * @return true if successful download 376 */ 377 private static boolean downloadChemCompRecord(String recordName) { 378 379 String localName = getLocalFileName(recordName); 380 File newFile; 381 try{ 382 newFile = File.createTempFile("chemcomp"+recordName, "cif"); 383 logger.debug("Will write chem comp file to temp file {}", newFile.toString()); 384 } 385 catch(IOException e){ 386 logger.error("Could not write to temp directory {} to create the chemical component download temp file", System.getProperty("java.io.tmpdir")); 387 return false; 388 } 389 String u; 390 if(useDefaultUrlLayout){ 391 u = serverBaseUrl + recordName + ".cif"; 392 } 393 else{ 394 u = serverBaseUrl + recordName.charAt(0) + "/" + recordName +"/" + recordName + ".cif"; 395 } 396 397 logger.debug("downloading " + u); 398 399 URL url = null; 400 401 402 try { 403 url = new URL(u); 404 URLConnection uconn = URLConnectionTools.openURLConnection(url); 405 406 try( PrintWriter pw = new PrintWriter(new GZIPOutputStream(new FileOutputStream(newFile))); 407 BufferedReader fileBuffer = new BufferedReader(new InputStreamReader(uconn.getInputStream())); 408 ) { 409 410 String line; 411 412 while ((line = fileBuffer.readLine()) != null) { 413 pw.println(line); 414 } 415 416 pw.flush(); 417 } 418 // Now we move this across to where it actually wants to be 419 Files.move(newFile.toPath(), Paths.get(localName), StandardCopyOption.REPLACE_EXISTING); 420 421 return true; 422 } catch (IOException e){ 423 logger.error("Could not download "+url.toString()+" OR store locally to "+localName+" Error ="+e.getMessage()); 424 newFile.delete(); 425 } 426 return false; 427 } 428 429 private void downloadAllDefinitions() { 430 431 if ( loading.get()){ 432 logger.info("Waiting for other thread to install chemical components..."); 433 } 434 435 while ( loading.get() ) { 436 437 // another thread is already downloading the components definitions 438 // wait for the other thread to finish... 439 440 try { 441 // wait half a second 442 443 Thread.sleep(500); 444 } catch (InterruptedException e) { 445 //e.printStackTrace(); 446 logger.error("Thread interrupted "+e.getMessage()); 447 } 448 449 logger.info("Another thread installed the chemical components."); 450 return; 451 452 } 453 454 loading.set(true); 455 long timeS = System.currentTimeMillis(); 456 457 logger.info("Performing first installation of chemical components."); 458 logger.info("Downloading components.cif.gz ..."); 459 460 461 try { 462 AllChemCompProvider.downloadFile(); 463 } catch (IOException e){ 464 logger.error("Could not download the all chemical components file. Error: {}. " 465 + "Chemical components information won't be available", e.getMessage()); 466 // no point in trying to split if the file could not be downloaded 467 loading.set(false); 468 return; 469 } 470 try { 471 split(); 472 } catch (IOException e) { 473 logger.error("Could not split all chem comp file into individual chemical component files. Error: {}", 474 e.getMessage()); 475 // no point in reporting time 476 loading.set(false); 477 return; 478 } 479 long timeE = System.currentTimeMillis(); 480 logger.info("time to install chem comp dictionary: " + (timeE - timeS) / 1000 + " sec."); 481 loading.set(false); 482 483 } 484 485 /** By default this provider will download only some of the {@link ChemComp} files. 486 * The user has to request that all files should be downloaded by setting this parameter to true. 487 * 488 * @return flag if the all components should be downloaded and installed at startup. (default: false) 489 */ 490 public boolean isDownloadAll() { 491 return downloadAll; 492 } 493 494 /** By default this provider will download only some of the {@link ChemComp} files. 495 * The user has to request that all files should be downloaded by setting this parameter to true. 496 * 497 * @param flag if the all components should be downloaded and installed at startup. (default: false) 498 */ 499 public void setDownloadAll(boolean downloadAll) { 500 this.downloadAll = downloadAll; 501 } 502 503 504 505 506 507}