001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.structure.io.mmcif; 022 023import java.io.BufferedReader; 024import java.io.File; 025import java.io.FileOutputStream; 026import java.io.FilenameFilter; 027import java.io.IOException; 028import java.io.InputStream; 029import java.io.InputStreamReader; 030import java.io.PrintWriter; 031import java.io.StringWriter; 032import java.net.URL; 033import java.net.URLConnection; 034import java.nio.file.Files; 035import java.nio.file.Paths; 036import java.nio.file.StandardCopyOption; 037import java.util.ArrayList; 038import java.util.List; 039import java.util.concurrent.atomic.AtomicBoolean; 040import java.util.zip.GZIPOutputStream; 041 042import org.biojava.nbio.core.util.InputStreamProvider; 043import org.biojava.nbio.structure.align.util.URLConnectionTools; 044import org.biojava.nbio.structure.align.util.UserConfiguration; 045import org.biojava.nbio.structure.io.mmcif.model.ChemComp; 046import org.slf4j.Logger; 047import org.slf4j.LoggerFactory; 048 049 050 051/** 052 * This provider of chemical components can download and cache chemical component definition files from the RCSB PDB web site. 053 * It is the default way to access these definitions. 054 * If this provider is called he first time, it will download and install all chemical 055 * component definitions in a local directory. 056 * Once the definition files have been installed, it has quick startup time and low memory requirements. 057 * 058 * An alternative provider, that keeps all definitions in memory is the {@link AllChemCompProvider}. Another provider, that 059 * does not require any network access, but only can support a limited set of chemical component definitions, is the {@link ReducedChemCompProvider}. 060 * 061 * 062 * @author Andreas Prlic 063 * 064 */ 065public class DownloadChemCompProvider implements ChemCompProvider { 066 067 private static final Logger logger = LoggerFactory.getLogger(DownloadChemCompProvider.class); 068 069 public static final String CHEM_COMP_CACHE_DIRECTORY = "chemcomp"; 070 071 public static final String DEFAULT_SERVER_URL = "http://files.rcsb.org/ligands/download/"; 072 073 public static String serverBaseUrl = DEFAULT_SERVER_URL; 074 075 /** 076 * Use default RCSB server layout (true) or internal RCSB server layout (false) 077 */ 078 public static boolean useDefaultUrlLayout = true; 079 080 081 private static File path; 082 //private static final String FILE_SEPARATOR = System.getProperty("file.separator"); 083 private static final String NEWLINE = System.getProperty("line.separator"); 084 085 086 // flags to make sure there is only one thread running that is loading the dictionary 087 static AtomicBoolean loading = new AtomicBoolean(false); 088 089 static final List<String> protectedIDs = new ArrayList<String> (); 090 static { 091 protectedIDs.add("CON"); 092 protectedIDs.add("PRN"); 093 protectedIDs.add("AUX"); 094 protectedIDs.add("NUL"); 095 } 096 097 /** by default we will download only some of the files. User has to request that all files should be downloaded... 098 * 099 */ 100 boolean downloadAll = false; 101 102 public DownloadChemCompProvider(){ 103 logger.debug("Initialising DownloadChemCompProvider"); 104 105 // note that path is static, so this is just to make sure that all non-static methods will have path initialised 106 initPath(); 107 } 108 109 public DownloadChemCompProvider(String cacheFilePath){ 110 logger.debug("Initialising DownloadChemCompProvider"); 111 112 // note that path is static, so this is just to make sure that all non-static methods will have path initialised 113 path = new File(cacheFilePath); 114 } 115 116 private static void initPath(){ 117 118 if (path==null) { 119 UserConfiguration config = new UserConfiguration(); 120 path = new File(config.getCacheFilePath()); 121 } 122 } 123 124 /** 125 * Checks if the chemical components already have been installed into the PDB directory. 126 * If not, will download the chemical components definitions file and split it up into small 127 * subfiles. 128 */ 129 public void checkDoFirstInstall(){ 130 131 if ( ! downloadAll ) { 132 return; 133 } 134 135 136 // this makes sure there is a file separator between every component, 137 // if path has a trailing file separator or not, it will work for both cases 138 File dir = new File(path, CHEM_COMP_CACHE_DIRECTORY); 139 File f = new File(dir, "components.cif.gz"); 140 141 if ( ! f.exists()) { 142 143 downloadAllDefinitions(); 144 145 } else { 146 // file exists.. did it get extracted? 147 148 FilenameFilter filter =new FilenameFilter() { 149 150 @Override 151 public boolean accept(File dir, String file) { 152 return file.endsWith(".cif.gz"); 153 } 154 }; 155 String[] files = dir.list(filter); 156 if ( files.length < 500) { 157 // not all did get unpacked 158 try { 159 split(); 160 } catch (IOException e) { 161 logger.error("Could not split file {} into individual chemical component files. Error: {}", 162 f.toString(), e.getMessage()); 163 } 164 } 165 } 166 } 167 168 private void split() throws IOException { 169 170 logger.info("Installing individual chem comp files ..."); 171 172 File dir = new File(path, CHEM_COMP_CACHE_DIRECTORY); 173 File f = new File(dir, "components.cif.gz"); 174 175 176 int counter = 0; 177 InputStreamProvider prov = new InputStreamProvider(); 178 179 try( BufferedReader buf = new BufferedReader (new InputStreamReader (prov.getInputStream(f))); 180 ) { 181 String line = null; 182 line = buf.readLine (); 183 StringWriter writer = new StringWriter(); 184 185 String currentID = null; 186 while (line != null){ 187 188 if ( line.startsWith("data_")) { 189 // a new record found! 190 191 if ( currentID != null) { 192 writeID(writer.toString(), currentID); 193 counter++; 194 } 195 196 currentID = line.substring(5); 197 writer = new StringWriter(); 198 } 199 200 writer.append(line); 201 writer.append(NEWLINE); 202 203 line = buf.readLine (); 204 } 205 206 // write the last record... 207 writeID(writer.toString(),currentID); 208 counter++; 209 210 } 211 212 logger.info("Created " + counter + " chemical component files."); 213 } 214 215 /** 216 * Output chemical contents to a file 217 * @param contents File contents 218 * @param currentID Chemical ID, used to determine the filename 219 * @throws IOException 220 */ 221 private void writeID(String contents, String currentID) throws IOException{ 222 223 String localName = DownloadChemCompProvider.getLocalFileName(currentID); 224 225 try ( PrintWriter pw = new PrintWriter(new GZIPOutputStream(new FileOutputStream(localName))) ) { 226 227 pw.print(contents); 228 pw.flush(); 229 } 230 } 231 232 /** 233 * Loads the definitions for this {@link ChemComp} from a local file and instantiates a new object. 234 * 235 * @param recordName the ID of the {@link ChemComp} 236 * @return a new {@link ChemComp} definition. 237 */ 238 @Override 239 public ChemComp getChemComp(String recordName) { 240 241 // make sure we work with upper case records 242 recordName = recordName.toUpperCase().trim(); 243 244 boolean haveFile = true; 245 if ( recordName.equals("?")){ 246 return null; 247 } 248 249 if ( ! fileExists(recordName)) { 250 // check if we should install all components 251 checkDoFirstInstall(); 252 } 253 if ( ! fileExists(recordName)) { 254 // we previously have installed already the definitions, 255 // just do an incrememntal update 256 haveFile = downloadChemCompRecord(recordName); 257 } 258 259 // Added check that download was successful and chemical component is available. 260 if (haveFile) { 261 String filename = getLocalFileName(recordName); 262 InputStream inStream = null; 263 try { 264 265 InputStreamProvider isp = new InputStreamProvider(); 266 267 inStream = isp.getInputStream(filename); 268 269 MMcifParser parser = new SimpleMMcifParser(); 270 271 ChemCompConsumer consumer = new ChemCompConsumer(); 272 273 // The Consumer builds up the BioJava - structure object. 274 // you could also hook in your own and build up you own data model. 275 parser.addMMcifConsumer(consumer); 276 277 parser.parse(new BufferedReader(new InputStreamReader(inStream))); 278 279 ChemicalComponentDictionary dict = consumer.getDictionary(); 280 281 ChemComp chemComp = dict.getChemComp(recordName); 282 283 return chemComp; 284 285 } catch (IOException e) { 286 287 logger.error("Could not parse chemical component file {}. Error: {}. " 288 + "There will be no chemical component info available for {}", filename, e.getMessage(), recordName); 289 290 } 291 finally{ 292 // Now close it 293 if(inStream!=null){ 294 try { 295 inStream.close(); 296 } catch (IOException e) { 297 // This would be weird... 298 logger.error("Could not close chemical component file {}. A resource leak could occur!!", filename); 299 } 300 } 301 302 } 303 } 304 305 // see https://github.com/biojava/biojava/issues/315 306 // probably a network error happened. Try to use the ReducedChemCOmpProvider 307 ReducedChemCompProvider reduced = new ReducedChemCompProvider(); 308 309 return reduced.getChemComp(recordName); 310 311 } 312 313 /** 314 * Returns the file name that contains the definition for this {@link ChemComp} 315 * 316 * @param recordName the ID of the {@link ChemComp} 317 * @return full path to the file 318 */ 319 public static String getLocalFileName(String recordName){ 320 321 if ( protectedIDs.contains(recordName)){ 322 recordName = "_" + recordName; 323 } 324 325 initPath(); 326 327 File f = new File(path, CHEM_COMP_CACHE_DIRECTORY); 328 if (! f.exists()){ 329 logger.info("Creating directory " + f); 330 331 boolean success = f.mkdir(); 332 // we've checked in initPath that path is writable, so there's no need to check if it succeeds 333 // in the unlikely case that in the meantime it isn't writable at least we log an error 334 if (!success) logger.error("Directory {} could not be created",f); 335 336 } 337 338 File theFile = new File(f,recordName + ".cif.gz"); 339 340 return theFile.toString(); 341 } 342 343 private static boolean fileExists(String recordName){ 344 345 String fileName = getLocalFileName(recordName); 346 347 File f = new File(fileName); 348 349 return f.exists(); 350 351 } 352 353 /** 354 * @param recordName : three-letter name 355 * @return true if successful download 356 */ 357 private static boolean downloadChemCompRecord(String recordName) { 358 359 String localName = getLocalFileName(recordName); 360 File newFile; 361 try{ 362 newFile = File.createTempFile("chemcomp"+recordName, "cif"); 363 logger.debug("Will write chem comp file to temp file {}", newFile.toString()); 364 } 365 catch(IOException e){ 366 logger.error("Could not write to temp directory {} to create the chemical component download temp file", System.getProperty("java.io.tmpdir")); 367 return false; 368 } 369 String u; 370 if(useDefaultUrlLayout){ 371 u = serverBaseUrl + recordName + ".cif"; 372 } 373 else{ 374 u = serverBaseUrl + recordName.charAt(0) + "/" + recordName +"/" + recordName + ".cif"; 375 } 376 377 logger.debug("downloading " + u); 378 379 URL url = null; 380 381 382 try { 383 url = new URL(u); 384 URLConnection uconn = URLConnectionTools.openURLConnection(url); 385 386 try( PrintWriter pw = new PrintWriter(new GZIPOutputStream(new FileOutputStream(newFile))); 387 BufferedReader fileBuffer = new BufferedReader(new InputStreamReader(uconn.getInputStream())); 388 ) { 389 390 String line; 391 392 while ((line = fileBuffer.readLine()) != null) { 393 pw.println(line); 394 } 395 396 pw.flush(); 397 } 398 // Now we move this across to where it actually wants to be 399 Files.move(newFile.toPath(), Paths.get(localName), StandardCopyOption.REPLACE_EXISTING); 400 401 return true; 402 } catch (IOException e){ 403 logger.error("Could not download "+url.toString()+" OR store locally to "+localName+" Error ="+e.getMessage()); 404 newFile.delete(); 405 } 406 return false; 407 } 408 409 private void downloadAllDefinitions() { 410 411 if ( loading.get()){ 412 logger.info("Waiting for other thread to install chemical components..."); 413 } 414 415 while ( loading.get() ) { 416 417 // another thread is already downloading the components definitions 418 // wait for the other thread to finish... 419 420 try { 421 // wait half a second 422 423 Thread.sleep(500); 424 } catch (InterruptedException e) { 425 //e.printStackTrace(); 426 logger.error("Thread interrupted "+e.getMessage()); 427 } 428 429 logger.info("Another thread installed the chemical components."); 430 return; 431 432 } 433 434 loading.set(true); 435 long timeS = System.currentTimeMillis(); 436 437 logger.info("Performing first installation of chemical components."); 438 logger.info("Downloading components.cif.gz ..."); 439 440 441 try { 442 AllChemCompProvider.downloadFile(); 443 } catch (IOException e){ 444 logger.error("Could not download the all chemical components file. Error: {}. " 445 + "Chemical components information won't be available", e.getMessage()); 446 // no point in trying to split if the file could not be downloaded 447 loading.set(false); 448 return; 449 } 450 try { 451 split(); 452 } catch (IOException e) { 453 logger.error("Could not split all chem comp file into individual chemical component files. Error: {}", 454 e.getMessage()); 455 // no point in reporting time 456 loading.set(false); 457 return; 458 } 459 long timeE = System.currentTimeMillis(); 460 logger.info("time to install chem comp dictionary: " + (timeE - timeS) / 1000 + " sec."); 461 loading.set(false); 462 463 } 464 465 /** By default this provider will download only some of the {@link ChemComp} files. 466 * The user has to request that all files should be downloaded by setting this parameter to true. 467 * 468 * @return flag if the all components should be downloaded and installed at startup. (default: false) 469 */ 470 public boolean isDownloadAll() { 471 return downloadAll; 472 } 473 474 /** By default this provider will download only some of the {@link ChemComp} files. 475 * The user has to request that all files should be downloaded by setting this parameter to true. 476 * 477 * @param flag if the all components should be downloaded and installed at startup. (default: false) 478 */ 479 public void setDownloadAll(boolean downloadAll) { 480 this.downloadAll = downloadAll; 481 } 482 483 484 485 486 487}