001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.structure.io.mmcif; 022 023import java.io.BufferedReader; 024import java.io.File; 025import java.io.FileOutputStream; 026import java.io.FilenameFilter; 027import java.io.IOException; 028import java.io.InputStream; 029import java.io.InputStreamReader; 030import java.io.PrintWriter; 031import java.io.StringWriter; 032import java.net.HttpURLConnection; 033import java.net.URL; 034import java.nio.file.Files; 035import java.nio.file.Paths; 036import java.nio.file.StandardCopyOption; 037import java.util.ArrayList; 038import java.util.List; 039import java.util.concurrent.atomic.AtomicBoolean; 040import java.util.zip.GZIPOutputStream; 041 042import org.biojava.nbio.core.util.InputStreamProvider; 043import org.biojava.nbio.structure.align.util.HTTPConnectionTools; 044import org.biojava.nbio.structure.align.util.UserConfiguration; 045import org.biojava.nbio.structure.io.mmcif.model.ChemComp; 046import org.slf4j.Logger; 047import org.slf4j.LoggerFactory; 048 049 050 051/** This provider of chemical components can download and cache chemical component definition files from the RCSB PDB web site. 052 * It is the default way to access these definitions. 053 * If this provider is called he first time, it will download and install all chemical 054 * component definitions in a local directory. 055 * Once the definition files have been installed, it has quick startup time and low memory requirements. 056 * 057 * An alternative provider, that keeps all definitions in memory is the {@link AllChemCompProvider}. Another provider, that 058 * does not require any network access, but only can support a limited set of chemical component definitions, is the {@link ReducedChemCompProvider}. 059 * 060 * 061 * @author Andreas Prlic 062 * 063 */ 064public class DownloadChemCompProvider implements ChemCompProvider { 065 066 private static final Logger logger = LoggerFactory.getLogger(DownloadChemCompProvider.class); 067 068 public static final String CHEM_COMP_CACHE_DIRECTORY = "chemcomp"; 069 070 public static final String SERVER_LOCATION = "http://files.rcsb.org/ligands/download/"; 071 072 073 private static File path; 074 //private static final String FILE_SEPARATOR = System.getProperty("file.separator"); 075 private static final String NEWLINE = System.getProperty("line.separator"); 076 077 078 // flags to make sure there is only one thread running that is loading the dictionary 079 static AtomicBoolean loading = new AtomicBoolean(false); 080 081 static final List<String> protectedIDs = new ArrayList<String> (); 082 static { 083 protectedIDs.add("CON"); 084 protectedIDs.add("PRN"); 085 protectedIDs.add("AUX"); 086 protectedIDs.add("NUL"); 087 } 088 089 /** by default we will download only some of the files. User has to request that all files should be downloaded... 090 * 091 */ 092 boolean downloadAll = false; 093 094 public DownloadChemCompProvider(){ 095 logger.debug("Initialising DownloadChemCompProvider"); 096 097 // note that path is static, so this is just to make sure that all non-static methods will have path initialised 098 initPath(); 099 } 100 101 public DownloadChemCompProvider(String cacheFilePath){ 102 logger.debug("Initialising DownloadChemCompProvider"); 103 104 // note that path is static, so this is just to make sure that all non-static methods will have path initialised 105 path = new File(cacheFilePath); 106 } 107 108 private static void initPath(){ 109 110 if (path==null) { 111 UserConfiguration config = new UserConfiguration(); 112 path = new File(config.getCacheFilePath()); 113 } 114 } 115 116 /** 117 * Checks if the chemical components already have been installed into the PDB directory. 118 * If not, will download the chemical components definitions file and split it up into small 119 * subfiles. 120 */ 121 public void checkDoFirstInstall(){ 122 123 if ( ! downloadAll ) { 124 return; 125 } 126 127 128 // this makes sure there is a file separator between every component, 129 // if path has a trailing file separator or not, it will work for both cases 130 File dir = new File(path, CHEM_COMP_CACHE_DIRECTORY); 131 File f = new File(dir, "components.cif.gz"); 132 133 if ( ! f.exists()) { 134 135 downloadAllDefinitions(); 136 137 } else { 138 // file exists.. did it get extracted? 139 140 FilenameFilter filter =new FilenameFilter() { 141 142 @Override 143 public boolean accept(File dir, String file) { 144 return file.endsWith(".cif.gz"); 145 } 146 }; 147 String[] files = dir.list(filter); 148 if ( files.length < 500) { 149 // not all did get unpacked 150 try { 151 split(); 152 } catch (IOException e) { 153 logger.error("Could not split file {} into individual chemical component files. Error: {}", 154 f.toString(), e.getMessage()); 155 } 156 } 157 } 158 } 159 160 private void split() throws IOException { 161 162 logger.info("Installing individual chem comp files ..."); 163 164 File dir = new File(path, CHEM_COMP_CACHE_DIRECTORY); 165 File f = new File(dir, "components.cif.gz"); 166 167 168 int counter = 0; 169 InputStreamProvider prov = new InputStreamProvider(); 170 171 try( BufferedReader buf = new BufferedReader (new InputStreamReader (prov.getInputStream(f))); 172 ) { 173 String line = null; 174 line = buf.readLine (); 175 StringWriter writer = new StringWriter(); 176 177 String currentID = null; 178 while (line != null){ 179 180 if ( line.startsWith("data_")) { 181 // a new record found! 182 183 if ( currentID != null) { 184 writeID(writer.toString(), currentID); 185 counter++; 186 } 187 188 currentID = line.substring(5); 189 writer = new StringWriter(); 190 } 191 192 writer.append(line); 193 writer.append(NEWLINE); 194 195 line = buf.readLine (); 196 } 197 198 // write the last record... 199 writeID(writer.toString(),currentID); 200 counter++; 201 202 } 203 204 logger.info("Created " + counter + " chemical component files."); 205 } 206 207 /** 208 * Output chemical contents to a file 209 * @param contents File contents 210 * @param currentID Chemical ID, used to determine the filename 211 * @throws IOException 212 */ 213 private void writeID(String contents, String currentID) throws IOException{ 214 215 String localName = DownloadChemCompProvider.getLocalFileName(currentID); 216 217 try ( PrintWriter pw = new PrintWriter(new GZIPOutputStream(new FileOutputStream(localName))) ) { 218 219 pw.print(contents.toString()); 220 pw.flush(); 221 } 222 } 223 224 /** 225 * Loads the definitions for this {@link ChemComp} from a local file and instantiates a new object. 226 * 227 * @param recordName the ID of the {@link ChemComp} 228 * @return a new {@link ChemComp} definition. 229 */ 230 @Override 231 public ChemComp getChemComp(String recordName) { 232 233 // make sure we work with upper case records 234 recordName = recordName.toUpperCase().trim(); 235 236 boolean haveFile = true; 237 if ( recordName.equals("?")){ 238 return null; 239 } 240 241 if ( ! fileExists(recordName)) { 242 // check if we should install all components 243 checkDoFirstInstall(); 244 } 245 if ( ! fileExists(recordName)) { 246 // we previously have installed already the definitions, 247 // just do an incrememntal update 248 haveFile = downloadChemCompRecord(recordName); 249 } 250 251 // Added check that download was successful and chemical component is available. 252 if (haveFile) { 253 String filename = getLocalFileName(recordName); 254 InputStream inStream = null; 255 try { 256 257 InputStreamProvider isp = new InputStreamProvider(); 258 259 inStream = isp.getInputStream(filename); 260 261 MMcifParser parser = new SimpleMMcifParser(); 262 263 ChemCompConsumer consumer = new ChemCompConsumer(); 264 265 // The Consumer builds up the BioJava - structure object. 266 // you could also hook in your own and build up you own data model. 267 parser.addMMcifConsumer(consumer); 268 269 parser.parse(new BufferedReader(new InputStreamReader(inStream))); 270 271 ChemicalComponentDictionary dict = consumer.getDictionary(); 272 273 ChemComp chemComp = dict.getChemComp(recordName); 274 275 return chemComp; 276 277 } catch (IOException e) { 278 279 logger.error("Could not parse chemical component file {}. Error: {}. " 280 + "There will be no chemical component info available for {}", filename, e.getMessage(), recordName); 281 282 } 283 finally{ 284 // Now close it 285 if(inStream!=null){ 286 try { 287 inStream.close(); 288 } catch (IOException e) { 289 // This would be weird... 290 logger.error("Could not close chemical component file {}. A resource leak could occur!!", filename); 291 } 292 } 293 294 } 295 } 296 297 // see https://github.com/biojava/biojava/issues/315 298 // probably a network error happened. Try to use the ReducedChemCOmpProvider 299 ReducedChemCompProvider reduced = new ReducedChemCompProvider(); 300 301 return reduced.getChemComp(recordName); 302 303 } 304 305 /** Returns the file name that contains the definition for this {@link ChemComp} 306 * 307 * @param recordName the ID of the {@link ChemComp} 308 * @return full path to the file 309 */ 310 public static String getLocalFileName(String recordName){ 311 312 if ( protectedIDs.contains(recordName)){ 313 recordName = "_" + recordName; 314 } 315 316 initPath(); 317 318 File f = new File(path, CHEM_COMP_CACHE_DIRECTORY); 319 if (! f.exists()){ 320 logger.info("Creating directory " + f); 321 322 boolean success = f.mkdir(); 323 // we've checked in initPath that path is writable, so there's no need to check if it succeeds 324 // in the unlikely case that in the meantime it isn't writable at least we log an error 325 if (!success) logger.error("Directory {} could not be created",f); 326 327 } 328 329 File theFile = new File(f,recordName + ".cif.gz"); 330 331 return theFile.toString(); 332 } 333 334 private static boolean fileExists(String recordName){ 335 336 String fileName = getLocalFileName(recordName); 337 338 File f = new File(fileName); 339 340 return f.exists(); 341 342 } 343 344 /** 345 * @param recordName : three-letter name 346 * @return true if successful download 347 */ 348 private static boolean downloadChemCompRecord(String recordName) { 349 350 String localName = getLocalFileName(recordName); 351 File newFile; 352 try{ 353 newFile = File.createTempFile("chemcomp"+recordName, "cif"); 354 } 355 catch(IOException e){ 356 logger.error("Could not write to temp directory {} to create the chemical component download temp file", System.getProperty("java.io.tmpdir")); 357 return false; 358 } 359 String u = SERVER_LOCATION + recordName + ".cif"; 360 361 logger.debug("downloading " + u); 362 363 URL url = null; 364 365 366 try { 367 url = new URL(u); 368 369 HttpURLConnection uconn = HTTPConnectionTools.openHttpURLConnection(url); 370 371 try( PrintWriter pw = new PrintWriter(new GZIPOutputStream(new FileOutputStream(newFile))); 372 BufferedReader fileBuffer = new BufferedReader(new InputStreamReader(uconn.getInputStream())); 373 ) { 374 375 String line; 376 377 while ((line = fileBuffer.readLine()) != null) { 378 pw.println(line); 379 } 380 381 pw.flush(); 382 } 383 // Now we move this across to where it actually wants to be 384 Files.move(newFile.toPath(), Paths.get(localName), StandardCopyOption.REPLACE_EXISTING); 385 386 return true; 387 } catch (IOException e){ 388 logger.error("Could not download "+url.toString()+" OR store locally to "+localName+" Error ="+e.getMessage()); 389 newFile.delete(); 390 } 391 return false; 392 } 393 394 private void downloadAllDefinitions() { 395 396 if ( loading.get()){ 397 logger.info("Waiting for other thread to install chemical components..."); 398 } 399 400 while ( loading.get() ) { 401 402 // another thread is already downloading the components definitions 403 // wait for the other thread to finish... 404 405 try { 406 // wait half a second 407 408 Thread.sleep(500); 409 } catch (InterruptedException e) { 410 //e.printStackTrace(); 411 logger.error("Thread interrupted "+e.getMessage()); 412 } 413 414 logger.info("Another thread installed the chemical components."); 415 return; 416 417 } 418 419 loading.set(true); 420 long timeS = System.currentTimeMillis(); 421 422 logger.info("Performing first installation of chemical components."); 423 logger.info("Downloading components.cif.gz ..."); 424 425 426 try { 427 AllChemCompProvider.downloadFile(); 428 } catch (IOException e){ 429 logger.error("Could not download the all chemical components file. Error: {}. " 430 + "Chemical components information won't be available", e.getMessage()); 431 // no point in trying to split if the file could not be downloaded 432 loading.set(false); 433 return; 434 } 435 try { 436 split(); 437 } catch (IOException e) { 438 logger.error("Could not split all chem comp file into individual chemical component files. Error: {}", 439 e.getMessage()); 440 // no point in reporting time 441 loading.set(false); 442 return; 443 } 444 long timeE = System.currentTimeMillis(); 445 logger.info("time to install chem comp dictionary: " + (timeE - timeS) / 1000 + " sec."); 446 loading.set(false); 447 448 } 449 450 /** By default this provider will download only some of the {@link ChemComp} files. 451 * The user has to request that all files should be downloaded by setting this parameter to true. 452 * 453 * @return flag if the all components should be downloaded and installed at startup. (default: false) 454 */ 455 public boolean isDownloadAll() { 456 return downloadAll; 457 } 458 459 /** By default this provider will download only some of the {@link ChemComp} files. 460 * The user has to request that all files should be downloaded by setting this parameter to true. 461 * 462 * @param flag if the all components should be downloaded and installed at startup. (default: false) 463 */ 464 public void setDownloadAll(boolean downloadAll) { 465 this.downloadAll = downloadAll; 466 } 467 468 469 470 471 472}