001package org.biojava.nbio.structure.chem; 002 003import org.biojava.nbio.structure.io.cif.ChemCompConverter; 004import org.slf4j.Logger; 005import org.slf4j.LoggerFactory; 006 007import java.io.BufferedOutputStream; 008import java.io.File; 009import java.io.FileOutputStream; 010import java.io.IOException; 011import java.nio.file.FileSystem; 012import java.nio.file.FileSystems; 013import java.nio.file.Files; 014import java.nio.file.Path; 015import java.nio.file.Paths; 016import java.nio.file.StandardCopyOption; 017import java.util.HashSet; 018import java.util.Set; 019import java.util.zip.ZipEntry; 020import java.util.zip.ZipOutputStream; 021 022/** 023 * This chemical component provider retrieves and caches chemical component definition files from a 024 * zip archive specified in its construction. If the archive does not contain the record, an attempt is 025 * made to download it using DownloadChemCompProvider. The downloaded file is then added to the archive. 026 * 027 * The class is thread-safe and the same ZipChemCompProvider should be used by all threads to prevent 028 * simultaneous read or write to the zip archive. A zip archive will be created if missing. 029 * 030 * @author edlunde 031 * @author larsonm 032 * @since 12/05/12 033 * updated 3/5/2016 for Java 7 ZipFileSystem 034 */ 035public class ZipChemCompProvider implements ChemCompProvider{ 036 private static final Logger s_logger = LoggerFactory.getLogger(ZipChemCompProvider.class); 037 038 private final Path m_tempDir; // Base path where $m_zipRootDir/ will be downloaded to. 039 private final Path m_zipRootDir; 040 private final Path m_zipFile; 041 private final DownloadChemCompProvider m_dlProvider; 042 043 private boolean m_removeCif; 044 045 // Missing IDs from library that cannot be download added here to prevent delays. 046 private Set<String> unavailable = new HashSet<>(); 047 048 /** 049 * ZipChemCompProvider is a Chemical Component provider that stores chemical components 050 * in a zip archive. Missing chemical components are downloaded and appended to the 051 * archive. If non-existent a new zip archive will be created. 052 * 053 * @param chemicalComponentDictionaryFile : path to zip archive for chemical components. 054 * @param tempDir : path for temporary directory, (null) defaults to path in property "java.io.tmpdir". 055 * @throws IOException 056 */ 057 public ZipChemCompProvider(String chemicalComponentDictionaryFile, String tempDir) throws IOException { 058 this.m_zipFile = Paths.get(chemicalComponentDictionaryFile); 059 060 // Use a default temporary directory if not passed a value. 061 if (tempDir == null || tempDir.equals("")) { 062 this.m_tempDir = Paths.get(System.getProperty("java.io.tmpdir")); 063 } else { 064 this.m_tempDir = Paths.get(tempDir); 065 } 066 067 this.m_zipRootDir = Paths.get("chemcomp"); 068 069 // Setup an instance of the download chemcomp provider. 070 this.m_dlProvider = new DownloadChemCompProvider(m_tempDir.toString()); 071 this.m_removeCif = true; 072 initializeZip(); 073 } 074 075 // See comments in addToZipFileSystem for why initialization is required with 076 // ZipFileSystems - due to URI issues in Java7. 077 private void initializeZip() throws IOException { 078 s_logger.info("Using chemical component dictionary: {}", m_zipFile.toString()); 079 final File f = m_zipFile.toFile(); 080 if (!f.exists()) { 081 s_logger.info("Creating missing zip archive: {}", m_zipFile.toString()); 082 FileOutputStream fo = new FileOutputStream(f); 083 try (ZipOutputStream zip = new ZipOutputStream(new BufferedOutputStream(fo))) { 084 zip.putNextEntry(new ZipEntry("chemcomp/")); 085 zip.closeEntry(); 086 } 087 } 088 } 089 090 /** 091 * Remove downloaded .cif.gz after adding to zip archive? 092 * Default is true. 093 * @param doRemove 094 */ 095 public void setRemoveCif(boolean doRemove) { 096 m_removeCif = doRemove; 097 } 098 099 /** 100 * (non-Javadoc) 101 * @see ChemCompProvider#getChemComp(java.lang.String) 102 * 103 * @param recordName : three letter PDB name for a residue 104 * @return ChemComp from .zip or ChemComp from repository. Will return empty ChemComp when unable to find a residue and will return null if not provided a valid recordName. 105 */ 106 @Override 107 public ChemComp getChemComp(String recordName) { 108 if (null == recordName) return null; 109 110 // handle non-existent ChemComp codes and do not repeatedly attempt to add these. 111 for (String str : unavailable) { 112 if (recordName.equals(str)) return getEmptyChemComp(recordName); 113 } 114 115 // Try to pull from zip, if fail then download. 116 ChemComp cc = getFromZip(recordName); 117 if (cc == null) { 118 s_logger.info("File {} not found in archive. Attempting download from PDB.", recordName); 119 cc = downloadAndAdd(recordName); 120 } 121 122 // If a null record or an empty chemcomp, return a default ChemComp and blacklist. 123 if (cc == null || (null == cc.getName() && cc.getAtoms().size() == 0)) { 124 s_logger.info("Unable to find or download {} - excluding from future searches.", recordName); 125 unavailable.add(recordName); 126 return getEmptyChemComp(recordName); 127 } 128 return cc; 129 } 130 131 /** Use DownloadChemCompProvider to grab a gzipped cif record from the PDB. 132 * Zip all downloaded cif.gz files into the dictionary. 133 * 134 * @param recordName is the three-letter chemical component code (i.e. residue name). 135 * @return ChemComp matching recordName 136 */ 137 private ChemComp downloadAndAdd(String recordName){ 138 final ChemComp cc = m_dlProvider.getChemComp(recordName); 139 140 // final File [] files = finder(m_tempDir.resolve("chemcomp").toString(), "cif.gz"); 141 final File [] files = new File[1]; 142 Path cif = m_tempDir.resolve("chemcomp").resolve(recordName + ".cif.gz"); 143 files[0] = cif.toFile(); 144 if (files[0] != null) { 145 addToZipFileSystem(m_zipFile, files, m_zipRootDir); 146 if (m_removeCif) for (File f : files) f.delete(); 147 } 148 return cc; 149 } 150 151 /** 152 * Cleanup chemical component (.cif.gz) files downloaded to tmpdir. 153 * @param tempdir : path to temporary directory for chemical components 154 */ 155 public static void purgeTempFiles(String tempdir) { 156 if (tempdir == null) return; 157 158 s_logger.info("Removing: "+tempdir); 159 Path dlPath = Paths.get(tempdir).resolve("chemcomp"); 160 File[] chemCompOutFiles = finder(dlPath.toString(), "cif.gz"); 161 if (null != chemCompOutFiles) for (File f : chemCompOutFiles) f.delete(); 162 dlPath.toFile().delete(); 163 } 164 165 /** 166 * Return an empty ChemComp group for a three-letter resName. 167 * @param resName 168 * @return 169 */ 170 private ChemComp getEmptyChemComp(String resName){ 171 String pdbName = ""; // Empty string is default 172 if (null != resName && resName.length() >= 3) { 173 pdbName = resName.substring(0,3); 174 } 175 final ChemComp comp = new ChemComp(); 176 comp.setOneLetterCode("?"); 177 comp.setThreeLetterCode(pdbName); 178 comp.setPolymerType(PolymerType.unknown); 179 comp.setResidueType(ResidueType.atomn); 180 return comp; 181 } 182 183 /** 184 * Return File(s) in dirName that match suffix. 185 * @param dirName 186 * @param suffix 187 * @return 188 */ 189 static private File[] finder(String dirName, final String suffix) { 190 if (null == dirName || null == suffix) { 191 return null; 192 } 193 194 final File dir = new File(dirName); 195 return dir.listFiles((dir1, filename) -> filename.endsWith(suffix)); 196 } 197 198 /** 199 * This is synchronized, along with addToFileSystem to prevent simulatenous reading/writing. 200 * @param recordName to find in zipfile. 201 * @return ChemComp if found or null if missing. 202 */ 203 private synchronized ChemComp getFromZip(String recordName) { 204 ChemComp cc = null; 205 if (!m_zipFile.toFile().exists()) return cc; 206 final String filename = "chemcomp/" + recordName + ".cif.gz"; 207 208 // try with resources block to read from the filesystem. 209 // Don't remove the (ClassLoader) cast! It is required for openjdk 11. 210 try (FileSystem fs = FileSystems.newFileSystem(m_zipFile, (ClassLoader)null)) { 211 Path cif = fs.getPath(filename); 212 213 if (Files.exists(cif)) { 214 s_logger.debug("reading {} from {}", recordName, m_zipFile); 215 final ChemicalComponentDictionary dict = ChemCompConverter.fromPath(cif); 216 cc = dict.getChemComp(recordName); 217 } 218 } catch (IOException e) { 219 s_logger.error("Unable to read from zip file : {}", e.getMessage()); 220 } 221 222 return cc; 223 } 224 225 /** 226 * Add an array of files to a zip archive. 227 * Synchronized to prevent simultaneous reading/writing. 228 * 229 * @param zipFile is a destination zip archive 230 * @param files is an array of files to be added 231 * @param pathWithinArchive is the path within the archive to add files to 232 * @return true if successfully appended these files. 233 */ 234 private synchronized boolean addToZipFileSystem(Path zipFile, File[] files, Path pathWithinArchive) { 235 boolean ret = false; 236 237 /* URIs in Java 7 cannot have spaces, must use Path instead 238 * and so, cannot use the properties map to describe need to create 239 * a new zip archive. ZipChemCompProvider.initilizeZip to creates the 240 * missing zip file */ 241 242 /* 243 // convert the filename to a URI 244 String uriString = "jar:file:" + zipFile.toUri().getPath(); 245 final URI uri = URI.create(uriString); 246 247 // if filesystem doesn't exist, create one. 248 final Map<String, String> env = new HashMap<>(); 249 // Create a new zip if one isn't present. 250 if (!zipFile.toFile().exists()) { 251 System.out.println("Need to create " + zipFile.toString()); 252 } 253 env.put("create", String.valueOf(!zipFile.toFile().exists())); 254 // Specify the encoding as UTF -8 255 env.put("encoding", "UTF-8"); 256 */ 257 258 // Copy in each file. 259 // Don't remove the (ClassLoader) cast! It is required for openjdk 11. 260 try (FileSystem zipfs = FileSystems.newFileSystem(zipFile, (ClassLoader)null)) { 261 Files.createDirectories(pathWithinArchive); 262 for (File f : files) { 263 if (!f.isDirectory() && f.exists()) { 264 Path externalFile = f.toPath(); 265 Path pathInZipFile = zipfs.getPath(pathWithinArchive.resolve(f.getName()).toString()); 266 Files.copy(externalFile, pathInZipFile, 267 StandardCopyOption.REPLACE_EXISTING); 268 } 269 } 270 ret = true; 271 } catch (IOException ex) { 272 s_logger.error("Unable to add entries to Chemical Component zip archive : {}", ex.getMessage()); 273 ret = false; 274 } 275 return ret; 276 } 277}