001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.structure.io.mmcif; 022 023import java.io.BufferedOutputStream; 024import java.io.File; 025import java.io.FileOutputStream; 026import java.io.FilenameFilter; 027import java.io.IOException; 028import java.io.InputStream; 029import java.nio.file.FileSystem; 030import java.nio.file.FileSystems; 031import java.nio.file.Files; 032import java.nio.file.Path; 033import java.nio.file.Paths; 034import java.nio.file.StandardCopyOption; 035import java.util.HashSet; 036import java.util.Set; 037import java.util.zip.GZIPInputStream; 038import java.util.zip.ZipEntry; 039import java.util.zip.ZipOutputStream; 040 041import org.biojava.nbio.structure.io.mmcif.chem.PolymerType; 042import org.biojava.nbio.structure.io.mmcif.chem.ResidueType; 043import org.biojava.nbio.structure.io.mmcif.model.ChemComp; 044import org.slf4j.Logger; 045import org.slf4j.LoggerFactory; 046 047/** This chemical component provider retrieves and caches chemical component definition files from a 048 * zip archive specified in its construction. If the archive does not contain the record, an attempt is 049 * made to download it using DownloadChemCompProvider. The downloaded file is then added to the archive. 050 * 051 * The class is thread-safe and the same ZipChemCompProvider should be used by all threads to prevent 052 * simultaneous read or write to the zip archive. A zip archive will be created if missing. 053 * 054 * @author edlunde 055 * @author larsonm 056 * @since 12/05/12 057 * updated 3/5/2016 for Java 7 ZipFileSystem 058 */ 059public class ZipChemCompProvider implements ChemCompProvider{ 060 private static final Logger s_logger = LoggerFactory.getLogger(ZipChemCompProvider.class); 061 062 private final Path m_tempDir; // Base path where $m_zipRootDir/ will be downloaded to. 063 private final Path m_zipRootDir; 064 private final Path m_zipFile; 065 private final DownloadChemCompProvider m_dlProvider; 066 067 private boolean m_removeCif; 068 069 // Missing IDs from library that cannot be download added here to prevent delays. 070 private Set<String> unavailable = new HashSet<String>(); 071 072 /** 073 * ZipChemCompProvider is a Chemical Component provider that stores chemical components 074 * in a zip archive. Missing chemical components are downloaded and appended to the 075 * archive. If non-existent a new zip archive will be created. 076 * 077 * @param chemicalComponentDictionaryFile : path to zip archive for chemical components. 078 * @param tempDir : path for temporary directory, (null) defaults to path in property "java.io.tmpdir". 079 * @throws IOException 080 */ 081 public ZipChemCompProvider(String chemicalComponentDictionaryFile, String tempDir) throws IOException { 082 this.m_zipFile = Paths.get(chemicalComponentDictionaryFile); 083 084 // Use a default temporary directory if not passed a value. 085 if (tempDir == null || tempDir.equals("")) { 086 this.m_tempDir = Paths.get(System.getProperty("java.io.tmpdir")); 087 } else { 088 this.m_tempDir = Paths.get(tempDir); 089 } 090 091 this.m_zipRootDir = Paths.get("chemcomp"); 092 093 // Setup an instance of the download chemcomp provider. 094 this.m_dlProvider = new DownloadChemCompProvider(m_tempDir.toString()); 095 this.m_removeCif = true; 096 initializeZip(); 097 } 098 099 // See comments in addToZipFileSystem for why initialization is required with 100 // ZipFileSystems - due to URI issues in Java7. 101 private void initializeZip() throws IOException { 102 s_logger.info("Using chemical component dictionary: " + m_zipFile.toString()); 103 final File f = m_zipFile.toFile(); 104 if (!f.exists()) { 105 s_logger.info("Creating missing zip archive: " + m_zipFile.toString()); 106 FileOutputStream fo = new FileOutputStream(f); 107 ZipOutputStream zip = new ZipOutputStream(new BufferedOutputStream(fo)); 108 try { 109 zip.putNextEntry(new ZipEntry("chemcomp/")); 110 zip.closeEntry(); 111 } finally { 112 zip.close(); 113 } 114 } 115 } 116 117 /** 118 * Remove downloaded .cif.gz after adding to zip archive? 119 * Default is true. 120 * @param doRemove 121 */ 122 public void setRemoveCif(boolean doRemove) { 123 m_removeCif = doRemove; 124 } 125 126 /* (non-Javadoc) 127 * @see org.biojava.nbio.structure.io.mmcif.ChemCompProvider#getChemComp(java.lang.String) 128 * 129 * @param recordName : three letter PDB name for a residue 130 * @return ChemComp from .zip or ChemComp from repository. Will return empty ChemComp when unable to find a residue and will return null if not provided a valid recordName. 131 */ 132 @Override 133 public ChemComp getChemComp(String recordName) { 134 if (null == recordName) return null; 135 136 // handle non-existent ChemComp codes and do not repeatedly attempt to add these. 137 for (String str : unavailable) { 138 if (recordName.equals(str)) return getEmptyChemComp(recordName); 139 } 140 141 // Try to pull from zip, if fail then download. 142 ChemComp cc = getFromZip(recordName); 143 if (cc == null) { 144 s_logger.info("File "+recordName+" not found in archive. Attempting download from PDB."); 145 cc = downloadAndAdd(recordName); 146 } 147 148 // If a null record or an empty chemcomp, return a default ChemComp and blacklist. 149 if (cc == null || (null == cc.getName() && cc.getAtoms().size() == 0)) { 150 s_logger.info("Unable to find or download " + recordName + " - excluding from future searches."); 151 unavailable.add(recordName); 152 return getEmptyChemComp(recordName); 153 } 154 return cc; 155 } 156 157 /** Use DownloadChemCompProvider to grab a gzipped cif record from the PDB. 158 * Zip all downloaded cif.gz files into the dictionary. 159 * 160 * @param recordName is the three-letter chemical component code (i.e. residue name). 161 * @return ChemComp matching recordName 162 */ 163 private ChemComp downloadAndAdd(String recordName){ 164 final ChemComp cc = m_dlProvider.getChemComp(recordName); 165 166 // final File [] files = finder(m_tempDir.resolve("chemcomp").toString(), "cif.gz"); 167 final File [] files = new File[1]; 168 Path cif = m_tempDir.resolve("chemcomp").resolve(recordName + ".cif.gz"); 169 files[0] = cif.toFile(); 170 if (files[0] != null) { 171 addToZipFileSystem(m_zipFile, files, m_zipRootDir); 172 if (m_removeCif) for (File f : files) f.delete(); 173 } 174 return cc; 175 } 176 177 /** 178 * Cleanup chemical component (.cif.gz) files downloaded to tmpdir. 179 * @param tempdir : path to temporary directory for chemical components 180 */ 181 public static void purgeTempFiles(String tempdir) { 182 if (tempdir == null) return; 183 184 s_logger.info("Removing: "+tempdir); 185 Path dlPath = Paths.get(tempdir).resolve("chemcomp"); 186 File[] chemCompOutFiles = finder(dlPath.toString(), "cif.gz"); 187 if (null != chemCompOutFiles) for (File f : chemCompOutFiles) f.delete(); 188 dlPath.toFile().delete(); 189 } 190 191 /** 192 * Return an empty ChemComp group for a three-letter resName. 193 * @param resName 194 * @return 195 */ 196 private ChemComp getEmptyChemComp(String resName){ 197 String pdbName = ""; // Empty string is default 198 if (null != resName && resName.length() >= 3) { 199 pdbName = resName.substring(0,3); 200 } 201 final ChemComp comp = new ChemComp(); 202 comp.setOne_letter_code("?"); 203 comp.setThree_letter_code(pdbName); 204 comp.setPolymerType(PolymerType.unknown); 205 comp.setResidueType(ResidueType.atomn); 206 return comp; 207 } 208 209 /** 210 * Return File(s) in dirName that match suffix. 211 * @param dirName 212 * @param suffix 213 * @return 214 */ 215 static private File[] finder( String dirName, final String suffix){ 216 if (null == dirName || null == suffix) { 217 return null; 218 } 219 220 final File dir = new File(dirName); 221 return dir.listFiles(new FilenameFilter() { 222 @Override 223 public boolean accept(File dir, String filename) 224 { return filename.endsWith(suffix); } 225 } ); 226 } 227 228 /** 229 * This is synchronized, along with addToFileSystem to prevent simulatenous reading/writing. 230 * @param recordName to find in zipfile. 231 * @return ChemComp if found or null if missing. 232 */ 233 private synchronized ChemComp getFromZip(String recordName) { 234 ChemComp cc = null; 235 if (!m_zipFile.toFile().exists()) return cc; 236 final String filename = "chemcomp/" + recordName+".cif.gz"; 237 238 // try with resources block to read from the filesystem. 239 try (FileSystem fs = FileSystems.newFileSystem(m_zipFile, null)) { 240 Path cif = fs.getPath(filename); 241 242 if (Files.exists(cif)) { 243 final InputStream zipStream = Files.newInputStream(cif); 244 final InputStream inputStream = new GZIPInputStream(zipStream); 245 s_logger.debug("reading " + recordName + " from " + m_zipFile); 246 final MMcifParser parser = new SimpleMMcifParser(); 247 final ChemCompConsumer consumer = new ChemCompConsumer(); 248 parser.addMMcifConsumer(consumer); 249 parser.parse(inputStream); 250 inputStream.close(); 251 252 final ChemicalComponentDictionary dict = consumer.getDictionary(); 253 cc = dict.getChemComp(recordName); 254 } 255 } catch (IOException e) { 256 s_logger.error("Unable to read from zip file : " + e.getMessage()); 257 } 258 259 return cc; 260 } 261 262 /** 263 * Add an array of files to a zip archive. 264 * Synchronized to prevent simultaneous reading/writing. 265 * 266 * @param zipFile is a destination zip archive 267 * @param files is an array of files to be added 268 * @param pathWithinArchive is the path within the archive to add files to 269 * @return true if successfully appended these files. 270 */ 271 private synchronized boolean addToZipFileSystem(Path zipFile, File[] files, Path pathWithinArchive) { 272 boolean ret = false; 273 274 /* URIs in Java 7 cannot have spaces, must use Path instead 275 * and so, cannot use the properties map to describe need to create 276 * a new zip archive. ZipChemCompProvider.initilizeZip to creates the 277 * missing zip file */ 278 279 /* 280 // convert the filename to a URI 281 String uriString = "jar:file:" + zipFile.toUri().getPath(); 282 final URI uri = URI.create(uriString); 283 284 // if filesystem doesn't exist, create one. 285 final Map<String, String> env = new HashMap<>(); 286 // Create a new zip if one isn't present. 287 if (!zipFile.toFile().exists()) { 288 System.out.println("Need to create " + zipFile.toString()); 289 } 290 env.put("create", String.valueOf(!zipFile.toFile().exists())); 291 // Specify the encoding as UTF -8 292 env.put("encoding", "UTF-8"); 293 */ 294 295 // Copy in each file. 296 try (FileSystem zipfs = FileSystems.newFileSystem(zipFile, null)) { 297 Files.createDirectories(pathWithinArchive); 298 for (File f : files) { 299 if (!f.isDirectory() && f.exists()) { 300 Path externalFile = f.toPath(); 301 Path pathInZipFile = zipfs.getPath(pathWithinArchive.resolve(f.getName()).toString()); 302 Files.copy(externalFile, pathInZipFile, 303 StandardCopyOption.REPLACE_EXISTING); 304 } 305 } 306 ret = true; 307 } catch (IOException ex) { 308 s_logger.error("Unable to add entries to Chemical Component zip archive : " + ex.getMessage()); 309 ret = false; 310 } 311 return ret; 312 } 313}