001package org.biojava.nbio.structure.chem;
002
003import org.biojava.nbio.structure.io.cif.ChemCompConverter;
004import org.slf4j.Logger;
005import org.slf4j.LoggerFactory;
006
007import java.io.BufferedOutputStream;
008import java.io.File;
009import java.io.FileOutputStream;
010import java.io.IOException;
011import java.nio.file.FileSystem;
012import java.nio.file.FileSystems;
013import java.nio.file.Files;
014import java.nio.file.Path;
015import java.nio.file.Paths;
016import java.nio.file.StandardCopyOption;
017import java.util.HashSet;
018import java.util.Set;
019import java.util.zip.ZipEntry;
020import java.util.zip.ZipOutputStream;
021
022/**
023 * This chemical component provider retrieves and caches chemical component definition files from a
024 * zip archive specified in its construction.  If the archive does not contain the record, an attempt is
025 * made to download it using DownloadChemCompProvider. The downloaded file is then added to the archive.
026 *
027 * The class is thread-safe and the same ZipChemCompProvider should be used by all threads to prevent
028 * simultaneous read or write to the zip archive.  A zip archive will be created if missing.
029 *
030 * @author edlunde
031 * @author larsonm
032 * @since 12/05/12
033 * updated 3/5/2016 for Java 7 ZipFileSystem
034 */
035public class ZipChemCompProvider implements ChemCompProvider{
036    private static final Logger s_logger = LoggerFactory.getLogger(ZipChemCompProvider.class);
037
038    private final Path m_tempDir;  // Base path where $m_zipRootDir/ will be downloaded to.
039    private final Path m_zipRootDir;
040    private final Path m_zipFile;
041    private final DownloadChemCompProvider m_dlProvider;
042
043    private boolean m_removeCif;
044
045    // Missing IDs from library that cannot be download added here to prevent delays.
046    private Set<String> unavailable = new HashSet<>();
047
048    /**
049     * ZipChemCompProvider is a Chemical Component provider that stores chemical components
050     * in a zip archive.  Missing chemical components are downloaded and appended to the
051     * archive.  If non-existent a new zip archive will be created.
052     *
053     * @param chemicalComponentDictionaryFile : path to zip archive for chemical components.
054     * @param tempDir : path for temporary directory, (null) defaults to path in property "java.io.tmpdir".
055     * @throws IOException
056     */
057    public ZipChemCompProvider(String chemicalComponentDictionaryFile, String tempDir) throws IOException {
058        this.m_zipFile = Paths.get(chemicalComponentDictionaryFile);
059
060        // Use a default temporary directory if not passed a value.
061        if (tempDir == null || tempDir.equals("")) {
062            this.m_tempDir = Paths.get(System.getProperty("java.io.tmpdir"));
063        } else {
064            this.m_tempDir = Paths.get(tempDir);
065        }
066
067        this.m_zipRootDir = Paths.get("chemcomp");
068
069        // Setup an instance of the download chemcomp provider.
070        this.m_dlProvider = new DownloadChemCompProvider(m_tempDir.toString());
071        this.m_removeCif = true;
072        initializeZip();
073    }
074
075    // See comments in addToZipFileSystem for why initialization is required with
076    // ZipFileSystems - due to URI issues in Java7.
077    private void initializeZip() throws IOException {
078        s_logger.info("Using chemical component dictionary: {}", m_zipFile.toString());
079        final File f = m_zipFile.toFile();
080        if (!f.exists()) {
081            s_logger.info("Creating missing zip archive: {}", m_zipFile.toString());
082            FileOutputStream fo = new FileOutputStream(f);
083            try (ZipOutputStream zip = new ZipOutputStream(new BufferedOutputStream(fo))) {
084                zip.putNextEntry(new ZipEntry("chemcomp/"));
085                zip.closeEntry();
086            }
087        }
088    }
089
090    /**
091     * Remove downloaded .cif.gz after adding to zip archive?
092     * Default is true.
093     * @param doRemove
094     */
095    public void setRemoveCif(boolean doRemove) {
096        m_removeCif = doRemove;
097    }
098
099    /**
100     * (non-Javadoc)
101     * @see ChemCompProvider#getChemComp(java.lang.String)
102     *
103     * @param recordName : three letter PDB name for a residue
104     * @return ChemComp from .zip or ChemComp from repository.  Will return empty ChemComp when unable to find a residue and will return null if not provided a valid recordName.
105     */
106    @Override
107    public ChemComp getChemComp(String recordName) {
108        if (null == recordName) return null;
109
110        // handle non-existent ChemComp codes and do not repeatedly attempt to add these.
111        for (String str : unavailable) {
112            if (recordName.equals(str)) return getEmptyChemComp(recordName);
113        }
114
115        // Try to pull from zip, if fail then download.
116        ChemComp cc = getFromZip(recordName);
117        if (cc == null) {
118            s_logger.info("File {} not found in archive. Attempting download from PDB.", recordName);
119            cc = downloadAndAdd(recordName);
120        }
121
122        // If a null record or an empty chemcomp, return a default ChemComp and blacklist.
123        if (cc == null || (null == cc.getName() && cc.getAtoms().size() == 0)) {
124            s_logger.info("Unable to find or download {} - excluding from future searches.", recordName);
125            unavailable.add(recordName);
126            return getEmptyChemComp(recordName);
127        }
128        return cc;
129    }
130
131    /** Use DownloadChemCompProvider to grab a gzipped cif record from the PDB.
132     *  Zip all downloaded cif.gz files into the dictionary.
133     *
134     * @param recordName is the three-letter chemical component code (i.e. residue name).
135     * @return ChemComp matching recordName
136     */
137    private ChemComp downloadAndAdd(String recordName){
138        final ChemComp cc = m_dlProvider.getChemComp(recordName);
139
140        // final File [] files = finder(m_tempDir.resolve("chemcomp").toString(), "cif.gz");
141        final File [] files = new File[1];
142        Path cif = m_tempDir.resolve("chemcomp").resolve(recordName + ".cif.gz");
143        files[0] = cif.toFile();
144        if (files[0] != null) {
145            addToZipFileSystem(m_zipFile, files, m_zipRootDir);
146            if (m_removeCif) for (File f : files) f.delete();
147        }
148        return cc;
149    }
150
151    /**
152     * Cleanup chemical component (.cif.gz) files downloaded to tmpdir.
153     * @param tempdir : path to temporary directory for chemical components
154     */
155    public static void purgeTempFiles(String tempdir) {
156        if (tempdir == null) return;
157
158        s_logger.info("Removing: "+tempdir);
159        Path dlPath = Paths.get(tempdir).resolve("chemcomp");
160        File[] chemCompOutFiles = finder(dlPath.toString(), "cif.gz");
161        if (null != chemCompOutFiles) for (File f : chemCompOutFiles) f.delete();
162        dlPath.toFile().delete();
163    }
164
165    /**
166     * Return an empty ChemComp group for a three-letter resName.
167     * @param resName
168     * @return
169     */
170    private ChemComp getEmptyChemComp(String resName){
171        String pdbName = ""; // Empty string is default
172        if (null != resName && resName.length() >= 3) {
173            pdbName = resName.substring(0,3);
174        }
175        final ChemComp comp = new ChemComp();
176        comp.setOneLetterCode("?");
177        comp.setThreeLetterCode(pdbName);
178        comp.setPolymerType(PolymerType.unknown);
179        comp.setResidueType(ResidueType.atomn);
180        return comp;
181    }
182
183    /**
184     * Return File(s) in dirName that match suffix.
185     * @param dirName
186     * @param suffix
187     * @return
188     */
189    static private File[] finder(String dirName, final String suffix) {
190        if (null == dirName || null == suffix) {
191            return null;
192        }
193
194        final File dir = new File(dirName);
195        return dir.listFiles((dir1, filename) -> filename.endsWith(suffix));
196    }
197
198    /**
199     * This is synchronized, along with addToFileSystem to prevent simulatenous reading/writing.
200     * @param recordName to find in zipfile.
201     * @return ChemComp if found or null if missing.
202     */
203    private synchronized ChemComp getFromZip(String recordName) {
204        ChemComp cc = null;
205        if (!m_zipFile.toFile().exists()) return cc;
206        final String filename = "chemcomp/" + recordName + ".cif.gz";
207
208        // try with resources block to read from the filesystem.
209        // Don't remove the (ClassLoader) cast! It is required for openjdk 11.
210        try (FileSystem fs = FileSystems.newFileSystem(m_zipFile, (ClassLoader)null)) {
211            Path cif = fs.getPath(filename);
212
213            if (Files.exists(cif)) {
214                s_logger.debug("reading {} from {}", recordName, m_zipFile);
215                final ChemicalComponentDictionary dict = ChemCompConverter.fromPath(cif);
216                cc = dict.getChemComp(recordName);
217            }
218        } catch (IOException e) {
219            s_logger.error("Unable to read from zip file : {}", e.getMessage());
220        }
221
222        return cc;
223    }
224
225    /**
226     * Add an array of files to a zip archive.
227     * Synchronized to prevent simultaneous reading/writing.
228     *
229     * @param zipFile is a destination zip archive
230     * @param files is an array of files to be added
231     * @param pathWithinArchive is the path within the archive to add files to
232     * @return true if successfully appended these files.
233     */
234    private synchronized boolean addToZipFileSystem(Path zipFile, File[] files, Path pathWithinArchive) {
235        boolean ret = false;
236
237        /* URIs in Java 7 cannot have spaces, must use Path instead
238         * and so, cannot use the properties map to describe need to create
239         * a new zip archive.  ZipChemCompProvider.initilizeZip to creates the
240         * missing zip file */
241
242                /*
243                // convert the filename to a URI
244                String uriString = "jar:file:" + zipFile.toUri().getPath();
245                final URI uri = URI.create(uriString);
246
247                // if filesystem doesn't exist, create one.
248                final Map<String, String> env = new HashMap<>();
249                // Create a new zip if one isn't present.
250                if (!zipFile.toFile().exists()) {
251                        System.out.println("Need to create " + zipFile.toString());
252                }
253                env.put("create", String.valueOf(!zipFile.toFile().exists()));
254                // Specify the encoding as UTF -8
255                env.put("encoding", "UTF-8");
256                */
257
258        // Copy in each file.
259        // Don't remove the (ClassLoader) cast! It is required for openjdk 11.
260        try (FileSystem zipfs = FileSystems.newFileSystem(zipFile, (ClassLoader)null)) {
261            Files.createDirectories(pathWithinArchive);
262            for (File f : files) {
263                if (!f.isDirectory() && f.exists()) {
264                    Path externalFile = f.toPath();
265                    Path pathInZipFile = zipfs.getPath(pathWithinArchive.resolve(f.getName()).toString());
266                    Files.copy(externalFile, pathInZipFile,
267                            StandardCopyOption.REPLACE_EXISTING);
268                }
269            }
270            ret = true;
271        } catch (IOException ex) {
272            s_logger.error("Unable to add entries to Chemical Component zip archive : {}", ex.getMessage());
273            ret = false;
274        }
275        return ret;
276    }
277}