001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.structure.io.mmcif;
022
023import java.io.BufferedOutputStream;
024import java.io.File;
025import java.io.FileOutputStream;
026import java.io.FilenameFilter;
027import java.io.IOException;
028import java.io.InputStream;
029import java.nio.file.FileSystem;
030import java.nio.file.FileSystems;
031import java.nio.file.Files;
032import java.nio.file.Path;
033import java.nio.file.Paths;
034import java.nio.file.StandardCopyOption;
035import java.util.HashSet;
036import java.util.Set;
037import java.util.zip.GZIPInputStream;
038import java.util.zip.ZipEntry;
039import java.util.zip.ZipOutputStream;
040
041import org.biojava.nbio.structure.io.mmcif.chem.PolymerType;
042import org.biojava.nbio.structure.io.mmcif.chem.ResidueType;
043import org.biojava.nbio.structure.io.mmcif.model.ChemComp;
044import org.slf4j.Logger;
045import org.slf4j.LoggerFactory;
046
047/** This chemical component provider retrieves and caches chemical component definition files from a
048 * zip archive specified in its construction.  If the archive does not contain the record, an attempt is
049 * made to download it using DownloadChemCompProvider. The downloaded file is then added to the archive.
050 *
051 * The class is thread-safe and the same ZipChemCompProvider should be used by all threads to prevent
052 * simultaneous read or write to the zip archive.  A zip archive will be created if missing.
053 *
054 * @author edlunde
055 * @author larsonm
056 * @since 12/05/12
057 * updated 3/5/2016 for Java 7 ZipFileSystem
058 */
059public class ZipChemCompProvider implements ChemCompProvider{
060        private static final Logger s_logger = LoggerFactory.getLogger(ZipChemCompProvider.class);
061
062        private final Path m_tempDir;  // Base path where $m_zipRootDir/ will be downloaded to.
063        private final Path m_zipRootDir;
064        private final Path m_zipFile;
065        private final DownloadChemCompProvider m_dlProvider;
066
067        private boolean m_removeCif;
068
069        // Missing IDs from library that cannot be download added here to prevent delays.
070        private Set<String> unavailable = new HashSet<String>();
071
072        /**
073         * ZipChemCompProvider is a Chemical Component provider that stores chemical components
074         * in a zip archive.  Missing chemical components are downloaded and appended to the
075         * archive.  If non-existent a new zip archive will be created.
076         *
077         * @param chemicalComponentDictionaryFile : path to zip archive for chemical components.
078         * @param tempDir : path for temporary directory, (null) defaults to path in property "java.io.tmpdir".
079         * @throws IOException
080         */
081        public ZipChemCompProvider(String chemicalComponentDictionaryFile, String tempDir) throws IOException {
082                this.m_zipFile = Paths.get(chemicalComponentDictionaryFile);
083
084                // Use a default temporary directory if not passed a value.
085                if (tempDir == null || tempDir.equals("")) {
086                        this.m_tempDir = Paths.get(System.getProperty("java.io.tmpdir"));
087                } else {
088                        this.m_tempDir = Paths.get(tempDir);
089                }
090
091                this.m_zipRootDir = Paths.get("chemcomp");
092
093                // Setup an instance of the download chemcomp provider.
094                this.m_dlProvider = new DownloadChemCompProvider(m_tempDir.toString());
095                this.m_removeCif = true;
096                initializeZip();
097        }
098
099        // See comments in addToZipFileSystem for why initialization is required with
100        // ZipFileSystems - due to URI issues in Java7.
101        private void initializeZip() throws IOException {
102                s_logger.info("Using chemical component dictionary: " + m_zipFile.toString());
103                final File f = m_zipFile.toFile();
104                if (!f.exists()) {
105                        s_logger.info("Creating missing zip archive: " + m_zipFile.toString());
106                        FileOutputStream fo = new FileOutputStream(f);
107                        ZipOutputStream zip = new ZipOutputStream(new BufferedOutputStream(fo));
108                        try {
109                                zip.putNextEntry(new ZipEntry("chemcomp/"));
110                                zip.closeEntry();
111                        } finally {
112                                zip.close();
113                        }
114                }
115        }
116
117        /**
118         * Remove downloaded .cif.gz after adding to zip archive?
119         * Default is true.
120         * @param doRemove
121         */
122        public void setRemoveCif(boolean doRemove) {
123                m_removeCif = doRemove;
124        }
125
126        /* (non-Javadoc)
127         * @see org.biojava.nbio.structure.io.mmcif.ChemCompProvider#getChemComp(java.lang.String)
128         *
129         * @param recordName : three letter PDB name for a residue
130         * @return ChemComp from .zip or ChemComp from repository.  Will return empty ChemComp when unable to find a residue and will return null if not provided a valid recordName.
131         */
132        @Override
133        public ChemComp getChemComp(String recordName) {
134                if (null == recordName) return null;
135
136                // handle non-existent ChemComp codes and do not repeatedly attempt to add these.
137                for (String str : unavailable) {
138                        if (recordName.equals(str)) return getEmptyChemComp(recordName);
139                }
140
141                // Try to pull from zip, if fail then download.
142                ChemComp cc = getFromZip(recordName);
143                if (cc == null) {
144                        s_logger.info("File "+recordName+" not found in archive. Attempting download from PDB.");
145                        cc = downloadAndAdd(recordName);
146                }
147
148                // If a null record or an empty chemcomp, return a default ChemComp and blacklist.
149                if (cc == null || (null == cc.getName() && cc.getAtoms().size() == 0)) {
150                        s_logger.info("Unable to find or download " + recordName + " - excluding from future searches.");
151                        unavailable.add(recordName);
152                        return getEmptyChemComp(recordName);
153                }
154                return cc;
155        }
156
157        /** Use DownloadChemCompProvider to grab a gzipped cif record from the PDB.
158         *  Zip all downloaded cif.gz files into the dictionary.
159         *
160         * @param recordName is the three-letter chemical component code (i.e. residue name).
161         * @return ChemComp matching recordName
162         */
163        private ChemComp downloadAndAdd(String recordName){
164                final ChemComp cc = m_dlProvider.getChemComp(recordName);
165
166                // final File [] files = finder(m_tempDir.resolve("chemcomp").toString(), "cif.gz");
167                final File [] files = new File[1];
168                Path cif = m_tempDir.resolve("chemcomp").resolve(recordName + ".cif.gz");
169                files[0] = cif.toFile();
170                if (files[0] != null) {
171                        addToZipFileSystem(m_zipFile, files, m_zipRootDir);
172                        if (m_removeCif) for (File f : files) f.delete();
173                }
174                return cc;
175        }
176
177        /**
178         * Cleanup chemical component (.cif.gz) files downloaded to tmpdir.
179         * @param tempdir : path to temporary directory for chemical components
180         */
181        public static void purgeTempFiles(String tempdir) {
182                if (tempdir == null) return;
183
184                s_logger.info("Removing: "+tempdir);
185                Path dlPath = Paths.get(tempdir).resolve("chemcomp");
186                File[] chemCompOutFiles = finder(dlPath.toString(), "cif.gz");
187                if (null != chemCompOutFiles) for (File f : chemCompOutFiles) f.delete();
188                dlPath.toFile().delete();
189        }
190
191        /**
192         * Return an empty ChemComp group for a three-letter resName.
193         * @param resName
194         * @return
195         */
196        private ChemComp getEmptyChemComp(String resName){
197                String pdbName = ""; // Empty string is default
198                if (null != resName && resName.length() >= 3) {
199                        pdbName = resName.substring(0,3);
200                }
201                final ChemComp comp = new ChemComp();
202                comp.setOne_letter_code("?");
203                comp.setThree_letter_code(pdbName);
204                comp.setPolymerType(PolymerType.unknown);
205                comp.setResidueType(ResidueType.atomn);
206                return comp;
207        }
208
209        /**
210         * Return File(s) in dirName that match suffix.
211         * @param dirName
212         * @param suffix
213         * @return
214         */
215        static private File[] finder( String dirName, final String suffix){
216                if (null == dirName || null == suffix) {
217                        return null;
218                }
219
220                final File dir = new File(dirName);
221                return dir.listFiles(new FilenameFilter() {
222                        @Override
223                        public boolean accept(File dir, String filename)
224                        { return filename.endsWith(suffix); }
225                } );
226        }
227
228        /**
229         * This is synchronized, along with addToFileSystem to prevent simulatenous reading/writing.
230         * @param recordName to find in zipfile.
231         * @return ChemComp if found or null if missing.
232         */
233        private synchronized ChemComp getFromZip(String recordName) {
234                ChemComp cc = null;
235                if (!m_zipFile.toFile().exists()) return cc;
236                final String filename = "chemcomp/" + recordName+".cif.gz";
237
238                // try with resources block to read from the filesystem.
239                try (FileSystem fs = FileSystems.newFileSystem(m_zipFile, null)) {
240                        Path cif = fs.getPath(filename);
241
242                        if (Files.exists(cif)) {
243                                final InputStream zipStream = Files.newInputStream(cif);
244                                final InputStream inputStream = new GZIPInputStream(zipStream);
245                                s_logger.debug("reading " + recordName + " from " + m_zipFile);
246                                final MMcifParser parser = new SimpleMMcifParser();
247                                final ChemCompConsumer consumer = new ChemCompConsumer();
248                                parser.addMMcifConsumer(consumer);
249                                parser.parse(inputStream);
250                                inputStream.close();
251
252                                final ChemicalComponentDictionary dict = consumer.getDictionary();
253                                cc = dict.getChemComp(recordName);
254                        }
255                } catch (IOException e) {
256                        s_logger.error("Unable to read from zip file : " + e.getMessage());
257                }
258
259                return cc;
260        }
261
262        /**
263         * Add an array of files to a zip archive.
264         * Synchronized to prevent simultaneous reading/writing.
265         *
266         * @param zipFile is a destination zip archive
267         * @param files is an array of files to be added
268         * @param pathWithinArchive is the path within the archive to add files to
269         * @return true if successfully appended these files.
270         */
271        private synchronized boolean addToZipFileSystem(Path zipFile, File[] files, Path pathWithinArchive) {
272                boolean ret = false;
273
274                /* URIs in Java 7 cannot have spaces, must use Path instead
275                 * and so, cannot use the properties map to describe need to create
276                 * a new zip archive.  ZipChemCompProvider.initilizeZip to creates the
277                 * missing zip file */
278
279                /*
280                // convert the filename to a URI
281                String uriString = "jar:file:" + zipFile.toUri().getPath();
282                final URI uri = URI.create(uriString);
283
284                // if filesystem doesn't exist, create one.
285                final Map<String, String> env = new HashMap<>();
286                // Create a new zip if one isn't present.
287                if (!zipFile.toFile().exists()) {
288                        System.out.println("Need to create " + zipFile.toString());
289                }
290                env.put("create", String.valueOf(!zipFile.toFile().exists()));
291                // Specify the encoding as UTF -8
292                env.put("encoding", "UTF-8");
293                */
294
295                // Copy in each file.
296                try (FileSystem zipfs = FileSystems.newFileSystem(zipFile, null)) {
297                        Files.createDirectories(pathWithinArchive);
298                        for (File f : files) {
299                                if (!f.isDirectory() && f.exists()) {
300                                        Path externalFile = f.toPath();
301                                        Path pathInZipFile = zipfs.getPath(pathWithinArchive.resolve(f.getName()).toString());
302                                        Files.copy(externalFile, pathInZipFile,
303                                                        StandardCopyOption.REPLACE_EXISTING);
304                                }
305                        }
306                        ret = true;
307                } catch (IOException ex) {
308                        s_logger.error("Unable to add entries to Chemical Component zip archive : " + ex.getMessage());
309                        ret = false;
310                }
311                return ret;
312        }
313}