001package org.biojava.nbio.structure.chem;
002
003import org.biojava.nbio.core.util.InputStreamProvider;
004import org.biojava.nbio.structure.align.util.URLConnectionTools;
005import org.biojava.nbio.structure.align.util.UserConfiguration;
006import org.biojava.nbio.structure.io.LocalPDBDirectory;
007import org.biojava.nbio.structure.io.cif.ChemCompConverter;
008import org.rcsb.cif.ParsingException;
009import org.slf4j.Logger;
010import org.slf4j.LoggerFactory;
011
012import java.io.BufferedReader;
013import java.io.File;
014import java.io.FileOutputStream;
015import java.io.FilenameFilter;
016import java.io.IOException;
017import java.io.InputStreamReader;
018import java.io.PrintWriter;
019import java.io.StringWriter;
020import java.net.URL;
021import java.net.URLConnection;
022import java.nio.file.Files;
023import java.nio.file.Paths;
024import java.nio.file.StandardCopyOption;
025import java.util.ArrayList;
026import java.util.List;
027import java.util.concurrent.atomic.AtomicBoolean;
028import java.util.regex.Matcher;
029import java.util.regex.Pattern;
030import java.util.zip.GZIPOutputStream;
031
032/**
033 * This provider of chemical components can download and cache chemical component definition files from the RCSB PDB web
034 * site. It is the default way to access these definitions. If this provider is called he first time, it will download
035 * and install all chemical component definitions in a local directory. Once the definition files have been installed,
036 * it has quick startup time and low memory requirements.
037 *
038 * An alternative provider, that keeps all definitions in memory is the {@link AllChemCompProvider}. Another provider,
039 * that does not require any network access, but only can support a limited set of chemical component definitions, is
040 * the {@link ReducedChemCompProvider}.
041 *
042 * @author Andreas Prlic
043 */
044public class DownloadChemCompProvider implements ChemCompProvider {
045    private static final Logger logger = LoggerFactory.getLogger(DownloadChemCompProvider.class);
046
047    private static final String NEWLINE = System.getProperty("line.separator");
048
049    public static final String CHEM_COMP_CACHE_DIRECTORY = "chemcomp";
050    public static final String DEFAULT_SERVER_URL = "https://files.rcsb.org/ligands/download/";
051    public static final String DEFAULT_CHEMCOMP_PATHURL_TEMPLATE = "{ccd_id}.cif";
052
053    /**
054     * The base URL to which the full path specified via {@link #setChemCompPathUrlTemplate(String)} is appended.
055     * It is assumed that it has a trailing slash.
056     */
057    public static String serverBaseUrl = DEFAULT_SERVER_URL;
058
059    private static File path;
060
061    private static String chemCompPathUrlTemplate = DEFAULT_CHEMCOMP_PATHURL_TEMPLATE;
062
063    static final Pattern CCD_ID_TEMPLATE_REGEX = Pattern.compile("\\{ccd_id(?::(\\d+_\\d+|[-+]?\\d+))?}");
064
065
066    // flags to make sure there is only one thread running that is loading the dictionary
067    static AtomicBoolean loading = new AtomicBoolean(false);
068
069    static final List<String> protectedIDs = new ArrayList<>();
070    static {
071        protectedIDs.add("CON");
072        protectedIDs.add("PRN");
073        protectedIDs.add("AUX");
074        protectedIDs.add("NUL");
075    }
076
077    private static ChemCompProvider fallback = null; // Fallback provider if the download fails
078
079    /**
080     * by default we will download only some of the files. User has to request that all files should be downloaded...
081     */
082    boolean downloadAll = false;
083
084    public DownloadChemCompProvider() {
085        this(null);
086    }
087
088    public DownloadChemCompProvider(String cacheFilePath) {
089        logger.debug("Initialising DownloadChemCompProvider");
090
091        // note that path is static, so this is just to make sure that all non-static methods will have path initialised
092        if (cacheFilePath != null) {
093            path = new File(cacheFilePath);
094        }
095    }
096
097    /**
098     * Set the base URL for the location of all chemical component CIF files, to which the chemCompPathUrlTemplate
099     * is appended, settable in {@link #setChemCompPathUrlTemplate(String)}. A trailing slash is appended
100     * if not present.
101     */
102    public static void setServerBaseUrl(String serverBaseUrl) {
103        if (!serverBaseUrl.endsWith("/")) {
104            serverBaseUrl = serverBaseUrl + "/";
105        }
106        DownloadChemCompProvider.serverBaseUrl = serverBaseUrl;
107    }
108
109    /**
110     * Set the path to append to the serverBaseUrl (settable in {@link #setServerBaseUrl(String)}).
111     * The string can contain placeholders that will be expanded at runtime:
112     * <ul>
113     * <li>"{ccd_id}" to be replaced by the chemical component identifier, in capitals</li>
114     * <li>"{ccd_id:beginIndex-endIndex}" to be replaced by a substring of the chemical component identifier in capitals,
115     * with indices following the same convention as {@link String#substring(int, int)} </li>
116     * <li>"{ccd_id:index}" to be replaced by a substring of the chemical component identifier in capitals,
117     * with index either a positive or negative integer to substring from left or right of the string respectively.</li>
118     * </ul>
119     * If any of the indices are off-bounds, then the full chemical component identifier is replaced
120     */
121    public static void setChemCompPathUrlTemplate(String chemCompPathUrlTemplate) {
122        DownloadChemCompProvider.chemCompPathUrlTemplate = chemCompPathUrlTemplate;
123    }
124
125    /**
126     * Get this provider's cache path
127     * @return
128     */
129    public static File getPath() {
130        if (path == null) {
131            UserConfiguration config = new UserConfiguration();
132            path = new File(config.getCacheFilePath());
133        }
134        return path;
135    }
136
137    /**
138     * Checks if the chemical components already have been installed into the PDB directory.
139     * If not, will download the chemical components definitions file and split it up into small
140     * subfiles.
141     */
142    public void checkDoFirstInstall() {
143        if (!downloadAll) {
144            return;
145        }
146
147        // this makes sure there is a file separator between every component,
148        // if path has a trailing file separator or not, it will work for both cases
149        File dir = new File(getPath(), CHEM_COMP_CACHE_DIRECTORY);
150        File f = new File(dir, "components.cif.gz");
151
152        if (!f.exists()) {
153            downloadAllDefinitions();
154        } else {
155            // file exists.. did it get extracted?
156            FilenameFilter filter = (dir1, file) -> file.endsWith(".cif.gz");
157            String[] files = dir.list(filter);
158            if (files.length < 500) {
159                // not all did get unpacked
160                try {
161                    split();
162                } catch (IOException e) {
163                    logger.error("Could not split file {} into individual chemical component files. Error: {}",
164                            f.toString(), e.getMessage());
165                }
166            }
167        }
168    }
169
170    private void split() throws IOException {
171        logger.info("Installing individual chem comp files ...");
172
173        File dir = new File(getPath(), CHEM_COMP_CACHE_DIRECTORY);
174        File f = new File(dir, "components.cif.gz");
175
176        int counter = 0;
177        InputStreamProvider prov = new InputStreamProvider();
178
179        try (BufferedReader buf = new BufferedReader (new InputStreamReader(prov.getInputStream(f)))) {
180            String line;
181            line = buf.readLine ();
182            StringWriter writer = new StringWriter();
183
184            String currentID = null;
185            while (line != null) {
186                if (line.startsWith("data_")) {
187                    // a new record found!
188
189                    if (currentID != null) {
190                        writeID(writer.toString(), currentID);
191                        counter++;
192                    }
193
194                    currentID = line.substring(5);
195                    writer = new StringWriter();
196                }
197
198                writer.append(line);
199                writer.append(NEWLINE);
200
201                line = buf.readLine();
202            }
203
204            // write the last record...
205            writeID(writer.toString(), currentID);
206            counter++;
207        }
208
209        logger.info("Created {} chemical component files.", counter);
210    }
211
212    /**
213     * Output chemical contents to a file
214     * @param contents File contents
215     * @param currentID Chemical ID, used to determine the filename
216     * @throws IOException
217     */
218    private void writeID(String contents, String currentID) throws IOException {
219        String localName = getLocalFileName(currentID);
220        try (PrintWriter pw = new PrintWriter(new GZIPOutputStream(new FileOutputStream(localName)))) {
221            pw.print(contents);
222            pw.flush();
223        }
224    }
225
226    /**
227     * Loads the definitions for this {@link ChemComp} from a local file and instantiates a new object.
228     *
229     * @param recordName the ID of the {@link ChemComp}
230     * @return a new {@link ChemComp} definition.
231     */
232    @Override
233    public ChemComp getChemComp(String recordName) {
234        // make sure we work with upper case records
235        recordName = recordName.toUpperCase().trim();
236
237        boolean haveFile = true;
238        if ("?".equals(recordName)) {
239            return null;
240        }
241
242        if (fileIsAbsent(recordName)) {
243            // check if we should install all components
244            checkDoFirstInstall();
245        }
246        if (fileIsAbsent(recordName)) {
247            // we previously have installed already the definitions,
248            // just do an incrememntal update
249            haveFile = downloadChemCompRecord(recordName);
250        }
251
252        // Added check that download was successful and chemical component is available.
253        if (haveFile) {
254            String filename = getLocalFileName(recordName);
255            try {
256                ChemComp chemComp;
257                try {
258                    ChemicalComponentDictionary dict = ChemCompConverter.fromPath(Paths.get(filename));
259                    chemComp = dict.getChemComp(recordName);
260                } catch (ParsingException e) {
261                    // happens for corrupt files
262                    chemComp = null;
263                }
264
265                // May be null if the file was corrupt. Fall back on ReducedChemCompProvider in that case
266                if (chemComp != null) {
267                    return chemComp;
268                }
269            } catch (IOException e) {
270                logger.warn("Could not download chemical component file {} for {}. Error: {}. Now trying to use the " +
271                                "local chemical component definitions.", filename, recordName, e.getMessage());
272            }
273        }
274
275        // see https://github.com/biojava/biojava/issues/315
276        // probably a network error happened. Try to use the ReducedChemCOmpProvider
277        if (fallback == null) {
278            fallback = new ReducedChemCompProvider();
279        }
280
281        logger.warn("Falling back to ReducedChemCompProvider for {}. This could indicate a network error.", recordName);
282        return fallback.getChemComp(recordName);
283    }
284
285    /**
286     * Returns the file name that contains the definition for this {@link ChemComp}
287     *
288     * @param recordName the ID of the {@link ChemComp}
289     * @return full path to the file
290     */
291    public static String getLocalFileName(String recordName) {
292        if (protectedIDs.contains(recordName)) {
293            recordName = "_" + recordName;
294        }
295
296        File f = new File(getPath(), CHEM_COMP_CACHE_DIRECTORY);
297        if (!f.exists()) {
298            logger.info("Creating directory {}", f);
299
300            boolean success = f.mkdir();
301            // we've checked in initPath that path is writable, so there's no need to check if it succeeds
302            // in the unlikely case that in the meantime it isn't writable at least we log an error
303            if (!success) {
304                logger.error("Directory {} could not be created", f);
305            }
306        }
307
308        File theFile = new File(f, recordName + ".cif.gz");
309        return theFile.toString();
310    }
311
312    private static boolean fileIsAbsent(String recordName) {
313        String fileName = getLocalFileName(recordName);
314        File f = new File(fileName);
315
316        // delete files that are too short to have contents
317        if (f.length() < LocalPDBDirectory.MIN_PDB_FILE_SIZE) {
318            // Delete defensively.
319            // Note that if delete is unsuccessful, we re-download the file anyways
320            f.delete();
321            return true;
322        }
323
324        return !f.exists();
325    }
326
327    /**
328     * Expands the given path URL template, replacing the placeholders as specified in {@link #setChemCompPathUrlTemplate(String)}
329     * by the ccdId given (or its substrings, if indices are present in the template)
330     * @param templateStr the template string with placeholders for ccd ids
331     * @param ccdId the ccd id to replace (in full or a substring)
332     * @return the input templateStr with placeholders replaced
333     */
334    static String expandPathUrlTemplate(String templateStr, String ccdId) {
335        Matcher m = CCD_ID_TEMPLATE_REGEX.matcher(templateStr);
336        StringBuilder output = new StringBuilder();
337        int lastIndex = 0;
338        while (m.find()) {
339            String repString = ccdId;
340            String indicesStr = m.group(1);
341            try {
342                if (indicesStr == null) {
343                    // no substringing
344                    repString = ccdId;
345                } else if (!indicesStr.contains("_")) {
346                    // left/right substring
347                    int idx = Integer.parseInt(indicesStr);
348                    if (idx < 0) { // right substring
349                        repString = ccdId.substring(ccdId.length() + idx);
350                    } else { // left substring
351                        repString = ccdId.substring(0, idx);
352                    }
353                } else if (indicesStr.contains("_")) {
354                    // start and end index
355                    String[] tokens = indicesStr.split("_");
356                    int begIdx = Integer.parseInt(tokens[0]);
357                    int endIdx = Integer.parseInt(tokens[1]);
358                    repString = ccdId.substring(begIdx, endIdx);
359                }
360            } catch (IndexOutOfBoundsException e) {
361                // we don't set repString, it keeps original value ccdId
362                logger.debug("Indices included in path URL template {} are out of bounds for string {}", templateStr, ccdId);
363            }
364            output.append(templateStr, lastIndex, m.start()).append(repString);
365
366            lastIndex = m.end();
367            // TODO when we upgrade to java 11, use the new methods introduced in java 9, see https://stackoverflow.com/questions/9605716/java-regular-expression-find-and-replace
368        }
369        if (lastIndex < templateStr.length()) {
370            output.append(templateStr, lastIndex, templateStr.length());
371        }
372        return output.toString();
373    }
374
375    /**
376     * @param recordName : three-letter name
377     * @return true if successful download
378     */
379    private static boolean downloadChemCompRecord(String recordName) {
380        String localName = getLocalFileName(recordName);
381        File newFile;
382        try {
383            newFile = Files.createTempFile("chemcomp" + recordName,"cif").toFile();
384            logger.debug("Will write chem comp file to temp file {}", newFile.toString());
385        } catch(IOException e) {
386            logger.error("Could not write to temp directory {} to create the chemical component download temp file", System.getProperty("java.io.tmpdir"));
387            return false;
388        }
389
390        String u = serverBaseUrl + expandPathUrlTemplate(chemCompPathUrlTemplate, recordName);
391
392        logger.debug("Downloading chem comp definition from {}", u);
393
394        URL url = null;
395        try {
396            url = new URL(u);
397            URLConnection uconn = URLConnectionTools.openURLConnection(url);
398
399            try (PrintWriter pw = new PrintWriter(new GZIPOutputStream(new FileOutputStream(newFile)));
400                 BufferedReader fileBuffer = new BufferedReader(new InputStreamReader(uconn.getInputStream()))) {
401                String line;
402                boolean success = false;
403                while ((line = fileBuffer.readLine()) != null) {
404                    pw.println(line);
405                    success = true;
406                }
407                if(!success) {
408                        throw new IOException("Malformed URL or no content found in "+url.toString());
409                }
410
411                pw.flush();
412            }
413            // Now we move this across to where it actually wants to be
414            Files.move(newFile.toPath(), Paths.get(localName), StandardCopyOption.REPLACE_EXISTING);
415
416            return true;
417        } catch (IOException e) {
418            logger.error("Could not download {} OR store locally to {} Error ={}",
419                    url,
420                    localName,
421                    e.getMessage());
422            newFile.delete();
423        }
424        return false;
425    }
426
427    private void downloadAllDefinitions() {
428        if (loading.get()) {
429            logger.info("Waiting for other thread to install chemical components...");
430        }
431
432        while (loading.get()) {
433            // another thread is already downloading the components definitions
434            // wait for the other thread to finish...
435            try {
436                // wait half a second
437                Thread.sleep(500);
438            } catch (InterruptedException e) {
439                //e.printStackTrace();
440                logger.error("Thread interrupted "+e.getMessage());
441            }
442
443            logger.info("Another thread installed the chemical components.");
444            return;
445        }
446
447        loading.set(true);
448        long timeS = System.currentTimeMillis();
449
450        logger.info("Performing first installation of chemical components.");
451        logger.info("Downloading components.cif.gz ...");
452
453        try {
454            AllChemCompProvider.downloadFile();
455        } catch (IOException e) {
456            logger.error("Could not download the all chemical components file. Error: {}. "
457                    + "Chemical components information won't be available", e.getMessage());
458            // no point in trying to split if the file could not be downloaded
459            loading.set(false);
460            return;
461        }
462        try {
463            split();
464        } catch (IOException e) {
465            logger.error("Could not split all chem comp file into individual chemical component files. Error: {}",
466                    e.getMessage());
467            // no point in reporting time
468            loading.set(false);
469            return;
470        }
471        long timeE = System.currentTimeMillis();
472        logger.info("time to install chem comp dictionary: " + (timeE - timeS) / 1000 + " sec.");
473        loading.set(false);
474    }
475
476    /**
477     * By default this provider will download only some of the {@link ChemComp} files.
478     * The user has to request that all files should be downloaded by setting this parameter to true.
479     *
480     *  @return flag if the all components should be downloaded and installed at startup. (default: false)
481     */
482    public boolean isDownloadAll() {
483        return downloadAll;
484    }
485
486    /** By default this provider will download only some of the {@link ChemComp} files.
487     * The user has to request that all files should be downloaded by setting this parameter to true.
488     *
489     * @param downloadAll if the all components should be downloaded and installed at startup. (default: false)
490     */
491    public void setDownloadAll(boolean downloadAll) {
492        this.downloadAll = downloadAll;
493    }
494}