001package org.biojava.nbio.structure.chem;
002
003import org.biojava.nbio.core.util.InputStreamProvider;
004import org.biojava.nbio.structure.align.util.URLConnectionTools;
005import org.biojava.nbio.structure.align.util.UserConfiguration;
006import org.biojava.nbio.structure.io.LocalPDBDirectory;
007import org.biojava.nbio.structure.io.cif.ChemCompConverter;
008import org.rcsb.cif.ParsingException;
009import org.slf4j.Logger;
010import org.slf4j.LoggerFactory;
011
012import java.io.BufferedReader;
013import java.io.File;
014import java.io.FileOutputStream;
015import java.io.FilenameFilter;
016import java.io.IOException;
017import java.io.InputStreamReader;
018import java.io.PrintWriter;
019import java.io.StringWriter;
020import java.net.URL;
021import java.net.URLConnection;
022import java.nio.file.Files;
023import java.nio.file.Paths;
024import java.nio.file.StandardCopyOption;
025import java.util.ArrayList;
026import java.util.List;
027import java.util.concurrent.atomic.AtomicBoolean;
028import java.util.regex.Matcher;
029import java.util.regex.Pattern;
030import java.util.zip.GZIPOutputStream;
031
032/**
033 * This provider of chemical components can download and cache chemical component definition files from the RCSB PDB web
034 * site. It is the default way to access these definitions. If this provider is called he first time, it will download
035 * and install all chemical component definitions in a local directory. Once the definition files have been installed,
036 * it has quick startup time and low memory requirements.
037 *
038 * An alternative provider, that keeps all definitions in memory is the {@link AllChemCompProvider}. Another provider,
039 * that does not require any network access, but only can support a limited set of chemical component definitions, is
040 * the {@link ReducedChemCompProvider}.
041 *
042 * @author Andreas Prlic
043 */
044public class DownloadChemCompProvider implements ChemCompProvider {
045    private static final Logger logger = LoggerFactory.getLogger(DownloadChemCompProvider.class);
046
047    private static final String NEWLINE = System.getProperty("line.separator");
048
049    public static final String CHEM_COMP_CACHE_DIRECTORY = "chemcomp";
050    public static final String DEFAULT_SERVER_URL = "https://files.rcsb.org/ligands/download/";
051    public static final String DEFAULT_CHEMCOMP_PATHURL_TEMPLATE = "{ccd_id}.cif";
052
053    /**
054     * The base URL to which the full path specified via {@link #setChemCompPathUrlTemplate(String)} is appended.
055     * It is assumed that it has a trailing slash.
056     */
057    public static String serverBaseUrl = DEFAULT_SERVER_URL;
058
059    private static File path;
060
061    private static String chemCompPathUrlTemplate = DEFAULT_CHEMCOMP_PATHURL_TEMPLATE;
062
063    static final Pattern CCD_ID_TEMPLATE_REGEX = Pattern.compile("\\{ccd_id(?::(\\d+_\\d+|[-+]?\\d+))?}");
064
065
066    // flags to make sure there is only one thread running that is loading the dictionary
067    static AtomicBoolean loading = new AtomicBoolean(false);
068
069    static final List<String> protectedIDs = new ArrayList<>();
070    static {
071        protectedIDs.add("CON");
072        protectedIDs.add("PRN");
073        protectedIDs.add("AUX");
074        protectedIDs.add("NUL");
075    }
076
077    private static ChemCompProvider fallback = null; // Fallback provider if the download fails
078
079    /**
080     * by default we will download only some of the files. User has to request that all files should be downloaded...
081     */
082    boolean downloadAll = false;
083
084    public DownloadChemCompProvider() {
085        this(null);
086    }
087
088    public DownloadChemCompProvider(String cacheFilePath) {
089        logger.debug("Initialising DownloadChemCompProvider");
090
091        // note that path is static, so this is just to make sure that all non-static methods will have path initialised
092        if (cacheFilePath != null) {
093            path = new File(cacheFilePath);
094        }
095    }
096
097    /**
098     * Set the base URL for the location of all chemical component CIF files, to which the chemCompPathUrlTemplate
099     * is appended, settable in {@link #setChemCompPathUrlTemplate(String)}. A trailing slash is appended
100     * if not present.
101     */
102    public static void setServerBaseUrl(String serverBaseUrl) {
103        if (!serverBaseUrl.endsWith("/")) {
104            serverBaseUrl = serverBaseUrl + "/";
105        }
106        DownloadChemCompProvider.serverBaseUrl = serverBaseUrl;
107    }
108
109    /**
110     * Set the path to append to the serverBaseUrl (settable in {@link #setServerBaseUrl(String)}).
111     * The string can contain placeholders that will be expanded at runtime:
112     * <li>"{ccd_id}" to be replaced by the chemical component identifier, in capitals</li>
113     * <li>"{ccd_id:beginIndex-endIndex}" to be replaced by a substring of the chemical component identifier in capitals,
114     * with indices following the same convention as {@link String#substring(int, int)} </li>
115     * <li>"{ccd_id:index}" to be replaced by a substring of the chemical component identifier in capitals,
116     * with index either a positive or negative integer to substring from left or right of the string respectively.</li>
117     * If any of the indices are off-bounds, then the full chemical component identifier is replaced
118     */
119    public static void setChemCompPathUrlTemplate(String chemCompPathUrlTemplate) {
120        DownloadChemCompProvider.chemCompPathUrlTemplate = chemCompPathUrlTemplate;
121    }
122
123    /**
124     * Get this provider's cache path
125     * @return
126     */
127    public static File getPath() {
128        if (path == null) {
129            UserConfiguration config = new UserConfiguration();
130            path = new File(config.getCacheFilePath());
131        }
132        return path;
133    }
134
135    /**
136     * Checks if the chemical components already have been installed into the PDB directory.
137     * If not, will download the chemical components definitions file and split it up into small
138     * subfiles.
139     */
140    public void checkDoFirstInstall() {
141        if (!downloadAll) {
142            return;
143        }
144
145        // this makes sure there is a file separator between every component,
146        // if path has a trailing file separator or not, it will work for both cases
147        File dir = new File(getPath(), CHEM_COMP_CACHE_DIRECTORY);
148        File f = new File(dir, "components.cif.gz");
149
150        if (!f.exists()) {
151            downloadAllDefinitions();
152        } else {
153            // file exists.. did it get extracted?
154            FilenameFilter filter = (dir1, file) -> file.endsWith(".cif.gz");
155            String[] files = dir.list(filter);
156            if (files.length < 500) {
157                // not all did get unpacked
158                try {
159                    split();
160                } catch (IOException e) {
161                    logger.error("Could not split file {} into individual chemical component files. Error: {}",
162                            f.toString(), e.getMessage());
163                }
164            }
165        }
166    }
167
168    private void split() throws IOException {
169        logger.info("Installing individual chem comp files ...");
170
171        File dir = new File(getPath(), CHEM_COMP_CACHE_DIRECTORY);
172        File f = new File(dir, "components.cif.gz");
173
174        int counter = 0;
175        InputStreamProvider prov = new InputStreamProvider();
176
177        try (BufferedReader buf = new BufferedReader (new InputStreamReader(prov.getInputStream(f)))) {
178            String line;
179            line = buf.readLine ();
180            StringWriter writer = new StringWriter();
181
182            String currentID = null;
183            while (line != null) {
184                if (line.startsWith("data_")) {
185                    // a new record found!
186
187                    if (currentID != null) {
188                        writeID(writer.toString(), currentID);
189                        counter++;
190                    }
191
192                    currentID = line.substring(5);
193                    writer = new StringWriter();
194                }
195
196                writer.append(line);
197                writer.append(NEWLINE);
198
199                line = buf.readLine();
200            }
201
202            // write the last record...
203            writeID(writer.toString(), currentID);
204            counter++;
205        }
206
207        logger.info("Created {} chemical component files.", counter);
208    }
209
210    /**
211     * Output chemical contents to a file
212     * @param contents File contents
213     * @param currentID Chemical ID, used to determine the filename
214     * @throws IOException
215     */
216    private void writeID(String contents, String currentID) throws IOException {
217        String localName = getLocalFileName(currentID);
218        try (PrintWriter pw = new PrintWriter(new GZIPOutputStream(new FileOutputStream(localName)))) {
219            pw.print(contents);
220            pw.flush();
221        }
222    }
223
224    /**
225     * Loads the definitions for this {@link ChemComp} from a local file and instantiates a new object.
226     *
227     * @param recordName the ID of the {@link ChemComp}
228     * @return a new {@link ChemComp} definition.
229     */
230    @Override
231    public ChemComp getChemComp(String recordName) {
232        // make sure we work with upper case records
233        recordName = recordName.toUpperCase().trim();
234
235        boolean haveFile = true;
236        if (recordName.equals("?")) {
237            return null;
238        }
239
240        if (fileIsAbsent(recordName)) {
241            // check if we should install all components
242            checkDoFirstInstall();
243        }
244        if (fileIsAbsent(recordName)) {
245            // we previously have installed already the definitions,
246            // just do an incrememntal update
247            haveFile = downloadChemCompRecord(recordName);
248        }
249
250        // Added check that download was successful and chemical component is available.
251        if (haveFile) {
252            String filename = getLocalFileName(recordName);
253            try {
254                ChemComp chemComp;
255                try {
256                    ChemicalComponentDictionary dict = ChemCompConverter.fromPath(Paths.get(filename));
257                    chemComp = dict.getChemComp(recordName);
258                } catch (ParsingException e) {
259                    // happens for corrupt files
260                    chemComp = null;
261                }
262
263                // May be null if the file was corrupt. Fall back on ReducedChemCompProvider in that case
264                if (chemComp != null) {
265                    return chemComp;
266                }
267            } catch (IOException e) {
268                logger.warn("Could not download chemical component file {} for {}. Error: {}. Now trying to use the " +
269                                "local chemical component definitions.", filename, recordName, e.getMessage());
270            }
271        }
272
273        // see https://github.com/biojava/biojava/issues/315
274        // probably a network error happened. Try to use the ReducedChemCOmpProvider
275        if (fallback == null) {
276            fallback = new ReducedChemCompProvider();
277        }
278
279        logger.warn("Falling back to ReducedChemCompProvider for {}. This could indicate a network error.", recordName);
280        return fallback.getChemComp(recordName);
281    }
282
283    /**
284     * Returns the file name that contains the definition for this {@link ChemComp}
285     *
286     * @param recordName the ID of the {@link ChemComp}
287     * @return full path to the file
288     */
289    public static String getLocalFileName(String recordName) {
290        if (protectedIDs.contains(recordName)) {
291            recordName = "_" + recordName;
292        }
293
294        File f = new File(getPath(), CHEM_COMP_CACHE_DIRECTORY);
295        if (!f.exists()) {
296            logger.info("Creating directory {}", f);
297
298            boolean success = f.mkdir();
299            // we've checked in initPath that path is writable, so there's no need to check if it succeeds
300            // in the unlikely case that in the meantime it isn't writable at least we log an error
301            if (!success) {
302                logger.error("Directory {} could not be created", f);
303            }
304        }
305
306        File theFile = new File(f, recordName + ".cif.gz");
307        return theFile.toString();
308    }
309
310    private static boolean fileIsAbsent(String recordName) {
311        String fileName = getLocalFileName(recordName);
312        File f = new File(fileName);
313
314        // delete files that are too short to have contents
315        if (f.length() < LocalPDBDirectory.MIN_PDB_FILE_SIZE) {
316            // Delete defensively.
317            // Note that if delete is unsuccessful, we re-download the file anyways
318            f.delete();
319            return true;
320        }
321
322        return !f.exists();
323    }
324
325    /**
326     * Expands the given path URL template, replacing the placeholders as specified in {@link #setChemCompPathUrlTemplate(String)}
327     * by the ccdId given (or its substrings, if indices are present in the template)
328     * @param templateStr the template string with placeholders for ccd ids
329     * @param ccdId the ccd id to replace (in full or a substring)
330     * @return the input templateStr with placeholders replaced
331     */
332    static String expandPathUrlTemplate(String templateStr, String ccdId) {
333        Matcher m = CCD_ID_TEMPLATE_REGEX.matcher(templateStr);
334        StringBuilder output = new StringBuilder();
335        int lastIndex = 0;
336        while (m.find()) {
337            String repString = ccdId;
338            String indicesStr = m.group(1);
339            try {
340                if (indicesStr == null) {
341                    // no substringing
342                    repString = ccdId;
343                } else if (!indicesStr.contains("_")) {
344                    // left/right substring
345                    int idx = Integer.parseInt(indicesStr);
346                    if (idx < 0) { // right substring
347                        repString = ccdId.substring(ccdId.length() + idx);
348                    } else { // left substring
349                        repString = ccdId.substring(0, idx);
350                    }
351                } else if (indicesStr.contains("_")) {
352                    // start and end index
353                    String[] tokens = indicesStr.split("_");
354                    int begIdx = Integer.parseInt(tokens[0]);
355                    int endIdx = Integer.parseInt(tokens[1]);
356                    repString = ccdId.substring(begIdx, endIdx);
357                }
358            } catch (IndexOutOfBoundsException e) {
359                // we don't set repString, it keeps original value ccdId
360                logger.debug("Indices included in path URL template {} are out of bounds for string {}", templateStr, ccdId);
361            }
362            output.append(templateStr, lastIndex, m.start()).append(repString);
363
364            lastIndex = m.end();
365            // TODO when we upgrade to java 11, use the new methods introduced in java 9, see https://stackoverflow.com/questions/9605716/java-regular-expression-find-and-replace
366        }
367        if (lastIndex < templateStr.length()) {
368            output.append(templateStr, lastIndex, templateStr.length());
369        }
370        return output.toString();
371    }
372
373    /**
374     * @param recordName : three-letter name
375     * @return true if successful download
376     */
377    private static boolean downloadChemCompRecord(String recordName) {
378        String localName = getLocalFileName(recordName);
379        File newFile;
380        try {
381            newFile = File.createTempFile("chemcomp" + recordName, "cif");
382            logger.debug("Will write chem comp file to temp file {}", newFile.toString());
383        } catch(IOException e) {
384            logger.error("Could not write to temp directory {} to create the chemical component download temp file", System.getProperty("java.io.tmpdir"));
385            return false;
386        }
387
388        String u = serverBaseUrl + expandPathUrlTemplate(chemCompPathUrlTemplate, recordName);
389
390        logger.debug("Downloading chem comp definition from {}", u);
391
392        URL url = null;
393        try {
394            url = new URL(u);
395            URLConnection uconn = URLConnectionTools.openURLConnection(url);
396
397            try (PrintWriter pw = new PrintWriter(new GZIPOutputStream(new FileOutputStream(newFile)));
398                 BufferedReader fileBuffer = new BufferedReader(new InputStreamReader(uconn.getInputStream()))) {
399                String line;
400                boolean success = false;
401                while ((line = fileBuffer.readLine()) != null) {
402                    pw.println(line);
403                    success = true;
404                }
405                if(!success) {
406                        throw new IOException("Malformed URL or no content found in "+url.toString());
407                }
408
409                pw.flush();
410            }
411            // Now we move this across to where it actually wants to be
412            Files.move(newFile.toPath(), Paths.get(localName), StandardCopyOption.REPLACE_EXISTING);
413
414            return true;
415        } catch (IOException e) {
416            logger.error("Could not download {} OR store locally to {} Error ={}",
417                    url,
418                    localName,
419                    e.getMessage());
420            newFile.delete();
421        }
422        return false;
423    }
424
425    private void downloadAllDefinitions() {
426        if (loading.get()) {
427            logger.info("Waiting for other thread to install chemical components...");
428        }
429
430        while (loading.get()) {
431            // another thread is already downloading the components definitions
432            // wait for the other thread to finish...
433            try {
434                // wait half a second
435                Thread.sleep(500);
436            } catch (InterruptedException e) {
437                //e.printStackTrace();
438                logger.error("Thread interrupted "+e.getMessage());
439            }
440
441            logger.info("Another thread installed the chemical components.");
442            return;
443        }
444
445        loading.set(true);
446        long timeS = System.currentTimeMillis();
447
448        logger.info("Performing first installation of chemical components.");
449        logger.info("Downloading components.cif.gz ...");
450
451        try {
452            AllChemCompProvider.downloadFile();
453        } catch (IOException e) {
454            logger.error("Could not download the all chemical components file. Error: {}. "
455                    + "Chemical components information won't be available", e.getMessage());
456            // no point in trying to split if the file could not be downloaded
457            loading.set(false);
458            return;
459        }
460        try {
461            split();
462        } catch (IOException e) {
463            logger.error("Could not split all chem comp file into individual chemical component files. Error: {}",
464                    e.getMessage());
465            // no point in reporting time
466            loading.set(false);
467            return;
468        }
469        long timeE = System.currentTimeMillis();
470        logger.info("time to install chem comp dictionary: " + (timeE - timeS) / 1000 + " sec.");
471        loading.set(false);
472    }
473
474    /**
475     * By default this provider will download only some of the {@link ChemComp} files.
476     * The user has to request that all files should be downloaded by setting this parameter to true.
477     *
478     *  @return flag if the all components should be downloaded and installed at startup. (default: false)
479     */
480    public boolean isDownloadAll() {
481        return downloadAll;
482    }
483
484    /** By default this provider will download only some of the {@link ChemComp} files.
485     * The user has to request that all files should be downloaded by setting this parameter to true.
486     *
487     * @param downloadAll if the all components should be downloaded and installed at startup. (default: false)
488     */
489    public void setDownloadAll(boolean downloadAll) {
490        this.downloadAll = downloadAll;
491    }
492}