001/** 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created on Dec 7, 2013 021 * Created by Douglas Myers-Turnbull 022 * 023 */ 024package org.biojava.nbio.structure.io.sifts; 025 026import org.biojava.nbio.structure.align.util.UserConfiguration; 027import org.biojava.nbio.core.sequence.io.util.IOUtils; 028import org.slf4j.Logger; 029import org.slf4j.LoggerFactory; 030 031import java.io.*; 032import java.net.MalformedURLException; 033import java.net.URL; 034import java.util.Collection; 035import java.util.HashMap; 036import java.util.Map; 037import java.util.Map.Entry; 038import java.util.Set; 039import java.util.zip.GZIPInputStream; 040 041/** 042 * A mapping between UniProt entries and PDB chains. 043 * For example 044 * <pre> 045 * SiftsChainToUniprot sifts = SiftsChainToUniprot.load(); 046 * SiftsChainEntry entry1 = sifts.getByUniProtId("P04585"); 047 * System.out.println(entry1.getPdbId() + "." + entry1.getChainName()); // 1hiv.A 048 * System.out.println(entry1.getPdbStart() + "-" + entry1.getPdbStop()); // 1-99 049 * SiftsChainEntry entry2 = sifts.getByChainId("1hiv", "A"); 050 * System.out.println(entry1.equals(entry2)); // true 051 * </pre> 052 * See SIFTS project documentation: https://www.ebi.ac.uk/pdbe/docs/sifts/ 053 * @author dmyersturnbull 054 * @see SiftsChainEntry 055 * @since 3.0.7 056 */ 057public class SiftsChainToUniprotMapping { 058 059 private final static Logger logger = LoggerFactory.getLogger(SiftsChainToUniprotMapping.class); 060 061 062 protected static File DEFAULT_FILE; 063 064 private static final String DEFAULT_FILENAME = "pdb_chain_uniprot.tsv"; 065 private static final URL DEFAULT_URL; 066 067 static { 068 try { 069 DEFAULT_URL = new URL("ftp://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/pdb_chain_uniprot.tsv.gz"); 070 } catch (MalformedURLException e) { 071 throw new RuntimeException(e); 072 } 073 } 074 075 /** 076 * Loads the SIFTS mapping. 077 * Attempts to load the mapping file in the PDB cache directory. 078 * If the file does not exist or could not be parsed, downloads and stores a GZ-compressed file. 079 * @return 080 * @throws IOException If the local file could not be read and could not be downloaded 081 */ 082 public static SiftsChainToUniprotMapping load() throws IOException { 083 return load(false); 084 } 085 086 /** 087 * Loads the SIFTS mapping. 088 * Attempts to load the mapping file in the PDB cache directory. 089 * If the file does not exist or could not be parsed, downloads and stores a GZ-compressed file. 090 * @param useOnlyLocal If true, will throw an IOException if the file needs to be downloaded 091 * @return 092 * @throws IOException If the local file could not be read and could not be downloaded (including if onlyLocal is true) 093 */ 094 public static SiftsChainToUniprotMapping load(boolean useOnlyLocal) throws IOException { 095 096 UserConfiguration config = new UserConfiguration(); 097 File cacheDir = new File(config.getCacheFilePath()); 098 099 DEFAULT_FILE = new File(cacheDir, DEFAULT_FILENAME); 100 101 102 if (!DEFAULT_FILE.exists() || DEFAULT_FILE.length() == 0) { 103 if (useOnlyLocal) throw new IOException(DEFAULT_FILE + " does not exist, and did not download"); 104 download(); 105 } 106 try { 107 return build(); 108 } catch (IOException e) { 109 logger.info("Caught IOException while reading {}. Error: {}",DEFAULT_FILE,e.getMessage()); 110 if (useOnlyLocal) throw new IOException(DEFAULT_FILE + " could not be read, and did not redownload"); 111 download(); 112 return build(); 113 } 114 } 115 116 /** 117 * Builds the mapping by reading SIFTS the tsv file set in {@link #DEFAULT_FILE} variable. 118 * @return 119 * @throws IOException 120 */ 121 protected static SiftsChainToUniprotMapping build() throws IOException { 122 SiftsChainToUniprotMapping sifts = new SiftsChainToUniprotMapping(); 123 BufferedReader br = new BufferedReader(new FileReader(DEFAULT_FILE)); 124 String line = ""; 125 while ((line = br.readLine()) != null) { 126 if (line.isEmpty() || line.startsWith("#") || line.startsWith("PDB")) continue; 127 String[] parts = line.split("\t"); 128 String pdbId = parts[0]; 129 String chainId = parts[1]; 130 String uniProtId = parts[2]; 131 String seqresStart = parts[3]; 132 String seqresEnd = parts[4]; 133 String pdbStart = parts[5]; 134 String pdbEnd = parts[6]; 135 String uniprotStart = parts[7]; 136 String uniprotEnd = parts[8]; 137 SiftsChainEntry entry = new SiftsChainEntry(pdbId, chainId, uniProtId, seqresStart, seqresEnd, 138 pdbStart, pdbEnd, uniprotStart, uniprotEnd); 139 sifts.byChainId.put(pdbId + "." + chainId, entry); 140 sifts.byUniProtId.put(uniProtId, entry); 141 } 142 br.close(); 143 return sifts; 144 } 145 146 private static void download() throws IOException { 147 148 logger.info("Downloading {} to {}",DEFAULT_URL.toString(),DEFAULT_FILE); 149 150 InputStream in = null; 151 OutputStream out = null; 152 153 in = new GZIPInputStream(DEFAULT_URL.openStream()); 154 out = new FileOutputStream(DEFAULT_FILE); 155 IOUtils.copy(in, out); 156 157 } 158 159 private Map<String, SiftsChainEntry> byChainId = new HashMap<String, SiftsChainEntry>(); 160 161 private Map<String, SiftsChainEntry> byUniProtId = new HashMap<String, SiftsChainEntry>(); 162 163 private SiftsChainToUniprotMapping() { 164 165 } 166 167 public Set<Entry<String, SiftsChainEntry>> chainEntrySet() { 168 return byChainId.entrySet(); 169 } 170 171 public boolean containsChainId(String pdbId, String chainId) { 172 return byChainId.containsKey(pdbId + "." + chainId); 173 } 174 175 public boolean containsUniProtId(String uniProtId) { 176 return byUniProtId.containsKey(uniProtId); 177 } 178 179 public SiftsChainEntry getByChainId(String pdbId, String chainId) { 180 return byChainId.get(pdbId + "." + chainId); 181 } 182 183 public SiftsChainEntry getByUniProtId(String uniProtId) { 184 return byUniProtId.get(uniProtId); 185 } 186 187 public Set<String> keySet() { 188 return byChainId.keySet(); 189 } 190 191 /** 192 * Returns the number of mapped entries. 193 */ 194 public int size() { 195 return byChainId.size(); 196 } 197 198 public Set<Entry<String, SiftsChainEntry>> uniProtEntrySet() { 199 return byChainId.entrySet(); 200 } 201 202 public Collection<SiftsChainEntry> values() { 203 return byChainId.values(); 204 } 205}