001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.phosphosite; 022 023import org.biojava.nbio.structure.align.util.AtomCache; 024import org.slf4j.Logger; 025import org.slf4j.LoggerFactory; 026 027import java.io.*; 028import java.net.URL; 029import java.nio.channels.FileChannel; 030import java.nio.file.Files; 031import java.nio.file.StandardCopyOption; 032import java.util.ArrayList; 033import java.util.List; 034 035/** 036 * Phosphosite is available under the PhosphoSitePlus® is licensed under Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License and is freely available for non-commercial purposes from 037 * 038 * http://www.phosphosite.org/staticDownloads.do 039 * 040 * Please acknowledge PhosphoSitePlus®, www.phosphosite.org" at appropriate locations. 041 * 042 * Please cite : “Hornbeck PV, Kornhauser JM, Tkachev S, Zhang B, Skrzypek E, Murray B, Latham V, Sullivan M (2012) PhosphoSitePlus: a comprehensive resource for investigating the structure and function of experimentally determined post-translational modifications in man and mouse. Nucleic Acids Res. 40(Database issue), D261–70.”. 043 * 044 * 045 * 046 * Created by ap3 on 31/10/2014. 047 */ 048public class Dataset { 049 050 private static final Logger logger = LoggerFactory.getLogger(Dataset.class); 051 052 public static final String ACETYLATION = "https://www.phosphosite.org/downloads/Acetylation_site_dataset.gz"; 053 054 public static final String DISEASE_ASSOC = "https://www.phosphosite.org/downloads/Disease-associated_sites.gz"; 055 056 public static final String METHYLATION = "https://www.phosphosite.org/downloads/Methylation_site_dataset.gz"; 057 058 public static final String PHOSPHORYLATION = "https://www.phosphosite.org/downloads/Phosphorylation_site_dataset.gz"; 059 060 public static final String REGULATORY = "https://www.phosphosite.org/downloads/Regulatory_sites.gz"; 061 062 public static final String SUMOYLATION = "https://www.phosphosite.org/downloads/Sumoylation_site_dataset.gz"; 063 064 public static final String UBIQUITINATION = "https://www.phosphosite.org/downloads/Ubiquitination_site_dataset.gz"; 065 066 067 public Dataset(){ 068 069 070 } 071 072 private String[] getRemoteFiles(){ 073 String[] files = new String[]{ACETYLATION,DISEASE_ASSOC,METHYLATION,PHOSPHORYLATION,REGULATORY,SUMOYLATION,UBIQUITINATION}; 074 075 076 return files; 077 } 078 079 public File[] getLocalFiles(){ 080 081 String[] rfiles = getRemoteFiles(); 082 083 084 File dir = getLocalDir(); 085 086 List<File> files = new ArrayList<File>(); 087 for ( String f : rfiles) { 088 089 090 int slashIndex = f.lastIndexOf("/"); 091 092 String fileName = f.substring(slashIndex); 093 094 File localFile = new File(dir+"/" + fileName); 095 096 if ( localFile.exists()){ 097 files.add(localFile); 098 } 099 100 } 101 102 return files.toArray(new File[files.size()]); 103 } 104 105 106 public File getLocalDir(){ 107 AtomCache cache = new AtomCache(); 108 109 String path = cache.getCachePath(); 110 111 File dir = new File(path+"/phosphosite"); 112 113 return dir; 114 } 115 116 public void download(){ 117 118 logger.warn("Downloading data from www.phosposite.org. Data is under CC-BY-NC-SA license. Please link to site and cite: "); 119 logger.warn("Hornbeck PV, Kornhauser JM, Tkachev S, Zhang B, Skrzypek E, Murray B, Latham V, Sullivan M (2012) PhosphoSitePlus: a comprehensive resource for investigating the structure and function of experimentally determined post-translational modifications in man and mouse. Nucleic Acids Res. 40(Database issue), D261–70."); 120 121 File dir = getLocalDir(); 122 123 if ( ! dir.exists()) { 124 125 // need to download all... 126 127 dir.mkdir(); 128 129 130 } 131 132 String[] files = getRemoteFiles(); 133 134 for ( String f : files){ 135 136 try { 137 138 139 int slashIndex = f.lastIndexOf("/"); 140 141 String fileName = f.substring(slashIndex); 142 143 File localFile = new File(dir+"/" + fileName); 144 145 if ( ! localFile.exists()){ 146 147 URL u = new URL(f); 148 downloadFile(u, localFile); 149 } 150 151 152 } catch (Exception e){ 153 154 e.printStackTrace(); 155 } 156 157 158 } 159 160 } 161 162 public void downloadFile(URL u, File localFile) throws IOException { 163 164 logger.info("Downloading " + u); 165 166 File tmp = File.createTempFile("tmp","phosphosite"); 167 168 InputStream is = u.openStream(); 169 170 BufferedInputStream in = new BufferedInputStream(is); 171 172 FileOutputStream w = new FileOutputStream(tmp); 173 174 int i= 0; 175 byte[] bytesIn = new byte[300000]; 176 while ((i = in.read(bytesIn)) >= 0) { 177 w.write(bytesIn,0,i); 178 } 179 in.close(); 180 w.close(); 181 182 183 // now copy tmp file to localFile 184 copyFile(tmp, localFile); 185 186 } 187 188 189 190 public static void copyFile(File src, File dst) throws IOException 191 { 192 193 Files.copy(src.toPath(), dst.toPath(), StandardCopyOption.REPLACE_EXISTING); 194 195 } 196 197 198 public static void main(String[] args) { 199 200 Dataset ds = new Dataset(); 201 202 ds.download(); 203 204 try { 205 206 for (File f : ds.getLocalFiles()) { 207 208 logger.info(f.getAbsolutePath()); 209 210 List<Site> sites = Site.parseSites(f); 211 212 logger.info("Got " + sites.size() + " sites"); 213 for (Site s : sites) { 214 if (s.getUniprot().equals("P50225") || s.getUniprot().equals("P48025")) { 215 logger.info(s.toString()); 216 } 217 } 218 219 } 220 221 222 } catch (Exception e) { 223 e.printStackTrace(); 224 } 225 } 226 227}