001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.phosphosite; 022 023import java.io.BufferedInputStream; 024import java.io.File; 025import java.io.FileOutputStream; 026import java.io.IOException; 027import java.io.InputStream; 028import java.net.URL; 029import java.nio.file.Files; 030import java.nio.file.StandardCopyOption; 031import java.util.Arrays; 032import java.util.List; 033import java.util.stream.Collectors; 034 035import org.biojava.nbio.structure.align.util.AtomCache; 036import org.slf4j.Logger; 037import org.slf4j.LoggerFactory; 038 039/** 040 * Phosphosite is available under the PhosphoSitePlus® is licensed under Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License and is freely available for non-commercial purposes from 041 * 042 * http://www.phosphosite.org/staticDownloads.do 043 * 044 * Please acknowledge PhosphoSitePlus®, www.phosphosite.org" at appropriate locations. 045 * 046 * Please cite : “Hornbeck PV, Kornhauser JM, Tkachev S, Zhang B, Skrzypek E, Murray B, Latham V, Sullivan M (2012) PhosphoSitePlus: a comprehensive resource for investigating the structure and function of experimentally determined post-translational modifications in man and mouse. Nucleic Acids Res. 40(Database issue), D261–70.”. 047 * 048 * 049 * 050 * Created by ap3 on 31/10/2014. 051 */ 052public class Dataset { 053 054 private static final Logger logger = LoggerFactory.getLogger(Dataset.class); 055 056 public static final String ACETYLATION = "https://www.phosphosite.org/downloads/Acetylation_site_dataset.gz"; 057 058 public static final String DISEASE_ASSOC = "https://www.phosphosite.org/downloads/Disease-associated_sites.gz"; 059 060 public static final String METHYLATION = "https://www.phosphosite.org/downloads/Methylation_site_dataset.gz"; 061 062 public static final String PHOSPHORYLATION = "https://www.phosphosite.org/downloads/Phosphorylation_site_dataset.gz"; 063 064 public static final String REGULATORY = "https://www.phosphosite.org/downloads/Regulatory_sites.gz"; 065 066 public static final String SUMOYLATION = "https://www.phosphosite.org/downloads/Sumoylation_site_dataset.gz"; 067 068 public static final String UBIQUITINATION = "https://www.phosphosite.org/downloads/Ubiquitination_site_dataset.gz"; 069 070 071 public Dataset(){ 072 073 074 } 075 076 private String[] getRemoteFiles(){ 077 String[] files = new String[]{ACETYLATION,DISEASE_ASSOC,METHYLATION,PHOSPHORYLATION,REGULATORY,SUMOYLATION,UBIQUITINATION}; 078 079 080 return files; 081 } 082 083 public File[] getLocalFiles(){ 084 String[] rfiles = getRemoteFiles(); 085 File dir = getLocalDir(); 086 List<File> files = Arrays.stream(rfiles).map(remoteFileName -> remoteFileName.substring(remoteFileName.lastIndexOf("/"))) 087 .map(localFile -> new File(dir+"/"+localFile)) 088 .filter(file -> file.exists()) 089 .collect(Collectors.toList()); 090 091 return files.toArray(new File[files.size()]); 092 } 093 094 095 public File getLocalDir(){ 096 AtomCache cache = new AtomCache(); 097 098 String path = cache.getCachePath(); 099 100 File dir = new File(path+"/phosphosite"); 101 102 return dir; 103 } 104 105 public void download(){ 106 107 logger.warn("Downloading data from www.phosposite.org. Data is under CC-BY-NC-SA license. Please link to site and cite: "); 108 logger.warn("Hornbeck PV, Kornhauser JM, Tkachev S, Zhang B, Skrzypek E, Murray B, Latham V, Sullivan M (2012) PhosphoSitePlus: a comprehensive resource for investigating the structure and function of experimentally determined post-translational modifications in man and mouse. Nucleic Acids Res. 40(Database issue), D261–70."); 109 110 File dir = getLocalDir(); 111 112 if ( ! dir.exists()) { 113 114 // need to download all... 115 116 dir.mkdir(); 117 118 119 } 120 121 String[] files = getRemoteFiles(); 122 123 for ( String f : files){ 124 125 try { 126 127 128 int slashIndex = f.lastIndexOf("/"); 129 130 String fileName = f.substring(slashIndex); 131 132 File localFile = new File(dir+"/" + fileName); 133 134 if ( ! localFile.exists()){ 135 136 URL u = new URL(f); 137 downloadFile(u, localFile); 138 } 139 140 141 } catch (Exception e){ 142 143 e.printStackTrace(); 144 } 145 146 147 } 148 149 } 150 151 public void downloadFile(URL u, File localFile) throws IOException { 152 153 logger.info("Downloading " + u); 154 155 File tmp = Files.createTempFile("tmp","phosphosite").toFile(); 156 157 InputStream is = u.openStream(); 158 159 BufferedInputStream in = new BufferedInputStream(is); 160 161 FileOutputStream w = new FileOutputStream(tmp); 162 163 int i= 0; 164 byte[] bytesIn = new byte[300000]; 165 while ((i = in.read(bytesIn)) >= 0) { 166 w.write(bytesIn,0,i); 167 } 168 in.close(); 169 w.close(); 170 171 172 // now copy tmp file to localFile 173 copyFile(tmp, localFile); 174 175 } 176 177 178 179 public static void copyFile(File src, File dst) throws IOException 180 { 181 182 Files.copy(src.toPath(), dst.toPath(), StandardCopyOption.REPLACE_EXISTING); 183 184 } 185 186 187 public static void main(String[] args) { 188 189 Dataset ds = new Dataset(); 190 191 ds.download(); 192 193 try { 194 195 for (File f : ds.getLocalFiles()) { 196 197 logger.info(f.getAbsolutePath()); 198 199 List<Site> sites = Site.parseSites(f); 200 201 logger.info("Got " + sites.size() + " sites"); 202 for (Site s : sites) { 203 if ("P50225".equals(s.getUniprot()) || "P48025".equals(s.getUniprot())) { 204 logger.info(s.toString()); 205 } 206 } 207 208 } 209 210 211 } catch (Exception e) { 212 e.printStackTrace(); 213 } 214 } 215 216}