001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.phosphosite; 022 023import org.biojava.nbio.structure.align.util.AtomCache; 024import org.slf4j.Logger; 025import org.slf4j.LoggerFactory; 026 027import java.io.*; 028import java.net.URL; 029import java.nio.file.Files; 030import java.nio.file.StandardCopyOption; 031import java.util.ArrayList; 032import java.util.Arrays; 033import java.util.List; 034import java.util.stream.Collectors; 035import java.util.stream.Stream; 036 037/** 038 * Phosphosite is available under the PhosphoSitePlus® is licensed under Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License and is freely available for non-commercial purposes from 039 * 040 * http://www.phosphosite.org/staticDownloads.do 041 * 042 * Please acknowledge PhosphoSitePlus®, www.phosphosite.org" at appropriate locations. 043 * 044 * Please cite : “Hornbeck PV, Kornhauser JM, Tkachev S, Zhang B, Skrzypek E, Murray B, Latham V, Sullivan M (2012) PhosphoSitePlus: a comprehensive resource for investigating the structure and function of experimentally determined post-translational modifications in man and mouse. Nucleic Acids Res. 40(Database issue), D261–70.”. 045 * 046 * 047 * 048 * Created by ap3 on 31/10/2014. 049 */ 050public class Dataset { 051 052 private static final Logger logger = LoggerFactory.getLogger(Dataset.class); 053 054 public static final String ACETYLATION = "https://www.phosphosite.org/downloads/Acetylation_site_dataset.gz"; 055 056 public static final String DISEASE_ASSOC = "https://www.phosphosite.org/downloads/Disease-associated_sites.gz"; 057 058 public static final String METHYLATION = "https://www.phosphosite.org/downloads/Methylation_site_dataset.gz"; 059 060 public static final String PHOSPHORYLATION = "https://www.phosphosite.org/downloads/Phosphorylation_site_dataset.gz"; 061 062 public static final String REGULATORY = "https://www.phosphosite.org/downloads/Regulatory_sites.gz"; 063 064 public static final String SUMOYLATION = "https://www.phosphosite.org/downloads/Sumoylation_site_dataset.gz"; 065 066 public static final String UBIQUITINATION = "https://www.phosphosite.org/downloads/Ubiquitination_site_dataset.gz"; 067 068 069 public Dataset(){ 070 071 072 } 073 074 private String[] getRemoteFiles(){ 075 String[] files = new String[]{ACETYLATION,DISEASE_ASSOC,METHYLATION,PHOSPHORYLATION,REGULATORY,SUMOYLATION,UBIQUITINATION}; 076 077 078 return files; 079 } 080 081 public File[] getLocalFiles(){ 082 String[] rfiles = getRemoteFiles(); 083 File dir = getLocalDir(); 084 List<File> files = Arrays.stream(rfiles).map(remoteFileName -> remoteFileName.substring(remoteFileName.lastIndexOf("/"))) 085 .map(localFile -> new File(dir+"/"+localFile)) 086 .filter(file -> file.exists()) 087 .collect(Collectors.toList()); 088 089 return files.toArray(new File[files.size()]); 090 } 091 092 093 public File getLocalDir(){ 094 AtomCache cache = new AtomCache(); 095 096 String path = cache.getCachePath(); 097 098 File dir = new File(path+"/phosphosite"); 099 100 return dir; 101 } 102 103 public void download(){ 104 105 logger.warn("Downloading data from www.phosposite.org. Data is under CC-BY-NC-SA license. Please link to site and cite: "); 106 logger.warn("Hornbeck PV, Kornhauser JM, Tkachev S, Zhang B, Skrzypek E, Murray B, Latham V, Sullivan M (2012) PhosphoSitePlus: a comprehensive resource for investigating the structure and function of experimentally determined post-translational modifications in man and mouse. Nucleic Acids Res. 40(Database issue), D261–70."); 107 108 File dir = getLocalDir(); 109 110 if ( ! dir.exists()) { 111 112 // need to download all... 113 114 dir.mkdir(); 115 116 117 } 118 119 String[] files = getRemoteFiles(); 120 121 for ( String f : files){ 122 123 try { 124 125 126 int slashIndex = f.lastIndexOf("/"); 127 128 String fileName = f.substring(slashIndex); 129 130 File localFile = new File(dir+"/" + fileName); 131 132 if ( ! localFile.exists()){ 133 134 URL u = new URL(f); 135 downloadFile(u, localFile); 136 } 137 138 139 } catch (Exception e){ 140 141 e.printStackTrace(); 142 } 143 144 145 } 146 147 } 148 149 public void downloadFile(URL u, File localFile) throws IOException { 150 151 logger.info("Downloading " + u); 152 153 File tmp = File.createTempFile("tmp","phosphosite"); 154 155 InputStream is = u.openStream(); 156 157 BufferedInputStream in = new BufferedInputStream(is); 158 159 FileOutputStream w = new FileOutputStream(tmp); 160 161 int i= 0; 162 byte[] bytesIn = new byte[300000]; 163 while ((i = in.read(bytesIn)) >= 0) { 164 w.write(bytesIn,0,i); 165 } 166 in.close(); 167 w.close(); 168 169 170 // now copy tmp file to localFile 171 copyFile(tmp, localFile); 172 173 } 174 175 176 177 public static void copyFile(File src, File dst) throws IOException 178 { 179 180 Files.copy(src.toPath(), dst.toPath(), StandardCopyOption.REPLACE_EXISTING); 181 182 } 183 184 185 public static void main(String[] args) { 186 187 Dataset ds = new Dataset(); 188 189 ds.download(); 190 191 try { 192 193 for (File f : ds.getLocalFiles()) { 194 195 logger.info(f.getAbsolutePath()); 196 197 List<Site> sites = Site.parseSites(f); 198 199 logger.info("Got " + sites.size() + " sites"); 200 for (Site s : sites) { 201 if (s.getUniprot().equals("P50225") || s.getUniprot().equals("P48025")) { 202 logger.info(s.toString()); 203 } 204 } 205 206 } 207 208 209 } catch (Exception e) { 210 e.printStackTrace(); 211 } 212 } 213 214}