001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * @author Karl Nicholas <github:karlnicholas> 015 * 016 * For more information on the BioJava project and its aims, 017 * or to join the biojava-l mailing list, visit the home page 018 * at: 019 * 020 * http://www.biojava.org/ 021 * 022 * Created on 08-08-2013 023 * 024 */ 025package org.biojava.nbio.core.sequence.loader; 026 027import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 028import org.biojava.nbio.core.sequence.AccessionID; 029import org.biojava.nbio.core.sequence.DNASequence; 030import org.biojava.nbio.core.sequence.ProteinSequence; 031import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; 032import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet; 033import org.biojava.nbio.core.sequence.compound.DNACompoundSet; 034import org.biojava.nbio.core.sequence.compound.NucleotideCompound; 035import org.biojava.nbio.core.sequence.features.*; 036import org.biojava.nbio.core.sequence.io.GenbankSequenceParser; 037import org.biojava.nbio.core.sequence.io.GenericGenbankHeaderParser; 038import org.biojava.nbio.core.sequence.template.AbstractSequence; 039import org.biojava.nbio.core.sequence.template.Compound; 040import org.biojava.nbio.core.sequence.template.CompoundSet; 041import org.slf4j.Logger; 042import org.slf4j.LoggerFactory; 043 044import java.io.*; 045import java.net.URL; 046import java.net.URLConnection; 047import java.util.List; 048import java.util.Map; 049 050/** 051 * @author Karl Nicholas <github:karlnicholas> 052 * @author Jacek Grzebyta <github:jgrzebyta> 053 */ 054public class GenbankProxySequenceReader<C extends Compound> extends StringProxySequenceReader<C> implements FeaturesKeyWordInterface, DatabaseReferenceInterface, FeatureRetriever { 055 056 private static final Logger logger = LoggerFactory.getLogger(GenbankProxySequenceReader.class); 057 058 private static final String eutilBaseURL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"; // 059 private String genbankDirectoryCache = null; 060 private GenbankSequenceParser<AbstractSequence<C>, C> genbankParser; 061 private GenericGenbankHeaderParser<AbstractSequence<C>, C> headerParser; 062 private String header; 063 private Map<String, List<AbstractFeature<AbstractSequence<C>, C>>> features; 064 065 066 /** 067 * 068 * @throws InterruptedException 069 * @throws IOException 070 * @throws CompoundNotFoundException 071 */ 072 public GenbankProxySequenceReader( 073 String genbankDirectoryCache, 074 String accessionID, 075 CompoundSet<C> compoundSet ) throws IOException, InterruptedException, CompoundNotFoundException { 076 077 setGenbankDirectoryCache(genbankDirectoryCache); 078 setCompoundSet(compoundSet); 079 080 String db = compoundSet instanceof AminoAcidCompoundSet ? "protein" : "nuccore"; 081 082 InputStream inStream = getBufferedInputStream(accessionID, db); 083 genbankParser = new GenbankSequenceParser<AbstractSequence<C>, C>(); 084 085 setContents(genbankParser.getSequence(new BufferedReader(new InputStreamReader(inStream)), 0)); 086 headerParser = genbankParser.getSequenceHeaderParser(); 087 header = genbankParser.getHeader(); 088 features = genbankParser.getFeatures(); 089 090 if (compoundSet.getClass().equals(AminoAcidCompoundSet.class)) { 091 if (!genbankParser.getCompoundType().equals(compoundSet)) { 092 logger.error("Declared compount type {} does not mach the real: {}", genbankParser.getCompoundType().toString(), compoundSet.toString()); 093 throw new IOException("Wrong declared compound type for: " + accessionID); 094 } 095 } 096 097 inStream.close(); 098 } 099 100 private BufferedInputStream getBufferedInputStream(String accessionID, String db) throws IOException, InterruptedException { 101 BufferedInputStream inStream = null; 102 if (genbankDirectoryCache != null && genbankDirectoryCache.length() > 0) { 103 File f = new File(genbankDirectoryCache + File.separatorChar + accessionID + ".gb"); 104 if (f.exists()) { 105 logger.debug("Reading: {}", f.toString()); 106 inStream = new BufferedInputStream(new FileInputStream(f)); 107 } else { 108 InputStream in = getEutilsInputStream(accessionID, db); 109 copyInputStreamToFile(in, f); 110 inStream = new BufferedInputStream(new FileInputStream(f)); 111 } 112 } else { 113 inStream = new BufferedInputStream(getEutilsInputStream(accessionID, db)); 114 } 115 return inStream; 116 } 117 118 private void copyInputStreamToFile(InputStream in, File f) throws IOException, InterruptedException { 119 FileOutputStream out = new FileOutputStream(f); 120 byte[] buffer = new byte[1024]; 121 int len = in.read(buffer); 122 while (len != -1) { 123 out.write(buffer, 0, len); 124 len = in.read(buffer); 125 if (Thread.interrupted()) { 126 in.close(); 127 out.close(); 128 throw new InterruptedException(); 129 } 130 } 131 in.close(); 132 out.close(); 133 } 134 135 private InputStream getEutilsInputStream(String accessionID, String db) throws IOException { 136 String genbankURL = eutilBaseURL + "efetch.fcgi?db=" + db + "&id=" + accessionID + "&rettype=gb&retmode=text"; 137 logger.trace("Loading: {}", genbankURL); 138 URL genbank = new URL(genbankURL); 139 URLConnection genbankConnection = genbank.openConnection(); 140 return genbankConnection.getInputStream(); 141 } 142 143 /** 144 * Local directory cache of Genbank that can be downloaded 145 * 146 * @return the uniprotDirectoryCache 147 */ 148 public String getGenbankDirectoryCache() { 149 return genbankDirectoryCache; 150 } 151 152 /** 153 * @param genbankDirectoryCache 154 */ 155 public void setGenbankDirectoryCache(String genbankDirectoryCache) { 156 if (genbankDirectoryCache != null) { 157 File f = new File(genbankDirectoryCache); 158 if (!f.exists()) { 159 f.mkdirs(); 160 } 161 } 162 this.genbankDirectoryCache = genbankDirectoryCache; 163 } 164 165 public String getHeader() { 166 return header; 167 } 168 169 public GenericGenbankHeaderParser<AbstractSequence<C>, C> getHeaderParser() { 170 return headerParser; 171 } 172 @Override 173 public Map<String, List<AbstractFeature<AbstractSequence<C>, C>>> getFeatures() { 174 return features; 175 } 176 177 @Override 178 public Map<String, List<DBReferenceInfo>> getDatabaseReferences() { 179 return genbankParser.getDatabaseReferences(); 180 } 181 182 @Override 183 public List<String> getKeyWords() { 184 return genbankParser.getKeyWords(); 185 } 186 187 public static void main(String[] args) throws Throwable { 188 189 GenbankProxySequenceReader<AminoAcidCompound> genbankProteinReader 190 = new GenbankProxySequenceReader<AminoAcidCompound>("/tmp", "NP_000257", AminoAcidCompoundSet.getAminoAcidCompoundSet()); 191 ProteinSequence proteinSequence = new ProteinSequence(genbankProteinReader); 192 genbankProteinReader.getHeaderParser().parseHeader(genbankProteinReader.getHeader(), proteinSequence); 193 logger.info("Sequence ({},{})={}...", proteinSequence.getAccession(), proteinSequence.getLength(), proteinSequence.getSequenceAsString().substring(0, 10)); 194 logger.info("Keywords: {}", genbankProteinReader.getKeyWords()); 195 logger.info("DatabaseReferences: {}", genbankProteinReader.getDatabaseReferences()); 196 proteinSequence.getFeatures(); 197 198 GenbankProxySequenceReader<NucleotideCompound> genbankDNAReader 199 = new GenbankProxySequenceReader<NucleotideCompound>("/tmp", "NM_001126", DNACompoundSet.getDNACompoundSet()); 200 DNASequence dnaSequence = new DNASequence(genbankDNAReader); 201 genbankDNAReader.getHeaderParser().parseHeader(genbankDNAReader.getHeader(), dnaSequence); 202 dnaSequence.setAccession(new AccessionID("NM_001126")); 203 logger.info("Sequence ({},{})={}...", dnaSequence.getAccession(), dnaSequence.getLength(), dnaSequence.getSequenceAsString().substring(0, 10)); 204 logger.info("Keywords: {}", genbankDNAReader.getKeyWords()); 205 logger.info("DatabaseReferences: {}", genbankDNAReader.getDatabaseReferences()); 206 207 genbankDNAReader 208 = new GenbankProxySequenceReader<NucleotideCompound>("/tmp", "NM_000266", DNACompoundSet.getDNACompoundSet()); 209 dnaSequence = new DNASequence(genbankDNAReader); 210 genbankDNAReader.getHeaderParser().parseHeader(genbankDNAReader.getHeader(), dnaSequence); 211 logger.info("Sequence ({},{})={}...", dnaSequence.getAccession(), dnaSequence.getLength(), dnaSequence.getSequenceAsString().substring(0, 10)); 212 logger.info("Keywords: {}", genbankDNAReader.getKeyWords()); 213 logger.info("DatabaseReferences: {}", genbankDNAReader.getDatabaseReferences()); 214 215 genbankDNAReader 216 = new GenbankProxySequenceReader<NucleotideCompound>("/tmp", "AV254721", DNACompoundSet.getDNACompoundSet()); 217 dnaSequence = new DNASequence(genbankDNAReader); 218 genbankDNAReader.getHeaderParser().parseHeader(genbankDNAReader.getHeader(), dnaSequence); 219 logger.info("Sequence ({},{})={}...", dnaSequence.getAccession(), dnaSequence.getLength(), dnaSequence.getSequenceAsString().substring(0, 10)); 220 logger.info("Keywords: {}", genbankDNAReader.getKeyWords()); 221 logger.info("DatabaseReferences: {}", genbankDNAReader.getDatabaseReferences()); 222 223 genbankDNAReader 224 = new GenbankProxySequenceReader<NucleotideCompound>("/tmp", "AV254721.2", DNACompoundSet.getDNACompoundSet()); 225 dnaSequence = new DNASequence(genbankDNAReader); 226 genbankDNAReader.getHeaderParser().parseHeader(genbankDNAReader.getHeader(), dnaSequence); 227 logger.info("Sequence ({},{})={}...", dnaSequence.getAccession(), dnaSequence.getLength(), dnaSequence.getSequenceAsString().substring(0, 10)); 228 logger.info("Keywords: {}", genbankDNAReader.getKeyWords()); 229 logger.info("DatabaseReferences: {}", genbankDNAReader.getDatabaseReferences()); 230 231 genbankDNAReader 232 = new GenbankProxySequenceReader<NucleotideCompound>("/tmp", "U49845", DNACompoundSet.getDNACompoundSet()); 233 dnaSequence = new DNASequence(genbankDNAReader); 234 genbankDNAReader.getHeaderParser().parseHeader(genbankDNAReader.getHeader(), dnaSequence); 235 logger.info("Sequence ({},{})={}...", dnaSequence.getAccession(), dnaSequence.getLength(), dnaSequence.getSequenceAsString().substring(0, 10)); 236 logger.info("Keywords: {}", genbankDNAReader.getKeyWords()); 237 logger.info("DatabaseReferences: {}", genbankDNAReader.getDatabaseReferences()); 238 239 genbankDNAReader 240 = new GenbankProxySequenceReader<NucleotideCompound>("/tmp", "GI:1293613", DNACompoundSet.getDNACompoundSet()); 241 dnaSequence = new DNASequence(genbankDNAReader); 242 genbankDNAReader.getHeaderParser().parseHeader(genbankDNAReader.getHeader(), dnaSequence); 243 logger.info("Sequence ({},{})={}...", dnaSequence.getAccession(), dnaSequence.getLength(), dnaSequence.getSequenceAsString().substring(0, 10)); 244 logger.info("Keywords: {}", genbankDNAReader.getKeyWords()); 245 logger.info("DatabaseReferences: {}", genbankDNAReader.getDatabaseReferences()); 246 247 genbankDNAReader 248 = new GenbankProxySequenceReader<NucleotideCompound>("/tmp", "14109166", DNACompoundSet.getDNACompoundSet()); 249 dnaSequence = new DNASequence(genbankDNAReader); 250 genbankDNAReader.getHeaderParser().parseHeader(genbankDNAReader.getHeader(), dnaSequence); 251 logger.info("Sequence ({},{})={}...", dnaSequence.getAccession(), dnaSequence.getLength(), dnaSequence.getSequenceAsString().substring(0, 10)); 252 logger.info("Keywords: {}", genbankDNAReader.getKeyWords()); 253 logger.info("DatabaseReferences: {}", genbankDNAReader.getDatabaseReferences()); 254 255 /* 256 GenbankProxySequenceReader genbankProxyReader = new GenbankProxySequenceReader("/tmp"); 257 Sequence<?> sequence; 258 259 sequence = genbankProxyReader.getDNASequence(new AccessionID("NM_001126")); 260 System.out.println("Sequence" + "(" + sequence.getLength() + ")=" + sequence.getSequenceAsString().substring(0, 10) + "..."); 261 262 sequence = genbankProxyReader.getDNASequence(new AccessionID("NM_000266")); 263 System.out.println("Sequence" + "(" + sequence.getLength() + ")=" + sequence.getSequenceAsString().substring(0, 10) + "..."); 264 265 sequence = genbankProxyReader.getProteinSequence(new AccessionID("NP_000257")); 266 System.out.println("Sequence" + "(" + sequence.getLength() + ")=" + sequence.getSequenceAsString().substring(0, 10) + "..."); 267 268 sequence = genbankProxyReader.getProteinSequence(new AccessionID("AV254721")); 269 System.out.println("Sequence" + "(" + sequence.getLength() + ")=" + sequence.getSequenceAsString().substring(0, 10) + "..."); 270 271 sequence = genbankProxyReader.getProteinSequence(new AccessionID("AV254721.2")); 272 System.out.println("Sequence" + "(" + sequence.getLength() + ")=" + sequence.getSequenceAsString().substring(0, 10) + "..."); 273 274 sequence = genbankProxyReader.getProteinSequence(new AccessionID("U49845")); 275 System.out.println("Sequence" + "(" + sequence.getLength() + ")=" + sequence.getSequenceAsString().substring(0, 10) + "..."); 276 277 sequence = genbankProxyReader.getProteinSequence(new AccessionID("GI:1293613")); 278 System.out.println("Sequence" + "(" + sequence.getLength() + ")=" + sequence.getSequenceAsString().substring(0, 10) + "..."); 279 280 sequence = genbankProxyReader.getProteinSequence(new AccessionID("14109166")); 281 System.out.println("Sequence" + "(" + sequence.getLength() + ")=" + sequence.getSequenceAsString().substring(0, 10) + "..."); 282 */ 283 } 284}