001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * @author Karl Nicholas <github:karlnicholas> 015 * 016 * For more information on the BioJava project and its aims, 017 * or to join the biojava-l mailing list, visit the home page 018 * at: 019 * 020 * http://www.biojava.org/ 021 * 022 * Created on 08-08-2013 023 * 024 */ 025package org.biojava.nbio.core.sequence.loader; 026 027import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 028import org.biojava.nbio.core.sequence.AccessionID; 029import org.biojava.nbio.core.sequence.DNASequence; 030import org.biojava.nbio.core.sequence.ProteinSequence; 031import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; 032import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet; 033import org.biojava.nbio.core.sequence.compound.DNACompoundSet; 034import org.biojava.nbio.core.sequence.compound.NucleotideCompound; 035import org.biojava.nbio.core.sequence.features.AbstractFeature; 036import org.biojava.nbio.core.sequence.features.DBReferenceInfo; 037import org.biojava.nbio.core.sequence.features.DatabaseReferenceInterface; 038import org.biojava.nbio.core.sequence.features.FeatureRetriever; 039import org.biojava.nbio.core.sequence.features.FeaturesKeyWordInterface; 040import org.biojava.nbio.core.sequence.io.GenbankSequenceParser; 041import org.biojava.nbio.core.sequence.io.GenericGenbankHeaderParser; 042import org.biojava.nbio.core.sequence.template.AbstractSequence; 043import org.biojava.nbio.core.sequence.template.Compound; 044import org.biojava.nbio.core.sequence.template.CompoundSet; 045import org.slf4j.Logger; 046import org.slf4j.LoggerFactory; 047 048import java.io.BufferedInputStream; 049import java.io.BufferedReader; 050import java.io.File; 051import java.io.FileInputStream; 052import java.io.FileOutputStream; 053import java.io.IOException; 054import java.io.InputStream; 055import java.io.InputStreamReader; 056import java.net.URL; 057import java.net.URLConnection; 058import java.util.ArrayList; 059import java.util.HashMap; 060import java.util.LinkedHashMap; 061 062/** 063 * @author Karl Nicholas <github:karlnicholas> 064 * @author Jacek Grzebyta <github:jgrzebyta> 065 */ 066public class GenbankProxySequenceReader<C extends Compound> extends StringProxySequenceReader<C> implements FeaturesKeyWordInterface, DatabaseReferenceInterface, FeatureRetriever { 067 068 private static final Logger logger = LoggerFactory.getLogger(GenbankProxySequenceReader.class); 069 070 private static final String eutilBaseURL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"; // 071 private String genbankDirectoryCache = null; 072 private GenbankSequenceParser<AbstractSequence<C>, C> genbankParser; 073 private GenericGenbankHeaderParser<AbstractSequence<C>, C> headerParser; 074 private String header; 075 private HashMap<String, ArrayList<AbstractFeature>> features; 076 077 078 /** 079 * 080 * @throws InterruptedException 081 * @throws IOException 082 * @throws CompoundNotFoundException 083 */ 084 public GenbankProxySequenceReader( 085 String genbankDirectoryCache, 086 String accessionID, 087 CompoundSet<C> compoundSet ) throws IOException, InterruptedException, CompoundNotFoundException { 088 089 setGenbankDirectoryCache(genbankDirectoryCache); 090 setCompoundSet(compoundSet); 091 092 String db = compoundSet instanceof AminoAcidCompoundSet ? "protein" : "nuccore"; 093 094 InputStream inStream = getBufferedInputStream(accessionID, db); 095 genbankParser = new GenbankSequenceParser<AbstractSequence<C>, C>(); 096 097 setContents(genbankParser.getSequence(new BufferedReader(new InputStreamReader(inStream)), 0)); 098 headerParser = genbankParser.getSequenceHeaderParser(); 099 header = genbankParser.getHeader(); 100 features = genbankParser.getFeatures(); 101 102 if (compoundSet.getClass().equals(AminoAcidCompoundSet.class)) { 103 if (!genbankParser.getCompoundType().equals(compoundSet)) { 104 logger.error("Declared compount type {} does not mach the real: {}", genbankParser.getCompoundType().toString(), compoundSet.toString()); 105 throw new IOException("Wrong declared compound type for: " + accessionID); 106 } 107 } 108 109 inStream.close(); 110 } 111 112 private BufferedInputStream getBufferedInputStream(String accessionID, String db) throws IOException, InterruptedException { 113 BufferedInputStream inStream = null; 114 if (genbankDirectoryCache != null && genbankDirectoryCache.length() > 0) { 115 File f = new File(genbankDirectoryCache + File.separatorChar + accessionID + ".gb"); 116 if (f.exists()) { 117 logger.debug("Reading: {}", f.toString()); 118 inStream = new BufferedInputStream(new FileInputStream(f)); 119 } else { 120 InputStream in = getEutilsInputStream(accessionID, db); 121 copyInputStreamToFile(in, f); 122 inStream = new BufferedInputStream(new FileInputStream(f)); 123 } 124 } else { 125 inStream = new BufferedInputStream(getEutilsInputStream(accessionID, db)); 126 } 127 return inStream; 128 } 129 130 private void copyInputStreamToFile(InputStream in, File f) throws IOException, InterruptedException { 131 FileOutputStream out = new FileOutputStream(f); 132 byte[] buffer = new byte[1024]; 133 int len = in.read(buffer); 134 while (len != -1) { 135 out.write(buffer, 0, len); 136 len = in.read(buffer); 137 if (Thread.interrupted()) { 138 in.close(); 139 out.close(); 140 throw new InterruptedException(); 141 } 142 } 143 in.close(); 144 out.close(); 145 } 146 147 private InputStream getEutilsInputStream(String accessionID, String db) throws IOException { 148 String genbankURL = eutilBaseURL + "efetch.fcgi?db=" + db + "&id=" + accessionID + "&rettype=gb&retmode=text"; 149 logger.trace("Loading: {}", genbankURL); 150 URL genbank = new URL(genbankURL); 151 URLConnection genbankConnection = genbank.openConnection(); 152 return genbankConnection.getInputStream(); 153 } 154 155 /** 156 * Local directory cache of Genbank that can be downloaded 157 * 158 * @return the uniprotDirectoryCache 159 */ 160 public String getGenbankDirectoryCache() { 161 return genbankDirectoryCache; 162 } 163 164 /** 165 * @param genbankDirectoryCache 166 */ 167 public void setGenbankDirectoryCache(String genbankDirectoryCache) { 168 if (genbankDirectoryCache != null) { 169 File f = new File(genbankDirectoryCache); 170 if (!f.exists()) { 171 f.mkdirs(); 172 } 173 } 174 this.genbankDirectoryCache = genbankDirectoryCache; 175 } 176 177 public String getHeader() { 178 return header; 179 } 180 181 public GenericGenbankHeaderParser<AbstractSequence<C>, C> getHeaderParser() { 182 return headerParser; 183 } 184 @Override 185 public HashMap<String, ArrayList<AbstractFeature>> getFeatures() { 186 return features; 187 } 188 189 @Override 190 public LinkedHashMap<String, ArrayList<DBReferenceInfo>> getDatabaseReferences() { 191 return genbankParser.getDatabaseReferences(); 192 } 193 194 @Override 195 public ArrayList<String> getKeyWords() { 196 return genbankParser.getKeyWords(); 197 } 198 199 public static void main(String[] args) throws Throwable { 200 201 GenbankProxySequenceReader<AminoAcidCompound> genbankProteinReader 202 = new GenbankProxySequenceReader<AminoAcidCompound>("/tmp", "NP_000257", AminoAcidCompoundSet.getAminoAcidCompoundSet()); 203 ProteinSequence proteinSequence = new ProteinSequence(genbankProteinReader); 204 genbankProteinReader.getHeaderParser().parseHeader(genbankProteinReader.getHeader(), proteinSequence); 205 logger.info("Sequence ({},{})={}...", proteinSequence.getAccession(), proteinSequence.getLength(), proteinSequence.getSequenceAsString().substring(0, 10)); 206 logger.info("Keywords: {}", genbankProteinReader.getKeyWords()); 207 logger.info("DatabaseReferences: {}", genbankProteinReader.getDatabaseReferences()); 208 proteinSequence.getFeatures(); 209 210 GenbankProxySequenceReader<NucleotideCompound> genbankDNAReader 211 = new GenbankProxySequenceReader<NucleotideCompound>("/tmp", "NM_001126", DNACompoundSet.getDNACompoundSet()); 212 DNASequence dnaSequence = new DNASequence(genbankDNAReader); 213 genbankDNAReader.getHeaderParser().parseHeader(genbankDNAReader.getHeader(), dnaSequence); 214 dnaSequence.setAccession(new AccessionID("NM_001126")); 215 logger.info("Sequence ({},{})={}...", dnaSequence.getAccession(), dnaSequence.getLength(), dnaSequence.getSequenceAsString().substring(0, 10)); 216 logger.info("Keywords: {}", genbankDNAReader.getKeyWords()); 217 logger.info("DatabaseReferences: {}", genbankDNAReader.getDatabaseReferences()); 218 219 genbankDNAReader 220 = new GenbankProxySequenceReader<NucleotideCompound>("/tmp", "NM_000266", DNACompoundSet.getDNACompoundSet()); 221 dnaSequence = new DNASequence(genbankDNAReader); 222 genbankDNAReader.getHeaderParser().parseHeader(genbankDNAReader.getHeader(), dnaSequence); 223 logger.info("Sequence ({},{})={}...", dnaSequence.getAccession(), dnaSequence.getLength(), dnaSequence.getSequenceAsString().substring(0, 10)); 224 logger.info("Keywords: {}", genbankDNAReader.getKeyWords()); 225 logger.info("DatabaseReferences: {}", genbankDNAReader.getDatabaseReferences()); 226 227 genbankDNAReader 228 = new GenbankProxySequenceReader<NucleotideCompound>("/tmp", "AV254721", DNACompoundSet.getDNACompoundSet()); 229 dnaSequence = new DNASequence(genbankDNAReader); 230 genbankDNAReader.getHeaderParser().parseHeader(genbankDNAReader.getHeader(), dnaSequence); 231 logger.info("Sequence ({},{})={}...", dnaSequence.getAccession(), dnaSequence.getLength(), dnaSequence.getSequenceAsString().substring(0, 10)); 232 logger.info("Keywords: {}", genbankDNAReader.getKeyWords()); 233 logger.info("DatabaseReferences: {}", genbankDNAReader.getDatabaseReferences()); 234 235 genbankDNAReader 236 = new GenbankProxySequenceReader<NucleotideCompound>("/tmp", "AV254721.2", DNACompoundSet.getDNACompoundSet()); 237 dnaSequence = new DNASequence(genbankDNAReader); 238 genbankDNAReader.getHeaderParser().parseHeader(genbankDNAReader.getHeader(), dnaSequence); 239 logger.info("Sequence ({},{})={}...", dnaSequence.getAccession(), dnaSequence.getLength(), dnaSequence.getSequenceAsString().substring(0, 10)); 240 logger.info("Keywords: {}", genbankDNAReader.getKeyWords()); 241 logger.info("DatabaseReferences: {}", genbankDNAReader.getDatabaseReferences()); 242 243 genbankDNAReader 244 = new GenbankProxySequenceReader<NucleotideCompound>("/tmp", "U49845", DNACompoundSet.getDNACompoundSet()); 245 dnaSequence = new DNASequence(genbankDNAReader); 246 genbankDNAReader.getHeaderParser().parseHeader(genbankDNAReader.getHeader(), dnaSequence); 247 logger.info("Sequence ({},{})={}...", dnaSequence.getAccession(), dnaSequence.getLength(), dnaSequence.getSequenceAsString().substring(0, 10)); 248 logger.info("Keywords: {}", genbankDNAReader.getKeyWords()); 249 logger.info("DatabaseReferences: {}", genbankDNAReader.getDatabaseReferences()); 250 251 genbankDNAReader 252 = new GenbankProxySequenceReader<NucleotideCompound>("/tmp", "GI:1293613", DNACompoundSet.getDNACompoundSet()); 253 dnaSequence = new DNASequence(genbankDNAReader); 254 genbankDNAReader.getHeaderParser().parseHeader(genbankDNAReader.getHeader(), dnaSequence); 255 logger.info("Sequence ({},{})={}...", dnaSequence.getAccession(), dnaSequence.getLength(), dnaSequence.getSequenceAsString().substring(0, 10)); 256 logger.info("Keywords: {}", genbankDNAReader.getKeyWords()); 257 logger.info("DatabaseReferences: {}", genbankDNAReader.getDatabaseReferences()); 258 259 genbankDNAReader 260 = new GenbankProxySequenceReader<NucleotideCompound>("/tmp", "14109166", DNACompoundSet.getDNACompoundSet()); 261 dnaSequence = new DNASequence(genbankDNAReader); 262 genbankDNAReader.getHeaderParser().parseHeader(genbankDNAReader.getHeader(), dnaSequence); 263 logger.info("Sequence ({},{})={}...", dnaSequence.getAccession(), dnaSequence.getLength(), dnaSequence.getSequenceAsString().substring(0, 10)); 264 logger.info("Keywords: {}", genbankDNAReader.getKeyWords()); 265 logger.info("DatabaseReferences: {}", genbankDNAReader.getDatabaseReferences()); 266 267 /* 268 GenbankProxySequenceReader genbankProxyReader = new GenbankProxySequenceReader("/tmp"); 269 Sequence<?> sequence; 270 271 sequence = genbankProxyReader.getDNASequence(new AccessionID("NM_001126")); 272 System.out.println("Sequence" + "(" + sequence.getLength() + ")=" + sequence.getSequenceAsString().substring(0, 10) + "..."); 273 274 sequence = genbankProxyReader.getDNASequence(new AccessionID("NM_000266")); 275 System.out.println("Sequence" + "(" + sequence.getLength() + ")=" + sequence.getSequenceAsString().substring(0, 10) + "..."); 276 277 sequence = genbankProxyReader.getProteinSequence(new AccessionID("NP_000257")); 278 System.out.println("Sequence" + "(" + sequence.getLength() + ")=" + sequence.getSequenceAsString().substring(0, 10) + "..."); 279 280 sequence = genbankProxyReader.getProteinSequence(new AccessionID("AV254721")); 281 System.out.println("Sequence" + "(" + sequence.getLength() + ")=" + sequence.getSequenceAsString().substring(0, 10) + "..."); 282 283 sequence = genbankProxyReader.getProteinSequence(new AccessionID("AV254721.2")); 284 System.out.println("Sequence" + "(" + sequence.getLength() + ")=" + sequence.getSequenceAsString().substring(0, 10) + "..."); 285 286 sequence = genbankProxyReader.getProteinSequence(new AccessionID("U49845")); 287 System.out.println("Sequence" + "(" + sequence.getLength() + ")=" + sequence.getSequenceAsString().substring(0, 10) + "..."); 288 289 sequence = genbankProxyReader.getProteinSequence(new AccessionID("GI:1293613")); 290 System.out.println("Sequence" + "(" + sequence.getLength() + ")=" + sequence.getSequenceAsString().substring(0, 10) + "..."); 291 292 sequence = genbankProxyReader.getProteinSequence(new AccessionID("14109166")); 293 System.out.println("Sequence" + "(" + sequence.getLength() + ")=" + sequence.getSequenceAsString().substring(0, 10) + "..."); 294 */ 295 } 296}