001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * @author Karl Nicholas <github:karlnicholas> 015 * 016 * For more information on the BioJava project and its aims, 017 * or to join the biojava-l mailing list, visit the home page 018 * at: 019 * 020 * http://www.biojava.org/ 021 * 022 * Created on 08-08-2013 023 * 024 */ 025package org.biojava.nbio.core.sequence.loader; 026 027import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 028import org.biojava.nbio.core.sequence.AccessionID; 029import org.biojava.nbio.core.sequence.DNASequence; 030import org.biojava.nbio.core.sequence.ProteinSequence; 031import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; 032import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet; 033import org.biojava.nbio.core.sequence.compound.DNACompoundSet; 034import org.biojava.nbio.core.sequence.compound.NucleotideCompound; 035import org.biojava.nbio.core.sequence.features.*; 036import org.biojava.nbio.core.sequence.io.GenbankSequenceParser; 037import org.biojava.nbio.core.sequence.io.GenericGenbankHeaderParser; 038import org.biojava.nbio.core.sequence.template.AbstractSequence; 039import org.biojava.nbio.core.sequence.template.Compound; 040import org.biojava.nbio.core.sequence.template.CompoundSet; 041import org.slf4j.Logger; 042import org.slf4j.LoggerFactory; 043 044import java.io.*; 045import java.net.URL; 046import java.net.URLConnection; 047import java.util.ArrayList; 048import java.util.HashMap; 049import java.util.LinkedHashMap; 050 051/** 052 * @author Karl Nicholas <github:karlnicholas> 053 * @author Jacek Grzebyta <github:jgrzebyta> 054 */ 055public class GenbankProxySequenceReader<C extends Compound> extends StringProxySequenceReader<C> implements FeaturesKeyWordInterface, DatabaseReferenceInterface, FeatureRetriever { 056 057 private final static Logger logger = LoggerFactory.getLogger(GenbankProxySequenceReader.class); 058 059 private static final String eutilBaseURL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"; // 060 private String genbankDirectoryCache = null; 061 private GenbankSequenceParser<AbstractSequence<C>, C> genbankParser; 062 private GenericGenbankHeaderParser<AbstractSequence<C>, C> headerParser; 063 private String header; 064 private HashMap<String, ArrayList<AbstractFeature>> features; 065 066 067 /** 068 * 069 * @throws InterruptedException 070 * @throws IOException 071 * @throws CompoundNotFoundException 072 */ 073 public GenbankProxySequenceReader( 074 String genbankDirectoryCache, 075 String accessionID, 076 CompoundSet<C> compoundSet ) throws IOException, InterruptedException, CompoundNotFoundException { 077 078 setGenbankDirectoryCache(genbankDirectoryCache); 079 setCompoundSet(compoundSet); 080 081 String db = compoundSet instanceof AminoAcidCompoundSet ? "protein" : "nuccore"; 082 083 InputStream inStream = getBufferedInputStream(accessionID, db); 084 genbankParser = new GenbankSequenceParser<AbstractSequence<C>, C>(); 085 086 setContents(genbankParser.getSequence(new BufferedReader(new InputStreamReader(inStream)), 0)); 087 headerParser = genbankParser.getSequenceHeaderParser(); 088 header = genbankParser.getHeader(); 089 features = genbankParser.getFeatures(); 090 091 if (compoundSet.getClass().equals(AminoAcidCompoundSet.class)) { 092 if (!genbankParser.getCompoundType().equals(compoundSet)) { 093 logger.error("Declared compount type {} does not mach the real: {}", genbankParser.getCompoundType().toString(), compoundSet.toString()); 094 throw new IOException("Wrong declared compound type for: " + accessionID); 095 } 096 } 097 098 inStream.close(); 099 } 100 101 private BufferedInputStream getBufferedInputStream(String accessionID, String db) throws IOException, InterruptedException { 102 BufferedInputStream inStream = null; 103 if (genbankDirectoryCache != null && genbankDirectoryCache.length() > 0) { 104 File f = new File(genbankDirectoryCache + File.separatorChar + accessionID + ".gb"); 105 if (f.exists()) { 106 logger.debug("Reading: {}", f.toString()); 107 inStream = new BufferedInputStream(new FileInputStream(f)); 108 } else { 109 InputStream in = getEutilsInputStream(accessionID, db); 110 copyInputStreamToFile(in, f); 111 inStream = new BufferedInputStream(new FileInputStream(f)); 112 } 113 } else { 114 inStream = new BufferedInputStream(getEutilsInputStream(accessionID, db)); 115 } 116 return inStream; 117 } 118 119 private void copyInputStreamToFile(InputStream in, File f) throws IOException, InterruptedException { 120 FileOutputStream out = new FileOutputStream(f); 121 byte[] buffer = new byte[1024]; 122 int len = in.read(buffer); 123 while (len != -1) { 124 out.write(buffer, 0, len); 125 len = in.read(buffer); 126 if (Thread.interrupted()) { 127 in.close(); 128 out.close(); 129 throw new InterruptedException(); 130 } 131 } 132 in.close(); 133 out.close(); 134 } 135 136 private InputStream getEutilsInputStream(String accessionID, String db) throws IOException { 137 String genbankURL = eutilBaseURL + "efetch.fcgi?db=" + db + "&id=" + accessionID + "&rettype=gb&retmode=text"; 138 logger.trace("Loading: {}", genbankURL); 139 URL genbank = new URL(genbankURL); 140 URLConnection genbankConnection = genbank.openConnection(); 141 return genbankConnection.getInputStream(); 142 } 143 144 /** 145 * Local directory cache of Genbank that can be downloaded 146 * 147 * @return the uniprotDirectoryCache 148 */ 149 public String getGenbankDirectoryCache() { 150 return genbankDirectoryCache; 151 } 152 153 /** 154 * @param genbankDirectoryCache 155 */ 156 public void setGenbankDirectoryCache(String genbankDirectoryCache) { 157 if (genbankDirectoryCache != null) { 158 File f = new File(genbankDirectoryCache); 159 if (!f.exists()) { 160 f.mkdirs(); 161 } 162 } 163 this.genbankDirectoryCache = genbankDirectoryCache; 164 } 165 166 public String getHeader() { 167 return header; 168 } 169 170 public GenericGenbankHeaderParser<AbstractSequence<C>, C> getHeaderParser() { 171 return headerParser; 172 } 173 @Override 174 public HashMap<String, ArrayList<AbstractFeature>> getFeatures() { 175 return features; 176 } 177 178 @Override 179 public LinkedHashMap<String, ArrayList<DBReferenceInfo>> getDatabaseReferences() { 180 return genbankParser.getDatabaseReferences(); 181 } 182 183 @Override 184 public ArrayList<String> getKeyWords() { 185 return genbankParser.getKeyWords(); 186 } 187 188 public static void main(String[] args) throws Throwable { 189 190 GenbankProxySequenceReader<AminoAcidCompound> genbankProteinReader 191 = new GenbankProxySequenceReader<AminoAcidCompound>("/tmp", "NP_000257", AminoAcidCompoundSet.getAminoAcidCompoundSet()); 192 ProteinSequence proteinSequence = new ProteinSequence(genbankProteinReader); 193 genbankProteinReader.getHeaderParser().parseHeader(genbankProteinReader.getHeader(), proteinSequence); 194 logger.info("Sequence ({},{})={}...", proteinSequence.getAccession(), proteinSequence.getLength(), proteinSequence.getSequenceAsString().substring(0, 10)); 195 logger.info("Keywords: {}", genbankProteinReader.getKeyWords()); 196 logger.info("DatabaseReferences: {}", genbankProteinReader.getDatabaseReferences()); 197 proteinSequence.getFeatures(); 198 199 GenbankProxySequenceReader<NucleotideCompound> genbankDNAReader 200 = new GenbankProxySequenceReader<NucleotideCompound>("/tmp", "NM_001126", DNACompoundSet.getDNACompoundSet()); 201 DNASequence dnaSequence = new DNASequence(genbankDNAReader); 202 genbankDNAReader.getHeaderParser().parseHeader(genbankDNAReader.getHeader(), dnaSequence); 203 dnaSequence.setAccession(new AccessionID("NM_001126")); 204 logger.info("Sequence ({},{})={}...", dnaSequence.getAccession(), dnaSequence.getLength(), dnaSequence.getSequenceAsString().substring(0, 10)); 205 logger.info("Keywords: {}", genbankDNAReader.getKeyWords()); 206 logger.info("DatabaseReferences: {}", genbankDNAReader.getDatabaseReferences()); 207 208 genbankDNAReader 209 = new GenbankProxySequenceReader<NucleotideCompound>("/tmp", "NM_000266", DNACompoundSet.getDNACompoundSet()); 210 dnaSequence = new DNASequence(genbankDNAReader); 211 genbankDNAReader.getHeaderParser().parseHeader(genbankDNAReader.getHeader(), dnaSequence); 212 logger.info("Sequence ({},{})={}...", dnaSequence.getAccession(), dnaSequence.getLength(), dnaSequence.getSequenceAsString().substring(0, 10)); 213 logger.info("Keywords: {}", genbankDNAReader.getKeyWords()); 214 logger.info("DatabaseReferences: {}", genbankDNAReader.getDatabaseReferences()); 215 216 genbankDNAReader 217 = new GenbankProxySequenceReader<NucleotideCompound>("/tmp", "AV254721", DNACompoundSet.getDNACompoundSet()); 218 dnaSequence = new DNASequence(genbankDNAReader); 219 genbankDNAReader.getHeaderParser().parseHeader(genbankDNAReader.getHeader(), dnaSequence); 220 logger.info("Sequence ({},{})={}...", dnaSequence.getAccession(), dnaSequence.getLength(), dnaSequence.getSequenceAsString().substring(0, 10)); 221 logger.info("Keywords: {}", genbankDNAReader.getKeyWords()); 222 logger.info("DatabaseReferences: {}", genbankDNAReader.getDatabaseReferences()); 223 224 genbankDNAReader 225 = new GenbankProxySequenceReader<NucleotideCompound>("/tmp", "AV254721.2", DNACompoundSet.getDNACompoundSet()); 226 dnaSequence = new DNASequence(genbankDNAReader); 227 genbankDNAReader.getHeaderParser().parseHeader(genbankDNAReader.getHeader(), dnaSequence); 228 logger.info("Sequence ({},{})={}...", dnaSequence.getAccession(), dnaSequence.getLength(), dnaSequence.getSequenceAsString().substring(0, 10)); 229 logger.info("Keywords: {}", genbankDNAReader.getKeyWords()); 230 logger.info("DatabaseReferences: {}", genbankDNAReader.getDatabaseReferences()); 231 232 genbankDNAReader 233 = new GenbankProxySequenceReader<NucleotideCompound>("/tmp", "U49845", DNACompoundSet.getDNACompoundSet()); 234 dnaSequence = new DNASequence(genbankDNAReader); 235 genbankDNAReader.getHeaderParser().parseHeader(genbankDNAReader.getHeader(), dnaSequence); 236 logger.info("Sequence ({},{})={}...", dnaSequence.getAccession(), dnaSequence.getLength(), dnaSequence.getSequenceAsString().substring(0, 10)); 237 logger.info("Keywords: {}", genbankDNAReader.getKeyWords()); 238 logger.info("DatabaseReferences: {}", genbankDNAReader.getDatabaseReferences()); 239 240 genbankDNAReader 241 = new GenbankProxySequenceReader<NucleotideCompound>("/tmp", "GI:1293613", DNACompoundSet.getDNACompoundSet()); 242 dnaSequence = new DNASequence(genbankDNAReader); 243 genbankDNAReader.getHeaderParser().parseHeader(genbankDNAReader.getHeader(), dnaSequence); 244 logger.info("Sequence ({},{})={}...", dnaSequence.getAccession(), dnaSequence.getLength(), dnaSequence.getSequenceAsString().substring(0, 10)); 245 logger.info("Keywords: {}", genbankDNAReader.getKeyWords()); 246 logger.info("DatabaseReferences: {}", genbankDNAReader.getDatabaseReferences()); 247 248 genbankDNAReader 249 = new GenbankProxySequenceReader<NucleotideCompound>("/tmp", "14109166", DNACompoundSet.getDNACompoundSet()); 250 dnaSequence = new DNASequence(genbankDNAReader); 251 genbankDNAReader.getHeaderParser().parseHeader(genbankDNAReader.getHeader(), dnaSequence); 252 logger.info("Sequence ({},{})={}...", dnaSequence.getAccession(), dnaSequence.getLength(), dnaSequence.getSequenceAsString().substring(0, 10)); 253 logger.info("Keywords: {}", genbankDNAReader.getKeyWords()); 254 logger.info("DatabaseReferences: {}", genbankDNAReader.getDatabaseReferences()); 255 256 /* 257 GenbankProxySequenceReader genbankProxyReader = new GenbankProxySequenceReader("/tmp"); 258 Sequence<?> sequence; 259 260 sequence = genbankProxyReader.getDNASequence(new AccessionID("NM_001126")); 261 System.out.println("Sequence" + "(" + sequence.getLength() + ")=" + sequence.getSequenceAsString().substring(0, 10) + "..."); 262 263 sequence = genbankProxyReader.getDNASequence(new AccessionID("NM_000266")); 264 System.out.println("Sequence" + "(" + sequence.getLength() + ")=" + sequence.getSequenceAsString().substring(0, 10) + "..."); 265 266 sequence = genbankProxyReader.getProteinSequence(new AccessionID("NP_000257")); 267 System.out.println("Sequence" + "(" + sequence.getLength() + ")=" + sequence.getSequenceAsString().substring(0, 10) + "..."); 268 269 sequence = genbankProxyReader.getProteinSequence(new AccessionID("AV254721")); 270 System.out.println("Sequence" + "(" + sequence.getLength() + ")=" + sequence.getSequenceAsString().substring(0, 10) + "..."); 271 272 sequence = genbankProxyReader.getProteinSequence(new AccessionID("AV254721.2")); 273 System.out.println("Sequence" + "(" + sequence.getLength() + ")=" + sequence.getSequenceAsString().substring(0, 10) + "..."); 274 275 sequence = genbankProxyReader.getProteinSequence(new AccessionID("U49845")); 276 System.out.println("Sequence" + "(" + sequence.getLength() + ")=" + sequence.getSequenceAsString().substring(0, 10) + "..."); 277 278 sequence = genbankProxyReader.getProteinSequence(new AccessionID("GI:1293613")); 279 System.out.println("Sequence" + "(" + sequence.getLength() + ")=" + sequence.getSequenceAsString().substring(0, 10) + "..."); 280 281 sequence = genbankProxyReader.getProteinSequence(new AccessionID("14109166")); 282 System.out.println("Sequence" + "(" + sequence.getLength() + ")=" + sequence.getSequenceAsString().substring(0, 10) + "..."); 283 */ 284 } 285}