001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * @author Karl Nicholas <github:karlnicholas> 015 * 016 * For more information on the BioJava project and its aims, 017 * or to join the biojava-l mailing list, visit the home page 018 * at: 019 * 020 * http://www.biojava.org/ 021 * 022 * Created on 08-08-2013 023 * 024 */ 025package org.biojava.nbio.core.sequence.loader; 026 027import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 028import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet; 029import org.biojava.nbio.core.sequence.features.*; 030import org.biojava.nbio.core.sequence.io.GenbankSequenceParser; 031import org.biojava.nbio.core.sequence.io.GenericGenbankHeaderParser; 032import org.biojava.nbio.core.sequence.template.AbstractSequence; 033import org.biojava.nbio.core.sequence.template.Compound; 034import org.biojava.nbio.core.sequence.template.CompoundSet; 035import org.slf4j.Logger; 036import org.slf4j.LoggerFactory; 037 038import java.io.*; 039import java.net.URL; 040import java.net.URLConnection; 041import java.util.List; 042import java.util.Map; 043 044/** 045 * @author Karl Nicholas <github:karlnicholas> 046 * @author Jacek Grzebyta <github:jgrzebyta> 047 */ 048public class GenbankProxySequenceReader<C extends Compound> extends StringProxySequenceReader<C> implements FeaturesKeyWordInterface, DatabaseReferenceInterface, FeatureRetriever { 049 050 private static final Logger logger = LoggerFactory.getLogger(GenbankProxySequenceReader.class); 051 052 private static final String eutilBaseURL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"; // 053 private String genbankDirectoryCache = null; 054 private GenbankSequenceParser<AbstractSequence<C>, C> genbankParser; 055 private GenericGenbankHeaderParser<AbstractSequence<C>, C> headerParser; 056 private String header; 057 private Map<String, List<AbstractFeature<AbstractSequence<C>, C>>> features; 058 059 060 /** 061 * 062 * @throws InterruptedException 063 * @throws IOException 064 * @throws CompoundNotFoundException 065 */ 066 public GenbankProxySequenceReader( 067 String genbankDirectoryCache, 068 String accessionID, 069 CompoundSet<C> compoundSet ) throws IOException, InterruptedException, CompoundNotFoundException { 070 071 setGenbankDirectoryCache(genbankDirectoryCache); 072 setCompoundSet(compoundSet); 073 074 String db = compoundSet instanceof AminoAcidCompoundSet ? "protein" : "nuccore"; 075 076 InputStream inStream = getBufferedInputStream(accessionID, db); 077 genbankParser = new GenbankSequenceParser<>(); 078 079 setContents(genbankParser.getSequence(new BufferedReader(new InputStreamReader(inStream)), 0)); 080 headerParser = genbankParser.getSequenceHeaderParser(); 081 header = genbankParser.getHeader(); 082 features = genbankParser.getFeatures(); 083 084 if (compoundSet.getClass().equals(AminoAcidCompoundSet.class)) { 085 if (!genbankParser.getCompoundType().equals(compoundSet)) { 086 logger.error("Declared compount type {} does not mach the real: {}", genbankParser.getCompoundType().toString(), compoundSet.toString()); 087 throw new IOException("Wrong declared compound type for: " + accessionID); 088 } 089 } 090 091 inStream.close(); 092 } 093 094 private BufferedInputStream getBufferedInputStream(String accessionID, String db) throws IOException, InterruptedException { 095 BufferedInputStream inStream = null; 096 if (genbankDirectoryCache != null && genbankDirectoryCache.length() > 0) { 097 File f = new File(genbankDirectoryCache + File.separatorChar + accessionID + ".gb"); 098 if (f.exists()) { 099 logger.debug("Reading: {}", f.toString()); 100 inStream = new BufferedInputStream(new FileInputStream(f)); 101 } else { 102 InputStream in = getEutilsInputStream(accessionID, db); 103 copyInputStreamToFile(in, f); 104 inStream = new BufferedInputStream(new FileInputStream(f)); 105 } 106 } else { 107 inStream = new BufferedInputStream(getEutilsInputStream(accessionID, db)); 108 } 109 return inStream; 110 } 111 112 private void copyInputStreamToFile(InputStream in, File f) throws IOException, InterruptedException { 113 try (FileOutputStream out = new FileOutputStream(f)) { 114 byte[] buffer = new byte[1024]; 115 int len = in.read(buffer); 116 while (len != -1) { 117 out.write(buffer, 0, len); 118 len = in.read(buffer); 119 if (Thread.interrupted()) { 120 in.close(); 121 out.close(); 122 throw new InterruptedException(); 123 } 124 } 125 in.close(); 126 } 127 } 128 129 private InputStream getEutilsInputStream(String accessionID, String db) throws IOException { 130 String genbankURL = eutilBaseURL + "efetch.fcgi?db=" + db + "&id=" + accessionID + "&rettype=gb&retmode=text"; 131 logger.trace("Loading: {}", genbankURL); 132 URL genbank = new URL(genbankURL); 133 URLConnection genbankConnection = genbank.openConnection(); 134 return genbankConnection.getInputStream(); 135 } 136 137 /** 138 * Local directory cache of Genbank that can be downloaded 139 * 140 * @return the uniprotDirectoryCache 141 */ 142 public String getGenbankDirectoryCache() { 143 return genbankDirectoryCache; 144 } 145 146 /** 147 * @param genbankDirectoryCache 148 */ 149 public void setGenbankDirectoryCache(String genbankDirectoryCache) { 150 if (genbankDirectoryCache != null) { 151 File f = new File(genbankDirectoryCache); 152 if (!f.exists()) { 153 f.mkdirs(); 154 } 155 } 156 this.genbankDirectoryCache = genbankDirectoryCache; 157 } 158 159 public String getHeader() { 160 return header; 161 } 162 163 public GenericGenbankHeaderParser<AbstractSequence<C>, C> getHeaderParser() { 164 return headerParser; 165 } 166 @Override 167 public Map<String, List<AbstractFeature<AbstractSequence<C>, C>>> getFeatures() { 168 return features; 169 } 170 171 @Override 172 public Map<String, List<DBReferenceInfo>> getDatabaseReferences() { 173 return genbankParser.getDatabaseReferences(); 174 } 175 176 @Override 177 public List<String> getKeyWords() { 178 return genbankParser.getKeyWords(); 179 } 180 181}