001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * @author Karl Nicholas <github:karlnicholas>
015 *
016 * For more information on the BioJava project and its aims,
017 * or to join the biojava-l mailing list, visit the home page
018 * at:
019 *
020 *      http://www.biojava.org/
021 *
022 * Created on 08-08-2013
023 *
024 */
025package org.biojava.nbio.core.sequence.loader;
026
027import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
028import org.biojava.nbio.core.sequence.AccessionID;
029import org.biojava.nbio.core.sequence.DNASequence;
030import org.biojava.nbio.core.sequence.ProteinSequence;
031import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
032import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
033import org.biojava.nbio.core.sequence.compound.DNACompoundSet;
034import org.biojava.nbio.core.sequence.compound.NucleotideCompound;
035import org.biojava.nbio.core.sequence.features.*;
036import org.biojava.nbio.core.sequence.io.GenbankSequenceParser;
037import org.biojava.nbio.core.sequence.io.GenericGenbankHeaderParser;
038import org.biojava.nbio.core.sequence.template.AbstractSequence;
039import org.biojava.nbio.core.sequence.template.Compound;
040import org.biojava.nbio.core.sequence.template.CompoundSet;
041import org.slf4j.Logger;
042import org.slf4j.LoggerFactory;
043
044import java.io.*;
045import java.net.URL;
046import java.net.URLConnection;
047import java.util.List;
048import java.util.Map;
049
050/**
051 * @author Karl Nicholas <github:karlnicholas>
052 * @author Jacek Grzebyta <github:jgrzebyta>
053 */
054public class GenbankProxySequenceReader<C extends Compound> extends StringProxySequenceReader<C> implements FeaturesKeyWordInterface, DatabaseReferenceInterface, FeatureRetriever {
055
056        private static final Logger logger = LoggerFactory.getLogger(GenbankProxySequenceReader.class);
057
058        private static final String eutilBaseURL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"; //
059        private String genbankDirectoryCache = null;
060        private GenbankSequenceParser<AbstractSequence<C>, C> genbankParser;
061        private GenericGenbankHeaderParser<AbstractSequence<C>, C> headerParser;
062        private String header;
063        private Map<String, List<AbstractFeature<AbstractSequence<C>, C>>> features;
064
065
066        /**
067         *
068         * @throws InterruptedException
069         * @throws IOException
070         * @throws CompoundNotFoundException
071         */
072        public GenbankProxySequenceReader(
073                        String genbankDirectoryCache,
074                        String accessionID,
075                        CompoundSet<C> compoundSet ) throws IOException, InterruptedException, CompoundNotFoundException {
076
077                setGenbankDirectoryCache(genbankDirectoryCache);
078                setCompoundSet(compoundSet);
079
080                String db = compoundSet instanceof AminoAcidCompoundSet ? "protein" : "nuccore";
081
082                InputStream inStream = getBufferedInputStream(accessionID, db);
083                genbankParser = new GenbankSequenceParser<AbstractSequence<C>, C>();
084
085                setContents(genbankParser.getSequence(new BufferedReader(new InputStreamReader(inStream)), 0));
086                headerParser = genbankParser.getSequenceHeaderParser();
087                header = genbankParser.getHeader();
088                features = genbankParser.getFeatures();
089
090                if (compoundSet.getClass().equals(AminoAcidCompoundSet.class)) {
091                        if (!genbankParser.getCompoundType().equals(compoundSet)) {
092                                logger.error("Declared compount type {} does not mach the real: {}", genbankParser.getCompoundType().toString(), compoundSet.toString());
093                                throw new IOException("Wrong declared compound type for: " + accessionID);
094                        }
095                }
096
097                inStream.close();
098        }
099
100        private BufferedInputStream getBufferedInputStream(String accessionID, String db) throws IOException, InterruptedException {
101                BufferedInputStream inStream = null;
102                if (genbankDirectoryCache != null && genbankDirectoryCache.length() > 0) {
103                        File f = new File(genbankDirectoryCache + File.separatorChar + accessionID + ".gb");
104                        if (f.exists()) {
105                                logger.debug("Reading: {}", f.toString());
106                                inStream = new BufferedInputStream(new FileInputStream(f));
107                        } else {
108                                InputStream in = getEutilsInputStream(accessionID, db);
109                                copyInputStreamToFile(in, f);
110                                inStream = new BufferedInputStream(new FileInputStream(f));
111                        }
112                } else {
113                        inStream = new BufferedInputStream(getEutilsInputStream(accessionID, db));
114                }
115                return inStream;
116        }
117
118        private void copyInputStreamToFile(InputStream in, File f) throws IOException, InterruptedException {
119                FileOutputStream out = new FileOutputStream(f);
120                byte[] buffer = new byte[1024];
121                int len = in.read(buffer);
122                while (len != -1) {
123                        out.write(buffer, 0, len);
124                        len = in.read(buffer);
125                        if (Thread.interrupted()) {
126                                in.close();
127                                out.close();
128                                throw new InterruptedException();
129                        }
130                }
131                in.close();
132                out.close();
133        }
134
135        private InputStream getEutilsInputStream(String accessionID, String db) throws IOException {
136                String genbankURL = eutilBaseURL + "efetch.fcgi?db=" + db + "&id=" + accessionID + "&rettype=gb&retmode=text";
137                logger.trace("Loading: {}", genbankURL);
138                URL genbank = new URL(genbankURL);
139                URLConnection genbankConnection = genbank.openConnection();
140                return genbankConnection.getInputStream();
141        }
142
143        /**
144         * Local directory cache of Genbank that can be downloaded
145         *
146         * @return the uniprotDirectoryCache
147         */
148        public String getGenbankDirectoryCache() {
149                return genbankDirectoryCache;
150        }
151
152        /**
153         * @param genbankDirectoryCache
154         */
155        public void setGenbankDirectoryCache(String genbankDirectoryCache) {
156                if (genbankDirectoryCache != null) {
157                        File f = new File(genbankDirectoryCache);
158                        if (!f.exists()) {
159                                f.mkdirs();
160                        }
161                }
162                this.genbankDirectoryCache = genbankDirectoryCache;
163        }
164
165        public String getHeader() {
166                return header;
167        }
168
169        public GenericGenbankHeaderParser<AbstractSequence<C>, C> getHeaderParser() {
170                return headerParser;
171        }
172        @Override
173        public Map<String, List<AbstractFeature<AbstractSequence<C>, C>>> getFeatures() {
174                return features;
175        }
176
177        @Override
178        public Map<String, List<DBReferenceInfo>> getDatabaseReferences() {
179                return genbankParser.getDatabaseReferences();
180        }
181
182        @Override
183        public List<String> getKeyWords() {
184                return genbankParser.getKeyWords();
185        }
186
187        public static void main(String[] args) throws Throwable {
188
189                GenbankProxySequenceReader<AminoAcidCompound> genbankProteinReader
190                                = new GenbankProxySequenceReader<AminoAcidCompound>("/tmp", "NP_000257", AminoAcidCompoundSet.getAminoAcidCompoundSet());
191                ProteinSequence proteinSequence = new ProteinSequence(genbankProteinReader);
192                genbankProteinReader.getHeaderParser().parseHeader(genbankProteinReader.getHeader(), proteinSequence);
193                logger.info("Sequence ({},{})={}...", proteinSequence.getAccession(), proteinSequence.getLength(), proteinSequence.getSequenceAsString().substring(0, 10));
194                logger.info("Keywords: {}", genbankProteinReader.getKeyWords());
195                logger.info("DatabaseReferences: {}", genbankProteinReader.getDatabaseReferences());
196                proteinSequence.getFeatures();
197
198                GenbankProxySequenceReader<NucleotideCompound> genbankDNAReader
199                                = new GenbankProxySequenceReader<NucleotideCompound>("/tmp", "NM_001126", DNACompoundSet.getDNACompoundSet());
200                DNASequence dnaSequence = new DNASequence(genbankDNAReader);
201                genbankDNAReader.getHeaderParser().parseHeader(genbankDNAReader.getHeader(), dnaSequence);
202                dnaSequence.setAccession(new AccessionID("NM_001126"));
203                logger.info("Sequence ({},{})={}...", dnaSequence.getAccession(), dnaSequence.getLength(), dnaSequence.getSequenceAsString().substring(0, 10));
204                logger.info("Keywords: {}", genbankDNAReader.getKeyWords());
205                logger.info("DatabaseReferences: {}", genbankDNAReader.getDatabaseReferences());
206
207                genbankDNAReader
208                                = new GenbankProxySequenceReader<NucleotideCompound>("/tmp", "NM_000266", DNACompoundSet.getDNACompoundSet());
209                dnaSequence = new DNASequence(genbankDNAReader);
210                genbankDNAReader.getHeaderParser().parseHeader(genbankDNAReader.getHeader(), dnaSequence);
211                logger.info("Sequence ({},{})={}...", dnaSequence.getAccession(), dnaSequence.getLength(), dnaSequence.getSequenceAsString().substring(0, 10));
212                logger.info("Keywords: {}", genbankDNAReader.getKeyWords());
213                logger.info("DatabaseReferences: {}", genbankDNAReader.getDatabaseReferences());
214
215                genbankDNAReader
216                                = new GenbankProxySequenceReader<NucleotideCompound>("/tmp", "AV254721", DNACompoundSet.getDNACompoundSet());
217                dnaSequence = new DNASequence(genbankDNAReader);
218                genbankDNAReader.getHeaderParser().parseHeader(genbankDNAReader.getHeader(), dnaSequence);
219                logger.info("Sequence ({},{})={}...", dnaSequence.getAccession(), dnaSequence.getLength(), dnaSequence.getSequenceAsString().substring(0, 10));
220                logger.info("Keywords: {}", genbankDNAReader.getKeyWords());
221                logger.info("DatabaseReferences: {}", genbankDNAReader.getDatabaseReferences());
222
223                genbankDNAReader
224                                = new GenbankProxySequenceReader<NucleotideCompound>("/tmp", "AV254721.2", DNACompoundSet.getDNACompoundSet());
225                dnaSequence = new DNASequence(genbankDNAReader);
226                genbankDNAReader.getHeaderParser().parseHeader(genbankDNAReader.getHeader(), dnaSequence);
227                logger.info("Sequence ({},{})={}...", dnaSequence.getAccession(), dnaSequence.getLength(), dnaSequence.getSequenceAsString().substring(0, 10));
228                logger.info("Keywords: {}", genbankDNAReader.getKeyWords());
229                logger.info("DatabaseReferences: {}", genbankDNAReader.getDatabaseReferences());
230
231                genbankDNAReader
232                                = new GenbankProxySequenceReader<NucleotideCompound>("/tmp", "U49845", DNACompoundSet.getDNACompoundSet());
233                dnaSequence = new DNASequence(genbankDNAReader);
234                genbankDNAReader.getHeaderParser().parseHeader(genbankDNAReader.getHeader(), dnaSequence);
235                logger.info("Sequence ({},{})={}...", dnaSequence.getAccession(), dnaSequence.getLength(), dnaSequence.getSequenceAsString().substring(0, 10));
236                logger.info("Keywords: {}", genbankDNAReader.getKeyWords());
237                logger.info("DatabaseReferences: {}", genbankDNAReader.getDatabaseReferences());
238
239                genbankDNAReader
240                                = new GenbankProxySequenceReader<NucleotideCompound>("/tmp", "GI:1293613", DNACompoundSet.getDNACompoundSet());
241                dnaSequence = new DNASequence(genbankDNAReader);
242                genbankDNAReader.getHeaderParser().parseHeader(genbankDNAReader.getHeader(), dnaSequence);
243                logger.info("Sequence ({},{})={}...", dnaSequence.getAccession(), dnaSequence.getLength(), dnaSequence.getSequenceAsString().substring(0, 10));
244                logger.info("Keywords: {}", genbankDNAReader.getKeyWords());
245                logger.info("DatabaseReferences: {}", genbankDNAReader.getDatabaseReferences());
246
247                genbankDNAReader
248                                = new GenbankProxySequenceReader<NucleotideCompound>("/tmp", "14109166", DNACompoundSet.getDNACompoundSet());
249                dnaSequence = new DNASequence(genbankDNAReader);
250                genbankDNAReader.getHeaderParser().parseHeader(genbankDNAReader.getHeader(), dnaSequence);
251                logger.info("Sequence ({},{})={}...", dnaSequence.getAccession(), dnaSequence.getLength(), dnaSequence.getSequenceAsString().substring(0, 10));
252                logger.info("Keywords: {}", genbankDNAReader.getKeyWords());
253                logger.info("DatabaseReferences: {}", genbankDNAReader.getDatabaseReferences());
254
255                /*
256                 GenbankProxySequenceReader genbankProxyReader = new GenbankProxySequenceReader("/tmp");
257                 Sequence<?> sequence;
258
259                 sequence = genbankProxyReader.getDNASequence(new AccessionID("NM_001126"));
260                 System.out.println("Sequence" + "(" + sequence.getLength() + ")=" + sequence.getSequenceAsString().substring(0, 10) + "...");
261
262                 sequence = genbankProxyReader.getDNASequence(new AccessionID("NM_000266"));
263                 System.out.println("Sequence" + "(" + sequence.getLength() + ")=" + sequence.getSequenceAsString().substring(0, 10) + "...");
264
265                 sequence = genbankProxyReader.getProteinSequence(new AccessionID("NP_000257"));
266                 System.out.println("Sequence" + "(" + sequence.getLength() + ")=" + sequence.getSequenceAsString().substring(0, 10) + "...");
267
268                 sequence = genbankProxyReader.getProteinSequence(new AccessionID("AV254721"));
269                 System.out.println("Sequence" + "(" + sequence.getLength() + ")=" + sequence.getSequenceAsString().substring(0, 10) + "...");
270
271                 sequence = genbankProxyReader.getProteinSequence(new AccessionID("AV254721.2"));
272                 System.out.println("Sequence" + "(" + sequence.getLength() + ")=" + sequence.getSequenceAsString().substring(0, 10) + "...");
273
274                 sequence = genbankProxyReader.getProteinSequence(new AccessionID("U49845"));
275                 System.out.println("Sequence" + "(" + sequence.getLength() + ")=" + sequence.getSequenceAsString().substring(0, 10) + "...");
276
277                 sequence = genbankProxyReader.getProteinSequence(new AccessionID("GI:1293613"));
278                 System.out.println("Sequence" + "(" + sequence.getLength() + ")=" + sequence.getSequenceAsString().substring(0, 10) + "...");
279
280                 sequence = genbankProxyReader.getProteinSequence(new AccessionID("14109166"));
281                 System.out.println("Sequence" + "(" + sequence.getLength() + ")=" + sequence.getSequenceAsString().substring(0, 10) + "...");
282                 */
283        }
284}