001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * @author Karl Nicholas <github:karlnicholas>
015 *
016 * For more information on the BioJava project and its aims,
017 * or to join the biojava-l mailing list, visit the home page
018 * at:
019 *
020 *      http://www.biojava.org/
021 *
022 * Created on 08-08-2013
023 *
024 */
025package org.biojava.nbio.core.sequence.loader;
026
027import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
028import org.biojava.nbio.core.sequence.AccessionID;
029import org.biojava.nbio.core.sequence.DNASequence;
030import org.biojava.nbio.core.sequence.ProteinSequence;
031import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
032import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
033import org.biojava.nbio.core.sequence.compound.DNACompoundSet;
034import org.biojava.nbio.core.sequence.compound.NucleotideCompound;
035import org.biojava.nbio.core.sequence.features.*;
036import org.biojava.nbio.core.sequence.io.GenbankSequenceParser;
037import org.biojava.nbio.core.sequence.io.GenericGenbankHeaderParser;
038import org.biojava.nbio.core.sequence.template.AbstractSequence;
039import org.biojava.nbio.core.sequence.template.Compound;
040import org.biojava.nbio.core.sequence.template.CompoundSet;
041import org.slf4j.Logger;
042import org.slf4j.LoggerFactory;
043
044import java.io.*;
045import java.net.URL;
046import java.net.URLConnection;
047import java.util.ArrayList;
048import java.util.HashMap;
049import java.util.LinkedHashMap;
050
051/**
052 * @author Karl Nicholas <github:karlnicholas>
053 * @author Jacek Grzebyta <github:jgrzebyta>
054 */
055public class GenbankProxySequenceReader<C extends Compound> extends StringProxySequenceReader<C> implements FeaturesKeyWordInterface, DatabaseReferenceInterface, FeatureRetriever {
056
057        private final static Logger logger = LoggerFactory.getLogger(GenbankProxySequenceReader.class);
058
059        private static final String eutilBaseURL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"; //
060        private String genbankDirectoryCache = null;
061        private GenbankSequenceParser<AbstractSequence<C>, C> genbankParser;
062        private GenericGenbankHeaderParser<AbstractSequence<C>, C> headerParser;
063        private String header;
064        private HashMap<String, ArrayList<AbstractFeature>> features;
065
066
067        /**
068         *
069         * @throws InterruptedException
070         * @throws IOException
071         * @throws CompoundNotFoundException
072         */
073        public GenbankProxySequenceReader(
074                        String genbankDirectoryCache,
075                        String accessionID,
076                        CompoundSet<C> compoundSet ) throws IOException, InterruptedException, CompoundNotFoundException {
077
078                setGenbankDirectoryCache(genbankDirectoryCache);
079                setCompoundSet(compoundSet);
080
081                String db = compoundSet instanceof AminoAcidCompoundSet ? "protein" : "nuccore";
082
083                InputStream inStream = getBufferedInputStream(accessionID, db);
084                genbankParser = new GenbankSequenceParser<AbstractSequence<C>, C>();
085
086                setContents(genbankParser.getSequence(new BufferedReader(new InputStreamReader(inStream)), 0));
087                headerParser = genbankParser.getSequenceHeaderParser();
088                header = genbankParser.getHeader();
089                features = genbankParser.getFeatures();
090
091                if (compoundSet.getClass().equals(AminoAcidCompoundSet.class)) {
092                        if (!genbankParser.getCompoundType().equals(compoundSet)) {
093                                logger.error("Declared compount type {} does not mach the real: {}", genbankParser.getCompoundType().toString(), compoundSet.toString());
094                                throw new IOException("Wrong declared compound type for: " + accessionID);
095                        }
096                }
097
098                inStream.close();
099        }
100
101        private BufferedInputStream getBufferedInputStream(String accessionID, String db) throws IOException, InterruptedException {
102                BufferedInputStream inStream = null;
103                if (genbankDirectoryCache != null && genbankDirectoryCache.length() > 0) {
104                        File f = new File(genbankDirectoryCache + File.separatorChar + accessionID + ".gb");
105                        if (f.exists()) {
106                                logger.debug("Reading: {}", f.toString());
107                                inStream = new BufferedInputStream(new FileInputStream(f));
108                        } else {
109                                InputStream in = getEutilsInputStream(accessionID, db);
110                                copyInputStreamToFile(in, f);
111                                inStream = new BufferedInputStream(new FileInputStream(f));
112                        }
113                } else {
114                        inStream = new BufferedInputStream(getEutilsInputStream(accessionID, db));
115                }
116                return inStream;
117        }
118
119        private void copyInputStreamToFile(InputStream in, File f) throws IOException, InterruptedException {
120                FileOutputStream out = new FileOutputStream(f);
121                byte[] buffer = new byte[1024];
122                int len = in.read(buffer);
123                while (len != -1) {
124                        out.write(buffer, 0, len);
125                        len = in.read(buffer);
126                        if (Thread.interrupted()) {
127                                in.close();
128                                out.close();
129                                throw new InterruptedException();
130                        }
131                }
132                in.close();
133                out.close();
134        }
135
136        private InputStream getEutilsInputStream(String accessionID, String db) throws IOException {
137                String genbankURL = eutilBaseURL + "efetch.fcgi?db=" + db + "&id=" + accessionID + "&rettype=gb&retmode=text";
138                logger.trace("Loading: {}", genbankURL);
139                URL genbank = new URL(genbankURL);
140                URLConnection genbankConnection = genbank.openConnection();
141                return genbankConnection.getInputStream();
142        }
143
144        /**
145         * Local directory cache of Genbank that can be downloaded
146         *
147         * @return the uniprotDirectoryCache
148         */
149        public String getGenbankDirectoryCache() {
150                return genbankDirectoryCache;
151        }
152
153        /**
154         * @param genbankDirectoryCache
155         */
156        public void setGenbankDirectoryCache(String genbankDirectoryCache) {
157                if (genbankDirectoryCache != null) {
158                        File f = new File(genbankDirectoryCache);
159                        if (!f.exists()) {
160                                f.mkdirs();
161                        }
162                }
163                this.genbankDirectoryCache = genbankDirectoryCache;
164        }
165
166        public String getHeader() {
167                return header;
168        }
169
170        public GenericGenbankHeaderParser<AbstractSequence<C>, C> getHeaderParser() {
171                return headerParser;
172        }
173        @Override
174        public HashMap<String, ArrayList<AbstractFeature>> getFeatures() {
175                return features;
176        }
177
178        @Override
179        public LinkedHashMap<String, ArrayList<DBReferenceInfo>> getDatabaseReferences() {
180                return genbankParser.getDatabaseReferences();
181        }
182
183        @Override
184        public ArrayList<String> getKeyWords() {
185                return genbankParser.getKeyWords();
186        }
187
188        public static void main(String[] args) throws Throwable {
189
190                GenbankProxySequenceReader<AminoAcidCompound> genbankProteinReader
191                                = new GenbankProxySequenceReader<AminoAcidCompound>("/tmp", "NP_000257", AminoAcidCompoundSet.getAminoAcidCompoundSet());
192                ProteinSequence proteinSequence = new ProteinSequence(genbankProteinReader);
193                genbankProteinReader.getHeaderParser().parseHeader(genbankProteinReader.getHeader(), proteinSequence);
194                logger.info("Sequence ({},{})={}...", proteinSequence.getAccession(), proteinSequence.getLength(), proteinSequence.getSequenceAsString().substring(0, 10));
195                logger.info("Keywords: {}", genbankProteinReader.getKeyWords());
196                logger.info("DatabaseReferences: {}", genbankProteinReader.getDatabaseReferences());
197                proteinSequence.getFeatures();
198
199                GenbankProxySequenceReader<NucleotideCompound> genbankDNAReader
200                                = new GenbankProxySequenceReader<NucleotideCompound>("/tmp", "NM_001126", DNACompoundSet.getDNACompoundSet());
201                DNASequence dnaSequence = new DNASequence(genbankDNAReader);
202                genbankDNAReader.getHeaderParser().parseHeader(genbankDNAReader.getHeader(), dnaSequence);
203                dnaSequence.setAccession(new AccessionID("NM_001126"));
204                logger.info("Sequence ({},{})={}...", dnaSequence.getAccession(), dnaSequence.getLength(), dnaSequence.getSequenceAsString().substring(0, 10));
205                logger.info("Keywords: {}", genbankDNAReader.getKeyWords());
206                logger.info("DatabaseReferences: {}", genbankDNAReader.getDatabaseReferences());
207
208                genbankDNAReader
209                                = new GenbankProxySequenceReader<NucleotideCompound>("/tmp", "NM_000266", DNACompoundSet.getDNACompoundSet());
210                dnaSequence = new DNASequence(genbankDNAReader);
211                genbankDNAReader.getHeaderParser().parseHeader(genbankDNAReader.getHeader(), dnaSequence);
212                logger.info("Sequence ({},{})={}...", dnaSequence.getAccession(), dnaSequence.getLength(), dnaSequence.getSequenceAsString().substring(0, 10));
213                logger.info("Keywords: {}", genbankDNAReader.getKeyWords());
214                logger.info("DatabaseReferences: {}", genbankDNAReader.getDatabaseReferences());
215
216                genbankDNAReader
217                                = new GenbankProxySequenceReader<NucleotideCompound>("/tmp", "AV254721", DNACompoundSet.getDNACompoundSet());
218                dnaSequence = new DNASequence(genbankDNAReader);
219                genbankDNAReader.getHeaderParser().parseHeader(genbankDNAReader.getHeader(), dnaSequence);
220                logger.info("Sequence ({},{})={}...", dnaSequence.getAccession(), dnaSequence.getLength(), dnaSequence.getSequenceAsString().substring(0, 10));
221                logger.info("Keywords: {}", genbankDNAReader.getKeyWords());
222                logger.info("DatabaseReferences: {}", genbankDNAReader.getDatabaseReferences());
223
224                genbankDNAReader
225                                = new GenbankProxySequenceReader<NucleotideCompound>("/tmp", "AV254721.2", DNACompoundSet.getDNACompoundSet());
226                dnaSequence = new DNASequence(genbankDNAReader);
227                genbankDNAReader.getHeaderParser().parseHeader(genbankDNAReader.getHeader(), dnaSequence);
228                logger.info("Sequence ({},{})={}...", dnaSequence.getAccession(), dnaSequence.getLength(), dnaSequence.getSequenceAsString().substring(0, 10));
229                logger.info("Keywords: {}", genbankDNAReader.getKeyWords());
230                logger.info("DatabaseReferences: {}", genbankDNAReader.getDatabaseReferences());
231
232                genbankDNAReader
233                                = new GenbankProxySequenceReader<NucleotideCompound>("/tmp", "U49845", DNACompoundSet.getDNACompoundSet());
234                dnaSequence = new DNASequence(genbankDNAReader);
235                genbankDNAReader.getHeaderParser().parseHeader(genbankDNAReader.getHeader(), dnaSequence);
236                logger.info("Sequence ({},{})={}...", dnaSequence.getAccession(), dnaSequence.getLength(), dnaSequence.getSequenceAsString().substring(0, 10));
237                logger.info("Keywords: {}", genbankDNAReader.getKeyWords());
238                logger.info("DatabaseReferences: {}", genbankDNAReader.getDatabaseReferences());
239
240                genbankDNAReader
241                                = new GenbankProxySequenceReader<NucleotideCompound>("/tmp", "GI:1293613", DNACompoundSet.getDNACompoundSet());
242                dnaSequence = new DNASequence(genbankDNAReader);
243                genbankDNAReader.getHeaderParser().parseHeader(genbankDNAReader.getHeader(), dnaSequence);
244                logger.info("Sequence ({},{})={}...", dnaSequence.getAccession(), dnaSequence.getLength(), dnaSequence.getSequenceAsString().substring(0, 10));
245                logger.info("Keywords: {}", genbankDNAReader.getKeyWords());
246                logger.info("DatabaseReferences: {}", genbankDNAReader.getDatabaseReferences());
247
248                genbankDNAReader
249                                = new GenbankProxySequenceReader<NucleotideCompound>("/tmp", "14109166", DNACompoundSet.getDNACompoundSet());
250                dnaSequence = new DNASequence(genbankDNAReader);
251                genbankDNAReader.getHeaderParser().parseHeader(genbankDNAReader.getHeader(), dnaSequence);
252                logger.info("Sequence ({},{})={}...", dnaSequence.getAccession(), dnaSequence.getLength(), dnaSequence.getSequenceAsString().substring(0, 10));
253                logger.info("Keywords: {}", genbankDNAReader.getKeyWords());
254                logger.info("DatabaseReferences: {}", genbankDNAReader.getDatabaseReferences());
255
256                /*
257                 GenbankProxySequenceReader genbankProxyReader = new GenbankProxySequenceReader("/tmp");
258                 Sequence<?> sequence;
259
260                 sequence = genbankProxyReader.getDNASequence(new AccessionID("NM_001126"));
261                 System.out.println("Sequence" + "(" + sequence.getLength() + ")=" + sequence.getSequenceAsString().substring(0, 10) + "...");
262
263                 sequence = genbankProxyReader.getDNASequence(new AccessionID("NM_000266"));
264                 System.out.println("Sequence" + "(" + sequence.getLength() + ")=" + sequence.getSequenceAsString().substring(0, 10) + "...");
265
266                 sequence = genbankProxyReader.getProteinSequence(new AccessionID("NP_000257"));
267                 System.out.println("Sequence" + "(" + sequence.getLength() + ")=" + sequence.getSequenceAsString().substring(0, 10) + "...");
268
269                 sequence = genbankProxyReader.getProteinSequence(new AccessionID("AV254721"));
270                 System.out.println("Sequence" + "(" + sequence.getLength() + ")=" + sequence.getSequenceAsString().substring(0, 10) + "...");
271
272                 sequence = genbankProxyReader.getProteinSequence(new AccessionID("AV254721.2"));
273                 System.out.println("Sequence" + "(" + sequence.getLength() + ")=" + sequence.getSequenceAsString().substring(0, 10) + "...");
274
275                 sequence = genbankProxyReader.getProteinSequence(new AccessionID("U49845"));
276                 System.out.println("Sequence" + "(" + sequence.getLength() + ")=" + sequence.getSequenceAsString().substring(0, 10) + "...");
277
278                 sequence = genbankProxyReader.getProteinSequence(new AccessionID("GI:1293613"));
279                 System.out.println("Sequence" + "(" + sequence.getLength() + ")=" + sequence.getSequenceAsString().substring(0, 10) + "...");
280
281                 sequence = genbankProxyReader.getProteinSequence(new AccessionID("14109166"));
282                 System.out.println("Sequence" + "(" + sequence.getLength() + ")=" + sequence.getSequenceAsString().substring(0, 10) + "...");
283                 */
284        }
285}