001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * @author Karl Nicholas <github:karlnicholas>
015 *
016 * For more information on the BioJava project and its aims,
017 * or to join the biojava-l mailing list, visit the home page
018 * at:
019 *
020 *      http://www.biojava.org/
021 *
022 * Created on 08-08-2013
023 *
024 */
025package org.biojava.nbio.core.sequence.loader;
026
027import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
028import org.biojava.nbio.core.sequence.AccessionID;
029import org.biojava.nbio.core.sequence.DNASequence;
030import org.biojava.nbio.core.sequence.ProteinSequence;
031import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
032import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
033import org.biojava.nbio.core.sequence.compound.DNACompoundSet;
034import org.biojava.nbio.core.sequence.compound.NucleotideCompound;
035import org.biojava.nbio.core.sequence.features.AbstractFeature;
036import org.biojava.nbio.core.sequence.features.DBReferenceInfo;
037import org.biojava.nbio.core.sequence.features.DatabaseReferenceInterface;
038import org.biojava.nbio.core.sequence.features.FeatureRetriever;
039import org.biojava.nbio.core.sequence.features.FeaturesKeyWordInterface;
040import org.biojava.nbio.core.sequence.io.GenbankSequenceParser;
041import org.biojava.nbio.core.sequence.io.GenericGenbankHeaderParser;
042import org.biojava.nbio.core.sequence.template.AbstractSequence;
043import org.biojava.nbio.core.sequence.template.Compound;
044import org.biojava.nbio.core.sequence.template.CompoundSet;
045import org.slf4j.Logger;
046import org.slf4j.LoggerFactory;
047
048import java.io.BufferedInputStream;
049import java.io.BufferedReader;
050import java.io.File;
051import java.io.FileInputStream;
052import java.io.FileOutputStream;
053import java.io.IOException;
054import java.io.InputStream;
055import java.io.InputStreamReader;
056import java.net.URL;
057import java.net.URLConnection;
058import java.util.ArrayList;
059import java.util.HashMap;
060import java.util.LinkedHashMap;
061
062/**
063 * @author Karl Nicholas <github:karlnicholas>
064 * @author Jacek Grzebyta <github:jgrzebyta>
065 */
066public class GenbankProxySequenceReader<C extends Compound> extends StringProxySequenceReader<C> implements FeaturesKeyWordInterface, DatabaseReferenceInterface, FeatureRetriever {
067
068        private static final Logger logger = LoggerFactory.getLogger(GenbankProxySequenceReader.class);
069
070        private static final String eutilBaseURL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"; //
071        private String genbankDirectoryCache = null;
072        private GenbankSequenceParser<AbstractSequence<C>, C> genbankParser;
073        private GenericGenbankHeaderParser<AbstractSequence<C>, C> headerParser;
074        private String header;
075        private HashMap<String, ArrayList<AbstractFeature>> features;
076
077
078        /**
079         *
080         * @throws InterruptedException
081         * @throws IOException
082         * @throws CompoundNotFoundException
083         */
084        public GenbankProxySequenceReader(
085                        String genbankDirectoryCache,
086                        String accessionID,
087                        CompoundSet<C> compoundSet ) throws IOException, InterruptedException, CompoundNotFoundException {
088
089                setGenbankDirectoryCache(genbankDirectoryCache);
090                setCompoundSet(compoundSet);
091
092                String db = compoundSet instanceof AminoAcidCompoundSet ? "protein" : "nuccore";
093
094                InputStream inStream = getBufferedInputStream(accessionID, db);
095                genbankParser = new GenbankSequenceParser<AbstractSequence<C>, C>();
096
097                setContents(genbankParser.getSequence(new BufferedReader(new InputStreamReader(inStream)), 0));
098                headerParser = genbankParser.getSequenceHeaderParser();
099                header = genbankParser.getHeader();
100                features = genbankParser.getFeatures();
101
102                if (compoundSet.getClass().equals(AminoAcidCompoundSet.class)) {
103                        if (!genbankParser.getCompoundType().equals(compoundSet)) {
104                                logger.error("Declared compount type {} does not mach the real: {}", genbankParser.getCompoundType().toString(), compoundSet.toString());
105                                throw new IOException("Wrong declared compound type for: " + accessionID);
106                        }
107                }
108
109                inStream.close();
110        }
111
112        private BufferedInputStream getBufferedInputStream(String accessionID, String db) throws IOException, InterruptedException {
113                BufferedInputStream inStream = null;
114                if (genbankDirectoryCache != null && genbankDirectoryCache.length() > 0) {
115                        File f = new File(genbankDirectoryCache + File.separatorChar + accessionID + ".gb");
116                        if (f.exists()) {
117                                logger.debug("Reading: {}", f.toString());
118                                inStream = new BufferedInputStream(new FileInputStream(f));
119                        } else {
120                                InputStream in = getEutilsInputStream(accessionID, db);
121                                copyInputStreamToFile(in, f);
122                                inStream = new BufferedInputStream(new FileInputStream(f));
123                        }
124                } else {
125                        inStream = new BufferedInputStream(getEutilsInputStream(accessionID, db));
126                }
127                return inStream;
128        }
129
130        private void copyInputStreamToFile(InputStream in, File f) throws IOException, InterruptedException {
131                FileOutputStream out = new FileOutputStream(f);
132                byte[] buffer = new byte[1024];
133                int len = in.read(buffer);
134                while (len != -1) {
135                        out.write(buffer, 0, len);
136                        len = in.read(buffer);
137                        if (Thread.interrupted()) {
138                                in.close();
139                                out.close();
140                                throw new InterruptedException();
141                        }
142                }
143                in.close();
144                out.close();
145        }
146
147        private InputStream getEutilsInputStream(String accessionID, String db) throws IOException {
148                String genbankURL = eutilBaseURL + "efetch.fcgi?db=" + db + "&id=" + accessionID + "&rettype=gb&retmode=text";
149                logger.trace("Loading: {}", genbankURL);
150                URL genbank = new URL(genbankURL);
151                URLConnection genbankConnection = genbank.openConnection();
152                return genbankConnection.getInputStream();
153        }
154
155        /**
156         * Local directory cache of Genbank that can be downloaded
157         *
158         * @return the uniprotDirectoryCache
159         */
160        public String getGenbankDirectoryCache() {
161                return genbankDirectoryCache;
162        }
163
164        /**
165         * @param genbankDirectoryCache
166         */
167        public void setGenbankDirectoryCache(String genbankDirectoryCache) {
168                if (genbankDirectoryCache != null) {
169                        File f = new File(genbankDirectoryCache);
170                        if (!f.exists()) {
171                                f.mkdirs();
172                        }
173                }
174                this.genbankDirectoryCache = genbankDirectoryCache;
175        }
176
177        public String getHeader() {
178                return header;
179        }
180
181        public GenericGenbankHeaderParser<AbstractSequence<C>, C> getHeaderParser() {
182                return headerParser;
183        }
184        @Override
185        public HashMap<String, ArrayList<AbstractFeature>> getFeatures() {
186                return features;
187        }
188
189        @Override
190        public LinkedHashMap<String, ArrayList<DBReferenceInfo>> getDatabaseReferences() {
191                return genbankParser.getDatabaseReferences();
192        }
193
194        @Override
195        public ArrayList<String> getKeyWords() {
196                return genbankParser.getKeyWords();
197        }
198
199        public static void main(String[] args) throws Throwable {
200
201                GenbankProxySequenceReader<AminoAcidCompound> genbankProteinReader
202                                = new GenbankProxySequenceReader<AminoAcidCompound>("/tmp", "NP_000257", AminoAcidCompoundSet.getAminoAcidCompoundSet());
203                ProteinSequence proteinSequence = new ProteinSequence(genbankProteinReader);
204                genbankProteinReader.getHeaderParser().parseHeader(genbankProteinReader.getHeader(), proteinSequence);
205                logger.info("Sequence ({},{})={}...", proteinSequence.getAccession(), proteinSequence.getLength(), proteinSequence.getSequenceAsString().substring(0, 10));
206                logger.info("Keywords: {}", genbankProteinReader.getKeyWords());
207                logger.info("DatabaseReferences: {}", genbankProteinReader.getDatabaseReferences());
208                proteinSequence.getFeatures();
209
210                GenbankProxySequenceReader<NucleotideCompound> genbankDNAReader
211                                = new GenbankProxySequenceReader<NucleotideCompound>("/tmp", "NM_001126", DNACompoundSet.getDNACompoundSet());
212                DNASequence dnaSequence = new DNASequence(genbankDNAReader);
213                genbankDNAReader.getHeaderParser().parseHeader(genbankDNAReader.getHeader(), dnaSequence);
214                dnaSequence.setAccession(new AccessionID("NM_001126"));
215                logger.info("Sequence ({},{})={}...", dnaSequence.getAccession(), dnaSequence.getLength(), dnaSequence.getSequenceAsString().substring(0, 10));
216                logger.info("Keywords: {}", genbankDNAReader.getKeyWords());
217                logger.info("DatabaseReferences: {}", genbankDNAReader.getDatabaseReferences());
218
219                genbankDNAReader
220                                = new GenbankProxySequenceReader<NucleotideCompound>("/tmp", "NM_000266", DNACompoundSet.getDNACompoundSet());
221                dnaSequence = new DNASequence(genbankDNAReader);
222                genbankDNAReader.getHeaderParser().parseHeader(genbankDNAReader.getHeader(), dnaSequence);
223                logger.info("Sequence ({},{})={}...", dnaSequence.getAccession(), dnaSequence.getLength(), dnaSequence.getSequenceAsString().substring(0, 10));
224                logger.info("Keywords: {}", genbankDNAReader.getKeyWords());
225                logger.info("DatabaseReferences: {}", genbankDNAReader.getDatabaseReferences());
226
227                genbankDNAReader
228                                = new GenbankProxySequenceReader<NucleotideCompound>("/tmp", "AV254721", DNACompoundSet.getDNACompoundSet());
229                dnaSequence = new DNASequence(genbankDNAReader);
230                genbankDNAReader.getHeaderParser().parseHeader(genbankDNAReader.getHeader(), dnaSequence);
231                logger.info("Sequence ({},{})={}...", dnaSequence.getAccession(), dnaSequence.getLength(), dnaSequence.getSequenceAsString().substring(0, 10));
232                logger.info("Keywords: {}", genbankDNAReader.getKeyWords());
233                logger.info("DatabaseReferences: {}", genbankDNAReader.getDatabaseReferences());
234
235                genbankDNAReader
236                                = new GenbankProxySequenceReader<NucleotideCompound>("/tmp", "AV254721.2", DNACompoundSet.getDNACompoundSet());
237                dnaSequence = new DNASequence(genbankDNAReader);
238                genbankDNAReader.getHeaderParser().parseHeader(genbankDNAReader.getHeader(), dnaSequence);
239                logger.info("Sequence ({},{})={}...", dnaSequence.getAccession(), dnaSequence.getLength(), dnaSequence.getSequenceAsString().substring(0, 10));
240                logger.info("Keywords: {}", genbankDNAReader.getKeyWords());
241                logger.info("DatabaseReferences: {}", genbankDNAReader.getDatabaseReferences());
242
243                genbankDNAReader
244                                = new GenbankProxySequenceReader<NucleotideCompound>("/tmp", "U49845", DNACompoundSet.getDNACompoundSet());
245                dnaSequence = new DNASequence(genbankDNAReader);
246                genbankDNAReader.getHeaderParser().parseHeader(genbankDNAReader.getHeader(), dnaSequence);
247                logger.info("Sequence ({},{})={}...", dnaSequence.getAccession(), dnaSequence.getLength(), dnaSequence.getSequenceAsString().substring(0, 10));
248                logger.info("Keywords: {}", genbankDNAReader.getKeyWords());
249                logger.info("DatabaseReferences: {}", genbankDNAReader.getDatabaseReferences());
250
251                genbankDNAReader
252                                = new GenbankProxySequenceReader<NucleotideCompound>("/tmp", "GI:1293613", DNACompoundSet.getDNACompoundSet());
253                dnaSequence = new DNASequence(genbankDNAReader);
254                genbankDNAReader.getHeaderParser().parseHeader(genbankDNAReader.getHeader(), dnaSequence);
255                logger.info("Sequence ({},{})={}...", dnaSequence.getAccession(), dnaSequence.getLength(), dnaSequence.getSequenceAsString().substring(0, 10));
256                logger.info("Keywords: {}", genbankDNAReader.getKeyWords());
257                logger.info("DatabaseReferences: {}", genbankDNAReader.getDatabaseReferences());
258
259                genbankDNAReader
260                                = new GenbankProxySequenceReader<NucleotideCompound>("/tmp", "14109166", DNACompoundSet.getDNACompoundSet());
261                dnaSequence = new DNASequence(genbankDNAReader);
262                genbankDNAReader.getHeaderParser().parseHeader(genbankDNAReader.getHeader(), dnaSequence);
263                logger.info("Sequence ({},{})={}...", dnaSequence.getAccession(), dnaSequence.getLength(), dnaSequence.getSequenceAsString().substring(0, 10));
264                logger.info("Keywords: {}", genbankDNAReader.getKeyWords());
265                logger.info("DatabaseReferences: {}", genbankDNAReader.getDatabaseReferences());
266
267                /*
268                 GenbankProxySequenceReader genbankProxyReader = new GenbankProxySequenceReader("/tmp");
269                 Sequence<?> sequence;
270
271                 sequence = genbankProxyReader.getDNASequence(new AccessionID("NM_001126"));
272                 System.out.println("Sequence" + "(" + sequence.getLength() + ")=" + sequence.getSequenceAsString().substring(0, 10) + "...");
273
274                 sequence = genbankProxyReader.getDNASequence(new AccessionID("NM_000266"));
275                 System.out.println("Sequence" + "(" + sequence.getLength() + ")=" + sequence.getSequenceAsString().substring(0, 10) + "...");
276
277                 sequence = genbankProxyReader.getProteinSequence(new AccessionID("NP_000257"));
278                 System.out.println("Sequence" + "(" + sequence.getLength() + ")=" + sequence.getSequenceAsString().substring(0, 10) + "...");
279
280                 sequence = genbankProxyReader.getProteinSequence(new AccessionID("AV254721"));
281                 System.out.println("Sequence" + "(" + sequence.getLength() + ")=" + sequence.getSequenceAsString().substring(0, 10) + "...");
282
283                 sequence = genbankProxyReader.getProteinSequence(new AccessionID("AV254721.2"));
284                 System.out.println("Sequence" + "(" + sequence.getLength() + ")=" + sequence.getSequenceAsString().substring(0, 10) + "...");
285
286                 sequence = genbankProxyReader.getProteinSequence(new AccessionID("U49845"));
287                 System.out.println("Sequence" + "(" + sequence.getLength() + ")=" + sequence.getSequenceAsString().substring(0, 10) + "...");
288
289                 sequence = genbankProxyReader.getProteinSequence(new AccessionID("GI:1293613"));
290                 System.out.println("Sequence" + "(" + sequence.getLength() + ")=" + sequence.getSequenceAsString().substring(0, 10) + "...");
291
292                 sequence = genbankProxyReader.getProteinSequence(new AccessionID("14109166"));
293                 System.out.println("Sequence" + "(" + sequence.getLength() + ")=" + sequence.getSequenceAsString().substring(0, 10) + "...");
294                 */
295        }
296}