001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojavax.bio.db.ncbi;
022
023import java.io.BufferedReader;
024import java.io.DataInputStream;
025import java.io.IOException;
026import java.io.InputStream;
027import java.io.InputStreamReader;
028import java.io.OutputStream;
029import java.io.OutputStreamWriter;
030import java.io.PrintWriter;
031import java.net.MalformedURLException;
032import java.net.Socket;
033import java.net.URL;
034import java.util.Iterator;
035import java.util.Set;
036
037import org.biojava.bio.BioException;
038import org.biojava.bio.seq.DNATools;
039import org.biojava.bio.seq.db.FetchURL;
040import org.biojava.bio.seq.db.IllegalIDException;
041import org.biojava.bio.seq.io.SymbolTokenization;
042import org.biojava.utils.ChangeVetoException;
043import org.biojavax.Namespace;
044import org.biojavax.RichObjectFactory;
045import org.biojavax.bio.db.AbstractRichSequenceDB;
046import org.biojavax.bio.db.HashRichSequenceDB;
047import org.biojavax.bio.db.RichSequenceDB;
048import org.biojavax.bio.db.RichSequenceDBLite;
049import org.biojavax.bio.seq.RichSequence;
050import org.biojavax.bio.seq.RichSequenceIterator;
051import org.biojavax.bio.seq.io.RichSequenceBuilderFactory;
052
053
054/**
055 * This class contains functions accessing DNA sequences in Genbank format.
056 * It adds methods to return RichSequences instead of plain Sequences.
057 *
058 * @author Lei Lai
059 * @author Matthew Pocock
060 * @author Laurent Jourdren
061 * @author Shuvankar Mukherjee
062 * @author Mark Schreiber
063 * @author Richard Holland
064 * @since 1.5
065 */
066public class GenbankRichSequenceDB extends AbstractRichSequenceDB implements RichSequenceDBLite {
067    
068    protected static final String urlBatchSequences = "http://www.ncbi.nlm.nih.gov:80/entrez/eutils/efetch.fcgi";
069
070    private String email = "anonymous@biojava.org";
071    private String tool = "biojavax";
072    
073    /**
074     * The default constructor delegates to the parent class. The constructor refers
075     * to RichObjectFactory.getDefaultNamespace() so make sure your factory is initialised
076     * before calling this constructor.
077     * Sets the default factory to THRESHOLD.
078     */
079    public GenbankRichSequenceDB() {
080        super();
081        this.setFactory(RichSequenceBuilderFactory.THRESHOLD); // threshold factory is efficient
082        this.setNamespace(RichObjectFactory.getDefaultNamespace()); // default namespace
083    }
084    
085    /**
086     * Get the URL object for locating sequence object using eutils.
087     * The default value of the return format of the sequence object is text.
088     **/
089    protected URL getAddress(String id) throws MalformedURLException {
090        FetchURL seqURL = new FetchURL("Genbank", "text");
091        String baseurl = seqURL.getbaseURL();
092        String db = seqURL.getDB();
093        String type = seqURL.getRetrievalType();
094        String mode = seqURL.getRetrievalMode();
095        String url = baseurl+db+"&id="+id+"&rettype="+type+"&retmode="+mode+"&tool="+getTool()+"&email="+getEmail();
096        return new URL(url);
097    }
098    
099    /**
100     * Create the Http Post Request to fetch (in batch mode) a list of sequence
101     * with Genbank.
102     * @param url URL of the request
103     * @param list List of sequence identifier
104     * @return The Post request.
105     */
106    protected String makeBatchRequest(URL url, Set list) {
107        StringBuffer params = new StringBuffer();
108        params.append("db=nucleotide&rettype=gb&id=");
109        
110        for (Iterator i = list.iterator(); i.hasNext();) {
111            String idSequence = (String)i.next();
112            params.append(idSequence);
113            if(i.hasNext()) params.append(",");
114        }
115
116        params.append("&email="+getEmail()+"&tool="+getTool());
117        
118        StringBuffer header = new StringBuffer();
119        header.append("POST ");
120        header.append(url.getPath());
121        header.append(
122                " HTTP/1.0\r\n"
123                + "Connection: close\r\n"
124                + "Accept: text/html, text/plain\r\n"
125                + "Host: ");
126        header.append(url.getHost());
127        header.append(
128                "\r\n"
129                + "User-Agent: Biojava/GenbankSequenceDB\r\n"
130                + "Content-Type: application/x-www-form-urlencoded\r\n"
131                + "Content-Length: ");
132        header.append(params.length());
133        header.append("\r\n\r\n");
134        
135        StringBuffer request = new StringBuffer();
136        request.append(header);
137        request.append(params);
138        
139        return request.toString();
140    }
141    
142    /**
143     * Given the appropriate Genbank ID, return the matching RichSequence object.
144     * @param id the Genbank ID to retrieve.
145     * @return the matching RichSequence object, or null if not found.
146     * @throws Exception if the sequence could not be retrieved for reasons other
147     * than the identifier not being found.
148     */
149    public RichSequence getRichSequence(String id) throws BioException, IllegalIDException {
150        try {
151            URL queryURL = getAddress(id); //get URL based on ID
152            
153            SymbolTokenization rParser = DNATools.getDNA().getTokenization("token"); //get SymbolTokenization
154            RichSequenceBuilderFactory seqFactory = this.getFactory();
155            Namespace ns = this.getNamespace();
156            
157            DataInputStream in = new DataInputStream(queryURL.openStream());
158            BufferedReader reader = new BufferedReader(new InputStreamReader(in));
159            RichSequenceIterator seqI = RichSequence.IOTools.readGenbank(reader, rParser, seqFactory, ns);
160            
161            return seqI.nextRichSequence();
162        } catch (MalformedURLException e) {
163            throw new BioException("Failed to create Genbank URL",e);
164        } catch (BioException e) {
165            throw new BioException("Failed to read Genbank sequence",e);
166        } catch (IOException e) {
167            throw new BioException("IO failure whilst reading from Genbank",e);
168        }
169    }    
170    
171    /**     
172     * Given the appropriate Genbank ID, return the matching RichSequence object. Additionally
173     * define a new Namespace for the received RichSequence object.
174     * @param id the Genbank ID to retrieve.
175     * @param nsp the Namespace to define.
176     * @return the matching RichSequence object, or null if not found.
177     * @throws Exception if the sequence could not be retrieved for reasons other
178     * than the identifier not being found.
179     */
180    public RichSequence getRichSequence(String id, Namespace nsp) throws BioException, IllegalIDException {
181        try {
182            URL queryURL = getAddress(id); //get URL based on ID
183            
184            SymbolTokenization rParser = DNATools.getDNA().getTokenization("token"); //get SymbolTokenization
185            RichSequenceBuilderFactory seqFactory = this.getFactory();
186            
187            DataInputStream in = new DataInputStream(queryURL.openStream());
188            BufferedReader reader = new BufferedReader(new InputStreamReader(in));
189            RichSequenceIterator seqI = RichSequence.IOTools.readGenbank(reader, rParser, seqFactory, nsp);
190            
191            return seqI.nextRichSequence();
192        } catch (MalformedURLException e) {
193            throw new BioException("Failed to create Genbank URL",e);
194        } catch (BioException e) {
195            throw new BioException("Failed to read Genbank sequence",e);
196        } catch (IOException e) {
197            throw new BioException("IO failure whilst reading from Genbank",e);
198        }
199    }
200    
201    
202    /**
203     * Retrieve rich sequences from a Genbank
204     *
205     * @param list List of NCBI sequence number (GI), accession, accession.version,
206     * fasta or seqid.
207     * @return The rich database object (HashSequenceDB) with downloaded rich sequences.
208     * You will need to cast the sequences you get from this database object into
209     * RichSequence objects if you want to access their full features.
210     */
211    public RichSequenceDB getRichSequences(Set list) throws BioException, IllegalIDException {
212        
213        return getRichSequences(list, null);
214    }
215    
216    /**
217     * Retrieve rich sequences from a Genbank
218     *
219     * @param list List of NCBI sequence number (GI), accession, accession.version,
220     * fasta or seqid.
221     * @param database Where to store rich sequences. If database is null, use an
222     * HashSequenceDB Object.
223     * @return The database object with downloaded rich sequences.
224     * You will need to cast the sequences you get from this database object into
225     * RichSequence objects if you want to access their full features.
226     */
227    public RichSequenceDB getRichSequences(Set list, RichSequenceDB database) throws BioException, IllegalIDException {
228        try {
229            if (database == null) database = new HashRichSequenceDB();
230            
231            URL url = new URL(urlBatchSequences);
232            int port = url.getPort();
233            String hostname = url.getHost();
234            
235            //Open the connection and the streams
236            Socket s = new Socket(hostname, port);
237            
238            InputStream sin = s.getInputStream();
239            BufferedReader fromServer = new BufferedReader(new InputStreamReader(sin));
240            OutputStream sout = s.getOutputStream();
241            PrintWriter toServer = new PrintWriter(new OutputStreamWriter(sout));
242            
243            // Put the Post request to the server
244            toServer.print(makeBatchRequest(url, list));
245            toServer.flush();
246            
247            // Delete response headers
248            boolean finEntete = false;
249            for (String l = null; ((l = fromServer.readLine()) != null) && (!finEntete);) {
250                if (l.equals("")) finEntete = true;
251            }
252            
253            SymbolTokenization rParser = DNATools.getDNA().getTokenization("token"); //get SymbolTokenization
254            RichSequenceBuilderFactory seqFactory = this.getFactory();
255            Namespace ns = this.getNamespace();
256            
257            RichSequenceIterator seqI = RichSequence.IOTools.readGenbank(fromServer, rParser, seqFactory, ns);
258            
259            while (seqI.hasNext()) {
260                try {
261                    database.addSequence(seqI.nextRichSequence());
262                } catch (ChangeVetoException ce) {
263                    throw new BioException("Unexpectedly couldn't add to the supplied RichSequenceDB", ce);
264                }
265            }
266            
267            return database;
268        } catch (MalformedURLException e) {
269            throw new BioException("Failed to create Genbank URL",e);
270        } catch (BioException e) {
271            throw new BioException("Failed to read Genbank sequence",e);
272        } catch (IOException e) {
273            throw new BioException("IO failure whilst reading from Genbank",e);
274        }
275    }
276    
277    public String getName() {
278        return "Genbank";
279    }
280    
281    public Set ids() {
282        throw new RuntimeException("Complete set of Genbank ids is unavailable.");
283    }
284    
285    /**
286     * Holds value of property factory.
287     */
288    private RichSequenceBuilderFactory factory;
289    
290    /**
291     * Getter for property factory.
292     * @return Value of property factory.
293     */
294    public RichSequenceBuilderFactory getFactory() {
295        
296        return this.factory;
297    }
298    
299    /**
300     * Setter for property factory.
301     * @param factory New value of property factory.
302     */
303    public void setFactory(RichSequenceBuilderFactory factory) {
304        
305        this.factory = factory;
306    }
307    
308    /**
309     * Holds value of property namespace.
310     */
311    private Namespace namespace;
312    
313    /**
314     * Getter for property namespace.
315     * @return Value of property namespace.
316     */
317    public Namespace getNamespace() {
318        
319        return this.namespace;
320    }
321    
322    /**
323     * Setter for property namespace.
324     * @param namespace New value of property namespace.
325     */
326    public void setNamespace(Namespace namespace) {
327        
328        this.namespace = namespace;
329    }
330
331    /** 
332     * Set the tool identifier for Entrez. Defaults to 'biojavax'.
333     * @param tool the new identifier.
334     */
335    public void setTool(String tool) {
336        this.tool = tool;
337    }
338
339    /** 
340     * Get the tool identifier for Entrez. Defaults to 'biojavax'.
341     * @return the identifier.
342     */
343    public String getTool() {
344        return this.tool;
345    }
346
347    /** 
348     * Set the email for Entrez. Defaults to 'anonymous@biojava.org'.
349     * @param email the new email.
350     */
351    public void setEmail(String email) {
352        this.email = email;
353    }
354
355    /** 
356     * Get the email for Entrez. Defaults to 'anonymous@biojava.org'.
357     * @return the email.
358     */
359    public String getEmail() {
360        return this.email;
361    }
362}