001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojavax.bio.db.ncbi;
022
023import java.io.BufferedReader;
024import java.io.DataInputStream;
025import java.io.IOException;
026import java.io.InputStream;
027import java.io.InputStreamReader;
028import java.io.OutputStream;
029import java.io.OutputStreamWriter;
030import java.io.PrintWriter;
031import java.net.MalformedURLException;
032import java.net.Socket;
033import java.net.URL;
034import java.util.Iterator;
035import java.util.Set;
036
037import org.biojava.bio.BioException;
038import org.biojava.bio.seq.ProteinTools;
039import org.biojava.bio.seq.db.FetchURL;
040import org.biojava.bio.seq.db.IllegalIDException;
041import org.biojava.bio.seq.io.SymbolTokenization;
042import org.biojava.utils.ChangeVetoException;
043import org.biojavax.Namespace;
044import org.biojavax.RichObjectFactory;
045import org.biojavax.bio.db.AbstractRichSequenceDB;
046import org.biojavax.bio.db.HashRichSequenceDB;
047import org.biojavax.bio.db.RichSequenceDB;
048import org.biojavax.bio.db.RichSequenceDBLite;
049import org.biojavax.bio.seq.RichSequence;
050import org.biojavax.bio.seq.RichSequenceIterator;
051import org.biojavax.bio.seq.io.RichSequenceBuilderFactory;
052
053
054/**
055 * This class contains functions accessing Peptide sequences in Genpept format.
056 * It adds methods to return RichSequences instead of plain Sequences.
057 *
058 * @author Lei Lai
059 * @author Matthew Pocock
060 * @author Laurent Jourdren
061 * @author Shuvankar Mukherjee
062 * @author Mark Schreiber
063 * @author Richard Holland
064 * @since 1.5
065 */
066public class GenpeptRichSequenceDB extends AbstractRichSequenceDB implements RichSequenceDBLite {
067    
068    protected static final String urlBatchSequences = "https://www.ncbi.nlm.nih.gov:80/entrez/eutils/efetch.fcgi";
069    
070    /**
071     * The default constructor delegates to the parent class. The constructor refers
072     * to RichObjectFactory.getDefaultNamespace() so make sure your factory is initialised
073     * before calling this constructor.
074     * Sets the default factory to THRESHOLD.
075     */
076    public GenpeptRichSequenceDB() {
077        super();
078        this.setFactory(RichSequenceBuilderFactory.THRESHOLD); // threshold factory is efficient
079        this.setNamespace(RichObjectFactory.getDefaultNamespace()); // default namespace
080    }
081    
082    /**
083     * Get the URL object for locating sequence object using eutils.
084     * The default value of the return format of the sequence object is text.
085     **/
086    protected URL getAddress(String id) throws MalformedURLException {
087        FetchURL seqURL = new FetchURL("Genpept", "text");
088        String baseurl = seqURL.getbaseURL();
089        String db = seqURL.getDB();
090        String type = seqURL.getRetrievalType();
091        String mode = seqURL.getRetrievalMode();
092        String url = baseurl+db+"&id="+id+"&rettype="+type+"&retmode="+mode;
093        return new URL(url);
094    }
095    
096    /**
097     * Create the Http Post Request to fetch (in batch mode) a list of sequence
098     * with Genbank.
099     * @param url URL of the request
100     * @param list List of sequence identifier
101     * @return The Post request.
102     */
103    protected String makeBatchRequest(URL url, Set list) {
104        StringBuffer params = new StringBuffer();
105        params.append("db=protein&rettype=gb&id=");
106        
107        for (Iterator i = list.iterator(); i.hasNext();) {
108            String idSequence = (String)i.next();
109            params.append(idSequence);
110            if(i.hasNext()) params.append(",");
111        }
112        
113        StringBuffer header = new StringBuffer();
114        header.append("POST ");
115        header.append(url.getPath());
116        header.append(
117                " HTTP/1.0\r\n"
118                + "Connection: close\r\n"
119                + "Accept: text/html, text/plain\r\n"
120                + "Host: ");
121        header.append(url.getHost());
122        header.append(
123                "\r\n"
124                + "User-Agent: Biojava/GenpeptSequenceDB\r\n"
125                + "Content-Type: application/x-www-form-urlencoded\r\n"
126                + "Content-Length: ");
127        header.append(params.length());
128        header.append("\r\n\r\n");
129        
130        StringBuffer request = new StringBuffer();
131        request.append(header);
132        request.append(params);
133        
134        return request.toString();
135    }
136    
137    /**
138     * Given the appropriate Genbank ID, return the matching RichSequence object.
139     * @param id the Genbank ID to retrieve.
140     * @return the matching RichSequence object, or null if not found.
141     * @throws Exception if the sequence could not be retrieved for reasons other
142     * than the identifier not being found.
143     */
144    public RichSequence getRichSequence(String id) throws BioException, IllegalIDException {
145        try {
146            URL queryURL = getAddress(id); //get URL based on ID
147            
148            SymbolTokenization rParser = ProteinTools.getTAlphabet().getTokenization("token"); //get SymbolTokenization
149            RichSequenceBuilderFactory seqFactory = this.getFactory();
150            Namespace ns = this.getNamespace();
151            
152            DataInputStream in = new DataInputStream(queryURL.openStream());
153            BufferedReader reader = new BufferedReader(new InputStreamReader(in));
154            RichSequenceIterator seqI = RichSequence.IOTools.readGenbank(reader, rParser, seqFactory, ns);
155            
156            return seqI.nextRichSequence();
157        } catch (MalformedURLException e) {
158            throw new BioException("Failed to create Genbank URL",e);
159        } catch (BioException e) {
160            throw new BioException("Failed to read Genbank sequence",e);
161        } catch (IOException e) {
162            throw new BioException("IO failure whilst reading from Genbank",e);
163        }
164    }    
165    
166    /**     
167     * Given the appropriate Genbank ID, return the matching RichSequence object. Additionally
168     * define a new Namespace for the received RichSequence object.
169     * @param id the Genbank ID to retrieve.
170     * @param nsp the Namespace to define.
171     * @return the matching RichSequence object, or null if not found.
172     * @throws Exception if the sequence could not be retrieved for reasons other
173     * than the identifier not being found.
174     */
175    public RichSequence getRichSequence(String id, Namespace nsp) throws BioException, IllegalIDException {
176        try {
177            URL queryURL = getAddress(id); //get URL based on ID
178            
179            SymbolTokenization rParser = ProteinTools.getTAlphabet().getTokenization("token"); //get SymbolTokenization
180            RichSequenceBuilderFactory seqFactory = this.getFactory();
181            
182            DataInputStream in = new DataInputStream(queryURL.openStream());
183            BufferedReader reader = new BufferedReader(new InputStreamReader(in));
184            RichSequenceIterator seqI = RichSequence.IOTools.readGenbank(reader, rParser, seqFactory, nsp);
185            
186            return seqI.nextRichSequence();
187        } catch (MalformedURLException e) {
188            throw new BioException("Failed to create Genbank URL",e);
189        } catch (BioException e) {
190            throw new BioException("Failed to read Genbank sequence",e);
191        } catch (IOException e) {
192            throw new BioException("IO failure whilst reading from Genbank",e);
193        }
194    }
195    
196    
197    /**
198     * Retrieve rich sequences from a Genbank
199     *
200     * @param list List of NCBI sequence number (GI), accession, accession.version,
201     * fasta or seqid.
202     * @return The rich database object (HashSequenceDB) with downloaded rich sequences.
203     * You will need to cast the sequences you get from this database object into
204     * RichSequence objects if you want to access their full features.
205     */
206    public RichSequenceDB getRichSequences(Set list) throws BioException, IllegalIDException {
207        
208        return getRichSequences(list, null);
209    }
210    
211    /**
212     * Retrieve rich sequences from a Genbank
213     *
214     * @param list List of NCBI sequence number (GI), accession, accession.version,
215     * fasta or seqid.
216     * @param database Where to store rich sequences. If database is null, use an
217     * HashSequenceDB Object.
218     * @return The database object with downloaded rich sequences.
219     * You will need to cast the sequences you get from this database object into
220     * RichSequence objects if you want to access their full features.
221     */
222    public RichSequenceDB getRichSequences(Set list, RichSequenceDB database) throws BioException, IllegalIDException {
223        try {
224            if (database == null) database = new HashRichSequenceDB();
225            
226            URL url = new URL(urlBatchSequences);
227            int port = url.getPort();
228            String hostname = url.getHost();
229            
230            //Open the connection and the streams
231            Socket s = new Socket(hostname, port);
232            
233            InputStream sin = s.getInputStream();
234            BufferedReader fromServer = new BufferedReader(new InputStreamReader(sin));
235            OutputStream sout = s.getOutputStream();
236            PrintWriter toServer = new PrintWriter(new OutputStreamWriter(sout));
237            
238            // Put the Post request to the server
239            toServer.print(makeBatchRequest(url, list));
240            toServer.flush();
241            
242            // Delete response headers
243            boolean finEntete = false;
244            for (String l = null; ((l = fromServer.readLine()) != null) && (!finEntete);) {
245                if (l.equals("")) finEntete = true;
246            }
247            
248            SymbolTokenization rParser = ProteinTools.getTAlphabet().getTokenization("token"); //get SymbolTokenization
249            RichSequenceBuilderFactory seqFactory = this.getFactory();
250            Namespace ns = this.getNamespace();
251            
252            RichSequenceIterator seqI = RichSequence.IOTools.readGenbank(fromServer, rParser, seqFactory, ns);
253            
254            while (seqI.hasNext()) {
255                try {
256                    database.addSequence(seqI.nextRichSequence());
257                } catch (ChangeVetoException ce) {
258                    throw new BioException("Unexpectedly couldn't add to the supplied RichSequenceDB", ce);
259                }
260            }
261            
262            return database;
263        } catch (MalformedURLException e) {
264            throw new BioException("Failed to create Genbank URL",e);
265        } catch (BioException e) {
266            throw new BioException("Failed to read Genbank sequence",e);
267        } catch (IOException e) {
268            throw new BioException("IO failure whilst reading from Genbank",e);
269        }
270    }
271    
272    public String getName() {
273        return "Genbank";
274    }
275    
276    public Set ids() {
277        throw new RuntimeException("Complete set of Genbank ids is unavailable.");
278    }
279    
280    /**
281     * Holds value of property factory.
282     */
283    private RichSequenceBuilderFactory factory;
284    
285    /**
286     * Getter for property factory.
287     * @return Value of property factory.
288     */
289    public RichSequenceBuilderFactory getFactory() {
290        
291        return this.factory;
292    }
293    
294    /**
295     * Setter for property factory.
296     * @param factory New value of property factory.
297     */
298    public void setFactory(RichSequenceBuilderFactory factory) {
299        
300        this.factory = factory;
301    }
302    
303    /**
304     * Holds value of property namespace.
305     */
306    private Namespace namespace;
307    
308    /**
309     * Getter for property namespace.
310     * @return Value of property namespace.
311     */
312    public Namespace getNamespace() {
313        
314        return this.namespace;
315    }
316    
317    /**
318     * Setter for property namespace.
319     * @param namespace New value of property namespace.
320     */
321    public void setNamespace(Namespace namespace) {
322        
323        this.namespace = namespace;
324    }
325}