001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.bio.seq.db;
022
023import java.io.BufferedReader;
024import java.io.DataInputStream;
025import java.io.IOException;
026import java.io.InputStream;
027import java.io.InputStreamReader;
028import java.io.OutputStream;
029import java.io.OutputStreamWriter;
030import java.io.PrintWriter;
031import java.net.MalformedURLException;
032import java.net.Socket;
033import java.net.URL;
034import java.util.Iterator;
035import java.util.Set;
036
037import org.biojava.bio.BioException;
038import org.biojava.bio.seq.DNATools;
039import org.biojava.bio.seq.Sequence;
040import org.biojava.bio.seq.SequenceIterator;
041import org.biojava.bio.seq.io.GenbankFormat;
042import org.biojava.bio.seq.io.SeqIOTools;
043import org.biojava.bio.seq.io.SequenceFormat;
044import org.biojava.bio.symbol.Alphabet;
045import org.biojava.utils.ChangeVetoException;
046
047/**
048 * This class contains functions accessing DNA sequences in Genbank format.
049 *
050 * @author Lei Lai
051 * @author Matthew Pocock
052 * @author Laurent Jourdren
053 * @author Shuvankar Mukherjee
054 * @author Mark Schreiber
055 * @author Richard Holland
056 * @George Waldon
057 */
058public class GenbankSequenceDB
059{
060  private static SequenceFormat format = new GenbankFormat();;//return format of the sequence
061  private static String DBName="Genbank";//predefined the database name -- Genbank
062  protected boolean IOExceptionFound=false;//check if IOException is found
063  protected boolean ExceptionFound=false;//check if any exception is found
064  protected static final String urlBatchSequences =
065    "http://www.ncbi.nlm.nih.gov:80/entrez/eutils/efetch.fcgi";
066
067  public GenbankSequenceDB() {
068  }
069
070  protected SequenceFormat getSequenceFormat()
071  {
072    return format;
073  }
074
075  protected Alphabet getAlphabet()
076  {
077    return DNATools.getDNA();
078  }
079
080  /**
081   * Get the URL object for locating sequence object using eutils.
082   * The default value of the return format of the sequence object is text.
083   **/
084  protected URL getAddress (String id) throws MalformedURLException
085  {
086        String defaultReturnFormat="text";
087        return getAddress(id,defaultReturnFormat);
088  }
089
090  /**
091   * Get the URL object for locating sequence object using eutils.
092   * User could specify the return format of the sequence object.
093   */
094  protected URL getAddress(String id, String format) throws MalformedURLException
095  {
096        FetchURL seqURL = new FetchURL(DBName, format);
097        String baseurl = seqURL.getbaseURL();
098        String db = seqURL.getDB();
099        String type = seqURL.getRetrievalType();
100        String mode = seqURL.getRetrievalMode();
101        String url = baseurl+db+"&id="+id+"&rettype="+type+"&retmode="+mode;
102        return new URL(url);
103  }
104
105  public String getName()
106  {
107    return DBName;
108  }
109
110  public Sequence getSequence(String id) throws Exception {
111    try {
112      IOExceptionFound = false;
113      ExceptionFound = false;
114      URL queryURL = getAddress(id); //get URL based on ID
115
116      //  System.err.println("query is "+ queryURL.toString());
117
118      //System.err.println("got data from " + queryURL);
119
120      DataInputStream in = new DataInputStream(queryURL.openStream());
121      BufferedReader reader = new BufferedReader(new InputStreamReader(in));
122      SequenceIterator seqI = SeqIOTools.readGenbank(reader);
123
124      return seqI.nextSequence();
125
126    } catch (Exception e) {
127      System.out.println("Exception found in GenbankSequenceDB -- getSequence");
128      System.out.println(e.toString());
129      ExceptionFound = true;
130      IOExceptionFound = true;
131      return null;
132    }
133  }
134
135  public boolean checkIOException()
136  {
137        return IOExceptionFound;
138  }
139
140  public boolean checkException()
141  {
142        return ExceptionFound;
143  }
144
145  /**
146   * Create the Http Post Request to fetch (in batch mode) a list of sequence
147   * with Genbank.
148   * @param url URL of the request
149   * @param list List of sequence identifier
150   * @return The Post request.
151   */
152  protected String makeBatchRequest(URL url, Set list) {
153
154    StringBuffer params = new StringBuffer();
155    params.append("db=nucleotide&rettype=gb&id=");
156
157    boolean b = true;
158    for (Iterator i = list.iterator(); b;) {
159      String idSequence = (String) i.next();
160      params.append(idSequence);
161      if(i.hasNext()){
162        params.append(",");
163      }else{
164        b =false;
165        //params.append("\r\n");
166      }
167    }
168
169    StringBuffer header = new StringBuffer();
170    header.append("POST ");
171    header.append(url.getPath());
172    header.append(
173      " HTTP/1.0\r\n"
174        + "Connection: close\r\n"
175        + "Accept: text/html, text/plain\r\n"
176        + "Host: ");
177
178    header.append(url.getHost());
179    header.append(
180      "\r\n"
181        + "User-Agent: Biojava/GenbankSequenceDB\r\n"
182        + "Content-Type: application/x-www-form-urlencoded\r\n"
183        + "Content-Length: ");
184    header.append(params.length());
185    header.append("\r\n\r\n");
186
187    StringBuffer request = new StringBuffer();
188    request.append(header);
189    request.append(params);
190
191    return request.toString();
192  }
193
194  /**
195   * Retrieve sequences from a Genbank
196   *
197   * @param list List of NCBI sequence number (GI), accession, accession.version,
198   * fasta or seqid.
199   * @return The database object (HashSequenceDB) with downloaded sequences.
200   */
201  public SequenceDB getSequences(Set list) throws BioException {
202
203    return getSequences(list, null);
204  }
205
206  /**
207   * Retrieve sequences from a Genbank
208   *
209   * @param list List of NCBI sequence number (GI), accession, accession.version,
210   * fasta or seqid.
211   * @param database Where to store sequences. if database is null, use an
212   * HashSequenceDB Objet.
213   * @return The database object with downloaded sequences.
214   */
215  public SequenceDB getSequences(Set list, SequenceDB database)
216    throws BioException {
217
218    if (database == null)
219      database = new HashSequenceDB();
220
221    try {
222
223      URL url = new URL(urlBatchSequences);
224      int port = url.getPort();
225      String hostname = url.getHost();
226
227      //Open the connection and the streams
228      Socket s = new Socket(hostname, port);
229
230      InputStream sin = s.getInputStream();
231      BufferedReader fromServer =
232        new BufferedReader(new InputStreamReader(sin));
233      OutputStream sout = s.getOutputStream();
234      PrintWriter toServer = new PrintWriter(new OutputStreamWriter(sout));
235
236      // Put the Post request to the server
237      toServer.print(makeBatchRequest(url, list));
238      toServer.flush();
239
240      // Delete response headers
241      boolean finEntete = false;
242      for (String l = null;
243        ((l = fromServer.readLine()) != null) && (!finEntete);
244        )
245        if (l.equals(""))
246          finEntete = true;
247
248      SequenceIterator seqI = SeqIOTools.readGenbank(fromServer);
249
250      while (seqI.hasNext())
251        database.addSequence(seqI.nextSequence());
252
253    } catch (MalformedURLException e) {
254      throw new BioException(e,"Exception found in GenbankSequenceDB -- getSequences");
255    } catch (IOException e) {
256      throw new BioException(e,"Exception found in GenbankSequenceDB -- getSequences");
257    } catch (BioException e) {
258      throw new BioException(e,"Exception found in GenbankSequenceDB -- getSequences");
259    } catch (ChangeVetoException e) {
260      throw new BioException(e,"Exception found in GenbankSequenceDB -- getSequences");
261    }
262
263    return database;
264  }
265}