001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojavax.bio.db.ncbi; 022 023import java.io.BufferedReader; 024import java.io.DataInputStream; 025import java.io.IOException; 026import java.io.InputStream; 027import java.io.InputStreamReader; 028import java.io.OutputStream; 029import java.io.OutputStreamWriter; 030import java.io.PrintWriter; 031import java.net.MalformedURLException; 032import java.net.Socket; 033import java.net.URL; 034import java.util.Iterator; 035import java.util.Set; 036 037import org.biojava.bio.BioException; 038import org.biojava.bio.seq.DNATools; 039import org.biojava.bio.seq.db.FetchURL; 040import org.biojava.bio.seq.db.IllegalIDException; 041import org.biojava.bio.seq.io.SymbolTokenization; 042import org.biojava.utils.ChangeVetoException; 043import org.biojavax.Namespace; 044import org.biojavax.RichObjectFactory; 045import org.biojavax.bio.db.AbstractRichSequenceDB; 046import org.biojavax.bio.db.HashRichSequenceDB; 047import org.biojavax.bio.db.RichSequenceDB; 048import org.biojavax.bio.db.RichSequenceDBLite; 049import org.biojavax.bio.seq.RichSequence; 050import org.biojavax.bio.seq.RichSequenceIterator; 051import org.biojavax.bio.seq.io.RichSequenceBuilderFactory; 052 053 054/** 055 * This class contains functions accessing DNA sequences in Genbank format. 056 * It adds methods to return RichSequences instead of plain Sequences. 057 * 058 * @author Lei Lai 059 * @author Matthew Pocock 060 * @author Laurent Jourdren 061 * @author Shuvankar Mukherjee 062 * @author Mark Schreiber 063 * @author Richard Holland 064 * @since 1.5 065 */ 066public class GenbankRichSequenceDB extends AbstractRichSequenceDB implements RichSequenceDBLite { 067 068 protected static final String urlBatchSequences = "https://www.ncbi.nlm.nih.gov:80/entrez/eutils/efetch.fcgi"; 069 070 private String email = "anonymous@biojava.org"; 071 private String tool = "biojavax"; 072 073 /** 074 * The default constructor delegates to the parent class. The constructor refers 075 * to RichObjectFactory.getDefaultNamespace() so make sure your factory is initialised 076 * before calling this constructor. 077 * Sets the default factory to THRESHOLD. 078 */ 079 public GenbankRichSequenceDB() { 080 super(); 081 this.setFactory(RichSequenceBuilderFactory.THRESHOLD); // threshold factory is efficient 082 this.setNamespace(RichObjectFactory.getDefaultNamespace()); // default namespace 083 } 084 085 /** 086 * Get the URL object for locating sequence object using eutils. 087 * The default value of the return format of the sequence object is text. 088 **/ 089 protected URL getAddress(String id) throws MalformedURLException { 090 FetchURL seqURL = new FetchURL("Genbank", "text"); 091 String baseurl = seqURL.getbaseURL(); 092 String db = seqURL.getDB(); 093 String type = seqURL.getRetrievalType(); 094 String mode = seqURL.getRetrievalMode(); 095 String url = baseurl+db+"&id="+id+"&rettype="+type+"&retmode="+mode+"&tool="+getTool()+"&email="+getEmail(); 096 return new URL(url); 097 } 098 099 /** 100 * Create the Http Post Request to fetch (in batch mode) a list of sequence 101 * with Genbank. 102 * @param url URL of the request 103 * @param list List of sequence identifier 104 * @return The Post request. 105 */ 106 protected String makeBatchRequest(URL url, Set list) { 107 StringBuffer params = new StringBuffer(); 108 params.append("db=nucleotide&rettype=gb&id="); 109 110 for (Iterator i = list.iterator(); i.hasNext();) { 111 String idSequence = (String)i.next(); 112 params.append(idSequence); 113 if(i.hasNext()) params.append(","); 114 } 115 116 params.append("&email="+getEmail()+"&tool="+getTool()); 117 118 StringBuffer header = new StringBuffer(); 119 header.append("POST "); 120 header.append(url.getPath()); 121 header.append( 122 " HTTP/1.0\r\n" 123 + "Connection: close\r\n" 124 + "Accept: text/html, text/plain\r\n" 125 + "Host: "); 126 header.append(url.getHost()); 127 header.append( 128 "\r\n" 129 + "User-Agent: Biojava/GenbankSequenceDB\r\n" 130 + "Content-Type: application/x-www-form-urlencoded\r\n" 131 + "Content-Length: "); 132 header.append(params.length()); 133 header.append("\r\n\r\n"); 134 135 StringBuffer request = new StringBuffer(); 136 request.append(header); 137 request.append(params); 138 139 return request.toString(); 140 } 141 142 /** 143 * Given the appropriate Genbank ID, return the matching RichSequence object. 144 * @param id the Genbank ID to retrieve. 145 * @return the matching RichSequence object, or null if not found. 146 * @throws Exception if the sequence could not be retrieved for reasons other 147 * than the identifier not being found. 148 */ 149 public RichSequence getRichSequence(String id) throws BioException, IllegalIDException { 150 try { 151 URL queryURL = getAddress(id); //get URL based on ID 152 153 SymbolTokenization rParser = DNATools.getDNA().getTokenization("token"); //get SymbolTokenization 154 RichSequenceBuilderFactory seqFactory = this.getFactory(); 155 Namespace ns = this.getNamespace(); 156 157 DataInputStream in = new DataInputStream(queryURL.openStream()); 158 BufferedReader reader = new BufferedReader(new InputStreamReader(in)); 159 RichSequenceIterator seqI = RichSequence.IOTools.readGenbank(reader, rParser, seqFactory, ns); 160 161 return seqI.nextRichSequence(); 162 } catch (MalformedURLException e) { 163 throw new BioException("Failed to create Genbank URL",e); 164 } catch (BioException e) { 165 throw new BioException("Failed to read Genbank sequence",e); 166 } catch (IOException e) { 167 throw new BioException("IO failure whilst reading from Genbank",e); 168 } 169 } 170 171 /** 172 * Given the appropriate Genbank ID, return the matching RichSequence object. Additionally 173 * define a new Namespace for the received RichSequence object. 174 * @param id the Genbank ID to retrieve. 175 * @param nsp the Namespace to define. 176 * @return the matching RichSequence object, or null if not found. 177 * @throws Exception if the sequence could not be retrieved for reasons other 178 * than the identifier not being found. 179 */ 180 public RichSequence getRichSequence(String id, Namespace nsp) throws BioException, IllegalIDException { 181 try { 182 URL queryURL = getAddress(id); //get URL based on ID 183 184 SymbolTokenization rParser = DNATools.getDNA().getTokenization("token"); //get SymbolTokenization 185 RichSequenceBuilderFactory seqFactory = this.getFactory(); 186 187 DataInputStream in = new DataInputStream(queryURL.openStream()); 188 BufferedReader reader = new BufferedReader(new InputStreamReader(in)); 189 RichSequenceIterator seqI = RichSequence.IOTools.readGenbank(reader, rParser, seqFactory, nsp); 190 191 return seqI.nextRichSequence(); 192 } catch (MalformedURLException e) { 193 throw new BioException("Failed to create Genbank URL",e); 194 } catch (BioException e) { 195 throw new BioException("Failed to read Genbank sequence",e); 196 } catch (IOException e) { 197 throw new BioException("IO failure whilst reading from Genbank",e); 198 } 199 } 200 201 202 /** 203 * Retrieve rich sequences from a Genbank 204 * 205 * @param list List of NCBI sequence number (GI), accession, accession.version, 206 * fasta or seqid. 207 * @return The rich database object (HashSequenceDB) with downloaded rich sequences. 208 * You will need to cast the sequences you get from this database object into 209 * RichSequence objects if you want to access their full features. 210 */ 211 public RichSequenceDB getRichSequences(Set list) throws BioException, IllegalIDException { 212 213 return getRichSequences(list, null); 214 } 215 216 /** 217 * Retrieve rich sequences from a Genbank 218 * 219 * @param list List of NCBI sequence number (GI), accession, accession.version, 220 * fasta or seqid. 221 * @param database Where to store rich sequences. If database is null, use an 222 * HashSequenceDB Object. 223 * @return The database object with downloaded rich sequences. 224 * You will need to cast the sequences you get from this database object into 225 * RichSequence objects if you want to access their full features. 226 */ 227 public RichSequenceDB getRichSequences(Set list, RichSequenceDB database) throws BioException, IllegalIDException { 228 try { 229 if (database == null) database = new HashRichSequenceDB(); 230 231 URL url = new URL(urlBatchSequences); 232 int port = url.getPort(); 233 String hostname = url.getHost(); 234 235 //Open the connection and the streams 236 Socket s = new Socket(hostname, port); 237 238 InputStream sin = s.getInputStream(); 239 BufferedReader fromServer = new BufferedReader(new InputStreamReader(sin)); 240 OutputStream sout = s.getOutputStream(); 241 PrintWriter toServer = new PrintWriter(new OutputStreamWriter(sout)); 242 243 // Put the Post request to the server 244 toServer.print(makeBatchRequest(url, list)); 245 toServer.flush(); 246 247 // Delete response headers 248 boolean finEntete = false; 249 for (String l = null; ((l = fromServer.readLine()) != null) && (!finEntete);) { 250 if (l.equals("")) finEntete = true; 251 } 252 253 SymbolTokenization rParser = DNATools.getDNA().getTokenization("token"); //get SymbolTokenization 254 RichSequenceBuilderFactory seqFactory = this.getFactory(); 255 Namespace ns = this.getNamespace(); 256 257 RichSequenceIterator seqI = RichSequence.IOTools.readGenbank(fromServer, rParser, seqFactory, ns); 258 259 while (seqI.hasNext()) { 260 try { 261 database.addSequence(seqI.nextRichSequence()); 262 } catch (ChangeVetoException ce) { 263 throw new BioException("Unexpectedly couldn't add to the supplied RichSequenceDB", ce); 264 } 265 } 266 267 return database; 268 } catch (MalformedURLException e) { 269 throw new BioException("Failed to create Genbank URL",e); 270 } catch (BioException e) { 271 throw new BioException("Failed to read Genbank sequence",e); 272 } catch (IOException e) { 273 throw new BioException("IO failure whilst reading from Genbank",e); 274 } 275 } 276 277 public String getName() { 278 return "Genbank"; 279 } 280 281 public Set ids() { 282 throw new RuntimeException("Complete set of Genbank ids is unavailable."); 283 } 284 285 /** 286 * Holds value of property factory. 287 */ 288 private RichSequenceBuilderFactory factory; 289 290 /** 291 * Getter for property factory. 292 * @return Value of property factory. 293 */ 294 public RichSequenceBuilderFactory getFactory() { 295 296 return this.factory; 297 } 298 299 /** 300 * Setter for property factory. 301 * @param factory New value of property factory. 302 */ 303 public void setFactory(RichSequenceBuilderFactory factory) { 304 305 this.factory = factory; 306 } 307 308 /** 309 * Holds value of property namespace. 310 */ 311 private Namespace namespace; 312 313 /** 314 * Getter for property namespace. 315 * @return Value of property namespace. 316 */ 317 public Namespace getNamespace() { 318 319 return this.namespace; 320 } 321 322 /** 323 * Setter for property namespace. 324 * @param namespace New value of property namespace. 325 */ 326 public void setNamespace(Namespace namespace) { 327 328 this.namespace = namespace; 329 } 330 331 /** 332 * Set the tool identifier for Entrez. Defaults to 'biojavax'. 333 * @param tool the new identifier. 334 */ 335 public void setTool(String tool) { 336 this.tool = tool; 337 } 338 339 /** 340 * Get the tool identifier for Entrez. Defaults to 'biojavax'. 341 * @return the identifier. 342 */ 343 public String getTool() { 344 return this.tool; 345 } 346 347 /** 348 * Set the email for Entrez. Defaults to 'anonymous@biojava.org'. 349 * @param email the new email. 350 */ 351 public void setEmail(String email) { 352 this.email = email; 353 } 354 355 /** 356 * Get the email for Entrez. Defaults to 'anonymous@biojava.org'. 357 * @return the email. 358 */ 359 public String getEmail() { 360 return this.email; 361 } 362}