001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojavax.bio.db.ncbi; 022 023import java.io.BufferedReader; 024import java.io.DataInputStream; 025import java.io.IOException; 026import java.io.InputStream; 027import java.io.InputStreamReader; 028import java.io.OutputStream; 029import java.io.OutputStreamWriter; 030import java.io.PrintWriter; 031import java.net.MalformedURLException; 032import java.net.Socket; 033import java.net.URL; 034import java.util.Iterator; 035import java.util.Set; 036 037import org.biojava.bio.BioException; 038import org.biojava.bio.seq.ProteinTools; 039import org.biojava.bio.seq.db.FetchURL; 040import org.biojava.bio.seq.db.IllegalIDException; 041import org.biojava.bio.seq.io.SymbolTokenization; 042import org.biojava.utils.ChangeVetoException; 043import org.biojavax.Namespace; 044import org.biojavax.RichObjectFactory; 045import org.biojavax.bio.db.AbstractRichSequenceDB; 046import org.biojavax.bio.db.HashRichSequenceDB; 047import org.biojavax.bio.db.RichSequenceDB; 048import org.biojavax.bio.db.RichSequenceDBLite; 049import org.biojavax.bio.seq.RichSequence; 050import org.biojavax.bio.seq.RichSequenceIterator; 051import org.biojavax.bio.seq.io.RichSequenceBuilderFactory; 052 053 054/** 055 * This class contains functions accessing Peptide sequences in Genpept format. 056 * It adds methods to return RichSequences instead of plain Sequences. 057 * 058 * @author Lei Lai 059 * @author Matthew Pocock 060 * @author Laurent Jourdren 061 * @author Shuvankar Mukherjee 062 * @author Mark Schreiber 063 * @author Richard Holland 064 * @since 1.5 065 */ 066public class GenpeptRichSequenceDB extends AbstractRichSequenceDB implements RichSequenceDBLite { 067 068 protected static final String urlBatchSequences = "https://www.ncbi.nlm.nih.gov:80/entrez/eutils/efetch.fcgi"; 069 070 /** 071 * The default constructor delegates to the parent class. The constructor refers 072 * to RichObjectFactory.getDefaultNamespace() so make sure your factory is initialised 073 * before calling this constructor. 074 * Sets the default factory to THRESHOLD. 075 */ 076 public GenpeptRichSequenceDB() { 077 super(); 078 this.setFactory(RichSequenceBuilderFactory.THRESHOLD); // threshold factory is efficient 079 this.setNamespace(RichObjectFactory.getDefaultNamespace()); // default namespace 080 } 081 082 /** 083 * Get the URL object for locating sequence object using eutils. 084 * The default value of the return format of the sequence object is text. 085 **/ 086 protected URL getAddress(String id) throws MalformedURLException { 087 FetchURL seqURL = new FetchURL("Genpept", "text"); 088 String baseurl = seqURL.getbaseURL(); 089 String db = seqURL.getDB(); 090 String type = seqURL.getRetrievalType(); 091 String mode = seqURL.getRetrievalMode(); 092 String url = baseurl+db+"&id="+id+"&rettype="+type+"&retmode="+mode; 093 return new URL(url); 094 } 095 096 /** 097 * Create the Http Post Request to fetch (in batch mode) a list of sequence 098 * with Genbank. 099 * @param url URL of the request 100 * @param list List of sequence identifier 101 * @return The Post request. 102 */ 103 protected String makeBatchRequest(URL url, Set list) { 104 StringBuffer params = new StringBuffer(); 105 params.append("db=protein&rettype=gb&id="); 106 107 for (Iterator i = list.iterator(); i.hasNext();) { 108 String idSequence = (String)i.next(); 109 params.append(idSequence); 110 if(i.hasNext()) params.append(","); 111 } 112 113 StringBuffer header = new StringBuffer(); 114 header.append("POST "); 115 header.append(url.getPath()); 116 header.append( 117 " HTTP/1.0\r\n" 118 + "Connection: close\r\n" 119 + "Accept: text/html, text/plain\r\n" 120 + "Host: "); 121 header.append(url.getHost()); 122 header.append( 123 "\r\n" 124 + "User-Agent: Biojava/GenpeptSequenceDB\r\n" 125 + "Content-Type: application/x-www-form-urlencoded\r\n" 126 + "Content-Length: "); 127 header.append(params.length()); 128 header.append("\r\n\r\n"); 129 130 StringBuffer request = new StringBuffer(); 131 request.append(header); 132 request.append(params); 133 134 return request.toString(); 135 } 136 137 /** 138 * Given the appropriate Genbank ID, return the matching RichSequence object. 139 * @param id the Genbank ID to retrieve. 140 * @return the matching RichSequence object, or null if not found. 141 * @throws Exception if the sequence could not be retrieved for reasons other 142 * than the identifier not being found. 143 */ 144 public RichSequence getRichSequence(String id) throws BioException, IllegalIDException { 145 try { 146 URL queryURL = getAddress(id); //get URL based on ID 147 148 SymbolTokenization rParser = ProteinTools.getTAlphabet().getTokenization("token"); //get SymbolTokenization 149 RichSequenceBuilderFactory seqFactory = this.getFactory(); 150 Namespace ns = this.getNamespace(); 151 152 DataInputStream in = new DataInputStream(queryURL.openStream()); 153 BufferedReader reader = new BufferedReader(new InputStreamReader(in)); 154 RichSequenceIterator seqI = RichSequence.IOTools.readGenbank(reader, rParser, seqFactory, ns); 155 156 return seqI.nextRichSequence(); 157 } catch (MalformedURLException e) { 158 throw new BioException("Failed to create Genbank URL",e); 159 } catch (BioException e) { 160 throw new BioException("Failed to read Genbank sequence",e); 161 } catch (IOException e) { 162 throw new BioException("IO failure whilst reading from Genbank",e); 163 } 164 } 165 166 /** 167 * Given the appropriate Genbank ID, return the matching RichSequence object. Additionally 168 * define a new Namespace for the received RichSequence object. 169 * @param id the Genbank ID to retrieve. 170 * @param nsp the Namespace to define. 171 * @return the matching RichSequence object, or null if not found. 172 * @throws Exception if the sequence could not be retrieved for reasons other 173 * than the identifier not being found. 174 */ 175 public RichSequence getRichSequence(String id, Namespace nsp) throws BioException, IllegalIDException { 176 try { 177 URL queryURL = getAddress(id); //get URL based on ID 178 179 SymbolTokenization rParser = ProteinTools.getTAlphabet().getTokenization("token"); //get SymbolTokenization 180 RichSequenceBuilderFactory seqFactory = this.getFactory(); 181 182 DataInputStream in = new DataInputStream(queryURL.openStream()); 183 BufferedReader reader = new BufferedReader(new InputStreamReader(in)); 184 RichSequenceIterator seqI = RichSequence.IOTools.readGenbank(reader, rParser, seqFactory, nsp); 185 186 return seqI.nextRichSequence(); 187 } catch (MalformedURLException e) { 188 throw new BioException("Failed to create Genbank URL",e); 189 } catch (BioException e) { 190 throw new BioException("Failed to read Genbank sequence",e); 191 } catch (IOException e) { 192 throw new BioException("IO failure whilst reading from Genbank",e); 193 } 194 } 195 196 197 /** 198 * Retrieve rich sequences from a Genbank 199 * 200 * @param list List of NCBI sequence number (GI), accession, accession.version, 201 * fasta or seqid. 202 * @return The rich database object (HashSequenceDB) with downloaded rich sequences. 203 * You will need to cast the sequences you get from this database object into 204 * RichSequence objects if you want to access their full features. 205 */ 206 public RichSequenceDB getRichSequences(Set list) throws BioException, IllegalIDException { 207 208 return getRichSequences(list, null); 209 } 210 211 /** 212 * Retrieve rich sequences from a Genbank 213 * 214 * @param list List of NCBI sequence number (GI), accession, accession.version, 215 * fasta or seqid. 216 * @param database Where to store rich sequences. If database is null, use an 217 * HashSequenceDB Object. 218 * @return The database object with downloaded rich sequences. 219 * You will need to cast the sequences you get from this database object into 220 * RichSequence objects if you want to access their full features. 221 */ 222 public RichSequenceDB getRichSequences(Set list, RichSequenceDB database) throws BioException, IllegalIDException { 223 try { 224 if (database == null) database = new HashRichSequenceDB(); 225 226 URL url = new URL(urlBatchSequences); 227 int port = url.getPort(); 228 String hostname = url.getHost(); 229 230 //Open the connection and the streams 231 Socket s = new Socket(hostname, port); 232 233 InputStream sin = s.getInputStream(); 234 BufferedReader fromServer = new BufferedReader(new InputStreamReader(sin)); 235 OutputStream sout = s.getOutputStream(); 236 PrintWriter toServer = new PrintWriter(new OutputStreamWriter(sout)); 237 238 // Put the Post request to the server 239 toServer.print(makeBatchRequest(url, list)); 240 toServer.flush(); 241 242 // Delete response headers 243 boolean finEntete = false; 244 for (String l = null; ((l = fromServer.readLine()) != null) && (!finEntete);) { 245 if (l.equals("")) finEntete = true; 246 } 247 248 SymbolTokenization rParser = ProteinTools.getTAlphabet().getTokenization("token"); //get SymbolTokenization 249 RichSequenceBuilderFactory seqFactory = this.getFactory(); 250 Namespace ns = this.getNamespace(); 251 252 RichSequenceIterator seqI = RichSequence.IOTools.readGenbank(fromServer, rParser, seqFactory, ns); 253 254 while (seqI.hasNext()) { 255 try { 256 database.addSequence(seqI.nextRichSequence()); 257 } catch (ChangeVetoException ce) { 258 throw new BioException("Unexpectedly couldn't add to the supplied RichSequenceDB", ce); 259 } 260 } 261 262 return database; 263 } catch (MalformedURLException e) { 264 throw new BioException("Failed to create Genbank URL",e); 265 } catch (BioException e) { 266 throw new BioException("Failed to read Genbank sequence",e); 267 } catch (IOException e) { 268 throw new BioException("IO failure whilst reading from Genbank",e); 269 } 270 } 271 272 public String getName() { 273 return "Genbank"; 274 } 275 276 public Set ids() { 277 throw new RuntimeException("Complete set of Genbank ids is unavailable."); 278 } 279 280 /** 281 * Holds value of property factory. 282 */ 283 private RichSequenceBuilderFactory factory; 284 285 /** 286 * Getter for property factory. 287 * @return Value of property factory. 288 */ 289 public RichSequenceBuilderFactory getFactory() { 290 291 return this.factory; 292 } 293 294 /** 295 * Setter for property factory. 296 * @param factory New value of property factory. 297 */ 298 public void setFactory(RichSequenceBuilderFactory factory) { 299 300 this.factory = factory; 301 } 302 303 /** 304 * Holds value of property namespace. 305 */ 306 private Namespace namespace; 307 308 /** 309 * Getter for property namespace. 310 * @return Value of property namespace. 311 */ 312 public Namespace getNamespace() { 313 314 return this.namespace; 315 } 316 317 /** 318 * Setter for property namespace. 319 * @param namespace New value of property namespace. 320 */ 321 public void setNamespace(Namespace namespace) { 322 323 this.namespace = namespace; 324 } 325}