001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.bio.seq.db; 022 023import java.io.BufferedReader; 024import java.io.DataInputStream; 025import java.io.IOException; 026import java.io.InputStream; 027import java.io.InputStreamReader; 028import java.io.OutputStream; 029import java.io.OutputStreamWriter; 030import java.io.PrintWriter; 031import java.net.MalformedURLException; 032import java.net.Socket; 033import java.net.URL; 034import java.util.Iterator; 035import java.util.Set; 036 037import org.biojava.bio.BioException; 038import org.biojava.bio.seq.DNATools; 039import org.biojava.bio.seq.Sequence; 040import org.biojava.bio.seq.SequenceIterator; 041import org.biojava.bio.seq.io.GenbankFormat; 042import org.biojava.bio.seq.io.SeqIOTools; 043import org.biojava.bio.seq.io.SequenceFormat; 044import org.biojava.bio.symbol.Alphabet; 045import org.biojava.utils.ChangeVetoException; 046 047/** 048 * This class contains functions accessing DNA sequences in Genbank format. 049 * 050 * @author Lei Lai 051 * @author Matthew Pocock 052 * @author Laurent Jourdren 053 * @author Shuvankar Mukherjee 054 * @author Mark Schreiber 055 * @author Richard Holland 056 * @George Waldon 057 */ 058public class GenbankSequenceDB 059{ 060 private static SequenceFormat format = new GenbankFormat();;//return format of the sequence 061 private static String DBName="Genbank";//predefined the database name -- Genbank 062 protected boolean IOExceptionFound=false;//check if IOException is found 063 protected boolean ExceptionFound=false;//check if any exception is found 064 protected static final String urlBatchSequences = 065 "https://www.ncbi.nlm.nih.gov:80/entrez/eutils/efetch.fcgi"; 066 067 public GenbankSequenceDB() { 068 } 069 070 protected SequenceFormat getSequenceFormat() 071 { 072 return format; 073 } 074 075 protected Alphabet getAlphabet() 076 { 077 return DNATools.getDNA(); 078 } 079 080 /** 081 * Get the URL object for locating sequence object using eutils. 082 * The default value of the return format of the sequence object is text. 083 **/ 084 protected URL getAddress (String id) throws MalformedURLException 085 { 086 String defaultReturnFormat="text"; 087 return getAddress(id,defaultReturnFormat); 088 } 089 090 /** 091 * Get the URL object for locating sequence object using eutils. 092 * User could specify the return format of the sequence object. 093 */ 094 protected URL getAddress(String id, String format) throws MalformedURLException 095 { 096 FetchURL seqURL = new FetchURL(DBName, format); 097 String baseurl = seqURL.getbaseURL(); 098 String db = seqURL.getDB(); 099 String type = seqURL.getRetrievalType(); 100 String mode = seqURL.getRetrievalMode(); 101 String url = baseurl+db+"&id="+id+"&rettype="+type+"&retmode="+mode; 102 return new URL(url); 103 } 104 105 public String getName() 106 { 107 return DBName; 108 } 109 110 public Sequence getSequence(String id) throws Exception { 111 try { 112 IOExceptionFound = false; 113 ExceptionFound = false; 114 URL queryURL = getAddress(id); //get URL based on ID 115 116 // System.err.println("query is "+ queryURL.toString()); 117 118 //System.err.println("got data from " + queryURL); 119 120 DataInputStream in = new DataInputStream(queryURL.openStream()); 121 BufferedReader reader = new BufferedReader(new InputStreamReader(in)); 122 SequenceIterator seqI = SeqIOTools.readGenbank(reader); 123 124 return seqI.nextSequence(); 125 126 } catch (Exception e) { 127 System.out.println("Exception found in GenbankSequenceDB -- getSequence"); 128 System.out.println(e.toString()); 129 ExceptionFound = true; 130 IOExceptionFound = true; 131 return null; 132 } 133 } 134 135 public boolean checkIOException() 136 { 137 return IOExceptionFound; 138 } 139 140 public boolean checkException() 141 { 142 return ExceptionFound; 143 } 144 145 /** 146 * Create the Http Post Request to fetch (in batch mode) a list of sequence 147 * with Genbank. 148 * @param url URL of the request 149 * @param list List of sequence identifier 150 * @return The Post request. 151 */ 152 protected String makeBatchRequest(URL url, Set list) { 153 154 StringBuffer params = new StringBuffer(); 155 params.append("db=nucleotide&rettype=gb&id="); 156 157 boolean b = true; 158 for (Iterator i = list.iterator(); b;) { 159 String idSequence = (String) i.next(); 160 params.append(idSequence); 161 if(i.hasNext()){ 162 params.append(","); 163 }else{ 164 b =false; 165 //params.append("\r\n"); 166 } 167 } 168 169 StringBuffer header = new StringBuffer(); 170 header.append("POST "); 171 header.append(url.getPath()); 172 header.append( 173 " HTTP/1.0\r\n" 174 + "Connection: close\r\n" 175 + "Accept: text/html, text/plain\r\n" 176 + "Host: "); 177 178 header.append(url.getHost()); 179 header.append( 180 "\r\n" 181 + "User-Agent: Biojava/GenbankSequenceDB\r\n" 182 + "Content-Type: application/x-www-form-urlencoded\r\n" 183 + "Content-Length: "); 184 header.append(params.length()); 185 header.append("\r\n\r\n"); 186 187 StringBuffer request = new StringBuffer(); 188 request.append(header); 189 request.append(params); 190 191 return request.toString(); 192 } 193 194 /** 195 * Retrieve sequences from a Genbank 196 * 197 * @param list List of NCBI sequence number (GI), accession, accession.version, 198 * fasta or seqid. 199 * @return The database object (HashSequenceDB) with downloaded sequences. 200 */ 201 public SequenceDB getSequences(Set list) throws BioException { 202 203 return getSequences(list, null); 204 } 205 206 /** 207 * Retrieve sequences from a Genbank 208 * 209 * @param list List of NCBI sequence number (GI), accession, accession.version, 210 * fasta or seqid. 211 * @param database Where to store sequences. if database is null, use an 212 * HashSequenceDB Objet. 213 * @return The database object with downloaded sequences. 214 */ 215 public SequenceDB getSequences(Set list, SequenceDB database) 216 throws BioException { 217 218 if (database == null) 219 database = new HashSequenceDB(); 220 221 try { 222 223 URL url = new URL(urlBatchSequences); 224 int port = url.getPort(); 225 String hostname = url.getHost(); 226 227 //Open the connection and the streams 228 Socket s = new Socket(hostname, port); 229 230 InputStream sin = s.getInputStream(); 231 BufferedReader fromServer = 232 new BufferedReader(new InputStreamReader(sin)); 233 OutputStream sout = s.getOutputStream(); 234 PrintWriter toServer = new PrintWriter(new OutputStreamWriter(sout)); 235 236 // Put the Post request to the server 237 toServer.print(makeBatchRequest(url, list)); 238 toServer.flush(); 239 240 // Delete response headers 241 boolean finEntete = false; 242 for (String l = null; 243 ((l = fromServer.readLine()) != null) && (!finEntete); 244 ) 245 if (l.equals("")) 246 finEntete = true; 247 248 SequenceIterator seqI = SeqIOTools.readGenbank(fromServer); 249 250 while (seqI.hasNext()) 251 database.addSequence(seqI.nextSequence()); 252 253 } catch (MalformedURLException e) { 254 throw new BioException(e,"Exception found in GenbankSequenceDB -- getSequences"); 255 } catch (IOException e) { 256 throw new BioException(e,"Exception found in GenbankSequenceDB -- getSequences"); 257 } catch (BioException e) { 258 throw new BioException(e,"Exception found in GenbankSequenceDB -- getSequences"); 259 } catch (ChangeVetoException e) { 260 throw new BioException(e,"Exception found in GenbankSequenceDB -- getSequences"); 261 } 262 263 return database; 264 } 265}