001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.bio.seq.db;
022
023import java.net.MalformedURLException;
024import java.net.URL;
025
026import org.biojava.bio.BioException;
027import org.biojava.bio.BioRuntimeException;
028import org.biojava.bio.seq.DNATools;
029import org.biojava.bio.seq.ProteinTools;
030import org.biojava.bio.seq.io.FastaFormat;
031import org.biojava.bio.seq.io.GenbankFormat;
032import org.biojava.bio.seq.io.SequenceFormat;
033import org.biojava.bio.symbol.Alphabet;
034
035/** 
036 * @author Matthew Pocock
037 * @author Mark Schreiber
038 */
039public class NCBISequenceDB
040extends WebSequenceDB {
041  private String server;
042  private String CGI;
043  private SequenceFormat format;
044  private String dataBase;
045  private Alphabet alpha;
046  private String formatName;
047
048  public static final String DB_NUCLEOTIDE = "nucleotide";
049  public static final String DB_PROTEIN = "protein";
050
051
052  /**
053   * Default constructor, querys the Genbank nucleotide database on "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi"
054   * and retrieves sequences in FastaFormat.
055   */
056  public NCBISequenceDB(){
057    this("http://www.ncbi.nlm.nih.gov/","entrez/query.fcgi",DB_NUCLEOTIDE,new FastaFormat());
058  }
059
060  /**
061   * Parameterized constructor
062   * @param database must be one of "nucleotide" or "protein" (use the static DB fields)
063   * @param format must be one of <code>GenbankFormat</code> or <code>FastaFormat</code>
064   * @throws BioRuntimeException if the database or format is invalid
065   */
066  public NCBISequenceDB(String database, SequenceFormat format){
067    this("http://www.ncbi.nlm.nih.gov/","entrez/query.fcgi",database,format);
068  }
069
070  /**
071   * Parameterized constructor
072   * @param server eg "http://www.ncbi.nlm.nih.gov/"
073   * @param CGI eg "entrez/query.fcgi"
074   * @param database must be one of "nucleotide" or "protein" (use the static DB fields)
075   * @param format must be one of <code>GenbankFormat</code> or <code>FastaFormat</code>
076   * @throws BioRuntimeException if the database or format is invalid
077   */
078  public NCBISequenceDB(String server, String CGI, String database, SequenceFormat format)
079        throws BioRuntimeException{
080
081    this.server = server;
082    this.CGI = CGI;
083    try {
084      setDatabase(database);
085    }
086    catch (BioException ex) {
087      throw new BioRuntimeException(
088          "Database format must be one of {nucleotide, protein}");
089    }
090    try {
091      setSequenceFormat(format);
092    }
093    catch (BioException ex) {
094      throw new BioRuntimeException(
095          "SequenceFormat object must be one of {FastaFormat, GenbankFormat}");
096    }
097
098  }
099
100
101  public String getDataBase(){
102    return dataBase;
103  }
104
105  /**
106   *
107   * @param dataBase must be one of "nucleotide" or "protein" (use the static DB fields)
108   * @throws BioException if an unknown database name is used.
109   */
110  public void setDatabase(String dataBase) throws BioException{
111    if (dataBase == DB_NUCLEOTIDE) {
112
113      this.dataBase = DB_NUCLEOTIDE;
114      this.alpha = DNATools.getDNA();
115
116    }
117    else if (dataBase == DB_PROTEIN) {
118
119      this.dataBase = DB_PROTEIN;
120      this.alpha = ProteinTools.getAlphabet();
121
122    }
123    else {
124
125      throw new BioException(
126          "Database format must be one of {nucleotide, protein}");
127
128    }
129
130  }
131
132  public SequenceFormat getSequenceFormat() {
133    return format;
134  }
135
136  /**
137   *
138   * @param format must be one of <code>FastaFormat</code> or <code>GenbankFormat</code>
139   * @throws BioException if an unknown <code>SequenceFormat</code> is used
140   */
141  public void setSequenceFormat(SequenceFormat format) throws BioException{
142
143    if (format instanceof FastaFormat ) {
144      this.format = format;
145      this.formatName = "FASTA";
146    }
147    else if(format instanceof GenbankFormat){
148      this.format = format;
149      if (alpha == DNATools.getDNA()) {
150        this.formatName = "GenBank";
151      }
152      else {
153        this.formatName = "GenPept";
154      }
155
156    }
157    else {
158      throw new BioException("Only Genbank and FASTA formats currently supported");
159    }
160  }
161
162  protected Alphabet getAlphabet() {
163    return alpha;
164  }
165
166  protected URL getAddress(String uid)
167  throws MalformedURLException {
168    String query = "cmd=text&db="+dataBase+"&uid="+uid+"&dopt="+formatName;
169
170    return new URL(server + CGI + "?" + query);
171  }
172
173  public String getName() {
174    return "NCBI-Genbank";
175  }
176}