001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022package org.biojava.bio.molbio;
023
024import java.io.Serializable;
025
026import org.biojava.bio.BioError;
027import org.biojava.bio.BioException;
028import org.biojava.bio.seq.DNATools;
029import org.biojava.bio.symbol.FiniteAlphabet;
030import org.biojava.bio.symbol.IllegalAlphabetException;
031import org.biojava.bio.symbol.IllegalSymbolException;
032import org.biojava.bio.symbol.MotifTools;
033import org.biojava.bio.symbol.Symbol;
034import org.biojava.bio.symbol.SymbolList;
035
036/**
037 * <code>RestrictionEnzyme</code> represents a restriction enzyme
038 * according to the REBASE standard. The cut positions are indicated
039 * relative to the 5' end of the recognition site and occur downstream
040 * of the given residue. Note that some enzymes cut in more than one
041 * position and that cut positions may occur outside the recognition
042 * site.
043 *
044 * @author Keith James
045 * @author George Waldon
046 * @since 1.3
047 */
048public class RestrictionEnzyme implements Serializable
049{
050    /**
051     * <code>CUT_SIMPLE</code> a cut type where the enzyme cuts in one
052     * position relative to the recognition site. This covers the vast
053     * majority of cases.
054     */
055    public static final int CUT_SIMPLE = 0;
056
057    /**
058     * <code>CUT_COMPOUND</code> a cut type where the enzyme cuts in
059     * two positions relative to the recognition site.
060     */
061    public static final int CUT_COMPOUND = 1;
062
063    /**
064     * <code>OVERHANG_5PRIME</code> the sticky end type created by
065     * enzymes which leave a 5' overhang (e.g. a stretch of single-stranded
066     * DNA with a free 5' end).
067     */
068    public static final int OVERHANG_5PRIME = 0;
069
070    /**
071     * <code>OVERHANG_3PRIME</code> the sticky end type created by
072     * enzymes which leave a 3' overhang (e.g. a stretch of single-stranded
073     * DNA with a free 3' end).
074     */
075    public static final int OVERHANG_3PRIME = 1;
076
077    /**
078     * <code>BLUNT</code> the end type created by enzymes which leave
079     * a blunt end.
080     */
081    public static final int BLUNT = 2;
082
083    protected String name;
084    protected SymbolList site;
085    protected int cutType;
086    protected int [] dsCutPositions;
087    protected int [] usCutPositions;
088    private double size = 0.0;
089
090    protected String forwardRegex;
091    protected String reverseRegex;
092
093    private String summary;
094
095    private RestrictionEnzyme prototype;
096
097    /**
098     * Creates a new <code>RestrictionEnzyme</code> which cuts within
099     * or downstream of the recognition site. The cut position indices
100     * are <strong>always</strong> in the same coordinate space as the
101     * recognition site. <code>RestrictionEnzyme</code>s are
102     * immutable.
103     *
104     * @param name a <code>String</code> such as EcoRI.
105     * @param site a <code>SymbolList</code> recognition site.
106     * @param dsForward an <code>int</code> index in the forward
107     * strand (the strand conventionally written
108     * <strong>5'</strong>-3') of the recognition site at which the
109     * cut occurs. The cut occurs between this base and the following
110     * one.
111     * @param dsReverse an <code>int</code> index in the reverse
112     * strand (the strand conventionally written
113     * <strong>3'</strong>-5') of the recognition site at which the
114     * cut occurs. The cut occurs between this base and the following
115     * one.
116     *
117     * @exception IllegalAlphabetException if an error occurs.
118     */
119    public RestrictionEnzyme(String name, SymbolList site,
120                             int dsForward, int dsReverse)
121        throws IllegalAlphabetException
122    {
123        this(name, site,
124             null,
125             new int [] { dsForward, dsReverse });
126        cutType = CUT_SIMPLE;
127    }
128
129    /**
130     * Creates a new <code>RestrictionEnzyme</code> of the unusual
131     * type which cuts both upstream and downstream of its recognition
132     * site. The cut position indices are <strong>always</strong> in
133     * the same coordinate space as the recognition site.
134     *
135     * @param name a <code>String</code> such as Bsp24I.
136     * @param site a <code>SymbolList</code> recognition site.
137     * @param usForward an <code>int</code> index in the forward
138     * strand (the strand conventionally written
139     * <strong>5'</strong>-3' upstream of the recognition site at
140     * which the cut occurs. The cut occurs between this base and the
141     * following one.
142     * @param usReverse an <code>int</code> index in the reverse
143     * strand (the strand conventionally written
144     * <strong>3'</strong>-5) upstream of the recognition site at
145     * which the cut occurs. The cut occurs between this base and the
146     * following one.
147     * @param dsForward an <code>int</code> index in the forward
148     * strand (the strand conventionally written
149     * <strong>5'</strong>-3') downstream of the recognition site at
150     * which the cut occurs. The cut occurs between this base and the
151     * following one.
152     * @param dsReverse an <code>int</code> index in the reverse
153     * strand (the strand conventionally written
154     * <strong>3'</strong>-5') downstream of the recognition site at
155     * which the cut occurs. The cut occurs between this base and the
156     * following one.
157     *
158     * @exception IllegalAlphabetException if an error occurs.
159     */
160    public RestrictionEnzyme(String name, SymbolList site,
161                             int usForward, int usReverse,
162                             int dsForward, int dsReverse)
163        throws IllegalAlphabetException
164    {
165        this(name, site,
166             new int [] { usForward, usReverse },
167             new int [] { dsForward, dsReverse });
168        cutType = CUT_COMPOUND;
169    }
170
171    /**
172     * Creates a new <code>RestrictionEnzyme</code>.
173     *
174     * @param name a <code>String</code> name.
175     * @param site a <code>SymbolList</code> site.
176     * @param usCutPositions an <code>int []</code> array of optional
177     * upstream indices.
178     * @param dsCutPositions an <code>int []</code> array of
179     * downstream indices.
180     *
181     * @exception IllegalAlphabetException if an error occurs.
182     */
183    private RestrictionEnzyme(String name, SymbolList site,
184                              int [] usCutPositions,
185                              int [] dsCutPositions)
186        throws IllegalAlphabetException
187    {
188        if (site.getAlphabet() != DNATools.getDNA())
189            throw new IllegalAlphabetException("RestrictionEnzyme site can only be a DNA SymbolList."
190                                               + " A SymbolList using the "
191                                               + site.getAlphabet().getName()
192                                               + " was supplied" );
193        this.name = name;
194        this.site = site;
195        this.usCutPositions = usCutPositions;
196        this.dsCutPositions = dsCutPositions;
197
198        forwardRegex = MotifTools.createRegex(site);
199
200        try
201        {
202            reverseRegex =
203                MotifTools.createRegex(DNATools.reverseComplement(site));
204        }
205        catch (IllegalAlphabetException iae)
206        {
207            throw new BioError("RestrictionEnzyme site was not composed of a complementable Alphabet", iae);
208        }
209
210        StringBuffer sb = new StringBuffer();
211        sb.append(name);
212        sb.append(" ");
213
214        if (usCutPositions != null)
215        {
216            sb.append("(");
217            sb.append(usCutPositions[0]);
218            sb.append("/");
219            sb.append(usCutPositions[1]);
220            sb.append(") ");
221        }
222
223        try
224        {
225            for (int i = 1; i <= site.length(); i++)
226                sb.append(Character.toUpperCase(DNATools.dnaToken(site.symbolAt(i))));
227        }
228        catch (IllegalSymbolException ise)
229        {
230            throw new BioError("RestrictionEnzyme site contained non-DNA Symbol", ise);
231        }
232
233        sb.append(" (");
234        sb.append(dsCutPositions[0]);
235        sb.append("/");
236        sb.append(dsCutPositions[1]);
237        sb.append(")");
238
239        summary = sb.substring(0);
240    }
241
242    /**
243     * <code>getName</code> returns the enzyme name.
244     *
245     * @return a <code>String</code>.
246     */
247    public String getName()
248    {
249        return name;
250    }
251
252    /**
253     * <code>getRecognitionSite</code> returns the forward strand of
254     * the recognition site.
255     *
256     * @return a <code>SymbolList</code>.
257     */
258    public SymbolList getRecognitionSite()
259    {
260        return site;
261    }
262
263    /**
264     * <code>getForwardRegex</code> returns a regular expression which
265     * matches the forward strand of the recognition site.
266     *
267     * @return a <code>String</code>.
268     */
269    public String getForwardRegex()
270    {
271        return forwardRegex;
272    }
273
274    /**
275     * <code>getReverseRegex</code> returns a regular expression which
276     * matches the reverse strand of the recognition site.
277     *
278     * @return a <code>String</code>.
279     */
280    public String getReverseRegex()
281    {
282        return reverseRegex;
283    }
284
285    /**
286     * <code>isPalindromic</code> returns true if the recognition site
287     * is palindromic.
288     *
289     * @return a <code>boolean</code>.
290     */
291    public boolean isPalindromic()
292    {
293        return forwardRegex.equals(reverseRegex);
294    }
295
296    /**
297     * <code>getCutType</code> returns the type of cut produced by the
298     * enzyme. This will be one of either RestrictionEnzyme.CUT_SIMPLE
299     * (where it cuts in one position relative to the recognition site
300     * i.e. the vast majority of cases) or
301     * RestrictionEnzyme.CUT_COMPOUND (where it cuts in two positions).
302     *
303     * @return an <code>int</code>.
304     */
305    public int getCutType()
306    {
307        return cutType;
308    }
309
310    /**
311     * <code>getDownstreamCut</code> returns the cut site within or
312     * downstream of the recognition site.
313     *
314     * @return an <code>int []</code> array with the position in the
315     * 5'-strand at index 0 and the 3'-strand at index 1.
316     */
317    public int [] getDownstreamCut()
318    {
319        return dsCutPositions;
320    }
321
322    /**
323     * <code>getUpstreamCut</code> returns the cut site upstream of
324     * the recognition site.
325     *
326     * @return an <code>int []</code> array with the position in the
327     * 5'-strand at index 0 and the 3'-strand at index 1. For example,
328     * Bsp24I will return -8 and -13:
329     *
330     *          5'      ^NNNNNNNNGACNNNNNNTGGNNNNNNNNNNNN^   3'
331     *          3' ^NNNNNNNNNNNNNCTGNNNNNNACCNNNNNNN^        5'
332     *
333     * @exception BioException if the enzyme does not cleave on both
334     * sides of its recognition site.
335     */
336    public int [] getUpstreamCut() throws BioException
337    {
338        if (cutType == CUT_SIMPLE)
339            throw new BioException(name + " does not cut upstream of the recognition site");
340
341        return usCutPositions;
342    }
343
344    /**
345     * <code>getDownstreamEndType</code> returns the double-stranded
346     * end type produced by the primary (intra-site or downstream)
347     * cut.
348     *
349     * @return an <code>int</code> equal to one of the constant fields
350     * OVERHANG_5PRIME, OVERHANG_3PRIME or BLUNT.
351     */
352    public int getDownstreamEndType()
353    {
354        if (dsCutPositions[0] > dsCutPositions[1])
355            return OVERHANG_3PRIME;
356        else if (dsCutPositions[0] < dsCutPositions[1])
357            return OVERHANG_5PRIME;
358        else
359            return BLUNT;
360    }
361
362    /**
363     * <code>getUpstreamEndType</code> returns the double-stranded end
364     * type produced by the secondary (upstream) cut.
365     *
366     * @return an <code>int</code> equal to one of the constant fields
367     * OVERHANG_5PRIME, OVERHANG_3PRIME or BLUNT.
368     *
369     * @exception BioException if the enzyme does not cleave on both
370     * sides of its recognition site.
371     */
372    public int getUpstreamEndType() throws BioException
373    {
374        if (cutType == CUT_SIMPLE)
375            throw new BioException(name + " does not cut upstream of the recognition site");
376
377        if (usCutPositions[0] > usCutPositions[1])
378            return OVERHANG_3PRIME;
379        else if (usCutPositions[0] < usCutPositions[1])
380            return OVERHANG_5PRIME;
381        else
382            return BLUNT;
383    }
384
385    /** Set the prototype of this <code>RestrictionEnzyme</code>.
386     *
387     * @param proto an isoschizomer of this enzyme.
388     */
389    public void setProtype(RestrictionEnzyme proto) {
390        prototype = proto;
391    }
392
393    /** The prototype is a <code>RestrictionEnzyme</code> that represents a set
394     * of isoshizomers. The choice of the representative/prototype is arbitrary;
395     * there is one and only one prototype per set of
396     * isoschizomers.
397     *
398     * @return A representative isoschisomer or null if prototypes are not defined.
399     */
400    public RestrictionEnzyme getPrototype() {
401        return prototype;
402    }
403
404    public boolean isPrototype() {
405        if(prototype==null)
406            return false;
407        return this==prototype;
408    }
409
410    /** The cutting size of a restriction enzyme is defined has the number
411     * of nucleotides that are directly involved in the recognition sequence.
412     * The size is ponderated as follow: 1 for a single nucleotide, 1/2
413     * for a degeneracy of 2, 1/4 for a degeneracy of 3, and 0 for any N nucleotides.
414     */
415    public synchronized double getCuttingSize() {
416        if(size == 0) {
417            SymbolList symbols = getRecognitionSite();
418            double tempsize = 0;
419            for (int i = 1; i <= symbols.length(); i++) {
420                Symbol s = symbols.symbolAt(i);
421                FiniteAlphabet a = (FiniteAlphabet) s.getMatches();
422                int cs = a.size();
423                if(cs==1)
424                    tempsize++;
425                else if(cs==2)
426                    tempsize += 0.5;
427                else if(cs==3)
428                    tempsize += 0.25;
429            }
430            size = tempsize;
431        }
432      return size;
433    }
434
435    public int hashCode()
436    {
437        return name.hashCode() ^ forwardRegex.hashCode();
438    }
439
440    public boolean equals(Object o)
441    {
442        return (o instanceof RestrictionEnzyme)
443            && name.equals(((RestrictionEnzyme) o).getName());
444    }
445
446    public String toString()
447    {
448        return summary;
449    }
450}