001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojava.bio.molbio; 023 024import java.io.Serializable; 025 026import org.biojava.bio.BioError; 027import org.biojava.bio.BioException; 028import org.biojava.bio.seq.DNATools; 029import org.biojava.bio.symbol.FiniteAlphabet; 030import org.biojava.bio.symbol.IllegalAlphabetException; 031import org.biojava.bio.symbol.IllegalSymbolException; 032import org.biojava.bio.symbol.MotifTools; 033import org.biojava.bio.symbol.Symbol; 034import org.biojava.bio.symbol.SymbolList; 035 036/** 037 * <code>RestrictionEnzyme</code> represents a restriction enzyme 038 * according to the REBASE standard. The cut positions are indicated 039 * relative to the 5' end of the recognition site and occur downstream 040 * of the given residue. Note that some enzymes cut in more than one 041 * position and that cut positions may occur outside the recognition 042 * site. 043 * 044 * @author Keith James 045 * @author George Waldon 046 * @since 1.3 047 */ 048public class RestrictionEnzyme implements Serializable 049{ 050 /** 051 * <code>CUT_SIMPLE</code> a cut type where the enzyme cuts in one 052 * position relative to the recognition site. This covers the vast 053 * majority of cases. 054 */ 055 public static final int CUT_SIMPLE = 0; 056 057 /** 058 * <code>CUT_COMPOUND</code> a cut type where the enzyme cuts in 059 * two positions relative to the recognition site. 060 */ 061 public static final int CUT_COMPOUND = 1; 062 063 /** 064 * <code>OVERHANG_5PRIME</code> the sticky end type created by 065 * enzymes which leave a 5' overhang (e.g. a stretch of single-stranded 066 * DNA with a free 5' end). 067 */ 068 public static final int OVERHANG_5PRIME = 0; 069 070 /** 071 * <code>OVERHANG_3PRIME</code> the sticky end type created by 072 * enzymes which leave a 3' overhang (e.g. a stretch of single-stranded 073 * DNA with a free 3' end). 074 */ 075 public static final int OVERHANG_3PRIME = 1; 076 077 /** 078 * <code>BLUNT</code> the end type created by enzymes which leave 079 * a blunt end. 080 */ 081 public static final int BLUNT = 2; 082 083 protected String name; 084 protected SymbolList site; 085 protected int cutType; 086 protected int [] dsCutPositions; 087 protected int [] usCutPositions; 088 private double size = 0.0; 089 090 protected String forwardRegex; 091 protected String reverseRegex; 092 093 private String summary; 094 095 private RestrictionEnzyme prototype; 096 097 /** 098 * Creates a new <code>RestrictionEnzyme</code> which cuts within 099 * or downstream of the recognition site. The cut position indices 100 * are <strong>always</strong> in the same coordinate space as the 101 * recognition site. <code>RestrictionEnzyme</code>s are 102 * immutable. 103 * 104 * @param name a <code>String</code> such as EcoRI. 105 * @param site a <code>SymbolList</code> recognition site. 106 * @param dsForward an <code>int</code> index in the forward 107 * strand (the strand conventionally written 108 * <strong>5'</strong>-3') of the recognition site at which the 109 * cut occurs. The cut occurs between this base and the following 110 * one. 111 * @param dsReverse an <code>int</code> index in the reverse 112 * strand (the strand conventionally written 113 * <strong>3'</strong>-5') of the recognition site at which the 114 * cut occurs. The cut occurs between this base and the following 115 * one. 116 * 117 * @exception IllegalAlphabetException if an error occurs. 118 */ 119 public RestrictionEnzyme(String name, SymbolList site, 120 int dsForward, int dsReverse) 121 throws IllegalAlphabetException 122 { 123 this(name, site, 124 null, 125 new int [] { dsForward, dsReverse }); 126 cutType = CUT_SIMPLE; 127 } 128 129 /** 130 * Creates a new <code>RestrictionEnzyme</code> of the unusual 131 * type which cuts both upstream and downstream of its recognition 132 * site. The cut position indices are <strong>always</strong> in 133 * the same coordinate space as the recognition site. 134 * 135 * @param name a <code>String</code> such as Bsp24I. 136 * @param site a <code>SymbolList</code> recognition site. 137 * @param usForward an <code>int</code> index in the forward 138 * strand (the strand conventionally written 139 * <strong>5'</strong>-3' upstream of the recognition site at 140 * which the cut occurs. The cut occurs between this base and the 141 * following one. 142 * @param usReverse an <code>int</code> index in the reverse 143 * strand (the strand conventionally written 144 * <strong>3'</strong>-5) upstream of the recognition site at 145 * which the cut occurs. The cut occurs between this base and the 146 * following one. 147 * @param dsForward an <code>int</code> index in the forward 148 * strand (the strand conventionally written 149 * <strong>5'</strong>-3') downstream of the recognition site at 150 * which the cut occurs. The cut occurs between this base and the 151 * following one. 152 * @param dsReverse an <code>int</code> index in the reverse 153 * strand (the strand conventionally written 154 * <strong>3'</strong>-5') downstream of the recognition site at 155 * which the cut occurs. The cut occurs between this base and the 156 * following one. 157 * 158 * @exception IllegalAlphabetException if an error occurs. 159 */ 160 public RestrictionEnzyme(String name, SymbolList site, 161 int usForward, int usReverse, 162 int dsForward, int dsReverse) 163 throws IllegalAlphabetException 164 { 165 this(name, site, 166 new int [] { usForward, usReverse }, 167 new int [] { dsForward, dsReverse }); 168 cutType = CUT_COMPOUND; 169 } 170 171 /** 172 * Creates a new <code>RestrictionEnzyme</code>. 173 * 174 * @param name a <code>String</code> name. 175 * @param site a <code>SymbolList</code> site. 176 * @param usCutPositions an <code>int []</code> array of optional 177 * upstream indices. 178 * @param dsCutPositions an <code>int []</code> array of 179 * downstream indices. 180 * 181 * @exception IllegalAlphabetException if an error occurs. 182 */ 183 private RestrictionEnzyme(String name, SymbolList site, 184 int [] usCutPositions, 185 int [] dsCutPositions) 186 throws IllegalAlphabetException 187 { 188 if (site.getAlphabet() != DNATools.getDNA()) 189 throw new IllegalAlphabetException("RestrictionEnzyme site can only be a DNA SymbolList." 190 + " A SymbolList using the " 191 + site.getAlphabet().getName() 192 + " was supplied" ); 193 this.name = name; 194 this.site = site; 195 this.usCutPositions = usCutPositions; 196 this.dsCutPositions = dsCutPositions; 197 198 forwardRegex = MotifTools.createRegex(site); 199 200 try 201 { 202 reverseRegex = 203 MotifTools.createRegex(DNATools.reverseComplement(site)); 204 } 205 catch (IllegalAlphabetException iae) 206 { 207 throw new BioError("RestrictionEnzyme site was not composed of a complementable Alphabet", iae); 208 } 209 210 StringBuffer sb = new StringBuffer(); 211 sb.append(name); 212 sb.append(" "); 213 214 if (usCutPositions != null) 215 { 216 sb.append("("); 217 sb.append(usCutPositions[0]); 218 sb.append("/"); 219 sb.append(usCutPositions[1]); 220 sb.append(") "); 221 } 222 223 try 224 { 225 for (int i = 1; i <= site.length(); i++) 226 sb.append(Character.toUpperCase(DNATools.dnaToken(site.symbolAt(i)))); 227 } 228 catch (IllegalSymbolException ise) 229 { 230 throw new BioError("RestrictionEnzyme site contained non-DNA Symbol", ise); 231 } 232 233 sb.append(" ("); 234 sb.append(dsCutPositions[0]); 235 sb.append("/"); 236 sb.append(dsCutPositions[1]); 237 sb.append(")"); 238 239 summary = sb.substring(0); 240 } 241 242 /** 243 * <code>getName</code> returns the enzyme name. 244 * 245 * @return a <code>String</code>. 246 */ 247 public String getName() 248 { 249 return name; 250 } 251 252 /** 253 * <code>getRecognitionSite</code> returns the forward strand of 254 * the recognition site. 255 * 256 * @return a <code>SymbolList</code>. 257 */ 258 public SymbolList getRecognitionSite() 259 { 260 return site; 261 } 262 263 /** 264 * <code>getForwardRegex</code> returns a regular expression which 265 * matches the forward strand of the recognition site. 266 * 267 * @return a <code>String</code>. 268 */ 269 public String getForwardRegex() 270 { 271 return forwardRegex; 272 } 273 274 /** 275 * <code>getReverseRegex</code> returns a regular expression which 276 * matches the reverse strand of the recognition site. 277 * 278 * @return a <code>String</code>. 279 */ 280 public String getReverseRegex() 281 { 282 return reverseRegex; 283 } 284 285 /** 286 * <code>isPalindromic</code> returns true if the recognition site 287 * is palindromic. 288 * 289 * @return a <code>boolean</code>. 290 */ 291 public boolean isPalindromic() 292 { 293 return forwardRegex.equals(reverseRegex); 294 } 295 296 /** 297 * <code>getCutType</code> returns the type of cut produced by the 298 * enzyme. This will be one of either RestrictionEnzyme.CUT_SIMPLE 299 * (where it cuts in one position relative to the recognition site 300 * i.e. the vast majority of cases) or 301 * RestrictionEnzyme.CUT_COMPOUND (where it cuts in two positions). 302 * 303 * @return an <code>int</code>. 304 */ 305 public int getCutType() 306 { 307 return cutType; 308 } 309 310 /** 311 * <code>getDownstreamCut</code> returns the cut site within or 312 * downstream of the recognition site. 313 * 314 * @return an <code>int []</code> array with the position in the 315 * 5'-strand at index 0 and the 3'-strand at index 1. 316 */ 317 public int [] getDownstreamCut() 318 { 319 return dsCutPositions; 320 } 321 322 /** 323 * <code>getUpstreamCut</code> returns the cut site upstream of 324 * the recognition site. 325 * 326 * @return an <code>int []</code> array with the position in the 327 * 5'-strand at index 0 and the 3'-strand at index 1. For example, 328 * Bsp24I will return -8 and -13: 329 * 330 * 5' ^NNNNNNNNGACNNNNNNTGGNNNNNNNNNNNN^ 3' 331 * 3' ^NNNNNNNNNNNNNCTGNNNNNNACCNNNNNNN^ 5' 332 * 333 * @exception BioException if the enzyme does not cleave on both 334 * sides of its recognition site. 335 */ 336 public int [] getUpstreamCut() throws BioException 337 { 338 if (cutType == CUT_SIMPLE) 339 throw new BioException(name + " does not cut upstream of the recognition site"); 340 341 return usCutPositions; 342 } 343 344 /** 345 * <code>getDownstreamEndType</code> returns the double-stranded 346 * end type produced by the primary (intra-site or downstream) 347 * cut. 348 * 349 * @return an <code>int</code> equal to one of the constant fields 350 * OVERHANG_5PRIME, OVERHANG_3PRIME or BLUNT. 351 */ 352 public int getDownstreamEndType() 353 { 354 if (dsCutPositions[0] > dsCutPositions[1]) 355 return OVERHANG_3PRIME; 356 else if (dsCutPositions[0] < dsCutPositions[1]) 357 return OVERHANG_5PRIME; 358 else 359 return BLUNT; 360 } 361 362 /** 363 * <code>getUpstreamEndType</code> returns the double-stranded end 364 * type produced by the secondary (upstream) cut. 365 * 366 * @return an <code>int</code> equal to one of the constant fields 367 * OVERHANG_5PRIME, OVERHANG_3PRIME or BLUNT. 368 * 369 * @exception BioException if the enzyme does not cleave on both 370 * sides of its recognition site. 371 */ 372 public int getUpstreamEndType() throws BioException 373 { 374 if (cutType == CUT_SIMPLE) 375 throw new BioException(name + " does not cut upstream of the recognition site"); 376 377 if (usCutPositions[0] > usCutPositions[1]) 378 return OVERHANG_3PRIME; 379 else if (usCutPositions[0] < usCutPositions[1]) 380 return OVERHANG_5PRIME; 381 else 382 return BLUNT; 383 } 384 385 /** Set the prototype of this <code>RestrictionEnzyme</code>. 386 * 387 * @param proto an isoschizomer of this enzyme. 388 */ 389 public void setProtype(RestrictionEnzyme proto) { 390 prototype = proto; 391 } 392 393 /** The prototype is a <code>RestrictionEnzyme</code> that represents a set 394 * of isoshizomers. The choice of the representative/prototype is arbitrary; 395 * there is one and only one prototype per set of 396 * isoschizomers. 397 * 398 * @return A representative isoschisomer or null if prototypes are not defined. 399 */ 400 public RestrictionEnzyme getPrototype() { 401 return prototype; 402 } 403 404 public boolean isPrototype() { 405 if(prototype==null) 406 return false; 407 return this==prototype; 408 } 409 410 /** The cutting size of a restriction enzyme is defined has the number 411 * of nucleotides that are directly involved in the recognition sequence. 412 * The size is ponderated as follow: 1 for a single nucleotide, 1/2 413 * for a degeneracy of 2, 1/4 for a degeneracy of 3, and 0 for any N nucleotides. 414 */ 415 public synchronized double getCuttingSize() { 416 if(size == 0) { 417 SymbolList symbols = getRecognitionSite(); 418 double tempsize = 0; 419 for (int i = 1; i <= symbols.length(); i++) { 420 Symbol s = symbols.symbolAt(i); 421 FiniteAlphabet a = (FiniteAlphabet) s.getMatches(); 422 int cs = a.size(); 423 if(cs==1) 424 tempsize++; 425 else if(cs==2) 426 tempsize += 0.5; 427 else if(cs==3) 428 tempsize += 0.25; 429 } 430 size = tempsize; 431 } 432 return size; 433 } 434 435 public int hashCode() 436 { 437 return name.hashCode() ^ forwardRegex.hashCode(); 438 } 439 440 public boolean equals(Object o) 441 { 442 return (o instanceof RestrictionEnzyme) 443 && name.equals(((RestrictionEnzyme) o).getName()); 444 } 445 446 public String toString() 447 { 448 return summary; 449 } 450}