001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojava.bio.molbio; 023 024import java.io.BufferedReader; 025import java.io.InputStream; 026import java.io.InputStreamReader; 027import java.util.ArrayList; 028import java.util.Collections; 029import java.util.HashMap; 030import java.util.HashSet; 031import java.util.Iterator; 032import java.util.List; 033import java.util.Map; 034import java.util.ResourceBundle; 035import java.util.Set; 036import java.util.regex.Pattern; 037 038import org.biojava.bio.Annotation; 039import org.biojava.bio.BioError; 040import org.biojava.bio.SmallAnnotation; 041import org.biojava.bio.program.tagvalue.ChangeTable; 042import org.biojava.bio.program.tagvalue.LineSplitParser; 043import org.biojava.bio.program.tagvalue.Parser; 044import org.biojava.bio.program.tagvalue.RegexSplitter; 045import org.biojava.bio.program.tagvalue.TagDropper; 046import org.biojava.bio.program.tagvalue.TagValueContext; 047import org.biojava.bio.program.tagvalue.TagValueListener; 048import org.biojava.bio.program.tagvalue.TagValueParser; 049import org.biojava.bio.program.tagvalue.ValueChanger; 050import org.biojava.bio.seq.DNATools; 051import org.biojava.bio.symbol.IllegalAlphabetException; 052import org.biojava.bio.symbol.IllegalSymbolException; 053import org.biojava.bio.symbol.SymbolList; 054import org.biojava.utils.ChangeListener; 055import org.biojava.utils.ChangeType; 056import org.biojava.utils.ChangeVetoException; 057import org.biojava.utils.ParserException; 058import org.biojava.utils.SmallSet; 059 060/** 061 * <p><code>RestrictionEnzymeManager</code> manages collections of 062 * static <code>RestrictionEnzyme</code> instances. A properties file 063 * should be placed in the CLASSPATH containing a key 064 * "rebase.data.file" and a corresponding value of a REBASE file 065 * (standard REBASE format #31 conventionally named withrefm.### where 066 * ### is the version number). This file will be loaded by the 067 * <code>RestrictionEnzymeManager</code> <code>ClassLoader</code>. The 068 * properties are loaded as a <code>ResourceBundle</code>, so the file 069 * should be named "RestrictionEnzymeManager.properties".</p> 070 * <p>Since 1.5, a format #31 REBASE file can be loaded at anytime 071 * using the method <code>loadEnzymeFile</code> and optionally filtered 072 * for commercially available enzymes.</p> 073 * 074 * @author Keith James 075 * @author George Waldon 076 * @since 1.3 077 078 */ 079public final class RestrictionEnzymeManager 080{ 081 /** 082 * <code>REBASE_DATA_KEY</code> the ResourceBundle key which 083 * specifies the location of the REBASE flat file. 084 */ 085 public static final String REBASE_DATA_KEY = "rebase.data.file"; 086 087 /** 088 * <code>REBASE_TAG_NAME</code> the REBASE tag containing the 089 * enzyme name. 090 */ 091 public static final String REBASE_TAG_NAME = "<1>"; 092 093 /** 094 * <code>REBASE_TAG_ISZR</code> the REBASE tag containing the 095 * enzyme isoschizomers. 096 */ 097 public static final String REBASE_TAG_ISZR = "<2>"; 098 099 /** 100 * <code>REBASE_TAG_SITE</code> the REBASE tag containing the 101 * enzyme site. 102 */ 103 public static final String REBASE_TAG_SITE = "<3>"; 104 105 /** 106 * <code>REBASE_TAG_METH</code> the REBASE tag containing the 107 * methylation site. 108 */ 109 public static final String REBASE_TAG_METH = "<4>"; 110 111 /** 112 * <code>REBASE_TAG_ORGN</code> the REBASE tag containing the 113 * organism. 114 */ 115 public static final String REBASE_TAG_ORGN = "<5>"; 116 117 /** 118 * <code>REBASE_TAG_SRCE</code> the REBASE tag containing the 119 * source. 120 */ 121 public static final String REBASE_TAG_SRCE = "<6>"; 122 123 /** 124 * <code>REBASE_TAG_COMM</code> the REBASE tag containing the 125 * commercial suppliers. 126 */ 127 public static final String REBASE_TAG_COMM = "<7>"; 128 129 /** 130 * <code>REBASE_TAG_REFS</code> the REBASE tag containing the 131 * references. 132 */ 133 public static final String REBASE_TAG_REFS = "<8>"; 134 135 136 private static boolean loadCommercialOnly = false; 137 138 private static ResourceBundle bundle = 139 ResourceBundle.getBundle(RestrictionEnzymeManager.class.getName()); 140 141 static 142 { 143 String rebaseDataFileName = bundle.getString(REBASE_DATA_KEY); 144 InputStream is = RestrictionEnzymeManager.class.getResourceAsStream(rebaseDataFileName); 145 loadData(is); 146 } 147 148 private static Map nameToSite; 149 private static Map nameToEnzyme; 150 private static Map nameToIsoschizomers; 151 private static Map sizeToCutters; 152 private static Map enzymeToPattern; 153 private static Map enzymeToAnnotation; 154 private static Map enzymeToSuppliers; 155 156 /** 157 * <code>RestrictionEnzymeManager</code> is a static utility 158 * method class and no instances should be created. 159 */ 160 private RestrictionEnzymeManager() { } 161 162 /** 163 * <code>loadEnzymeFile</code> loads a new REBASE file (or any file using 164 * REBASE format #31). 165 * 166 * @param is an InputStream over the file to load. 167 * @param commercialOnly indicates whether or not only commercially available 168 * enzymes are loaded. 169 * 170 * @since 1.5 171 */ 172 public static synchronized void loadEnzymeFile(InputStream is, boolean commercialOnly) { 173 loadCommercialOnly = commercialOnly; 174 loadData(is); 175 } 176 177 /** 178 * <code>getAllEnzymes</code> returns an unmodifable set of all 179 * available enzymes. 180 * 181 * @return a <code>Set</code> of <code>RestrictionEnzyme</code>s. 182 */ 183 public static Set getAllEnzymes() 184 { 185 return Collections.unmodifiableSet(enzymeToPattern.keySet()); 186 } 187 188 /** 189 * <code>getEnzyme</code> returns an enzyme by name. 190 * 191 * @param name a <code>String</code> such as EcoRI, case 192 * sensitive. 193 * 194 * @return a <code>RestrictionEnzyme</code>. 195 */ 196 public static RestrictionEnzyme getEnzyme(String name) 197 { 198 if (! nameToEnzyme.containsKey(name)) 199 throw new IllegalArgumentException("Unknown RestrictionEnzyme name '" 200 + name 201 + "'"); 202 203 return (RestrictionEnzyme) nameToEnzyme.get(name); 204 } 205 206 /** 207 * <code>getIsoschizomers</code> returns an unmodifable set of the 208 * isoschizomers of this enzyme. 209 * 210 * @param name a <code>String</code> such as EcoRI, case 211 * sensitive. 212 * 213 * @return a <code>Set</code> of <code>RestrictionEnzyme</code>s. 214 */ 215 public static Set getIsoschizomers(String name) 216 { 217 if (! nameToIsoschizomers.containsKey(name)) 218 throw new IllegalArgumentException("Unknown RestrictionEnzyme name '" 219 + name 220 + "'"); 221 Set result = (Set) nameToIsoschizomers.get(name); 222 if(result.contains(null)) 223 return Collections.EMPTY_SET; 224 return Collections.unmodifiableSet(result); 225 } 226 227 /** 228 * <code>getRecognitionSequence</code> returns a string that describes 229 * the recognition site of this enzyme. It corresponds to the field <3> 230 * of the REBASE file. 231 * 232 * @param name a <code>String</code> such as EcoRI, case 233 * sensitive. 234 * @return a <code>String</code> describing the recognition sequence, 235 * e.g. "G^AATTC" for EcoRI. 236 * @since 1.5 237 */ 238 public static String getRecognitionSequence(String name) 239 { 240 if (! nameToSite.containsKey(name)) 241 throw new IllegalArgumentException("Unknown RestrictionEnzyme name '" 242 + name 243 + "'"); 244 return (String) nameToSite.get(name); 245 } 246 247 /** 248 * <code>getNCutters</code> returns an unmodifable set of all 249 * enzymes with a cut site of size n. 250 * 251 * @param n an <code>int</code> cut site size. 252 * 253 * @return a <code>Set</code> of <code>RestrictionEnzyme</code>s. 254 */ 255 public static Set getNCutters(int n) 256 { 257 Integer size = new Integer(n); 258 if (! sizeToCutters.containsKey(size)) 259 return Collections.EMPTY_SET; 260 261 return Collections.unmodifiableSet((Set) sizeToCutters.get(size)); 262 } 263 264 /** 265 * <code>getPatterns</code> returns two <code>Pattern</code> 266 * objects for an enzyme, one matches the forward strand and one 267 * the reverse. This enables searching of both strands of a 268 * sequence without reverse-complementing it. As 269 * <code>Pattern</code> objects are thread-safe these may be used 270 * for all searches. 271 * 272 * @param enzyme a <code>RestrictionEnzyme</code>. 273 * 274 * @return a <code>Pattern []</code> array with the forward strand 275 * <code>Pattern</code> at index 0 and the reverse at index 1. 276 */ 277 public static Pattern [] getPatterns(RestrictionEnzyme enzyme) 278 { 279 if (! enzymeToPattern.containsKey(enzyme)) 280 throw new IllegalArgumentException("RestrictionEnzyme '" 281 + enzyme.getName() 282 + "' is not registered. No precompiled Pattern is available"); 283 284 return (Pattern []) enzymeToPattern.get(enzyme); 285 } 286 287 /** 288 * <code>getAnnotation</code> returns an immutable, static 289 * annotation describing the enzyme. This is suitable for adding 290 * to <code>Feature</code>s which represent restriction sites. The 291 * annotation produced currently contains one key "dbxref" in line 292 * with the GenBank/EMBL qualifier for the "misc_binding" feature 293 * key. The key has a corresponding value "REBASE:<enzyme 294 * name>". 295 * 296 * @param enzyme a <code>RestrictionEnzyme</code>. 297 * 298 * @return an <code>Annotation</code>. 299 */ 300 public static Annotation getAnnotation(RestrictionEnzyme enzyme) 301 { 302 if (! enzymeToAnnotation.containsKey(enzyme)) 303 throw new IllegalArgumentException("RestrictionEnzyme '" 304 + enzyme.getName() 305 + "' is not registered. No Annotation is available"); 306 307 return (Annotation) enzymeToAnnotation.get(enzyme); 308 } 309 310 311 /** 312 * <code>getSuppliers</code> returns a string describing the suppliers 313 * of this enzyme according to REBASE encoding for commercial sources 314 * or an empty String if the enzyme is not commecially available. 315 * 316 * <P>REBASE #31 version 604 code: </P> 317 * <P>A GE Healthcare (8/05) <BR> 318 * B Invitrogen Corporation(8/05)<BR> 319 * C Minotech Biotechnology (9/05)<BR> 320 * E Stratagene (9/05)<BR> 321 * F Fermentas International Inc. (2/06)<BR> 322 * G Qbiogene (9/05)<BR> 323 * H American Allied Biochemical, Inc. (9/05)<BR> 324 * I SibEnzyme Ltd. (2/06)<BR> 325 * J Nippon Gene Co., Ltd. (8/05)<BR> 326 * K Takara Bio Inc. (9/05)<BR> 327 * M Roche Applied Science (8/05)<BR> 328 * N New England Biolabs (2/06)<BR> 329 * O Toyobo Biochemicals (9/05)<BR> 330 * Q Molecular Biology Resources (8/05)<BR> 331 * R Promega Corporation (9/05)<BR> 332 * S Sigma Chemical Corporation (9/05)<BR> 333 * U Bangalore Genei (9/05)<BR> 334 * V Vivantis Technologies (1/06)<BR> 335 * X EURx Ltd. (9/05)<BR> 336 * Y CinnaGen Inc. (9/05) 337 * </P> 338 * 339 * @param enzyme a <code>RestrictionEnzyme</code>. 340 * 341 * @return a <code>String</code>. 342 * @since 1.5 343 */ 344 public static String getSuppliers(RestrictionEnzyme enzyme) 345 { 346 if (! enzymeToSuppliers.containsKey(enzyme)) 347 return ""; 348 return (String) enzymeToSuppliers.get(enzyme); 349 } 350 351 /** 352 * <code>register</code> regisiters a new 353 * <code>RestrictionEnzyme</code> with the manager. It does not 354 * check that the isoschizomers are known to the manager. If there 355 * are custom isoschizomers in the <code>Set</code>, they should 356 * be also be registered. 357 * 358 * @param enzyme a <code>RestrictionEnzyme</code> to register. 359 * 360 * @param isoschizomers a <code>Set</code> of 361 * <code>RestrictionEnzyme</code>s which are isoschizomers. 362 */ 363 public synchronized static void register(RestrictionEnzyme enzyme, 364 Set isoschizomers) 365 { 366 for (Iterator ii = isoschizomers.iterator(); ii.hasNext();) 367 { 368 Object o = ii.next(); 369 370 if (! (o instanceof RestrictionEnzyme)) 371 { 372 throw new IllegalArgumentException("Isoschizomers set may contain only RestrictionEnzymes. Found '" 373 + o 374 + "'"); 375 } 376 } 377 378 registerEnzyme(enzyme); 379 380 String name = enzyme.getName(); 381 nameToIsoschizomers.put(name, isoschizomers); 382 } 383 384 /** 385 * <code>registerEnzyme</code> registers an enzyme, but does not 386 * populate its isoschizomers. This is because registering the 387 * contents of a REBASE file and registering a custom enzyme 388 * handle addition of isoschizomers differently, but both use this 389 * method for all other registration functions. 390 * 391 * @param enzyme a <code>RestrictionEnzyme</code>. 392 */ 393 private static void registerEnzyme(RestrictionEnzyme enzyme) 394 { 395 String name = enzyme.getName(); 396 nameToEnzyme.put(name, enzyme); 397 398 Integer sizeKey = new Integer(enzyme.getRecognitionSite().length()); 399 if (sizeToCutters.containsKey(sizeKey)) 400 { 401 Set s = (Set) sizeToCutters.get(sizeKey); 402 s.add(enzyme); 403 } 404 else 405 { 406 Set s = new HashSet(); 407 s.add(enzyme); 408 sizeToCutters.put(sizeKey, s); 409 } 410 411 Pattern forward = Pattern.compile(enzyme.getForwardRegex()); 412 Pattern reverse = Pattern.compile(enzyme.getReverseRegex()); 413 enzymeToPattern.put(enzyme, new Pattern [] { forward, reverse }); 414 415 Annotation annotation = new SmallAnnotation(); 416 try 417 { 418 annotation.setProperty("dbxref", "REBASE:" + name); 419 } 420 catch (ChangeVetoException cve) 421 { 422 throw new BioError("Assertion Failure: failed to modify Annotation", cve); 423 } 424 425 annotation.addChangeListener(ChangeListener.ALWAYS_VETO,ChangeType.UNKNOWN); 426 enzymeToAnnotation.put(enzyme, annotation); 427 } 428 429 private static void loadData(InputStream is) { 430 nameToSite = new HashMap(); 431 nameToEnzyme = new HashMap(); 432 nameToIsoschizomers = new HashMap(); 433 sizeToCutters = new HashMap(); 434 enzymeToPattern = new HashMap(); 435 enzymeToAnnotation = new HashMap(); 436 enzymeToSuppliers = new HashMap(); 437 try { 438 BufferedReader br = new BufferedReader(new InputStreamReader(is)); 439 440 // Basic linesplit parser 441 LineSplitParser lsParser = new LineSplitParser(); 442 lsParser.setEndOfRecord(TagValueParser.EMPTY_LINE_EOR); 443 lsParser.setSplitOffset(3); 444 lsParser.setContinueOnEmptyTag(true); 445 lsParser.setMergeSameTag(true); 446 447 // The end of the chain 448 RebaseEnzymeBuilder builder = new RebaseEnzymeBuilder(); 449 450 // Create isoschizomer value splitter 451 RegexSplitter iso = 452 new RegexSplitter(Pattern.compile("([^,]+)"), 1); 453 // Create site value splitter 454 RegexSplitter site = 455 new RegexSplitter(Pattern.compile("(\\(-?\\d+/-?\\d+\\)|[A-Za-z^]+)"), 1); 456 457 ChangeTable table = new ChangeTable(); 458 table.setSplitter(REBASE_TAG_ISZR, iso); 459 table.setSplitter(REBASE_TAG_SITE, site); 460 ValueChanger changer = new ValueChanger(builder, table); 461 462 // Filter tags 463 TagDropper rebaseTags = new TagDropper(changer); 464 // Retain the enzyme name 465 rebaseTags.addTag(REBASE_TAG_NAME); 466 // Retain isoschizomers 467 rebaseTags.addTag(REBASE_TAG_ISZR); 468 // Retain recognition sequence 469 rebaseTags.addTag(REBASE_TAG_SITE); 470 // Retain commercial supplier 471 rebaseTags.addTag(REBASE_TAG_COMM); 472 473 474 475 Parser parser = new Parser(); 476 while (parser.read(br, lsParser, rebaseTags)) 477 { 478 continue; 479 } 480 481 // Replace isoschizomer names with RestrictionEnzymes 482 Map tempMap = new HashMap(); 483 Set tempSet = null; 484 for (Iterator ni = nameToIsoschizomers.keySet().iterator(); ni.hasNext();) 485 { 486 Object name = ni.next(); 487 Set isoschizomers = (Set) nameToIsoschizomers.get(name); 488 489 if (isoschizomers.size() == 0) 490 tempSet = Collections.EMPTY_SET; 491 else 492 tempSet = (Set) isoschizomers.getClass().newInstance(); 493 494 tempMap.put(name, tempSet); 495 496 for (Iterator ii = isoschizomers.iterator(); ii.hasNext();) { 497 String isoName = (String) ii.next(); 498 Object re = nameToEnzyme.get(isoName); 499 //bug fix suggested by George Waldon 500 if(re!=null) 501 tempSet.add(re); 502 } 503 } 504 505 nameToIsoschizomers = tempMap; 506 } 507 catch (Exception e) 508 { 509 throw new BioError("Failed to read REBASE data file",e); 510 } 511 } 512 513 /** 514 * <code>RebaseEnzymeBuilder</code> creates enzyme instances and 515 * populates the maps. 516 */ 517 private static class RebaseEnzymeBuilder implements TagValueListener 518 { 519 private String recseq; 520 private String name; 521 private Set isoschizomers; 522 private List isoBuffer; 523 private SymbolList site; 524 private int [] usCutPositions; 525 private int [] dsCutPositions; 526 private boolean isCommerciallyAvailable; 527 528 private String tagState; 529 private String suppliers; 530 private boolean unknownSite; 531 532 RebaseEnzymeBuilder() { } 533 534 public void startRecord() throws ParserException 535 { 536 isoBuffer = new ArrayList(30); 537 recseq = ""; 538 site = null; 539 dsCutPositions = null; 540 usCutPositions = null; 541 unknownSite = false; 542 isCommerciallyAvailable = false; 543 } 544 545 public void endRecord() throws ParserException 546 { 547 if (! getRecordState()) 548 return; 549 if (unknownSite || site == null) 550 return; 551 552 int isoCount = isoBuffer.size(); 553 if (isoCount < 30) 554 { 555 isoschizomers = new SmallSet(isoCount); 556 for (int i = 0; i < isoCount; i++) 557 isoschizomers.add(isoBuffer.get(i)); 558 } 559 else 560 { 561 isoschizomers = new HashSet(isoBuffer); 562 } 563 564 if(!loadCommercialOnly || isCommerciallyAvailable) { 565 RestrictionEnzyme re = createEnzyme(); 566 registerEnzyme(re); 567 nameToIsoschizomers.put(name, isoschizomers); 568 enzymeToSuppliers.put(re,suppliers); 569 nameToSite.put(name,recseq); 570 } 571 } 572 573 public void startTag(Object tag) throws ParserException 574 { 575 tagState = (String) tag; 576 } 577 578 public void endTag() throws ParserException { } 579 580 public void value(TagValueContext context, Object value) 581 throws ParserException 582 { 583 if (tagState.equals(REBASE_TAG_NAME)) 584 name = (String) value; 585 else if (tagState.equals(REBASE_TAG_ISZR)) 586 isoBuffer.add(value); 587 else if (tagState.equals(REBASE_TAG_SITE)) { 588 recseq += (String) value; 589 processSite(value); 590 } else if (tagState.equals(REBASE_TAG_COMM)) 591 processSuppliers(value); 592 else 593 throw new ParserException("Unable to handle value for tag '" 594 + tagState 595 + "'"); 596 } 597 598 boolean getRecordState() 599 { 600 return tagState != null; 601 } 602 603 RestrictionEnzyme createEnzyme() 604 { 605 RestrictionEnzyme enzyme = null; 606 607 try 608 { 609 if (usCutPositions != null) 610 { 611 enzyme = new RestrictionEnzyme(name, site, 612 usCutPositions[0], 613 usCutPositions[1], 614 dsCutPositions[0], 615 dsCutPositions[1]); 616 } 617 else 618 { 619 enzyme = new RestrictionEnzyme(name, site, 620 dsCutPositions[0], 621 dsCutPositions[1]); 622 } 623 } 624 catch (IllegalAlphabetException iae) 625 { 626 throw new BioError("New DNA SymbolList no longer consists on DNA Alphabet",iae); 627 } 628 629 return enzyme; 630 } 631 632 private void processSuppliers(Object value) throws ParserException { 633 suppliers = (String) value; 634 if(suppliers.length()!=0) 635 isCommerciallyAvailable = true; 636 } 637 638 private void processSite(Object value) throws ParserException 639 { 640 StringBuffer sb = new StringBuffer((String) value); 641 int div, forIdx, revIdx; 642 643 // REBASE marks enzymes whose site is not known with '?' 644 if (sb.charAt(0) == '?') 645 { 646 unknownSite = true; 647 return; 648 } 649 650 if (sb.charAt(0) == '(') 651 { 652 // Index separator 653 div = sb.indexOf("/"); 654 655 try 656 { 657 forIdx = Integer.parseInt(sb.substring(1, div)); 658 revIdx = Integer.parseInt(sb.substring(div + 1, 659 sb.length() - 1)); 660 } 661 catch (NumberFormatException nfe) 662 { 663 throw new ParserException("Failed to parse cut site index",nfe); 664 } 665 666 // Indices before the site indicate a double cutter 667 if (site == null) 668 { 669 usCutPositions = new int [2]; 670 usCutPositions[0] = -forIdx; 671 usCutPositions[1] = -revIdx; 672 } 673 else 674 { 675 dsCutPositions = new int [2]; 676 dsCutPositions[0] = forIdx + site.length(); 677 dsCutPositions[1] = revIdx + site.length(); 678 } 679 } 680 else 681 { 682 // Explicit cut site marker 683 int cut = sb.indexOf("^"); 684 dsCutPositions = new int [2]; 685 686 try 687 { 688 if (cut == -1) 689 { 690 site = DNATools.createDNA(sb.substring(0)); 691 dsCutPositions[0] = 1; 692 dsCutPositions[1] = 1; 693 } 694 else 695 { 696 sb.deleteCharAt(cut); 697 site = DNATools.createDNA(sb.substring(0)); 698 dsCutPositions[0] = cut; 699 dsCutPositions[1] = site.length() - cut; 700 } 701 } 702 catch (IllegalSymbolException iae) 703 { 704 throw new ParserException("Illegal DNA symbol in recognition site",iae); 705 } 706 } 707 } 708 } 709}