001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022package org.biojava.bio.molbio;
023
024import java.io.BufferedReader;
025import java.io.InputStream;
026import java.io.InputStreamReader;
027import java.util.ArrayList;
028import java.util.Collections;
029import java.util.HashMap;
030import java.util.HashSet;
031import java.util.Iterator;
032import java.util.List;
033import java.util.Map;
034import java.util.ResourceBundle;
035import java.util.Set;
036import java.util.regex.Pattern;
037
038import org.biojava.bio.Annotation;
039import org.biojava.bio.BioError;
040import org.biojava.bio.SmallAnnotation;
041import org.biojava.bio.program.tagvalue.ChangeTable;
042import org.biojava.bio.program.tagvalue.LineSplitParser;
043import org.biojava.bio.program.tagvalue.Parser;
044import org.biojava.bio.program.tagvalue.RegexSplitter;
045import org.biojava.bio.program.tagvalue.TagDropper;
046import org.biojava.bio.program.tagvalue.TagValueContext;
047import org.biojava.bio.program.tagvalue.TagValueListener;
048import org.biojava.bio.program.tagvalue.TagValueParser;
049import org.biojava.bio.program.tagvalue.ValueChanger;
050import org.biojava.bio.seq.DNATools;
051import org.biojava.bio.symbol.IllegalAlphabetException;
052import org.biojava.bio.symbol.IllegalSymbolException;
053import org.biojava.bio.symbol.SymbolList;
054import org.biojava.utils.ChangeListener;
055import org.biojava.utils.ChangeType;
056import org.biojava.utils.ChangeVetoException;
057import org.biojava.utils.ParserException;
058import org.biojava.utils.SmallSet;
059
060/**
061 * <p><code>RestrictionEnzymeManager</code> manages collections of
062 * static <code>RestrictionEnzyme</code> instances. A properties file
063 * should be placed in the CLASSPATH containing a key
064 * "rebase.data.file" and a corresponding value of a REBASE file
065 * (standard REBASE format #31 conventionally named withrefm.### where
066 * ### is the version number). This file will be loaded by the
067 * <code>RestrictionEnzymeManager</code> <code>ClassLoader</code>. The
068 * properties are loaded as a <code>ResourceBundle</code>, so the file
069 * should be named "RestrictionEnzymeManager.properties".</p>
070 * <p>Since 1.5, a format #31 REBASE file can be loaded at anytime
071 * using the method <code>loadEnzymeFile</code> and optionally filtered
072 * for commercially available enzymes.</p>
073 *  
074 * @author Keith James
075 * @author George Waldon
076 * @since 1.3
077
078 */
079public final class RestrictionEnzymeManager
080{
081     /**
082     * <code>REBASE_DATA_KEY</code> the ResourceBundle key which
083     * specifies the location of the REBASE flat file.
084     */
085    public static final String REBASE_DATA_KEY = "rebase.data.file";
086
087    /**
088     * <code>REBASE_TAG_NAME</code> the REBASE tag containing the
089     * enzyme name.
090     */
091    public static final String REBASE_TAG_NAME = "<1>";
092
093    /**
094     * <code>REBASE_TAG_ISZR</code> the REBASE tag containing the
095     * enzyme isoschizomers.
096     */
097    public static final String REBASE_TAG_ISZR = "<2>";
098
099    /**
100     * <code>REBASE_TAG_SITE</code> the REBASE tag containing the
101     * enzyme site.
102     */
103    public static final String REBASE_TAG_SITE = "<3>";
104
105    /**
106     * <code>REBASE_TAG_METH</code> the REBASE tag containing the
107     * methylation site.
108     */
109    public static final String REBASE_TAG_METH = "<4>";
110
111    /**
112     * <code>REBASE_TAG_ORGN</code> the REBASE tag containing the
113     * organism.
114     */
115    public static final String REBASE_TAG_ORGN = "<5>";
116
117    /**
118     * <code>REBASE_TAG_SRCE</code> the REBASE tag containing the
119     * source.
120     */
121    public static final String REBASE_TAG_SRCE = "<6>";
122
123    /**
124     * <code>REBASE_TAG_COMM</code> the REBASE tag containing the
125     * commercial suppliers.
126     */
127    public static final String REBASE_TAG_COMM = "<7>";
128
129    /**
130     * <code>REBASE_TAG_REFS</code> the REBASE tag containing the
131     * references.
132     */
133    public static final String REBASE_TAG_REFS = "<8>";
134    
135    
136    private static boolean loadCommercialOnly = false;
137
138    private static ResourceBundle bundle =
139        ResourceBundle.getBundle(RestrictionEnzymeManager.class.getName());
140
141    static
142    {
143        String rebaseDataFileName = bundle.getString(REBASE_DATA_KEY);
144        InputStream is = RestrictionEnzymeManager.class.getResourceAsStream(rebaseDataFileName);
145        loadData(is);
146    }
147
148    private static Map nameToSite;
149    private static Map nameToEnzyme;
150    private static Map nameToIsoschizomers;
151    private static Map sizeToCutters;
152    private static Map enzymeToPattern;
153    private static Map enzymeToAnnotation;
154    private static Map enzymeToSuppliers;
155
156    /**
157     * <code>RestrictionEnzymeManager</code> is a static utility
158     * method class and no instances should be created.
159     */
160    private RestrictionEnzymeManager() { }
161    
162    /**
163     * <code>loadEnzymeFile</code> loads a new REBASE file (or any file using
164     * REBASE format #31).
165     *
166     * @param is an InputStream over the file to load.
167     * @param commercialOnly indicates whether or not only commercially available 
168     * enzymes are loaded.
169     *
170     * @since 1.5
171     */
172    public static synchronized void loadEnzymeFile(InputStream is, boolean commercialOnly) {
173        loadCommercialOnly = commercialOnly;
174        loadData(is);
175    }
176
177    /**
178     * <code>getAllEnzymes</code> returns an unmodifable set of all
179     * available enzymes.
180     *
181     * @return a <code>Set</code> of <code>RestrictionEnzyme</code>s.
182     */
183    public static Set getAllEnzymes()
184    {
185        return Collections.unmodifiableSet(enzymeToPattern.keySet());
186    }
187
188    /**
189     * <code>getEnzyme</code> returns an enzyme by name.
190     *
191     * @param name a <code>String</code> such as EcoRI, case
192     * sensitive.
193     *
194     * @return a <code>RestrictionEnzyme</code>.
195     */
196    public static RestrictionEnzyme getEnzyme(String name)
197    {
198        if (! nameToEnzyme.containsKey(name))
199            throw new IllegalArgumentException("Unknown RestrictionEnzyme name '"
200                                               + name
201                                               + "'");
202
203        return (RestrictionEnzyme) nameToEnzyme.get(name);
204    }
205
206    /**
207     * <code>getIsoschizomers</code> returns an unmodifable set of the
208     * isoschizomers of this enzyme.
209     *
210     * @param name a <code>String</code> such as EcoRI, case
211     * sensitive.
212     *
213     * @return a <code>Set</code> of <code>RestrictionEnzyme</code>s.
214     */
215    public static Set getIsoschizomers(String name)
216    {
217        if (! nameToIsoschizomers.containsKey(name))
218            throw new IllegalArgumentException("Unknown RestrictionEnzyme name '"
219                                               + name
220                                               + "'");
221        Set result = (Set) nameToIsoschizomers.get(name);
222        if(result.contains(null))
223            return Collections.EMPTY_SET;
224        return Collections.unmodifiableSet(result);
225    }
226
227    /**
228     * <code>getRecognitionSequence</code> returns a string that describes
229     * the recognition site of this enzyme. It corresponds to the field <3>
230     * of the REBASE file.
231     *
232     * @param name a <code>String</code> such as EcoRI, case
233     * sensitive.
234     * @return a <code>String</code> describing the recognition sequence, 
235     * e.g. "G^AATTC" for EcoRI.
236     * @since 1.5
237     */
238    public static String getRecognitionSequence(String name)
239    {
240        if (! nameToSite.containsKey(name))
241            throw new IllegalArgumentException("Unknown RestrictionEnzyme name '"
242                                               + name
243                                               + "'");
244        return (String) nameToSite.get(name);
245    }
246
247    /**
248     * <code>getNCutters</code> returns an unmodifable set of all
249     * enzymes with a cut site of size n.
250     *
251     * @param n an <code>int</code> cut site size.
252     *
253     * @return a <code>Set</code> of <code>RestrictionEnzyme</code>s.
254     */
255    public static Set getNCutters(int n)
256    {
257        Integer size = new Integer(n);
258        if (! sizeToCutters.containsKey(size))
259            return Collections.EMPTY_SET;
260
261        return Collections.unmodifiableSet((Set) sizeToCutters.get(size));
262    }
263
264    /**
265     * <code>getPatterns</code> returns two <code>Pattern</code>
266     * objects for an enzyme, one matches the forward strand and one
267     * the reverse. This enables searching of both strands of a
268     * sequence without reverse-complementing it. As
269     * <code>Pattern</code> objects are thread-safe these may be used
270     * for all searches.
271     *
272     * @param enzyme a <code>RestrictionEnzyme</code>.
273     *
274     * @return a <code>Pattern []</code> array with the forward strand
275     * <code>Pattern</code> at index 0 and the reverse at index 1.
276     */
277    public static Pattern [] getPatterns(RestrictionEnzyme enzyme)
278    {
279        if (! enzymeToPattern.containsKey(enzyme))
280            throw new IllegalArgumentException("RestrictionEnzyme '"
281                                               + enzyme.getName()
282                                               + "' is not registered. No precompiled Pattern is available");
283
284        return (Pattern []) enzymeToPattern.get(enzyme);
285    }
286
287    /**
288     * <code>getAnnotation</code> returns an immutable, static
289     * annotation describing the enzyme. This is suitable for adding
290     * to <code>Feature</code>s which represent restriction sites. The
291     * annotation produced currently contains one key "dbxref" in line
292     * with the GenBank/EMBL qualifier for the "misc_binding" feature
293     * key. The key has a corresponding value "REBASE:&lt;enzyme
294     * name&gt;".
295     *
296     * @param enzyme a <code>RestrictionEnzyme</code>.
297     *
298     * @return an <code>Annotation</code>.
299     */
300    public static Annotation getAnnotation(RestrictionEnzyme enzyme)
301    {
302        if (! enzymeToAnnotation.containsKey(enzyme))
303            throw new IllegalArgumentException("RestrictionEnzyme '"
304                                               + enzyme.getName()
305                                               + "' is not registered. No Annotation is available");
306
307        return (Annotation) enzymeToAnnotation.get(enzyme);
308    }
309
310
311    /**
312     * <code>getSuppliers</code> returns a string describing the suppliers
313     * of this enzyme according to REBASE encoding for commercial sources 
314     * or an empty String if the enzyme is not commecially available.
315     *
316     * <P>REBASE #31 version 604 code: </P>
317     * <P>A GE Healthcare (8/05) <BR>
318     * B Invitrogen Corporation(8/05)<BR>
319     * C Minotech Biotechnology (9/05)<BR>
320     * E Stratagene (9/05)<BR>
321     * F Fermentas International Inc. (2/06)<BR>
322     * G Qbiogene (9/05)<BR>
323     * H American Allied Biochemical, Inc. (9/05)<BR>
324     * I SibEnzyme Ltd. (2/06)<BR>
325     * J Nippon Gene Co., Ltd. (8/05)<BR>
326     * K Takara Bio Inc. (9/05)<BR>
327     * M Roche Applied Science (8/05)<BR>
328     * N New England Biolabs (2/06)<BR>
329     * O Toyobo Biochemicals (9/05)<BR>
330     * Q Molecular Biology Resources (8/05)<BR>
331     * R Promega Corporation (9/05)<BR>
332     * S Sigma Chemical Corporation (9/05)<BR>
333     * U Bangalore Genei (9/05)<BR>
334     * V Vivantis Technologies (1/06)<BR>
335     * X EURx Ltd. (9/05)<BR>
336     * Y CinnaGen Inc. (9/05)
337     * </P>
338     *
339     * @param enzyme a <code>RestrictionEnzyme</code>.
340     *
341     * @return a <code>String</code>.
342     * @since 1.5
343     */
344    public static String getSuppliers(RestrictionEnzyme enzyme)
345    {
346        if (! enzymeToSuppliers.containsKey(enzyme))
347            return "";
348        return (String) enzymeToSuppliers.get(enzyme);
349    }
350
351    /**
352     * <code>register</code> regisiters a new
353     * <code>RestrictionEnzyme</code> with the manager. It does not
354     * check that the isoschizomers are known to the manager. If there
355     * are custom isoschizomers in the <code>Set</code>, they should
356     * be also be registered.
357     *
358     * @param enzyme a <code>RestrictionEnzyme</code> to register.
359     *
360     * @param isoschizomers a <code>Set</code> of
361     * <code>RestrictionEnzyme</code>s which are isoschizomers.
362     */
363    public synchronized static void register(RestrictionEnzyme enzyme,
364                                             Set               isoschizomers)
365    {
366        for (Iterator ii = isoschizomers.iterator(); ii.hasNext();)
367        {
368            Object o = ii.next();
369
370            if (! (o instanceof RestrictionEnzyme))
371            {
372                throw new IllegalArgumentException("Isoschizomers set may contain only RestrictionEnzymes. Found '"
373                                                   + o
374                                                   + "'");
375            }
376        }
377
378        registerEnzyme(enzyme);
379
380        String name = enzyme.getName();
381        nameToIsoschizomers.put(name, isoschizomers);
382    }
383
384    /**
385     * <code>registerEnzyme</code> registers an enzyme, but does not
386     * populate its isoschizomers. This is because registering the
387     * contents of a REBASE file and registering a custom enzyme
388     * handle addition of isoschizomers differently, but both use this
389     * method for all other registration functions.
390     *
391     * @param enzyme a <code>RestrictionEnzyme</code>.
392     */
393    private static void registerEnzyme(RestrictionEnzyme enzyme)
394    {
395        String name = enzyme.getName();
396        nameToEnzyme.put(name, enzyme);
397
398        Integer sizeKey = new Integer(enzyme.getRecognitionSite().length());
399        if (sizeToCutters.containsKey(sizeKey))
400        {
401            Set s = (Set) sizeToCutters.get(sizeKey);
402            s.add(enzyme);
403        }
404        else
405        {
406            Set s = new HashSet();
407            s.add(enzyme);
408            sizeToCutters.put(sizeKey, s);
409        }
410
411        Pattern forward = Pattern.compile(enzyme.getForwardRegex());
412        Pattern reverse = Pattern.compile(enzyme.getReverseRegex());
413        enzymeToPattern.put(enzyme, new Pattern [] { forward, reverse });
414
415        Annotation annotation = new SmallAnnotation();
416        try
417        {
418            annotation.setProperty("dbxref", "REBASE:" + name);
419        }
420        catch (ChangeVetoException cve)
421        {
422            throw new BioError("Assertion Failure: failed to modify Annotation", cve);
423        }
424
425        annotation.addChangeListener(ChangeListener.ALWAYS_VETO,ChangeType.UNKNOWN);
426        enzymeToAnnotation.put(enzyme, annotation);
427    }
428
429    private static void loadData(InputStream is) {
430        nameToSite          = new HashMap();
431        nameToEnzyme        = new HashMap();
432        nameToIsoschizomers = new HashMap();
433        sizeToCutters       = new HashMap();
434        enzymeToPattern     = new HashMap();
435        enzymeToAnnotation  = new HashMap();
436        enzymeToSuppliers   = new HashMap();
437        try {
438            BufferedReader br = new BufferedReader(new InputStreamReader(is));
439
440            // Basic linesplit parser
441            LineSplitParser lsParser = new LineSplitParser();
442            lsParser.setEndOfRecord(TagValueParser.EMPTY_LINE_EOR);
443            lsParser.setSplitOffset(3);
444            lsParser.setContinueOnEmptyTag(true);
445            lsParser.setMergeSameTag(true);
446
447            // The end of the chain
448            RebaseEnzymeBuilder builder = new RebaseEnzymeBuilder();
449
450            // Create isoschizomer value splitter
451            RegexSplitter iso =
452                new RegexSplitter(Pattern.compile("([^,]+)"), 1);
453            // Create site value splitter
454            RegexSplitter site =
455                new RegexSplitter(Pattern.compile("(\\(-?\\d+/-?\\d+\\)|[A-Za-z^]+)"), 1);
456
457            ChangeTable table = new ChangeTable();
458            table.setSplitter(REBASE_TAG_ISZR, iso);
459            table.setSplitter(REBASE_TAG_SITE, site);
460            ValueChanger changer = new ValueChanger(builder, table);
461
462            // Filter tags
463            TagDropper rebaseTags = new TagDropper(changer);
464            // Retain the enzyme name
465            rebaseTags.addTag(REBASE_TAG_NAME);
466            // Retain isoschizomers
467            rebaseTags.addTag(REBASE_TAG_ISZR);
468            // Retain recognition sequence
469            rebaseTags.addTag(REBASE_TAG_SITE);
470            // Retain commercial supplier
471            rebaseTags.addTag(REBASE_TAG_COMM);
472
473
474
475            Parser parser = new Parser();
476            while (parser.read(br, lsParser, rebaseTags))
477            {
478                continue;
479            }
480
481            // Replace isoschizomer names with RestrictionEnzymes
482            Map tempMap = new HashMap();
483            Set tempSet = null;
484            for (Iterator ni = nameToIsoschizomers.keySet().iterator(); ni.hasNext();)
485            {
486                Object name = ni.next();
487                Set isoschizomers = (Set) nameToIsoschizomers.get(name);
488
489                if (isoschizomers.size() == 0)
490                    tempSet = Collections.EMPTY_SET;
491                else
492                    tempSet = (Set) isoschizomers.getClass().newInstance();
493
494                tempMap.put(name, tempSet);
495
496                for (Iterator ii = isoschizomers.iterator(); ii.hasNext();) {
497                    String isoName = (String) ii.next();
498                    Object re = nameToEnzyme.get(isoName);
499                    //bug fix suggested by George Waldon
500                    if(re!=null)
501                        tempSet.add(re);
502                }
503            }
504
505            nameToIsoschizomers = tempMap;
506        }
507        catch (Exception e)
508        {
509            throw new BioError("Failed to read REBASE data file",e);
510        }
511    }
512
513    /**
514     * <code>RebaseEnzymeBuilder</code> creates enzyme instances and
515     * populates the maps.
516     */
517    private static class RebaseEnzymeBuilder implements TagValueListener
518    {
519        private String recseq;
520        private String name;
521        private Set isoschizomers;
522        private List isoBuffer;
523        private SymbolList site;
524        private int [] usCutPositions;
525        private int [] dsCutPositions;
526        private boolean isCommerciallyAvailable;
527
528        private String tagState;
529        private String suppliers;
530        private boolean unknownSite;
531
532        RebaseEnzymeBuilder() { }
533
534        public void startRecord() throws ParserException
535        {
536            isoBuffer = new ArrayList(30);
537            recseq         = "";
538            site           = null;
539            dsCutPositions = null;
540            usCutPositions = null;
541            unknownSite = false;
542            isCommerciallyAvailable = false;
543        }
544
545        public void endRecord() throws ParserException
546        {
547            if (! getRecordState())
548                return;
549            if (unknownSite || site == null)
550                return;
551
552            int isoCount = isoBuffer.size();
553            if (isoCount < 30)
554            {
555                isoschizomers = new SmallSet(isoCount);
556                for (int i = 0; i < isoCount; i++)
557                    isoschizomers.add(isoBuffer.get(i));
558            }
559            else
560            {
561                isoschizomers = new HashSet(isoBuffer);
562            }
563
564            if(!loadCommercialOnly || isCommerciallyAvailable) {
565                RestrictionEnzyme re = createEnzyme();
566                registerEnzyme(re);
567                nameToIsoschizomers.put(name, isoschizomers);
568                enzymeToSuppliers.put(re,suppliers);
569                nameToSite.put(name,recseq);
570            }
571        }
572
573        public void startTag(Object tag) throws ParserException
574        {
575            tagState = (String) tag;
576        }
577
578        public void endTag() throws ParserException { }
579
580        public void value(TagValueContext context, Object value)
581            throws ParserException
582        {
583            if (tagState.equals(REBASE_TAG_NAME))
584                name = (String) value;
585            else if (tagState.equals(REBASE_TAG_ISZR))
586                isoBuffer.add(value);
587            else if (tagState.equals(REBASE_TAG_SITE)) {
588                recseq += (String) value;
589                processSite(value);
590            } else if (tagState.equals(REBASE_TAG_COMM))
591                processSuppliers(value);
592            else
593                throw new ParserException("Unable to handle value for tag '"
594                                          + tagState
595                                          + "'");
596        }
597
598        boolean getRecordState()
599        {
600            return tagState != null;
601        }
602
603        RestrictionEnzyme createEnzyme()
604        {
605            RestrictionEnzyme enzyme = null;
606
607            try
608            {
609                if (usCutPositions != null)
610                {
611                    enzyme = new RestrictionEnzyme(name, site,
612                                                   usCutPositions[0],
613                                                   usCutPositions[1],
614                                                   dsCutPositions[0],
615                                                   dsCutPositions[1]);
616                }
617                else
618                {
619                    enzyme = new RestrictionEnzyme(name, site,
620                                                   dsCutPositions[0],
621                                                   dsCutPositions[1]);
622                }
623            }
624            catch (IllegalAlphabetException iae)
625            {
626                throw new BioError("New DNA SymbolList no longer consists on DNA Alphabet",iae);
627            }
628
629            return enzyme;
630        }
631
632        private void processSuppliers(Object value) throws ParserException {
633            suppliers = (String) value;
634            if(suppliers.length()!=0)
635                isCommerciallyAvailable = true;
636        }
637
638        private void processSite(Object value) throws ParserException
639        {
640            StringBuffer sb = new StringBuffer((String) value);
641            int div, forIdx, revIdx;
642
643            // REBASE marks enzymes whose site is not known with '?'
644            if (sb.charAt(0) == '?')
645            {
646                unknownSite = true;
647                return;
648            }
649
650            if (sb.charAt(0) == '(')
651            {
652                // Index separator
653                div = sb.indexOf("/");
654
655                try
656                {
657                    forIdx = Integer.parseInt(sb.substring(1, div));
658                    revIdx = Integer.parseInt(sb.substring(div + 1,
659                                                           sb.length() - 1));
660                }
661                catch (NumberFormatException nfe)
662                {
663                    throw new ParserException("Failed to parse cut site index",nfe);
664                }
665
666                // Indices before the site indicate a double cutter
667                if (site == null)
668                {
669                    usCutPositions = new int [2];
670                    usCutPositions[0] = -forIdx;
671                    usCutPositions[1] = -revIdx;
672                }
673                else
674                {
675                    dsCutPositions = new int [2];
676                    dsCutPositions[0] = forIdx + site.length();
677                    dsCutPositions[1] = revIdx + site.length();
678                }
679            }
680            else
681            {
682                // Explicit cut site marker
683                int cut = sb.indexOf("^");
684                dsCutPositions = new int [2];
685
686                try
687                {
688                    if (cut == -1)
689                    {
690                        site = DNATools.createDNA(sb.substring(0));
691                        dsCutPositions[0] = 1;
692                        dsCutPositions[1] = 1;
693                    }
694                    else
695                    {
696                        sb.deleteCharAt(cut);
697                        site = DNATools.createDNA(sb.substring(0));
698                        dsCutPositions[0] = cut;
699                        dsCutPositions[1] = site.length() - cut;
700                    }
701                }
702                catch (IllegalSymbolException iae)
703                {
704                    throw new ParserException("Illegal DNA symbol in recognition site",iae);
705                }
706            }
707        }
708    }
709}