001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022package org.biojava.bio.symbol;
023
024import java.util.ArrayList;
025import java.util.Arrays;
026import java.util.Iterator;
027import java.util.List;
028import java.util.Set;
029import java.util.Stack;
030
031import org.biojava.bio.BioError;
032import org.biojava.bio.BioException;
033import org.biojava.bio.seq.io.SymbolTokenization;
034
035/**
036 * <code>MotifTools</code> contains utility methods for sequence
037 * motifs.
038 *
039 * @author Keith James
040 */
041public class MotifTools
042{
043    private static Symbol [] symProto = new Symbol [0];
044
045    /**
046     * <p><code>createRegex</code> creates a regular expression which
047     * matches the <code>SymbolList</code>. Ambiguous
048     * <code>Symbol</code>s are simply transformed into character
049     * classes. For example the nucleotide sequence "AAGCTT" becomes
050     * "A{2}GCT{2}" and "CTNNG" is expanded to
051     * "CT[ABCDGHKMNRSTVWY]{2}G". The character class is generated
052     * using the <code>getMatches</code> method of an ambiguity symbol
053     * to obtain the alphabet of <code>AtomicSymbol</code>s it
054     * matches, followed by calling <code>getAllSymbols</code> on this
055     * alphabet, removal of any gap symbols and then tokenization of
056     * the remainder. The ordering of the tokens in a character class
057     * is by ascending numerical order of their tokens as determined
058     * by <code>Arrays.sort(char [])</code>.</p>
059     *
060     * <p>The <code>Alphabet</code> of the <code>SymbolList</code>
061     * must be finite and must have a character token type. Regular
062     * expressions may be generated for any such
063     * <code>SymbolList</code>, not just DNA, RNA and protein.</p>
064     *
065     * @param motif a <code>SymbolList</code>.
066     *
067     * @return a <code>String</code> regular expression.
068     */
069    public static String createRegex(SymbolList motif)
070    {
071        if (motif.length() == 0)
072            throw new IllegalArgumentException("SymbolList was empty");
073
074        StringBuffer regex = new StringBuffer();
075        Stack stack = new Stack();
076
077        try
078        {
079            SymbolTokenization sToke =
080                motif.getAlphabet().getTokenization("token");
081            if (sToke.getTokenType() != SymbolTokenization.CHARACTER)
082                throw new IllegalArgumentException("SymbolList alphabet did not have a character token type");
083
084            int motifLen = motif.length();
085
086            for (int i = 1; i <= motifLen; i++)
087            {
088                StringBuffer sb = new StringBuffer();
089                Symbol sym = motif.symbolAt(i);
090                FiniteAlphabet ambiAlpha = (FiniteAlphabet) sym.getMatches();
091
092                if (ambiAlpha.size() == 1)
093                {
094                    sb.append(sToke.tokenizeSymbol(sym));
095                }
096                else
097                {
098                    Set rawSyms = AlphabetManager.getAllSymbols(ambiAlpha);
099                    List gapSyms = new ArrayList();
100
101                    for (Iterator si = rawSyms.iterator(); si.hasNext();)
102                    {
103                        Symbol rawSym = (Symbol) si.next();
104                        // Crude check for gap symbol
105                        if (((FiniteAlphabet) rawSym.getMatches()).size() == 0)
106                        {
107                            gapSyms.add(rawSym);
108                        }
109                    }
110
111                    rawSyms.removeAll(gapSyms);
112
113                    // getAllSymbols returns a Set (i.e. unordered) so
114                    // we convert to char array so we can sort tokens
115                    Symbol [] ambiSyms = (Symbol []) rawSyms.toArray(symProto);
116                    char [] ambiChars = new char [ambiSyms.length];
117
118                    for (int j = 0; j < ambiSyms.length; j++)
119                    {
120                        ambiChars[j] =
121                            sToke.tokenizeSymbol(ambiSyms[j]).charAt(0);
122                    }
123
124                    Arrays.sort(ambiChars);
125                    sb.append(ambiChars);
126                }
127
128                String result = sb.substring(0);
129
130                if (i == 1)
131                {
132                    stack.push(result);
133                    if(motifLen == 1) // Close now
134                        regex = extendRegex(stack, regex);
135                }
136                else if (i < motifLen)
137                {
138                    if (! stack.isEmpty() && stack.peek().equals(result))
139                    {
140                        stack.push(result);
141                    }
142                    else
143                    {
144                        regex = extendRegex(stack, regex);
145                        stack.push(result);
146                    }
147                }
148                else
149                {
150                    if (! stack.isEmpty() && stack.peek().equals(result))
151                    {
152                        stack.push(result);
153                        regex = extendRegex(stack, regex);
154                    }
155                    else
156                    {
157                        regex = extendRegex(stack, regex);
158                        stack.push(result);
159                        regex = extendRegex(stack, regex);
160                    }
161                }
162            }
163        }
164        catch (IllegalSymbolException ise)
165        {
166            throw new BioError("Internal error: failed to tokenize a Symbol from an existing SymbolList", ise);
167        }
168        catch (BioException be)
169        {
170            throw new BioError("Internal error: failed to get SymbolTokenization for SymbolList alphabet", be);
171        }
172
173        return regex.substring(0);
174    }
175
176    private static StringBuffer extendRegex(Stack stack, StringBuffer regex)
177    {
178        String component = stack.isEmpty()? "":(String) stack.peek();
179
180        if (component.length() == 1)
181        {
182            regex.append(component);
183
184            if (stack.size() > 1)
185            {
186                regex.append("{");
187                regex.append(stack.size());
188                regex.append("}");
189            }
190        }
191        else
192        {
193            regex.append("[");
194            regex.append(component);
195            regex.append("]");
196
197            if (stack.size() > 1)
198            {
199                regex.append("{");
200                regex.append(stack.size());
201                regex.append("}");
202            }
203        }
204
205        stack.clear();
206
207        return regex;
208    }
209}