001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojava.bio.symbol; 023 024import java.util.ArrayList; 025import java.util.Arrays; 026import java.util.Iterator; 027import java.util.List; 028import java.util.Set; 029import java.util.Stack; 030 031import org.biojava.bio.BioError; 032import org.biojava.bio.BioException; 033import org.biojava.bio.seq.io.SymbolTokenization; 034 035/** 036 * <code>MotifTools</code> contains utility methods for sequence 037 * motifs. 038 * 039 * @author Keith James 040 */ 041public class MotifTools 042{ 043 private static Symbol [] symProto = new Symbol [0]; 044 045 /** 046 * <p><code>createRegex</code> creates a regular expression which 047 * matches the <code>SymbolList</code>. Ambiguous 048 * <code>Symbol</code>s are simply transformed into character 049 * classes. For example the nucleotide sequence "AAGCTT" becomes 050 * "A{2}GCT{2}" and "CTNNG" is expanded to 051 * "CT[ABCDGHKMNRSTVWY]{2}G". The character class is generated 052 * using the <code>getMatches</code> method of an ambiguity symbol 053 * to obtain the alphabet of <code>AtomicSymbol</code>s it 054 * matches, followed by calling <code>getAllSymbols</code> on this 055 * alphabet, removal of any gap symbols and then tokenization of 056 * the remainder. The ordering of the tokens in a character class 057 * is by ascending numerical order of their tokens as determined 058 * by <code>Arrays.sort(char [])</code>.</p> 059 * 060 * <p>The <code>Alphabet</code> of the <code>SymbolList</code> 061 * must be finite and must have a character token type. Regular 062 * expressions may be generated for any such 063 * <code>SymbolList</code>, not just DNA, RNA and protein.</p> 064 * 065 * @param motif a <code>SymbolList</code>. 066 * 067 * @return a <code>String</code> regular expression. 068 */ 069 public static String createRegex(SymbolList motif) 070 { 071 if (motif.length() == 0) 072 throw new IllegalArgumentException("SymbolList was empty"); 073 074 StringBuffer regex = new StringBuffer(); 075 Stack stack = new Stack(); 076 077 try 078 { 079 SymbolTokenization sToke = 080 motif.getAlphabet().getTokenization("token"); 081 if (sToke.getTokenType() != SymbolTokenization.CHARACTER) 082 throw new IllegalArgumentException("SymbolList alphabet did not have a character token type"); 083 084 int motifLen = motif.length(); 085 086 for (int i = 1; i <= motifLen; i++) 087 { 088 StringBuffer sb = new StringBuffer(); 089 Symbol sym = motif.symbolAt(i); 090 FiniteAlphabet ambiAlpha = (FiniteAlphabet) sym.getMatches(); 091 092 if (ambiAlpha.size() == 1) 093 { 094 sb.append(sToke.tokenizeSymbol(sym)); 095 } 096 else 097 { 098 Set rawSyms = AlphabetManager.getAllSymbols(ambiAlpha); 099 List gapSyms = new ArrayList(); 100 101 for (Iterator si = rawSyms.iterator(); si.hasNext();) 102 { 103 Symbol rawSym = (Symbol) si.next(); 104 // Crude check for gap symbol 105 if (((FiniteAlphabet) rawSym.getMatches()).size() == 0) 106 { 107 gapSyms.add(rawSym); 108 } 109 } 110 111 rawSyms.removeAll(gapSyms); 112 113 // getAllSymbols returns a Set (i.e. unordered) so 114 // we convert to char array so we can sort tokens 115 Symbol [] ambiSyms = (Symbol []) rawSyms.toArray(symProto); 116 char [] ambiChars = new char [ambiSyms.length]; 117 118 for (int j = 0; j < ambiSyms.length; j++) 119 { 120 ambiChars[j] = 121 sToke.tokenizeSymbol(ambiSyms[j]).charAt(0); 122 } 123 124 Arrays.sort(ambiChars); 125 sb.append(ambiChars); 126 } 127 128 String result = sb.substring(0); 129 130 if (i == 1) 131 { 132 stack.push(result); 133 if(motifLen == 1) // Close now 134 regex = extendRegex(stack, regex); 135 } 136 else if (i < motifLen) 137 { 138 if (! stack.isEmpty() && stack.peek().equals(result)) 139 { 140 stack.push(result); 141 } 142 else 143 { 144 regex = extendRegex(stack, regex); 145 stack.push(result); 146 } 147 } 148 else 149 { 150 if (! stack.isEmpty() && stack.peek().equals(result)) 151 { 152 stack.push(result); 153 regex = extendRegex(stack, regex); 154 } 155 else 156 { 157 regex = extendRegex(stack, regex); 158 stack.push(result); 159 regex = extendRegex(stack, regex); 160 } 161 } 162 } 163 } 164 catch (IllegalSymbolException ise) 165 { 166 throw new BioError("Internal error: failed to tokenize a Symbol from an existing SymbolList", ise); 167 } 168 catch (BioException be) 169 { 170 throw new BioError("Internal error: failed to get SymbolTokenization for SymbolList alphabet", be); 171 } 172 173 return regex.substring(0); 174 } 175 176 private static StringBuffer extendRegex(Stack stack, StringBuffer regex) 177 { 178 String component = stack.isEmpty()? "":(String) stack.peek(); 179 180 if (component.length() == 1) 181 { 182 regex.append(component); 183 184 if (stack.size() > 1) 185 { 186 regex.append("{"); 187 regex.append(stack.size()); 188 regex.append("}"); 189 } 190 } 191 else 192 { 193 regex.append("["); 194 regex.append(component); 195 regex.append("]"); 196 197 if (stack.size() > 1) 198 { 199 regex.append("{"); 200 regex.append(stack.size()); 201 regex.append("}"); 202 } 203 } 204 205 stack.clear(); 206 207 return regex; 208 } 209}