001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022 023package org.biojava.bio.symbol; 024 025import java.io.Serializable; 026import java.util.ArrayList; 027import java.util.List; 028 029/** 030 * Suffix tree implementation. The interface is a bit strange, as it 031 * needed to be as space-efficient as possible. More work could be 032 * done on the space issue. 033 * 034 * <p> 035 * A suffix tree is an efficient method for encoding the frequencies 036 * of motifs in a sequence. They are sometimes used to quickly screen 037 * for similar sequences. For instance, all motifs of length up to 038 * 2 in the sequence <code>AAGT</code> could be encoded as: 039 * </p> 040 * 041 * <pre> 042 * root(4) 043 * | 044 * A(2)--------G(1)-----T(1) 045 * | | 046 * A(1)--G(1) T(1) 047 * </pre> 048 * 049 * <p> 050 * A possible method of comparing SuffixTrees is provided as a kernel 051 * function as <code>org.biojava.stats.svm.tools.SuffixTreeKernel</code>. 052 * </p> 053 * 054 * @author Matthew Pocock 055 * @author Thomas Down (documentation and other updates) 056 */ 057 058public class SuffixTree implements Serializable { 059 private FiniteAlphabet alphabet; 060 private SuffixNode root; 061 private AlphabetIndex indexer; 062 private List counts; 063 064 /** 065 * Return the Alphabet containing all Symbols which might be found in 066 * this SuffixTree. 067 */ 068 069 public FiniteAlphabet getAlphabet() { 070 return alphabet; 071 } 072 073 /** 074 * Return the node object which is the root of this suffix tree. 075 * This represents the set of all motifs found in this tree. 076 */ 077 078 public SuffixNode getRoot() { 079 return root; 080 } 081 082 /** 083 * Get a child of a SuffixTree.SuffixNode, constructing a new 084 * one if need be. This method is here due to memory optimisations. 085 */ 086 087 public SuffixNode getChild(SuffixNode node, Symbol s) 088 throws IllegalSymbolException { 089 if(!getAlphabet().contains(s)) { 090 return null; 091 } 092 int index = indexer.indexForSymbol(s); 093 return getChild(node, index); 094 } 095 096 /** 097 * Get the n'th child of a node. 098 */ 099 100 public SuffixNode getChild(SuffixNode node, int i) { 101 if(!node.hasChild(i)) { 102 node.addChild(this, i, new SimpleNode(alphabet.size())); 103 } 104 return node.getChild(i); 105 } 106 107 /** 108 * Add a count for all motifs with length of up to <code>window</code> 109 * to this tree. 110 * 111 * @param sList a SymbolList whose motifs should be added to the 112 * tree. 113 * @param window The maximum motif length to count. 114 */ 115 116 public void addSymbols(SymbolList sList, int window) 117 throws IllegalSymbolException { 118 SuffixNode [] buf = new SuffixNode[window]; 119 int [] counts = new int[window]; 120 for(int i = 0; i < window; i++) { 121 buf[i] = getRoot(); 122 } 123 124 for(int p = 1; p <= sList.length(); p++) { 125 Symbol s = sList.symbolAt(p); 126 buf[p % window] = getRoot(); 127 for(int i = 0; i < window; i++) { 128 int pi = (p + i) % window; 129 if(buf[pi] != null) { 130 buf[pi] = getChild(buf[pi], s); 131 if(buf[pi] != null) { 132 counts[i]++; 133 buf[pi].setNumber(buf[pi].getNumber() + 1.0f); 134 } 135 } 136 } 137 } 138 139 for(int i = 0; i < window; i++) { 140 incCounts(i+1, counts[i]); 141 } 142 } 143 144 protected void incCounts(int i, int c) { 145 if(i < counts.size()) { 146 Integer oldC = (Integer) counts.get(i-1); 147 Integer newC = new Integer(oldC.intValue() + c); 148 counts.set(i-1, newC); 149 } else { 150 counts.add(new Integer(c)); 151 } 152 } 153 154 /** 155 * Return the length of the longest motif represented in this 156 * SuffixTree 157 */ 158 159 public int maxLength() { 160 return counts.size(); 161 } 162 163 /** 164 * Return the number of motifs of a given length encoded 165 * in this SuffixTree. 166 */ 167 168 public int frequency(int length) { 169 return ((Integer) counts.get(length - 1)).intValue(); 170 } 171 172 /** 173 * Construct a new SuffixTree to contain motifs over the 174 * specified alphabet. 175 * 176 * @param alphabet The alphabet of this SuffixTree (must be 177 * finite). 178 */ 179 180 public SuffixTree(FiniteAlphabet alphabet) { 181 this.alphabet = alphabet; 182 this.indexer = AlphabetManager.getAlphabetIndex(alphabet); 183 this.counts = new ArrayList(); 184 this.root = new SimpleNode(alphabet.size()); 185 } 186 187 /** 188 * A node in the suffix tree. 189 * <p> 190 * This class is realy stupid & delegates most work off to a SuffixTree so 191 * that it is as small (in memory-per-object terms) as possible. 192 * 193 * @author Matthew Pocock 194 */ 195 public static abstract class SuffixNode implements Serializable { 196 /** 197 * Determine is this node is terminal (has no children). 198 * 199 * @return <code>true</code> if and only if this node has no children. 200 */ 201 202 abstract public boolean isTerminal(); 203 204 /** 205 * Determine if this node has a child corresponding to a given 206 * index number. 207 */ 208 209 abstract public boolean hasChild(int i); 210 211 /** 212 * Return a number (usually, but not always, a motif count) 213 * associated with this node of the tree. 214 */ 215 216 abstract public float getNumber(); 217 218 /** 219 * Set the number associated with this node. 220 */ 221 222 abstract public void setNumber(float n); 223 224 abstract SuffixNode getChild(int i); 225 abstract void addChild(SuffixTree tree, int i, SuffixNode n); 226 } 227 228 private static class SimpleNode extends SuffixNode { 229 private float number = 0.0f; 230 private SuffixNode [] child; 231 232 private SuffixNode [] childArray(SuffixTree tree) { 233 if(child == null) 234 child = new SuffixNode[tree.getAlphabet().size()]; 235 return child; 236 } 237 238 public boolean isTerminal() { 239 return false; 240 } 241 242 public boolean hasChild(int i) { 243 return child != null && child[i] != null; 244 } 245 246 public float getNumber() { 247 return number; 248 } 249 250 SuffixNode getChild(int i) { 251 if(hasChild(i)) 252 return child[i]; 253 return null; 254 } 255 256 void addChild(SuffixTree tree, int i, SuffixNode n) { 257 childArray(tree)[i] = n; 258 } 259 260 public void setNumber(float n) { 261 number = n; 262 } 263 264 SimpleNode(int c) { 265 child = new SuffixNode[c]; 266 } 267 } 268}