001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022
023package org.biojava.bio.symbol;
024
025import java.io.Serializable;
026import java.util.ArrayList;
027import java.util.List;
028
029/**
030 * Suffix tree implementation.  The interface is a bit strange, as it
031 * needed to be as space-efficient as possible. More work could be
032 * done on the space issue.
033 *
034 * <p>
035 * A suffix tree is an efficient method for encoding the frequencies
036 * of motifs in a sequence.  They are sometimes used to quickly screen
037 * for similar sequences.  For instance, all motifs of length up to
038 * 2 in the sequence <code>AAGT</code> could be encoded as:
039 * </p>
040 *
041 * <pre>
042 * root(4)
043 * |
044 * A(2)--------G(1)-----T(1)
045 * |           |
046 * A(1)--G(1)  T(1)
047 * </pre>
048 *
049 * <p>
050 * A possible method of comparing SuffixTrees is provided as a kernel
051 * function as <code>org.biojava.stats.svm.tools.SuffixTreeKernel</code>.
052 * </p>
053 *
054 * @author Matthew Pocock
055 * @author Thomas Down (documentation and other updates) 
056 */
057
058public class SuffixTree implements Serializable {
059  private FiniteAlphabet alphabet;
060  private SuffixNode root;
061  private AlphabetIndex indexer;
062  private List counts;
063  
064    /**
065     * Return the Alphabet containing all Symbols which might be found in
066     * this SuffixTree.
067     */
068
069  public FiniteAlphabet getAlphabet() {
070    return alphabet;
071  }
072
073    /**
074     * Return the node object which is the root of this suffix tree.
075     * This represents the set of all motifs found in this tree.
076     */
077
078  public SuffixNode getRoot() {
079    return root;
080  }
081  
082    /**
083     * Get a child of a SuffixTree.SuffixNode, constructing a new
084     * one if need be.  This method is here due to memory optimisations.
085     */
086
087  public SuffixNode getChild(SuffixNode node, Symbol s)
088  throws IllegalSymbolException {
089    if(!getAlphabet().contains(s)) {
090      return null;
091    }
092    int index = indexer.indexForSymbol(s);
093    return getChild(node, index);
094  }
095  
096    /**
097     * Get the n'th child of a node.
098     */
099
100  public SuffixNode getChild(SuffixNode node, int i) {
101    if(!node.hasChild(i)) {
102      node.addChild(this, i, new SimpleNode(alphabet.size()));
103    }
104    return node.getChild(i);
105  }
106  
107    /**
108     * Add a count for all motifs with length of up to <code>window</code>
109     * to this tree.
110     *
111     * @param sList a SymbolList whose motifs should be added to the
112     *              tree.
113     * @param window The maximum motif length to count.
114     */
115
116  public void addSymbols(SymbolList sList, int window)
117  throws IllegalSymbolException {
118    SuffixNode [] buf = new SuffixNode[window];
119    int [] counts = new int[window];
120    for(int i = 0; i < window; i++) {
121      buf[i] = getRoot();
122    }
123    
124    for(int p = 1; p <= sList.length(); p++) {
125      Symbol s = sList.symbolAt(p);
126      buf[p % window] = getRoot();
127      for(int i = 0; i < window; i++) {
128        int pi = (p + i) % window;
129        if(buf[pi] != null) {
130          buf[pi] = getChild(buf[pi], s);
131          if(buf[pi] != null) {
132            counts[i]++;
133            buf[pi].setNumber(buf[pi].getNumber() + 1.0f);
134          }
135        }
136      }
137    }
138    
139    for(int i = 0; i < window; i++) {
140      incCounts(i+1, counts[i]);
141    } 
142  }
143  
144  protected void incCounts(int i, int c) {
145    if(i < counts.size()) {
146      Integer oldC = (Integer) counts.get(i-1);
147      Integer newC = new Integer(oldC.intValue() + c);
148      counts.set(i-1, newC);
149    } else {
150      counts.add(new Integer(c));
151    }
152  }
153  
154    /**
155     * Return the length of the longest motif represented in this
156     * SuffixTree
157     */
158
159  public int maxLength() {
160    return counts.size();
161  }
162  
163    /**
164     * Return the number of motifs of a given length encoded
165     * in this SuffixTree.
166     */
167
168  public int frequency(int length) {
169    return ((Integer) counts.get(length - 1)).intValue();
170  }
171  
172    /**
173     * Construct a new SuffixTree to contain motifs over the
174     * specified alphabet.
175     *
176     * @param alphabet The alphabet of this SuffixTree (must be
177     *                 finite).
178     */
179
180  public SuffixTree(FiniteAlphabet alphabet) {
181    this.alphabet = alphabet;
182    this.indexer = AlphabetManager.getAlphabetIndex(alphabet);
183    this.counts = new ArrayList();
184    this.root = new SimpleNode(alphabet.size());
185  }
186  
187  /**
188   * A node in the suffix tree.
189   * <p>
190   * This class is realy stupid & delegates most work off to a SuffixTree so
191   * that it is as small (in memory-per-object terms) as possible.
192   *
193   * @author Matthew Pocock
194   */
195  public static abstract class SuffixNode implements Serializable {
196      /**
197       * Determine is this node is terminal (has no children).
198       *
199       * @return <code>true</code> if and only if this node has no children.
200       */
201
202    abstract public boolean isTerminal();
203      
204      /**
205       * Determine if this node has a child corresponding to a given
206       * index number.
207       */
208
209    abstract public boolean hasChild(int i);
210
211      /**
212       * Return a number (usually, but not always, a motif count)
213       * associated with this node of the tree.
214       */
215
216    abstract public float getNumber();
217      
218      /**
219       * Set the number associated with this node.
220       */
221
222    abstract public void setNumber(float n);
223
224    abstract SuffixNode getChild(int i);
225    abstract void addChild(SuffixTree tree, int i, SuffixNode n);
226  }
227
228  private static class SimpleNode extends SuffixNode {
229    private float number = 0.0f;
230    private SuffixNode [] child;
231    
232    private SuffixNode [] childArray(SuffixTree tree) {
233      if(child == null)
234        child = new SuffixNode[tree.getAlphabet().size()];
235      return child;
236    }
237    
238    public boolean isTerminal() {
239      return false;
240    }
241    
242    public boolean hasChild(int i) {
243      return child != null && child[i] != null;
244    }
245    
246    public float getNumber() {
247      return number;
248    }
249    
250    SuffixNode getChild(int i) {
251      if(hasChild(i))
252        return child[i];
253      return null;
254    }
255    
256    void addChild(SuffixTree tree, int i, SuffixNode n) {
257      childArray(tree)[i] = n;
258    }
259    
260    public void setNumber(float n) {
261      number = n;
262    }
263    
264    SimpleNode(int c) {
265      child = new SuffixNode[c];
266    }
267  }
268}