001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022package org.biojava.bio.search;
023
024import org.biojava.bio.symbol.AlphabetIndex;
025import org.biojava.bio.symbol.AlphabetManager;
026import org.biojava.bio.symbol.AtomicSymbol;
027import org.biojava.bio.symbol.FiniteAlphabet;
028import org.biojava.bio.symbol.IllegalAlphabetException;
029import org.biojava.bio.symbol.IllegalSymbolException;
030import org.biojava.bio.symbol.SymbolList;
031
032/**
033 * A pattern that can be used to find regions with given sequence content.
034 *
035 * <p>
036 * Regular expressions can be used to find sequence patterns. However, some
037 * things can't be easily expressed as a regular expression. For example,
038 * a region of length 10 that contains at least 8 Gs and up to two Ts and no
039 * other symbols. A SeqContentPattern can be built that does represent this.
040 * <p>
041 *
042 * <code><pre>
043 * SeqContentPattern scp = new SeqContentPattern(DNATools.getDNA());
044 * scp.setLength(10);
045 * scp.setMinCounts(DNATools.g(), 8);
046 * scp.setMaxCounts(DNATools.t(), 2);
047 * scp.setMaxCounts(DNATools.c(), 0);
048 * scp.setMaxCounts(DNATools.a(), 0);
049 * </pre></code>
050 *
051 * <p>
052 * The minimum counts default to 0, and the maximum counts default to the
053 * length. If you have not manually set the maximum count for a symbol, it will
054 * continue to adjust while you change the length. Once you have set it, it will
055 * not vary, even if you do set the length. To re-set a maximum count to track
056 * the length, set it to -1.
057 * </p>
058 *
059 * <p>
060 * All regions of the defined length for which all constraints are satisfied
061 * will potentialy be found. At the moment we have not defined what will
062 * happen for multiple regions that overlap all of which satisfy the
063 * constraints.
064 * </p>
065 *
066 * @author Matthew Pocock
067 * @since 1.4
068 */
069public class SeqContentPattern implements BioPattern {
070  private final AlphabetIndex index;
071  private final int[] minCounts;
072  private final int[] maxCounts;
073  private int length;
074
075  /**
076   * Create a new SeqContentPattern over an alphabet.
077   *
078   * @param alpha  the FiniteAlphabet for this pattern
079   */
080  public SeqContentPattern(FiniteAlphabet alpha) {
081    index = AlphabetManager.getAlphabetIndex(alpha);
082    this.minCounts = new int[alpha.size()];
083    this.maxCounts = new int[alpha.size()];
084
085    for(int i = 0; i < minCounts.length; i++) {
086      minCounts[i] = 0;
087      maxCounts[i] = -1;
088    }
089  }
090
091  /**
092   * Get the current length.
093   *
094   * @return the length
095   */
096  public int getLength() {
097    return length;
098  }
099
100  /**
101   * Set the pattern length.
102   *
103   * @param length  the new length
104   */
105  public void setLength(int length) {
106    this.length = length;
107  }
108
109  /**
110   * Set the minimum counts required for a symbol.
111   *
112   * @param as  the AtomicSymbol to check
113   * @param count  the minimum number of counts it must have
114   * @throws IllegalSymbolException  if as is not known in this alphabet
115   */
116  public void setMinCounts(AtomicSymbol as, int count)
117  throws IllegalSymbolException {
118    minCounts[index.indexForSymbol(as)] = count;
119  }
120
121  /**
122   * Get the minimum counts required for a symbol.
123   *
124   * @param as  the AtomicSymbol to check
125   * @return the minimum number of counts it must have
126   * @throws IllegalSymbolException  if as is not known in this alphabet
127   */
128  public int getMinCounts(AtomicSymbol as)
129  throws IllegalSymbolException {
130    return minCounts[index.indexForSymbol(as)];
131  }
132
133  /**
134   * Set the maximum counts required for a symbol.
135   * Use -1 to reset it to track the length.
136   *
137   * @param as  the AtomicSymbol to check
138   * @param count  the maximum number of counts it must have
139   * @throws IllegalSymbolException  if as is not known in this alphabet
140   */
141  public void setMaxCounts(AtomicSymbol as, int count)
142  throws IllegalSymbolException {
143    maxCounts[index.indexForSymbol(as)] = count;
144  }
145
146  /**
147   * Get the maximum counts required for a symbol.
148   *
149   * @param as  the AtomicSymbol to check
150   * @return the maximum number of counts it must have
151   * @throws IllegalSymbolException  if as is not known in this alphabet
152   */
153  public int getMaxCounts(AtomicSymbol as)
154  throws IllegalSymbolException {
155    int c = maxCounts[index.indexForSymbol(as)];
156    if(c == -1) {
157      return length;
158    } else {
159      return c;
160    }
161  }
162
163  public BioMatcher matcher(SymbolList symList)
164  throws IllegalAlphabetException {
165    if(symList.getAlphabet() != index.getAlphabet()) {
166      throw new IllegalAlphabetException(
167        "Attempted to search symbol list over " + symList.getAlphabet() +
168        " but the search parameters only accept " + index.getAlphabet() );
169    }
170
171    int[] minCounts = new int[this.minCounts.length];
172    int[] maxCounts = new int[this.maxCounts.length];
173    for(int i = 0; i < minCounts.length; i++) {
174      minCounts[i] = this.minCounts[i];
175
176      int c = this.maxCounts[i];
177      maxCounts[i] = (c == -1) ? length : c;
178    }
179
180    return new SeqContentMatcher(
181      symList,
182      index,
183      minCounts,
184      maxCounts,
185      length );
186  }
187}
188