001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojava.bio.search; 023 024import org.biojava.bio.symbol.AlphabetIndex; 025import org.biojava.bio.symbol.AlphabetManager; 026import org.biojava.bio.symbol.AtomicSymbol; 027import org.biojava.bio.symbol.FiniteAlphabet; 028import org.biojava.bio.symbol.IllegalAlphabetException; 029import org.biojava.bio.symbol.IllegalSymbolException; 030import org.biojava.bio.symbol.SymbolList; 031 032/** 033 * A pattern that can be used to find regions with given sequence content. 034 * 035 * <p> 036 * Regular expressions can be used to find sequence patterns. However, some 037 * things can't be easily expressed as a regular expression. For example, 038 * a region of length 10 that contains at least 8 Gs and up to two Ts and no 039 * other symbols. A SeqContentPattern can be built that does represent this. 040 * <p> 041 * 042 * <code><pre> 043 * SeqContentPattern scp = new SeqContentPattern(DNATools.getDNA()); 044 * scp.setLength(10); 045 * scp.setMinCounts(DNATools.g(), 8); 046 * scp.setMaxCounts(DNATools.t(), 2); 047 * scp.setMaxCounts(DNATools.c(), 0); 048 * scp.setMaxCounts(DNATools.a(), 0); 049 * </pre></code> 050 * 051 * <p> 052 * The minimum counts default to 0, and the maximum counts default to the 053 * length. If you have not manually set the maximum count for a symbol, it will 054 * continue to adjust while you change the length. Once you have set it, it will 055 * not vary, even if you do set the length. To re-set a maximum count to track 056 * the length, set it to -1. 057 * </p> 058 * 059 * <p> 060 * All regions of the defined length for which all constraints are satisfied 061 * will potentialy be found. At the moment we have not defined what will 062 * happen for multiple regions that overlap all of which satisfy the 063 * constraints. 064 * </p> 065 * 066 * @author Matthew Pocock 067 * @since 1.4 068 */ 069public class SeqContentPattern implements BioPattern { 070 private final AlphabetIndex index; 071 private final int[] minCounts; 072 private final int[] maxCounts; 073 private int length; 074 075 /** 076 * Create a new SeqContentPattern over an alphabet. 077 * 078 * @param alpha the FiniteAlphabet for this pattern 079 */ 080 public SeqContentPattern(FiniteAlphabet alpha) { 081 index = AlphabetManager.getAlphabetIndex(alpha); 082 this.minCounts = new int[alpha.size()]; 083 this.maxCounts = new int[alpha.size()]; 084 085 for(int i = 0; i < minCounts.length; i++) { 086 minCounts[i] = 0; 087 maxCounts[i] = -1; 088 } 089 } 090 091 /** 092 * Get the current length. 093 * 094 * @return the length 095 */ 096 public int getLength() { 097 return length; 098 } 099 100 /** 101 * Set the pattern length. 102 * 103 * @param length the new length 104 */ 105 public void setLength(int length) { 106 this.length = length; 107 } 108 109 /** 110 * Set the minimum counts required for a symbol. 111 * 112 * @param as the AtomicSymbol to check 113 * @param count the minimum number of counts it must have 114 * @throws IllegalSymbolException if as is not known in this alphabet 115 */ 116 public void setMinCounts(AtomicSymbol as, int count) 117 throws IllegalSymbolException { 118 minCounts[index.indexForSymbol(as)] = count; 119 } 120 121 /** 122 * Get the minimum counts required for a symbol. 123 * 124 * @param as the AtomicSymbol to check 125 * @return the minimum number of counts it must have 126 * @throws IllegalSymbolException if as is not known in this alphabet 127 */ 128 public int getMinCounts(AtomicSymbol as) 129 throws IllegalSymbolException { 130 return minCounts[index.indexForSymbol(as)]; 131 } 132 133 /** 134 * Set the maximum counts required for a symbol. 135 * Use -1 to reset it to track the length. 136 * 137 * @param as the AtomicSymbol to check 138 * @param count the maximum number of counts it must have 139 * @throws IllegalSymbolException if as is not known in this alphabet 140 */ 141 public void setMaxCounts(AtomicSymbol as, int count) 142 throws IllegalSymbolException { 143 maxCounts[index.indexForSymbol(as)] = count; 144 } 145 146 /** 147 * Get the maximum counts required for a symbol. 148 * 149 * @param as the AtomicSymbol to check 150 * @return the maximum number of counts it must have 151 * @throws IllegalSymbolException if as is not known in this alphabet 152 */ 153 public int getMaxCounts(AtomicSymbol as) 154 throws IllegalSymbolException { 155 int c = maxCounts[index.indexForSymbol(as)]; 156 if(c == -1) { 157 return length; 158 } else { 159 return c; 160 } 161 } 162 163 public BioMatcher matcher(SymbolList symList) 164 throws IllegalAlphabetException { 165 if(symList.getAlphabet() != index.getAlphabet()) { 166 throw new IllegalAlphabetException( 167 "Attempted to search symbol list over " + symList.getAlphabet() + 168 " but the search parameters only accept " + index.getAlphabet() ); 169 } 170 171 int[] minCounts = new int[this.minCounts.length]; 172 int[] maxCounts = new int[this.maxCounts.length]; 173 for(int i = 0; i < minCounts.length; i++) { 174 minCounts[i] = this.minCounts[i]; 175 176 int c = this.maxCounts[i]; 177 maxCounts[i] = (c == -1) ? length : c; 178 } 179 180 return new SeqContentMatcher( 181 symList, 182 index, 183 minCounts, 184 maxCounts, 185 length ); 186 } 187} 188