001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojava.bio.search; 023 024import org.biojava.bio.symbol.BasisSymbol; 025import org.biojava.bio.symbol.Symbol; 026import org.biojava.bio.symbol.SymbolList; 027 028/** 029 * A BioMatcher class returned by MaxMismatchPattern.matcher() that implements 030 * searching of a SymbolList. 031 * <p> 032 * This class is public only to allow access to the mismatchCount() method. 033 * 034 * @author Matthew Pocock (wrote original MaxMissmatchMatcher class) 035 * @author David Huen (debugging and extension of functionality) 036 */ 037public class MaxMismatchMatcher 038implements BioMatcher { 039 // primary data 040 private final Symbol [] patternSymbol; // an array containing the pattern symbols in reversed order 041 private final SymbolList seq; 042 043 // precomputed constants 044 private final int patLength; 045 private final int maxPatternSymbolIdx; 046 private final int seqLength; 047 private final int minMatches; 048 private final boolean [] ambiguousPosition; // indicates if the corresponding position in patternSymbol represents 049 // an ambiguity 050 private boolean hasAmbiguity =false; 051 052 // working numbers 053 private final int[] matches; // this is a modulo'd rotating register that keeps track of the match count 054 private int pos; // this is the symbol in sequence that will be added for consideration 055 056 MaxMismatchMatcher(SymbolList seq, 057 SymbolList pattern, 058 int mismatches) 059 { 060 this.seq = seq; 061 062 patLength = pattern.length(); 063 maxPatternSymbolIdx = patLength - 1; 064 seqLength = seq.length(); 065 minMatches = patLength - mismatches; 066 067 // construct reversed pattern array 068 patternSymbol = new Symbol[patLength]; 069 ambiguousPosition = new boolean[patLength]; 070 for (int i=0; i < patLength; i++) { 071 patternSymbol[i] = pattern.symbolAt(patLength - i); 072 if (patternSymbol[i] instanceof BasisSymbol) { 073 ambiguousPosition[i] = true; 074 hasAmbiguity = true; 075 } 076 } 077 078 // initialize matches 079 matches = new int[patLength]; 080 for(int i = 0; i < matches.length; i++) matches[i] = 0; 081 082 pos = 1; 083 } 084 085 public boolean find() 086 { 087 088 if (pos >= seq.length()) { 089 return false; 090 } 091 092 for (; pos <= seqLength; pos++) { 093 if (addSymbol()) { 094 pos++; 095 return true; 096 } 097 } 098 099 return false; 100 } 101 102 // computes consequences of symbol pointed to by pos 103 private boolean addSymbol() 104 { 105 Symbol sym = seq.symbolAt(pos); 106 // compute matches for continuation of match 107 if (hasAmbiguity) { 108 for (int i=0; i < maxPatternSymbolIdx; i++) { 109 int idx = (pos + i) % patLength; 110 if (ambiguousPosition[i]) { 111 if (patternSymbol[i].getMatches().contains(sym)) 112 matches[idx]++; 113 } 114 else { 115 if (sym == patternSymbol[i]) 116 matches[idx]++; 117 } 118 } 119 } 120 else { 121 for (int i=0; i < maxPatternSymbolIdx; i++) { 122 int idx = (pos + i) % patLength; 123 if (sym == patternSymbol[i]) 124 matches[idx]++; 125 } 126 } 127 128 // initialise and compute initial match 129 matches[(pos + maxPatternSymbolIdx) % patLength] = (sym == patternSymbol[maxPatternSymbolIdx])?1:0; 130 131 return matches[pos % patLength] >= minMatches; 132 } 133 134 public int start() { 135 // remember that pos is already incremented beyond the match 136 return pos - matches.length; 137 } 138 139 public int end() { 140 // remember that pos is already incremented beyond the match 141 return pos - 1; 142 } 143 144 public SymbolList group() { 145 return seq.subList(start(), end()); 146 } 147 148 /** 149 * Returns number of mismatches 150 */ 151 public int mismatchCount() { return patLength - matches[(pos - 1) % patLength]; } 152} 153 154