001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022package org.biojava.bio.search;
023
024import org.biojava.bio.symbol.BasisSymbol;
025import org.biojava.bio.symbol.Symbol;
026import org.biojava.bio.symbol.SymbolList;
027
028/**
029 * A BioMatcher class returned by MaxMismatchPattern.matcher() that implements
030 * searching of a SymbolList.
031 * <p>
032 * This class is public only to allow access to the mismatchCount() method.
033 *
034 * @author Matthew Pocock (wrote original MaxMissmatchMatcher class)
035 * @author David Huen (debugging and extension of functionality)
036 */
037public class MaxMismatchMatcher
038implements BioMatcher {
039    // primary data
040    private final Symbol [] patternSymbol; // an array containing the pattern symbols in reversed order
041    private final SymbolList seq;
042
043    // precomputed constants
044    private final int patLength;
045    private final int maxPatternSymbolIdx;
046    private final int seqLength;
047    private final int minMatches;
048    private final boolean [] ambiguousPosition; // indicates if the corresponding position in patternSymbol represents
049                                                // an ambiguity
050    private boolean hasAmbiguity =false;
051
052    // working numbers
053    private final int[] matches; // this is a modulo'd rotating register that keeps track of the match count
054    private int pos; // this is the symbol in sequence that will be added for consideration
055
056    MaxMismatchMatcher(SymbolList seq,
057                      SymbolList pattern,
058                      int mismatches)
059    {
060        this.seq = seq;
061
062        patLength = pattern.length();
063        maxPatternSymbolIdx = patLength - 1;
064        seqLength = seq.length();
065        minMatches = patLength - mismatches;
066
067        // construct reversed pattern array
068        patternSymbol = new Symbol[patLength];
069        ambiguousPosition = new boolean[patLength];
070        for (int i=0; i < patLength; i++) {
071            patternSymbol[i] = pattern.symbolAt(patLength - i);
072            if (patternSymbol[i] instanceof BasisSymbol) {
073                ambiguousPosition[i] = true;
074                hasAmbiguity = true;
075            }
076        }
077
078        // initialize matches
079        matches = new int[patLength];
080        for(int i = 0; i < matches.length; i++) matches[i] = 0;
081
082        pos = 1;
083    }
084
085    public boolean find() 
086    {
087
088        if (pos >= seq.length()) {
089            return false;
090        }
091
092        for (; pos <= seqLength; pos++) {
093            if (addSymbol()) {
094                pos++;
095                return true;
096            }
097        }
098
099        return false;
100    }
101
102    // computes consequences of symbol pointed to by pos
103    private boolean addSymbol()
104    {
105        Symbol sym = seq.symbolAt(pos);
106        // compute matches for continuation of match
107        if (hasAmbiguity) {
108            for (int i=0; i < maxPatternSymbolIdx; i++) {
109                int idx = (pos + i) % patLength;
110                if (ambiguousPosition[i]) {
111                    if (patternSymbol[i].getMatches().contains(sym))
112                        matches[idx]++;
113                }
114                else {
115                    if (sym == patternSymbol[i])
116                        matches[idx]++;
117                }
118            }
119        }
120        else {
121            for (int i=0; i < maxPatternSymbolIdx; i++) {
122                int idx = (pos + i) % patLength;
123                if (sym == patternSymbol[i])
124                    matches[idx]++;
125            }
126        }
127
128        // initialise and compute initial match
129        matches[(pos + maxPatternSymbolIdx) % patLength] = (sym == patternSymbol[maxPatternSymbolIdx])?1:0;
130
131        return matches[pos % patLength] >= minMatches;
132    }
133
134    public int start() {
135        // remember that pos is already incremented beyond the match
136        return pos - matches.length;
137    }
138
139    public int end() {
140        // remember that pos is already incremented beyond the match
141        return pos - 1;
142    }
143
144    public SymbolList group() {
145        return seq.subList(start(), end());
146    }
147
148    /**
149     * Returns number of mismatches
150     */
151    public int mismatchCount() { return patLength - matches[(pos - 1) % patLength]; }
152}
153
154