001
002
003package org.biojava.utils.regex;
004
005import org.biojava.bio.seq.io.SymbolListCharSequence;
006import org.biojava.bio.symbol.SymbolList;
007
008/**
009 * This class is analogous to java.util.Matcher except that it works
010 * on SymbolLists instead of Strings.  All coordinates are in the 1-based
011 * coordinate system used by SymbolLists.
012 *
013 * @author David Huen
014 * @since 1.4
015 */
016public class Matcher
017{
018    private org.biojava.utils.regex.Pattern pattern;
019    private java.util.regex.Matcher matcher;
020    private SymbolList sl;
021
022    Matcher(org.biojava.utils.regex.Pattern pattern, SymbolList sl)
023    {
024        this.pattern = pattern;
025        this.sl = sl;
026
027        matcher = pattern.getPattern().matcher(new SymbolListCharSequence(sl));
028    }
029
030    /**
031     * Returns the index of the last character matched, plus one.
032     * @return The index of the last character matched, plus one.
033     */
034    public int end() { return matcher.end() + 1; }
035    /**
036     * Returns the index of the last Symbol, plus one, 
037     * of the subsequence captured by the given group during the previous match operation.
038     * <p>
039     * Capturing groups are indexed from left to right, starting at one. 
040     * Group zero denotes the entire pattern, so the expression m.end(0) is equivalent to m.end().
041     * @param group The index of a capturing group in this matcher's pattern.
042     * @return The index of the last Symbol captured by the group, plus one, 
043     * or -1 if the match was successful but the group itself did not match anything.
044     */
045    public int end(int group) throws IndexOutOfBoundsException 
046    {
047        int pos = matcher.end(group);
048        if (pos == -1)
049            return pos;
050        else
051            return pos + 1;
052    }
053
054    /**
055     * Attempts to find the next subsequence of the input sequence that matches the pattern.
056     * <p>
057     * This method starts at the beginning of the input sequence or, 
058     * if a previous invocation of the method was successful and the matcher 
059     * has not since been reset, at the first Symbol not matched by the previous match.
060     * If the match succeeds then more information can be obtained via the start, end, and group methods. 
061     * @return true if, and only if, a subsequence of the input sequence matches this matcher's pattern.
062     */
063    public boolean find() { return matcher.find(); }
064
065    /**
066     * Resets this matcher and then attempts to find the next subsequence 
067     * of the input sequence that matches the pattern, starting at the specified index.
068     * <p>
069     * If the match succeeds then more information can be obtained via the start, 
070     * end, and group methods, and subsequent invocations of the find() method 
071     * will start at the first Symbol not matched by this match. 
072     * @return true if, and only if, a subsequence of the input sequence 
073     * starting at the given index matches this matcher's pattern.
074     */
075    public boolean find(int start) throws IndexOutOfBoundsException { return matcher.find(start - 1); }
076    /**
077     * Returns the input subsequence matched by the previous match.
078     * <p>
079     * For a matcher m with input sequence s, the expressions m.group() 
080     * and s.substring(m.start(), m.end()) are equivalent.
081     * Note that some patterns, for example a*, match the empty SymbolList. 
082     * This method will return the empty string when the pattern successfully matches the empty string in the input. 
083     * @return The (possibly empty) subsequence matched by the previous match, in SymbolList form.
084     */
085    public SymbolList group()
086    {
087        return sl.subList(start(), end() - 1);
088    }
089
090    /**
091     * Returns the input subsequence captured by the given group during the previous match operation.
092     * <p>
093     * For a matcher m, input sequence s, and group index g, the expressions 
094     * m.group(g) and s.substring(m.start(g), m.end(g)) are equivalent.
095     * Capturing groups are indexed from left to right, starting at one. 
096     * Group zero denotes the entire pattern, so the expression m.group(0) is equivalent to m.group().
097     * If the match was successful but the group specified failed to match 
098     * any part of the input sequence, then null is returned. 
099     * Note that some groups, for example (a*), match the empty string. 
100     * This method will return the empty string when such a group successfully matches the emtpy string in the input. 
101     * @return The (possibly empty) subsequence captured by the group during the previous match, 
102     * or null if the group failed to match part of the input.
103     */
104    public SymbolList group(int group)
105        throws IndexOutOfBoundsException
106    {
107        int start = matcher.start(group);
108        int end = matcher.end(group);
109        if ((start == -1) && (end == -1)) return null;
110        else
111            return sl.subList(start(group), end(group) - 1);
112    }
113
114    /**
115     * Returns the number of capturing groups in this matcher's pattern.
116     * <p>
117     * Any non-negative integer smaller than the value returned 
118     * by this method is guaranteed to be a valid group index for this matcher. 
119     * @return The number of capturing groups in this matcher's pattern.
120     */
121    public int groupCount() { return matcher.groupCount(); }
122
123    /**
124     * Attempts to match the input SymbolList, starting at the beginning, against the pattern.
125     * <p>
126     * Like the matches method, this method always starts at the 
127     * beginning of the input sequence; unlike that method, 
128     * it does not require that the entire input sequence be matched.
129     * If the match succeeds then more information can be obtained via the start, end, and group methods.
130     * @return true if, and only if, a prefix of the input sequence matches this matcher's pattern.
131     */
132    public boolean lookingAt() { return matcher.lookingAt(); }
133
134    /**
135     * Attempts to match the entire input sequence against the pattern.
136     * <p>
137     * If the match succeeds then more information can be obtained via the start, end, and group methods. 
138     * @return true if, and only if, the entire input sequence matches this matcher's pattern.
139     */
140    public boolean matches() { return matcher.matches(); }
141
142    /**
143     * Returns the Pattern object that compiled this Matcher.
144     */
145    public org.biojava.utils.regex.Pattern pattern()
146    {
147        return pattern;
148    }
149
150    /**
151     * Resets this matcher.
152     * <p>
153     * Resetting a matcher discards all of its explicit state information and sets its append position to zero. 
154     * @return this matcher.
155     */
156    public org.biojava.utils.regex.Matcher reset()
157    {
158        matcher = matcher.reset();
159        return this;
160    }
161
162    /**
163     * Resets this matcher with a new input SymbolList.
164     * <p>
165     * Resetting a matcher discards all of its explicit state information and sets its append position to zero. 
166     * @return this matcher.
167     */
168    public org.biojava.utils.regex.Matcher reset(SymbolList sl)
169    {
170        this.sl = sl;
171        matcher = matcher.reset(new SymbolListCharSequence(sl));
172        return this;
173    }
174
175    /**
176     * Returns the start index of the previous match.
177     * @return The index of the first Symbol matched.
178     */
179    public int start() { return matcher.start() + 1; }
180    /**
181     * Returns the start index of the subsequence captured by the given group during the previous match operation.
182     * <p>
183     * Capturing groups are indexed from left to right, starting at one. 
184     * Group zero denotes the entire pattern, so the expression m.start(0) is equivalent to m.start(). 
185     * @param group The index of a capturing group in this matcher's pattern.
186     * @return The index of the first character captured by the group, or -1 if the match was successful 
187     * but the group itself did not match anything.
188     */
189    public int start(int group) 
190    {
191        int pos = matcher.start(group);
192        if (pos == -1)
193            return pos;
194        else
195            return pos + 1;
196    }
197
198}
199