001 002 003package org.biojava.utils.regex; 004 005import org.biojava.bio.seq.io.SymbolListCharSequence; 006import org.biojava.bio.symbol.SymbolList; 007 008/** 009 * This class is analogous to java.util.Matcher except that it works 010 * on SymbolLists instead of Strings. All coordinates are in the 1-based 011 * coordinate system used by SymbolLists. 012 * 013 * @author David Huen 014 * @since 1.4 015 */ 016public class Matcher 017{ 018 private org.biojava.utils.regex.Pattern pattern; 019 private java.util.regex.Matcher matcher; 020 private SymbolList sl; 021 022 Matcher(org.biojava.utils.regex.Pattern pattern, SymbolList sl) 023 { 024 this.pattern = pattern; 025 this.sl = sl; 026 027 matcher = pattern.getPattern().matcher(new SymbolListCharSequence(sl)); 028 } 029 030 /** 031 * Returns the index of the last character matched, plus one. 032 * @return The index of the last character matched, plus one. 033 */ 034 public int end() { return matcher.end() + 1; } 035 /** 036 * Returns the index of the last Symbol, plus one, 037 * of the subsequence captured by the given group during the previous match operation. 038 * <p> 039 * Capturing groups are indexed from left to right, starting at one. 040 * Group zero denotes the entire pattern, so the expression m.end(0) is equivalent to m.end(). 041 * @param group The index of a capturing group in this matcher's pattern. 042 * @return The index of the last Symbol captured by the group, plus one, 043 * or -1 if the match was successful but the group itself did not match anything. 044 */ 045 public int end(int group) throws IndexOutOfBoundsException 046 { 047 int pos = matcher.end(group); 048 if (pos == -1) 049 return pos; 050 else 051 return pos + 1; 052 } 053 054 /** 055 * Attempts to find the next subsequence of the input sequence that matches the pattern. 056 * <p> 057 * This method starts at the beginning of the input sequence or, 058 * if a previous invocation of the method was successful and the matcher 059 * has not since been reset, at the first Symbol not matched by the previous match. 060 * If the match succeeds then more information can be obtained via the start, end, and group methods. 061 * @return true if, and only if, a subsequence of the input sequence matches this matcher's pattern. 062 */ 063 public boolean find() { return matcher.find(); } 064 065 /** 066 * Resets this matcher and then attempts to find the next subsequence 067 * of the input sequence that matches the pattern, starting at the specified index. 068 * <p> 069 * If the match succeeds then more information can be obtained via the start, 070 * end, and group methods, and subsequent invocations of the find() method 071 * will start at the first Symbol not matched by this match. 072 * @return true if, and only if, a subsequence of the input sequence 073 * starting at the given index matches this matcher's pattern. 074 */ 075 public boolean find(int start) throws IndexOutOfBoundsException { return matcher.find(start - 1); } 076 /** 077 * Returns the input subsequence matched by the previous match. 078 * <p> 079 * For a matcher m with input sequence s, the expressions m.group() 080 * and s.substring(m.start(), m.end()) are equivalent. 081 * Note that some patterns, for example a*, match the empty SymbolList. 082 * This method will return the empty string when the pattern successfully matches the empty string in the input. 083 * @return The (possibly empty) subsequence matched by the previous match, in SymbolList form. 084 */ 085 public SymbolList group() 086 { 087 return sl.subList(start(), end() - 1); 088 } 089 090 /** 091 * Returns the input subsequence captured by the given group during the previous match operation. 092 * <p> 093 * For a matcher m, input sequence s, and group index g, the expressions 094 * m.group(g) and s.substring(m.start(g), m.end(g)) are equivalent. 095 * Capturing groups are indexed from left to right, starting at one. 096 * Group zero denotes the entire pattern, so the expression m.group(0) is equivalent to m.group(). 097 * If the match was successful but the group specified failed to match 098 * any part of the input sequence, then null is returned. 099 * Note that some groups, for example (a*), match the empty string. 100 * This method will return the empty string when such a group successfully matches the emtpy string in the input. 101 * @return The (possibly empty) subsequence captured by the group during the previous match, 102 * or null if the group failed to match part of the input. 103 */ 104 public SymbolList group(int group) 105 throws IndexOutOfBoundsException 106 { 107 int start = matcher.start(group); 108 int end = matcher.end(group); 109 if ((start == -1) && (end == -1)) return null; 110 else 111 return sl.subList(start(group), end(group) - 1); 112 } 113 114 /** 115 * Returns the number of capturing groups in this matcher's pattern. 116 * <p> 117 * Any non-negative integer smaller than the value returned 118 * by this method is guaranteed to be a valid group index for this matcher. 119 * @return The number of capturing groups in this matcher's pattern. 120 */ 121 public int groupCount() { return matcher.groupCount(); } 122 123 /** 124 * Attempts to match the input SymbolList, starting at the beginning, against the pattern. 125 * <p> 126 * Like the matches method, this method always starts at the 127 * beginning of the input sequence; unlike that method, 128 * it does not require that the entire input sequence be matched. 129 * If the match succeeds then more information can be obtained via the start, end, and group methods. 130 * @return true if, and only if, a prefix of the input sequence matches this matcher's pattern. 131 */ 132 public boolean lookingAt() { return matcher.lookingAt(); } 133 134 /** 135 * Attempts to match the entire input sequence against the pattern. 136 * <p> 137 * If the match succeeds then more information can be obtained via the start, end, and group methods. 138 * @return true if, and only if, the entire input sequence matches this matcher's pattern. 139 */ 140 public boolean matches() { return matcher.matches(); } 141 142 /** 143 * Returns the Pattern object that compiled this Matcher. 144 */ 145 public org.biojava.utils.regex.Pattern pattern() 146 { 147 return pattern; 148 } 149 150 /** 151 * Resets this matcher. 152 * <p> 153 * Resetting a matcher discards all of its explicit state information and sets its append position to zero. 154 * @return this matcher. 155 */ 156 public org.biojava.utils.regex.Matcher reset() 157 { 158 matcher = matcher.reset(); 159 return this; 160 } 161 162 /** 163 * Resets this matcher with a new input SymbolList. 164 * <p> 165 * Resetting a matcher discards all of its explicit state information and sets its append position to zero. 166 * @return this matcher. 167 */ 168 public org.biojava.utils.regex.Matcher reset(SymbolList sl) 169 { 170 this.sl = sl; 171 matcher = matcher.reset(new SymbolListCharSequence(sl)); 172 return this; 173 } 174 175 /** 176 * Returns the start index of the previous match. 177 * @return The index of the first Symbol matched. 178 */ 179 public int start() { return matcher.start() + 1; } 180 /** 181 * Returns the start index of the subsequence captured by the given group during the previous match operation. 182 * <p> 183 * Capturing groups are indexed from left to right, starting at one. 184 * Group zero denotes the entire pattern, so the expression m.start(0) is equivalent to m.start(). 185 * @param group The index of a capturing group in this matcher's pattern. 186 * @return The index of the first character captured by the group, or -1 if the match was successful 187 * but the group itself did not match anything. 188 */ 189 public int start(int group) 190 { 191 int pos = matcher.start(group); 192 if (pos == -1) 193 return pos; 194 else 195 return pos + 1; 196 } 197 198} 199