001 002 003 004package org.biojava.utils.regex; 005 006import java.util.HashSet; 007import java.util.Iterator; 008import java.util.Set; 009 010import org.biojava.bio.symbol.FiniteAlphabet; 011import org.biojava.bio.symbol.IllegalAlphabetException; 012import org.biojava.bio.symbol.IllegalSymbolException; 013import org.biojava.bio.symbol.Symbol; 014import org.biojava.bio.symbol.SymbolList; 015 016/** 017 * A utility class to make searching a Sequence with many regex patterns 018 * easier. 019 * @author David Huen 020 * @since 1.4 021 */ 022public class Search 023{ 024 /** 025 * Interface for a class that will recieve match information 026 * from this class. 027 */ 028 public interface Listener 029 { 030 /** 031 * @param seq Sequence on which the search was conducted. 032 * @param pattern Pattern object used to conduct search. 033 * @param start start coordinate of match. 034 * @param end end of match plus one. 035 * @return if false, it indicates the Listener will not accept further data. 036 */ 037 public boolean reportMatch(SymbolList seq, Pattern pattern, int start, int end); 038 } 039 040 private class PatternInfo 041 { 042 private String patternString; 043 private Pattern pattern = null; 044 private Matcher matcher = null; 045 private boolean overlap; 046 047 public boolean equals(Object o) 048 { 049 if (o instanceof PatternInfo) { 050 PatternInfo other = (PatternInfo) o; 051 return patternString.equals(other.patternString); 052 } 053 else 054 return false; 055 } 056 057 public int hashCode() 058 { 059 return patternString.hashCode(); 060 } 061 } 062 063 private Listener listener = null; 064 private PatternFactory factory; 065 private Set patterns = new HashSet(); 066 067 public Search(FiniteAlphabet alfa) 068 { 069 factory = PatternFactory.makeFactory(alfa); 070 } 071 072 public void setListener(Listener listener) 073 { 074 this.listener = listener; 075 } 076 077 /** 078 * add a search pattern to the searches to be conducted 079 * by this object. 080 * @param patternString String representation of the pattern. 081 * @param overlap if true, the search continues at the base following the start to the previous hit. 082 * If false, it continues at the base after the existing hit. 083 * @throws RegexException if the requested pattern is not valid 084 * @throws IllegalAlphabetException if the requested pattern is not valid 085 */ 086 public void addPattern(String patternString, boolean overlap) 087 throws RegexException, IllegalAlphabetException 088 { 089 Pattern pattern = factory.compile(patternString); 090 PatternInfo info = new PatternInfo(); 091 info.patternString = patternString; 092 info.pattern = pattern; 093 info.overlap = overlap; 094 patterns.add(info); 095 } 096 097 /** 098 * add a search pattern to the searches to be conducted 099 * by this object. 100 * @param patternString String representation of the pattern. 101 * @param overlap if true, the search continues at the base following the start to the previous hit. 102 * If false, it continues at the base after the existing hit. 103 * @throws RegexException if the requested pattern is not valid 104 * @throws IllegalAlphabetException if the requested pattern is not valid 105 */ 106 public void addPattern(String label, String patternString, boolean overlap) 107 throws RegexException, IllegalAlphabetException 108 { 109 Pattern pattern = factory.compile(patternString, label); 110 PatternInfo info = new PatternInfo(); 111 info.patternString = patternString; 112 info.pattern = pattern; 113 info.overlap = overlap; 114 patterns.add(info); 115 } 116 117 /** 118 * remove all patterns from the pattern cache. 119 */ 120 public void clearPatterns() 121 { 122 patterns.clear(); 123 } 124 125 public char charValue(Symbol sym) 126 throws IllegalSymbolException 127 { 128 return factory.charValue(sym); 129 } 130 131 /** 132 * search the Sequence with the patterns already registered with this object. 133 */ 134 public void search(SymbolList seq) 135 { 136 search(seq, 1, seq.length()); 137 } 138 139 /** 140 * search part of the SymbolList with the patterns already registered with this object. 141 * @param loLimit low limit of search range. 142 * @param hiLimit high limit of search range. 143 */ 144 public void search(SymbolList seq, int loLimit, int hiLimit) 145 { 146 for (Iterator patternsI = patterns.iterator(); patternsI.hasNext(); ) { 147 PatternInfo info = (PatternInfo) patternsI.next(); 148 if (info.matcher == null) { 149 info.matcher = info.pattern.matcher(seq); 150 } 151 else { 152 info.matcher = info.matcher.reset(seq); 153 } 154 155 // now exhaustively search the sequence 156 int begin = loLimit; 157 while (info.matcher.find(begin)) { 158 // got a hit 159 int start = info.matcher.start(); 160 int end = info.matcher.end(); 161 if ((listener != null) && (start <= hiLimit)) 162 if (!listener.reportMatch(seq, info.pattern, start, end)) return; 163 // compute where next search begins 164 if (info.overlap) 165 begin = start + 1; 166 else 167 begin = end; 168 if (begin >= hiLimit) break; 169 } 170 } 171 } 172} 173