001package org.biojava.bio.program.tagvalue; 002 003import java.util.regex.Matcher; 004import java.util.regex.Pattern; 005 006import org.biojava.utils.ParserException; 007 008/** 009 * <p> 010 * A TagValueParser that splits a line based upon a regular expression. There 011 * are configuration parameters analgous to those in LineSplitParser for 012 * configuring parsing details. 013 * </p> 014 * 015 * @author Matthew Pocock 016 * @author Keith James (enabled empty line EOR) 017 * @since 1.3 018 */ 019public class RegexParser 020 implements 021 TagValueParser 022{ 023 private Pattern pattern = null; 024 025 private int tagGroup = -1; 026 027 private int valueGroup = -1; 028 029 private String endOfRecord = null; 030 031 private boolean trimTag = false; 032 033 private boolean trimValue = false; 034 035 private boolean continueOnEmptyTag = false; 036 037 private boolean mergeSameTag = false; 038 039 private String tag; 040 041 /** 042 * Create a new RegexParser with all boolean values set to false. 043 */ 044 public RegexParser() {} 045 046 /** 047 * Set the Pattern used to split lines. 048 * 049 * @param pattern the Pattern used to split lines 050 */ 051 public void setPattern(Pattern pattern) { 052 this.pattern = pattern; 053 } 054 055 /** 056 * Get the Pattern currently used to split lines. 057 * 058 * @return the current Pattern 059 */ 060 public Pattern getPattern() { 061 return pattern; 062 } 063 064 /** 065 * Set the group number that will match the tag. 066 * 067 * @param group the tag group number 068 */ 069 public void setTagGroup(int group) { 070 this.tagGroup = group; 071 } 072 073 /** 074 * Get the group number that matches the tag. 075 * 076 * @return the tag group number 077 */ 078 public int getTagGroup() { 079 return tagGroup; 080 } 081 082 /** 083 * Set the group number that will match the value. 084 * 085 * @param group the value group number 086 */ 087 public void setValueGroup(int group) { 088 this.valueGroup = group; 089 } 090 091 /** 092 * Get the group number that matches the value. 093 * 094 * @return the value group number 095 */ 096 public int getValueGroup() { 097 return valueGroup; 098 } 099 100 /** 101 * Set the explicit end-of-record string. 102 * 103 * @param endOfRecord the new endOfRecord String 104 */ 105 public void setEndOfRecord(String endOfRecord) { 106 this.endOfRecord = endOfRecord; 107 } 108 109 /** 110 * Get the explicit end-of-record string. 111 * 112 * @return the current endOfRecord String 113 */ 114 public String getEndOfRecord() { 115 return endOfRecord; 116 } 117 118 /** 119 * Enable trimming of the tag using String.trim(). 120 * 121 * @param trimTag true if tags should be trimmed, false otherwise 122 */ 123 public void setTrimTag(boolean trimTag) { 124 this.trimTag = trimTag; 125 } 126 127 /** 128 * See if trimming of tags is enabled. 129 * 130 * @return true if tag trimming is enabled, false otherwise 131 */ 132 public boolean getTrimTag() { 133 return trimTag; 134 } 135 136 /** 137 * Enable trimming of the value using String.trim(). 138 * 139 * @param trimValue true if values should be trimmed, false otherwise 140 */ 141 public void setTrimValue(boolean trimValue) { 142 this.trimValue = trimValue; 143 } 144 145 /** 146 * See if trimming of values is enabled. 147 * 148 * @return true if value trimming is enabled, false otherwise 149 */ 150 public boolean getTrimValue() { 151 return trimValue; 152 } 153 154 /** 155 * Decide whether to treat empty tags as continuations of the previous non 156 * -empty tag. 157 * 158 * @param continueOnEmptyTag true if empty tags should be replaced, false 159 * otherwise 160 */ 161 public void setContinueOnEmptyTag(boolean continueOnEmptyTag) { 162 this.continueOnEmptyTag = continueOnEmptyTag; 163 } 164 165 /** 166 * Report whether empty tags will be treated as continuations of the last non 167 * -empty tag. 168 * 169 * @return true if empty tags will be replaced, false otherwise 170 */ 171 public boolean getContinueOnEmptyTag() { 172 return continueOnEmptyTag; 173 } 174 175 /** 176 * Decide if multiple examples of a single tag should be merged into a single 177 * start/endTag pair with multiple values, or multiple start/endTag pairs each 178 * with a single value. 179 * 180 * @param mergeSameTag true if tags will be merged, false otherwise 181 */ 182 public void setMergeSameTag(boolean mergeSameTag) { 183 this.mergeSameTag = mergeSameTag; 184 } 185 186 /** 187 * Report whether empty tags will be treated as continuations of the last non 188 * -empty tag. 189 * 190 * @return true if tags will be merged, false otherwise 191 */ 192 public boolean getMergeSameTag() { 193 return mergeSameTag; 194 } 195 196 public TagValue parse(Object o) 197 throws ParserException { 198 String line = o.toString(); 199 200 // Use of the special value for the EOR marker allows a blank line 201 // to be used to delimit records. Many file formats are like this. 202 if (endOfRecord != null) { 203 if (endOfRecord == TagValueParser.EMPTY_LINE_EOR) { 204 if (line.equals(TagValueParser.EMPTY_LINE_EOR)) { 205 return null; 206 } 207 } 208 else 209 { 210 if (line.startsWith(endOfRecord)) { 211 return null; 212 } 213 } 214 } 215 216 Matcher matcher = pattern.matcher(line); 217 if(!matcher.find()) { 218 throw new ParserException("Could not match " + pattern.pattern() + " to " + line); 219 } 220 String tag = matcher.group(tagGroup); 221 if(trimTag) { 222 tag = tag.trim(); 223 } 224 225 String value = matcher.group(valueGroup); 226 if(trimValue) { 227 value = value.trim(); 228 } 229 230 if(continueOnEmptyTag && (tag.length() == 0)) { 231 return new TagValue(this.tag, value, false); 232 } else if(mergeSameTag && tag.equals(this.tag)) { 233 return new TagValue(tag, value, false); 234 } else { 235 return new TagValue(this.tag = tag, value, true); 236 } 237 } 238}