001package org.biojava.bio.program.tagvalue;
002
003import java.util.regex.Matcher;
004import java.util.regex.Pattern;
005
006import org.biojava.utils.ParserException;
007
008/**
009 * <p>
010 * A TagValueParser that splits a line based upon a regular expression. There
011 * are configuration parameters analgous to those in LineSplitParser for
012 * configuring parsing details.
013 * </p>
014 *
015 * @author Matthew Pocock
016 * @author Keith James (enabled empty line EOR)
017 * @since 1.3
018 */
019public class RegexParser
020  implements
021    TagValueParser
022{
023  private Pattern pattern = null;
024  
025  private int tagGroup = -1;
026  
027  private int valueGroup = -1;
028  
029  private String endOfRecord = null;
030  
031  private boolean trimTag = false;
032  
033  private boolean trimValue = false;
034
035  private boolean continueOnEmptyTag = false;
036  
037  private boolean mergeSameTag = false;
038  
039  private String tag;
040  
041  /**
042   * Create a new RegexParser with all boolean values set to false.
043   */
044  public RegexParser() {}
045  
046  /** 
047   * Set the Pattern used to split lines.
048   *
049   * @param pattern  the Pattern used to split lines
050   */
051  public void setPattern(Pattern pattern) {
052    this.pattern = pattern;
053  }
054  
055  /**
056   * Get the Pattern currently used to split lines.
057   *
058   * @return the current Pattern
059   */
060  public Pattern getPattern() {
061    return pattern;
062  }
063  
064  /**
065   * Set the group number that will match the tag.
066   *
067   * @param group the tag group number
068   */
069  public void setTagGroup(int group) {
070    this.tagGroup = group;
071  }
072  
073  /**
074   * Get the group number that matches the tag.
075   *
076   * @return the tag group number
077   */
078  public int getTagGroup() {
079    return tagGroup;
080  }
081
082  /**
083   * Set the group number that will match the value.
084   *
085   * @param group the value group number
086   */
087  public void setValueGroup(int group) {
088    this.valueGroup = group;
089  }
090  
091  /**
092   * Get the group number that matches the value.
093   *
094   * @return the value group number
095   */
096  public int getValueGroup() {
097    return valueGroup;
098  }
099  
100  /**
101   * Set the explicit end-of-record string.
102   *
103   * @param endOfRecord  the new endOfRecord String
104   */
105  public void setEndOfRecord(String endOfRecord) {
106    this.endOfRecord = endOfRecord;
107  }
108  
109  /**
110   * Get the explicit end-of-record string.
111   *
112   * @return  the current endOfRecord String
113   */
114  public String getEndOfRecord() {
115    return endOfRecord;
116  }
117  
118  /**
119   * Enable trimming of the tag using String.trim().
120   *
121   * @param trimTag  true if tags should be trimmed, false otherwise
122   */
123  public void setTrimTag(boolean trimTag) {
124    this.trimTag = trimTag;
125  }
126  
127  /**
128   * See if trimming of tags is enabled.
129   *
130   * @return true if tag trimming is enabled, false otherwise
131   */
132  public boolean getTrimTag() {
133    return trimTag;
134  }
135
136  /**
137   * Enable trimming of the value using String.trim().
138   *
139   * @param trimValue  true if values should be trimmed, false otherwise
140   */
141  public void setTrimValue(boolean trimValue) {
142    this.trimValue = trimValue;
143  }
144  
145  /**
146   * See if trimming of values is enabled.
147   *
148   * @return true if value trimming is enabled, false otherwise
149   */
150  public boolean getTrimValue() {
151    return trimValue;
152  }
153
154  /**
155   * Decide whether to treat empty tags as continuations of the previous non
156   * -empty tag.
157   *
158   * @param continueOnEmptyTag  true if empty tags should be replaced, false
159   *        otherwise
160   */
161  public void setContinueOnEmptyTag(boolean continueOnEmptyTag) {
162    this.continueOnEmptyTag = continueOnEmptyTag;
163  }
164  
165  /**
166   * Report whether empty tags will be treated as continuations of the last non
167   * -empty tag.
168   *
169   * @return true if empty tags will be replaced, false otherwise
170   */
171  public boolean getContinueOnEmptyTag() {
172    return continueOnEmptyTag;
173  }
174  
175  /**
176   * Decide if multiple examples of a single tag should be merged into a single
177   * start/endTag pair with multiple values, or multiple start/endTag pairs each
178   * with a single value.
179   *
180   * @param mergeSameTag  true if tags will be merged, false otherwise
181   */
182  public void setMergeSameTag(boolean mergeSameTag) {
183    this.mergeSameTag = mergeSameTag;
184  }
185  
186  /**
187   * Report whether empty tags will be treated as continuations of the last non
188   * -empty tag.
189   *
190   * @return true if tags will be merged, false otherwise
191   */
192  public boolean getMergeSameTag() {
193    return mergeSameTag;
194  }
195  
196  public TagValue parse(Object o)
197  throws ParserException {
198    String line = o.toString();
199    
200    // Use of the special value for the EOR marker allows a blank line
201    // to be used to delimit records. Many file formats are like this.
202    if (endOfRecord != null) {
203        if (endOfRecord == TagValueParser.EMPTY_LINE_EOR) {
204            if (line.equals(TagValueParser.EMPTY_LINE_EOR)) {
205                return null;
206            }
207        }
208        else
209        {
210            if (line.startsWith(endOfRecord)) {
211                return null;
212            }
213        }
214    }
215    
216    Matcher matcher = pattern.matcher(line);
217    if(!matcher.find()) {
218      throw new ParserException("Could not match " + pattern.pattern() + " to " + line);
219    }
220    String tag = matcher.group(tagGroup);
221    if(trimTag) {
222      tag = tag.trim();
223    }
224    
225    String value = matcher.group(valueGroup);
226    if(trimValue) {
227      value = value.trim();
228    }
229    
230    if(continueOnEmptyTag && (tag.length() == 0)) {
231      return new TagValue(this.tag, value, false);
232    } else if(mergeSameTag && tag.equals(this.tag)) {
233      return new TagValue(tag, value, false);
234    } else {
235      return new TagValue(this.tag = tag, value, true);
236    }
237  }
238}