001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022package org.biojava.bio.program.tagvalue;
023
024
025/**
026 * <p>
027 * A parser that splits a line into tag/value at a given column number. The
028 * GENBANK and EMBL constants are parsers pre-configured for genbank and embl
029 * style files respectively.
030 * </p>
031 *
032 * <p>
033 * There are many properties of the parser that can be set to change how lines
034 * are split, and how the tag and value is produced from that split.
035 * <ul>
036 * <li>endOfRecord - string starting lines that mark record boundaries
037 * e.g. "//"</li>
038 * <li>splitOffset - column index of the first character of the value, and the
039 * length of the raw tag e.g. 5 for EMBL files</li>
040 * <li>trimTag - trim white-space from tags</li>
041 * <li>trimValue - trim white-space from values</li>
042 * <li>continueOnEmptyTag - if the tag is empty, use the previous tag e.g. this
043 * is true for GENBANK files and false for EMBL files</li>
044 * <li>mergeSameTag - if two consecutive tags have the same value, consider
045 * their values to be a continuation of a single value so don't fire start/end
046 * tag events e.g. true for EMBL</li>
047 * </ul>
048 *
049 * @author Matthew Pocock
050 * @author Keith James (enabled empty line EOR)
051 * @since 1.2
052 */
053public class LineSplitParser
054  implements
055    TagValueParser,
056    Cloneable
057{
058  /**
059   * A LineSplitParser pre-configured to process EMBL-style flat files.
060   */
061  public static final LineSplitParser EMBL;
062
063  /**
064   * A LineSplitParser pre-configured to process GENBANK-style flat files.
065   */
066  public static final LineSplitParser GENBANK;
067  
068  static {
069    EMBL = new LineSplitParser();
070    EMBL.setEndOfRecord("//");
071    EMBL.setSplitOffset(5);
072    EMBL.setTrimTag(true);
073    EMBL.setTrimValue(false);
074    EMBL.setContinueOnEmptyTag(false);
075    EMBL.setMergeSameTag(true);
076    
077    GENBANK = new LineSplitParser();
078    GENBANK.setEndOfRecord("//");
079    GENBANK.setSplitOffset(12);
080    GENBANK.setTrimTag(true);
081    GENBANK.setTrimValue(false);
082    GENBANK.setContinueOnEmptyTag(true);
083    GENBANK.setMergeSameTag(false);
084  }
085
086  // properties
087  //
088  
089  private String endOfRecord = null;
090  
091  private int splitOffset;
092  
093  private boolean trimTag;
094  
095  private boolean trimValue;
096  
097  private boolean continueOnEmptyTag;
098  
099  private boolean mergeSameTag;
100
101  // state
102  //
103  
104  private String tag;
105  
106  public LineSplitParser() {}
107
108  public LineSplitParser(LineSplitParser parser) {
109    this.endOfRecord = parser.endOfRecord;
110    this.splitOffset = parser.splitOffset;
111    this.trimTag = parser.trimTag;
112    this.trimValue = parser.trimValue;
113    this.continueOnEmptyTag = parser.continueOnEmptyTag;
114    this.mergeSameTag = parser.mergeSameTag;
115  }
116  
117  /**
118   * Set the string indicating that a record has ended.
119   *
120   * @param endOfRecord the new String delimiting records
121   */
122  public void setEndOfRecord(String endOfRecord) {
123    this.endOfRecord = endOfRecord;
124  }
125  
126  /**
127   * Get the current string indicating that a record has ended.
128   *
129   * @return the current string delimiting records.
130   */
131  public String getEndOfRecord() {
132    return endOfRecord;
133  }
134  
135  /**
136   * Set the offset to split lines at.
137   *
138   * @param splitOffset the new offset to split at
139   */
140  public void setSplitOffset(int splitOffset) {
141    this.splitOffset = splitOffset;
142  }
143  
144  /**
145   * Get the current offset at which lines are split.
146   *
147   * @return the offset to split at
148   */
149  public int getSplitOffset() {
150    return splitOffset;
151  }
152  
153  /**
154   * Enable or disable trimming of tags.
155   *
156   * @param trimTag  true if tags should be trimmed, otherwise false
157   */
158  public void setTrimTag(boolean trimTag) {
159    this.trimTag = trimTag;
160  }
161  
162  /**
163   * See if tag trimming is enabled.
164   *
165   * @return true if tags are trimmed, otherwise false
166   */
167  public boolean getTrimTag() {
168    return trimTag;
169  }
170  
171  /**
172   * Enable or disable trimming of values.
173   *
174   * @param trimValue  true if values should be trimmed, otherwise false
175   */
176  public void setTrimValue(boolean trimValue) {
177    this.trimValue = trimValue;
178  }
179  
180  /**
181   * See if value trimming is enabled.
182   *
183   * @return true if values are trimmed, otherwise false
184   */
185  public boolean getTrimValue() {
186    return trimValue;
187  }
188  
189  /**
190   * Choose whether to treat empty tags as a continuation of previous tags or as a
191   * new tag with the value of the empty string.
192   *
193   * @param continueOnEmptyTag true to enable empty tags to be treated as a
194   *        continuation of the previous tag, false otherwise
195   */
196  public void setContinueOnEmptyTag(boolean continueOnEmptyTag) {
197    this.continueOnEmptyTag = continueOnEmptyTag;
198  }
199  
200  /**
201   * See if empty tags are treated as a continuation of previous tags or as a
202   * new tag with the value of the empty string.
203   *
204   * @return true if continuation is enabled, false otherwise
205   */
206  public boolean getContinueOnEmptyTag() {
207    return continueOnEmptyTag;
208  }
209  
210  /**
211   * Enable or disable treating runs of identical tags as a single tag start
212   * event with multiple values or each as a separate tag start, value, and tag
213   * end.
214   *
215   * @param mergeSameTag true if tags should be merged, false otherwise
216   */
217  public void setMergeSameTag(boolean mergeSameTag) {
218    this.mergeSameTag = mergeSameTag;
219  }
220  
221  /**
222   * See if tags are being merged.
223   *
224   * @return true if merging is enabled, false otherwise
225   */
226  public boolean getMergeSameTag() {
227    return mergeSameTag;
228  }
229  
230  public TagValue parse(Object o) {
231    String line = o.toString();
232
233    // Use of the special value for the EOR marker allows a blank line
234    // to be used to delimit records. Many file formats are like this.
235    if (endOfRecord != null) {
236        if (endOfRecord == TagValueParser.EMPTY_LINE_EOR) {
237            if (line.equals(TagValueParser.EMPTY_LINE_EOR)) {
238                return null;
239            }
240        }
241        else
242        {
243            if (line.startsWith(endOfRecord)) {
244                return null;
245            }
246        }
247    }
248    
249    int length = line.length();
250    
251    String tag;
252    if(length > splitOffset) {
253      tag = line.substring(0, splitOffset);
254    } else {
255      tag = line;
256    }
257    if(trimTag) {
258      tag = tag.trim();
259    }
260    
261    String value;
262    if(length > splitOffset) {
263      value = line.substring(splitOffset);
264    } else {
265      value = "";
266    }
267    if(trimValue) {
268      value = value.trim();
269    }
270    
271    if(continueOnEmptyTag && (tag.length() == 0)) {
272      return new TagValue(this.tag, value, false);
273    } else if(mergeSameTag && tag.equals(this.tag)) {
274      return new TagValue(tag, value, false);
275    } else {
276      return new TagValue(this.tag = tag, value, true);
277    }
278  }
279  
280  public Object clone()
281  throws CloneNotSupportedException {
282    return super.clone();
283  }
284}