001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojava.bio.program.tagvalue; 023 024 025/** 026 * <p> 027 * A parser that splits a line into tag/value at a given column number. The 028 * GENBANK and EMBL constants are parsers pre-configured for genbank and embl 029 * style files respectively. 030 * </p> 031 * 032 * <p> 033 * There are many properties of the parser that can be set to change how lines 034 * are split, and how the tag and value is produced from that split. 035 * <ul> 036 * <li>endOfRecord - string starting lines that mark record boundaries 037 * e.g. "//"</li> 038 * <li>splitOffset - column index of the first character of the value, and the 039 * length of the raw tag e.g. 5 for EMBL files</li> 040 * <li>trimTag - trim white-space from tags</li> 041 * <li>trimValue - trim white-space from values</li> 042 * <li>continueOnEmptyTag - if the tag is empty, use the previous tag e.g. this 043 * is true for GENBANK files and false for EMBL files</li> 044 * <li>mergeSameTag - if two consecutive tags have the same value, consider 045 * their values to be a continuation of a single value so don't fire start/end 046 * tag events e.g. true for EMBL</li> 047 * </ul> 048 * 049 * @author Matthew Pocock 050 * @author Keith James (enabled empty line EOR) 051 * @since 1.2 052 */ 053public class LineSplitParser 054 implements 055 TagValueParser, 056 Cloneable 057{ 058 /** 059 * A LineSplitParser pre-configured to process EMBL-style flat files. 060 */ 061 public static final LineSplitParser EMBL; 062 063 /** 064 * A LineSplitParser pre-configured to process GENBANK-style flat files. 065 */ 066 public static final LineSplitParser GENBANK; 067 068 static { 069 EMBL = new LineSplitParser(); 070 EMBL.setEndOfRecord("//"); 071 EMBL.setSplitOffset(5); 072 EMBL.setTrimTag(true); 073 EMBL.setTrimValue(false); 074 EMBL.setContinueOnEmptyTag(false); 075 EMBL.setMergeSameTag(true); 076 077 GENBANK = new LineSplitParser(); 078 GENBANK.setEndOfRecord("//"); 079 GENBANK.setSplitOffset(12); 080 GENBANK.setTrimTag(true); 081 GENBANK.setTrimValue(false); 082 GENBANK.setContinueOnEmptyTag(true); 083 GENBANK.setMergeSameTag(false); 084 } 085 086 // properties 087 // 088 089 private String endOfRecord = null; 090 091 private int splitOffset; 092 093 private boolean trimTag; 094 095 private boolean trimValue; 096 097 private boolean continueOnEmptyTag; 098 099 private boolean mergeSameTag; 100 101 // state 102 // 103 104 private String tag; 105 106 public LineSplitParser() {} 107 108 public LineSplitParser(LineSplitParser parser) { 109 this.endOfRecord = parser.endOfRecord; 110 this.splitOffset = parser.splitOffset; 111 this.trimTag = parser.trimTag; 112 this.trimValue = parser.trimValue; 113 this.continueOnEmptyTag = parser.continueOnEmptyTag; 114 this.mergeSameTag = parser.mergeSameTag; 115 } 116 117 /** 118 * Set the string indicating that a record has ended. 119 * 120 * @param endOfRecord the new String delimiting records 121 */ 122 public void setEndOfRecord(String endOfRecord) { 123 this.endOfRecord = endOfRecord; 124 } 125 126 /** 127 * Get the current string indicating that a record has ended. 128 * 129 * @return the current string delimiting records. 130 */ 131 public String getEndOfRecord() { 132 return endOfRecord; 133 } 134 135 /** 136 * Set the offset to split lines at. 137 * 138 * @param splitOffset the new offset to split at 139 */ 140 public void setSplitOffset(int splitOffset) { 141 this.splitOffset = splitOffset; 142 } 143 144 /** 145 * Get the current offset at which lines are split. 146 * 147 * @return the offset to split at 148 */ 149 public int getSplitOffset() { 150 return splitOffset; 151 } 152 153 /** 154 * Enable or disable trimming of tags. 155 * 156 * @param trimTag true if tags should be trimmed, otherwise false 157 */ 158 public void setTrimTag(boolean trimTag) { 159 this.trimTag = trimTag; 160 } 161 162 /** 163 * See if tag trimming is enabled. 164 * 165 * @return true if tags are trimmed, otherwise false 166 */ 167 public boolean getTrimTag() { 168 return trimTag; 169 } 170 171 /** 172 * Enable or disable trimming of values. 173 * 174 * @param trimValue true if values should be trimmed, otherwise false 175 */ 176 public void setTrimValue(boolean trimValue) { 177 this.trimValue = trimValue; 178 } 179 180 /** 181 * See if value trimming is enabled. 182 * 183 * @return true if values are trimmed, otherwise false 184 */ 185 public boolean getTrimValue() { 186 return trimValue; 187 } 188 189 /** 190 * Choose whether to treat empty tags as a continuation of previous tags or as a 191 * new tag with the value of the empty string. 192 * 193 * @param continueOnEmptyTag true to enable empty tags to be treated as a 194 * continuation of the previous tag, false otherwise 195 */ 196 public void setContinueOnEmptyTag(boolean continueOnEmptyTag) { 197 this.continueOnEmptyTag = continueOnEmptyTag; 198 } 199 200 /** 201 * See if empty tags are treated as a continuation of previous tags or as a 202 * new tag with the value of the empty string. 203 * 204 * @return true if continuation is enabled, false otherwise 205 */ 206 public boolean getContinueOnEmptyTag() { 207 return continueOnEmptyTag; 208 } 209 210 /** 211 * Enable or disable treating runs of identical tags as a single tag start 212 * event with multiple values or each as a separate tag start, value, and tag 213 * end. 214 * 215 * @param mergeSameTag true if tags should be merged, false otherwise 216 */ 217 public void setMergeSameTag(boolean mergeSameTag) { 218 this.mergeSameTag = mergeSameTag; 219 } 220 221 /** 222 * See if tags are being merged. 223 * 224 * @return true if merging is enabled, false otherwise 225 */ 226 public boolean getMergeSameTag() { 227 return mergeSameTag; 228 } 229 230 public TagValue parse(Object o) { 231 String line = o.toString(); 232 233 // Use of the special value for the EOR marker allows a blank line 234 // to be used to delimit records. Many file formats are like this. 235 if (endOfRecord != null) { 236 if (endOfRecord == TagValueParser.EMPTY_LINE_EOR) { 237 if (line.equals(TagValueParser.EMPTY_LINE_EOR)) { 238 return null; 239 } 240 } 241 else 242 { 243 if (line.startsWith(endOfRecord)) { 244 return null; 245 } 246 } 247 } 248 249 int length = line.length(); 250 251 String tag; 252 if(length > splitOffset) { 253 tag = line.substring(0, splitOffset); 254 } else { 255 tag = line; 256 } 257 if(trimTag) { 258 tag = tag.trim(); 259 } 260 261 String value; 262 if(length > splitOffset) { 263 value = line.substring(splitOffset); 264 } else { 265 value = ""; 266 } 267 if(trimValue) { 268 value = value.trim(); 269 } 270 271 if(continueOnEmptyTag && (tag.length() == 0)) { 272 return new TagValue(this.tag, value, false); 273 } else if(mergeSameTag && tag.equals(this.tag)) { 274 return new TagValue(tag, value, false); 275 } else { 276 return new TagValue(this.tag = tag, value, true); 277 } 278 } 279 280 public Object clone() 281 throws CloneNotSupportedException { 282 return super.clone(); 283 } 284}