001package org.biojava.bio.program.tagvalue; 002 003import java.util.regex.Matcher; 004import java.util.regex.Pattern; 005 006import org.biojava.bio.AnnotationType; 007import org.biojava.bio.CardinalityConstraint; 008import org.biojava.bio.CollectionConstraint; 009import org.biojava.bio.PropertyConstraint; 010import org.biojava.utils.ParserException; 011 012/** 013 * This is intended as a repository for tag-value and AnnotationType information 014 * about common file formats. Each format should have an annotaiton type 015 * defined as <FormatName>_TYPE and a method 016 * create<FormatName>ParserListener(ParserListener listener) that together 017 * give you everything needed to parse and represent the format. 018 * 019 * @author Matthew Pocock 020 */ 021public class Formats { 022 public static final AnnotationType EMBL_TYPE; 023 public static final AnnotationType EMBL_GENBANK_FEATURE_TABLE_TYPE; 024 public static final AnnotationType SWISSPROT_TYPE; 025 026 static { 027 PropertyConstraint prop_string = new PropertyConstraint.ByClass(String.class); 028 CollectionConstraint prop_stringList = new CollectionConstraint.AllValuesIn( 029 prop_string, 030 CardinalityConstraint.ANY 031 ); 032 033 // feature table strucure - shared by embl & genbank 034 EMBL_GENBANK_FEATURE_TABLE_TYPE = new AnnotationType.Impl(); 035 EMBL_GENBANK_FEATURE_TABLE_TYPE.setDefaultConstraint(prop_stringList); 036 PropertyConstraint prop_featureTable = new PropertyConstraint.ByAnnotationType(EMBL_GENBANK_FEATURE_TABLE_TYPE); 037 038 // embl top-level 039 EMBL_TYPE = new AnnotationType.Impl(); 040 EMBL_TYPE.setDefaultConstraint(prop_stringList); 041 EMBL_TYPE.setConstraints("FT", prop_featureTable, CardinalityConstraint.ZERO_OR_ONE); 042 043 // swissprot top-level 044 SWISSPROT_TYPE = new AnnotationType.Impl(); 045 SWISSPROT_TYPE.setDefaultConstraint(prop_stringList); 046 } 047 048 public static final ParserListener createEmblParserListener(TagValueListener listener) { 049 RegexSplitter semiColonSplitter = new RegexSplitter( 050 Pattern.compile("(\\w+)[;.]"), 051 1 052 ); 053 ValueChanger semiColonChanger = new ValueChanger(listener); 054 semiColonChanger.setDefaultSplitter(semiColonSplitter); 055 056 057 LineSplitParser lsp = LineSplitParser.EMBL; 058 059 TagDelegator td = new TagDelegator(listener); 060 061 LineSplitParser ftParser = new LineSplitParser(); 062 ftParser.setSplitOffset(15); 063 ftParser.setTrimTag(true); 064 ftParser.setTrimValue(true); 065 ftParser.setContinueOnEmptyTag(true); 066 ftParser.setMergeSameTag(false); 067 068 TagValueListener ftListener = new FeatureTableListener(listener); 069 070 td.setParserListener("FT", ftParser, ftListener); 071 td.setListener("ID", new RegexFieldFinder( 072 listener, 073 Pattern.compile("(\\w+)\\s+(\\w+);\\s+(.*?);\\s+(\\w+);\\s+(\\d+)\\s+BP\\."), 074 new String[] { "ID", "TYPE", "MOLECULE", "DIVISION", "SIZE" }, 075 true 076 )); 077 td.setListener("AC", semiColonChanger); 078 td.setListener("KW", semiColonChanger); 079 td.setListener("OC", semiColonChanger); 080 081 return new ParserListener(lsp, td); 082 } 083 084 public static final ParserListener createSwissprotParserListener(TagValueListener listener) { 085 RegexSplitter semiColonSplitter = new RegexSplitter( 086 Pattern.compile("(\\w+)[;.]"), 087 1 088 ); 089 ValueChanger semiColonChanger = new ValueChanger(listener); 090 semiColonChanger.setDefaultSplitter(semiColonSplitter); 091 092 LineSplitParser ftParser = new LineSplitParser(); 093 ftParser.setSplitOffset(29); 094 ftParser.setTrimTag(true); 095 ftParser.setTrimValue(true); 096 ftParser.setContinueOnEmptyTag(true); 097 ftParser.setMergeSameTag(false); 098 099 TagValueListener ftListener = new SPFeatureTableListener(listener); 100 101 LineSplitParser lsp = LineSplitParser.EMBL; 102 TagDelegator td = new TagDelegator(listener); 103 104 td.setListener("ID", new RegexFieldFinder( 105 listener, 106 Pattern.compile("(\\w+)\\s+(\\w+);\\s+(\\w+);\\s+(\\d+)"), 107 new String[] { "ID", "TYPE", "MOLECULE", "LENGTH" }, 108 true 109 )); 110 td.setListener("AC", semiColonChanger); 111 td.setListener("KW", semiColonChanger); 112 td.setListener("OC", semiColonChanger); 113 td.setListener("RC", semiColonChanger); 114 td.setListener("RX", semiColonChanger); 115 td.setParserListener("FT", ftParser, ftListener); 116 117 return new ParserListener(lsp, td); 118 } 119 120 private static class FeatureTableListener 121 extends SimpleTagValueWrapper { 122 private TagValueParser featurePropertyParser = new FeaturePropertyParser(); 123 private int depth = 0; 124 125 private boolean inLocation; 126 127 public FeatureTableListener() { 128 super(); 129 } 130 131 public FeatureTableListener(TagValueListener delegate) { 132 super(delegate); 133 } 134 135 public void startRecord() 136 throws ParserException { 137 inLocation = false; 138 139 super.startRecord(); 140 } 141 142 public void endRecord() 143 throws ParserException { 144 if(inLocation) { 145 super.endTag(); 146 } 147 148 super.endRecord(); 149 } 150 151 public void startTag(Object tag) 152 throws ParserException { 153 super.startTag(tag); 154 155 if(depth == 0) { 156 super.startRecord(); 157 } 158 159 depth++; 160 } 161 162 public void endTag() 163 throws ParserException { 164 depth--; 165 166 if(depth == 0) { 167 super.endRecord(); 168 } 169 170 super.endTag(); 171 } 172 173 public void value(TagValueContext tvc, Object value) 174 throws ParserException { 175 String line = (String) value; 176 if(line.startsWith("/")) { 177 if(inLocation) { 178 super.endTag(); 179 inLocation = false; 180 } 181 tvc.pushParser(featurePropertyParser, new TopRecordDropper(getDelegate())); 182 } else { 183 if(!inLocation) { 184 super.startTag("LOCATION"); 185 inLocation = true; 186 } 187 super.value(tvc, value); 188 } 189 } 190 } 191 192 private static class FeaturePropertyParser 193 implements TagValueParser { 194 public TagValue parse(Object value) 195 throws ParserException { 196 String line = (String) value; 197 if(line.startsWith("/")) { 198 int eq = line.indexOf("="); 199 if(eq < 0) { 200 return new TagValue(line.substring(1), "", true); 201 } else { 202 String ourTag = line.substring(1, eq); 203 String ourValue = line.substring(eq + 1); 204 return new TagValue(ourTag, ourValue, true); 205 } 206 } else { 207 return new TagValue(null, value, false); 208 } 209 } 210 } 211 212 private static class TopRecordDropper 213 extends SimpleTagValueWrapper { 214 private int depth = 0; 215 216 public TopRecordDropper(TagValueListener delegate) { 217 super(delegate); 218 } 219 220 public void startRecord() 221 throws ParserException { 222 if(depth > 0) { 223 super.startRecord(); 224 } 225 226 depth++; 227 } 228 229 public void endRecord() 230 throws ParserException { 231 depth--; 232 233 if(depth > 0) { 234 super.endRecord(); 235 } 236 } 237 } 238 239 private static class SPFeatureTableListener 240 extends SimpleTagValueWrapper { 241 private Pattern pat = Pattern.compile("(\\w+)\\s+(\\d+)\\s+(\\d+)"); 242 private int depth = 0; 243 private Object tag; 244 245 public SPFeatureTableListener(TagValueListener delegate) { 246 super(delegate); 247 } 248 249 public void startRecord() 250 throws ParserException { 251 depth++; 252 super.startRecord(); 253 } 254 255 public void endRecord() 256 throws ParserException { 257 super.endRecord(); 258 depth--; 259 } 260 261 public void startTag(Object tag) 262 throws ParserException { 263 if(depth == 1) { 264 this.tag = tag; 265 } else { 266 super.startTag(tag); 267 } 268 } 269 270 public void endTag(Object tag) 271 throws ParserException { 272 if(depth == 1) { 273 // do we need something here? 274 } 275 276 super.endTag(); 277 } 278 279 public void value(TagValueContext ctxt, Object val) 280 throws ParserException { 281 System.out.println(depth + " " + tag + " " + val); 282 if(depth == 1) { 283 if(tag != null) { 284 try { 285 Matcher m = pat.matcher(tag.toString()); 286 m.find(); 287 288 super.startTag("TYPE"); 289 super.value(ctxt, m.group(1)); 290 super.endTag(); 291 292 super.startTag("START"); 293 super.value(ctxt, m.group(2)); 294 super.endTag(); 295 296 super.startTag("END"); 297 super.value(ctxt, m.group(3)); 298 super.endTag(); 299 300 super.startTag("DESCRIPTION"); 301 super.value(ctxt, val); 302 303 tag = null; 304 } catch (IllegalStateException ise) { 305 throw new ParserException("Couldn't match: " + pat.pattern() + " " + tag, ise); 306 } 307 } else { 308 super.value(ctxt, val); 309 } 310 } else { 311 super.value(ctxt, val); 312 } 313 } 314 } 315} 316