001package org.biojava.bio.program.tagvalue;
002
003import java.util.regex.Matcher;
004import java.util.regex.Pattern;
005
006import org.biojava.bio.AnnotationType;
007import org.biojava.bio.CardinalityConstraint;
008import org.biojava.bio.CollectionConstraint;
009import org.biojava.bio.PropertyConstraint;
010import org.biojava.utils.ParserException;
011
012/**
013 * This is intended as a repository for tag-value and AnnotationType information
014 * about common file formats. Each format should have an annotaiton type
015 * defined as <FormatName>_TYPE and a method
016 * create<FormatName>ParserListener(ParserListener listener) that together
017 * give you everything needed to parse and represent the format.
018 *
019 * @author Matthew Pocock
020 */
021public class Formats {
022  public static final AnnotationType EMBL_TYPE;
023  public static final AnnotationType EMBL_GENBANK_FEATURE_TABLE_TYPE;
024  public static final AnnotationType SWISSPROT_TYPE;
025
026  static {
027    PropertyConstraint prop_string = new PropertyConstraint.ByClass(String.class);
028    CollectionConstraint prop_stringList = new CollectionConstraint.AllValuesIn(
029      prop_string,
030      CardinalityConstraint.ANY
031    );
032
033    // feature table strucure - shared by embl & genbank
034    EMBL_GENBANK_FEATURE_TABLE_TYPE = new AnnotationType.Impl();
035    EMBL_GENBANK_FEATURE_TABLE_TYPE.setDefaultConstraint(prop_stringList);
036    PropertyConstraint prop_featureTable = new PropertyConstraint.ByAnnotationType(EMBL_GENBANK_FEATURE_TABLE_TYPE);
037
038    // embl top-level
039    EMBL_TYPE = new AnnotationType.Impl();
040    EMBL_TYPE.setDefaultConstraint(prop_stringList);
041    EMBL_TYPE.setConstraints("FT", prop_featureTable, CardinalityConstraint.ZERO_OR_ONE);
042
043    // swissprot top-level
044    SWISSPROT_TYPE = new AnnotationType.Impl();
045    SWISSPROT_TYPE.setDefaultConstraint(prop_stringList);
046  }
047
048  public static final ParserListener createEmblParserListener(TagValueListener listener) {
049    RegexSplitter semiColonSplitter = new RegexSplitter(
050      Pattern.compile("(\\w+)[;.]"),
051      1
052    );
053    ValueChanger semiColonChanger = new ValueChanger(listener);
054    semiColonChanger.setDefaultSplitter(semiColonSplitter);
055
056
057    LineSplitParser lsp = LineSplitParser.EMBL;
058
059    TagDelegator td = new TagDelegator(listener);
060
061    LineSplitParser ftParser = new LineSplitParser();
062    ftParser.setSplitOffset(15);
063    ftParser.setTrimTag(true);
064    ftParser.setTrimValue(true);
065    ftParser.setContinueOnEmptyTag(true);
066    ftParser.setMergeSameTag(false);
067
068    TagValueListener ftListener = new FeatureTableListener(listener);
069
070    td.setParserListener("FT", ftParser, ftListener);
071    td.setListener("ID", new RegexFieldFinder(
072      listener,
073      Pattern.compile("(\\w+)\\s+(\\w+);\\s+(.*?);\\s+(\\w+);\\s+(\\d+)\\s+BP\\."),
074      new String[] { "ID", "TYPE", "MOLECULE", "DIVISION", "SIZE" },
075      true
076    ));
077    td.setListener("AC", semiColonChanger);
078    td.setListener("KW", semiColonChanger);
079    td.setListener("OC", semiColonChanger);
080
081    return new ParserListener(lsp, td);
082  }
083
084  public static final ParserListener createSwissprotParserListener(TagValueListener listener) {
085    RegexSplitter semiColonSplitter = new RegexSplitter(
086      Pattern.compile("(\\w+)[;.]"),
087      1
088    );
089    ValueChanger semiColonChanger = new ValueChanger(listener);
090    semiColonChanger.setDefaultSplitter(semiColonSplitter);
091
092    LineSplitParser ftParser = new LineSplitParser();
093    ftParser.setSplitOffset(29);
094    ftParser.setTrimTag(true);
095    ftParser.setTrimValue(true);
096    ftParser.setContinueOnEmptyTag(true);
097    ftParser.setMergeSameTag(false);
098
099    TagValueListener ftListener = new SPFeatureTableListener(listener);
100
101    LineSplitParser lsp = LineSplitParser.EMBL;
102    TagDelegator td = new TagDelegator(listener);
103
104    td.setListener("ID", new RegexFieldFinder(
105      listener,
106      Pattern.compile("(\\w+)\\s+(\\w+);\\s+(\\w+);\\s+(\\d+)"),
107      new String[] { "ID", "TYPE", "MOLECULE", "LENGTH" },
108      true
109    ));
110    td.setListener("AC", semiColonChanger);
111    td.setListener("KW", semiColonChanger);
112    td.setListener("OC", semiColonChanger);
113    td.setListener("RC", semiColonChanger);
114    td.setListener("RX", semiColonChanger);
115    td.setParserListener("FT", ftParser, ftListener);
116
117    return new ParserListener(lsp, td);
118  }
119
120  private static class FeatureTableListener
121  extends SimpleTagValueWrapper {
122    private TagValueParser featurePropertyParser = new FeaturePropertyParser();
123    private int depth = 0;
124    
125    private boolean inLocation;
126
127    public FeatureTableListener() {
128      super();
129    }
130
131    public FeatureTableListener(TagValueListener delegate) {
132      super(delegate);
133    }
134
135    public void startRecord()
136    throws ParserException  {
137      inLocation = false;
138
139      super.startRecord();
140    }
141
142    public void endRecord()
143    throws ParserException {
144      if(inLocation) {
145        super.endTag();
146      }
147
148      super.endRecord();
149    }
150
151    public void startTag(Object tag)
152    throws ParserException {
153      super.startTag(tag);
154
155      if(depth == 0) {
156        super.startRecord();
157      }
158
159      depth++;
160    }
161
162    public void endTag()
163    throws ParserException {
164      depth--;
165
166      if(depth == 0) {
167        super.endRecord();
168      }
169
170      super.endTag();
171    }
172
173    public void value(TagValueContext tvc, Object value)
174    throws ParserException {
175      String line = (String) value;
176      if(line.startsWith("/")) {
177        if(inLocation) {
178          super.endTag();
179          inLocation = false;
180        }
181        tvc.pushParser(featurePropertyParser, new TopRecordDropper(getDelegate()));
182      } else {
183        if(!inLocation) {
184          super.startTag("LOCATION");
185          inLocation = true;
186        }
187        super.value(tvc, value);
188      }
189    }
190  }
191
192  private static class FeaturePropertyParser
193  implements TagValueParser {
194    public TagValue parse(Object value)
195    throws ParserException  {
196      String line = (String) value;
197      if(line.startsWith("/")) {
198        int eq = line.indexOf("=");
199        if(eq < 0) {
200          return new TagValue(line.substring(1), "", true);
201        } else {
202          String ourTag = line.substring(1, eq);
203          String ourValue = line.substring(eq + 1);
204          return new TagValue(ourTag, ourValue, true);
205        }
206      } else {
207        return new TagValue(null, value, false);
208      }
209    }
210  }
211
212  private static class TopRecordDropper
213  extends SimpleTagValueWrapper {
214    private int depth = 0;
215
216    public TopRecordDropper(TagValueListener delegate) {
217      super(delegate);
218    }
219
220    public void startRecord()
221    throws ParserException {
222      if(depth > 0) {
223        super.startRecord();
224      }
225
226      depth++;
227    }
228
229    public void endRecord()
230    throws ParserException {
231      depth--;
232
233      if(depth > 0) {
234        super.endRecord();
235      }
236    }
237  }
238
239  private static class SPFeatureTableListener
240  extends SimpleTagValueWrapper {
241    private Pattern pat = Pattern.compile("(\\w+)\\s+(\\d+)\\s+(\\d+)");
242    private int depth = 0;
243    private Object tag;
244
245    public SPFeatureTableListener(TagValueListener delegate) {
246      super(delegate);
247    }
248
249    public void startRecord()
250    throws ParserException {
251      depth++;
252      super.startRecord();
253    }
254
255    public void endRecord()
256    throws ParserException {
257      super.endRecord();
258      depth--;
259    }
260
261    public void startTag(Object tag)
262    throws ParserException {
263      if(depth == 1) {
264        this.tag = tag;
265      } else {
266        super.startTag(tag);
267      }
268    }
269
270    public void endTag(Object tag)
271    throws ParserException {
272      if(depth == 1) {
273        // do we need something here?
274      }
275
276      super.endTag();
277    }
278
279    public void value(TagValueContext ctxt, Object val)
280    throws ParserException {
281      System.out.println(depth + " " + tag + " " + val);
282      if(depth == 1) {
283        if(tag != null) {
284          try {
285            Matcher m = pat.matcher(tag.toString());
286            m.find();
287
288            super.startTag("TYPE");
289            super.value(ctxt, m.group(1));
290            super.endTag();
291
292            super.startTag("START");
293            super.value(ctxt, m.group(2));
294            super.endTag();
295
296            super.startTag("END");
297            super.value(ctxt, m.group(3));
298            super.endTag();
299
300            super.startTag("DESCRIPTION");
301            super.value(ctxt, val);
302
303            tag = null;
304          } catch (IllegalStateException ise) {
305            throw new ParserException("Couldn't match: " + pat.pattern() + " " + tag, ise);
306          }
307        } else {
308          super.value(ctxt, val);
309        }
310      } else {
311        super.value(ctxt, val);
312      }
313    }
314  }
315}
316