001package org.biojava.bio.program.formats;
002
003import java.util.regex.Matcher;
004import java.util.regex.Pattern;
005
006import org.biojava.bio.AnnotationType;
007import org.biojava.bio.CardinalityConstraint;
008import org.biojava.bio.PropertyConstraint;
009import org.biojava.bio.program.tagvalue.LineSplitParser;
010import org.biojava.bio.program.tagvalue.ParserListener;
011import org.biojava.bio.program.tagvalue.RegexFieldFinder;
012import org.biojava.bio.program.tagvalue.RegexSplitter;
013import org.biojava.bio.program.tagvalue.SimpleTagValueWrapper;
014import org.biojava.bio.program.tagvalue.TagDelegator;
015import org.biojava.bio.program.tagvalue.TagValueContext;
016import org.biojava.bio.program.tagvalue.TagValueListener;
017import org.biojava.bio.program.tagvalue.ValueChanger;
018import org.biojava.bio.symbol.Location;
019import org.biojava.utils.ParserException;
020import org.biojava.utils.lsid.LifeScienceIdentifier;
021
022public class Swissprot
023implements Format {
024  private static final AnnotationType ANNO_TYPE;
025  //private static final LineSplitParser PARSER;
026  private static final LifeScienceIdentifier LSID;
027
028  static {
029    LSID = LifeScienceIdentifier.valueOf("open-bio.org", "format", "swissprot");
030
031    Location NONE = CardinalityConstraint.NONE;
032    Location ANY = CardinalityConstraint.ANY;
033    Location ONE = CardinalityConstraint.ONE;
034    Location ONE_OR_MORE = CardinalityConstraint.ONE_OR_MORE;
035
036    //PARSER = new LineSplitParser(LineSplitParser.EMBL);
037
038    PropertyConstraint c_string = new PropertyConstraint.ByClass(String.class);
039
040    AnnotationType FT = new AnnotationType.Impl();
041    FT.setDefaultConstraints(PropertyConstraint.ANY, ANY); // fix this
042    PropertyConstraint c_ft = new PropertyConstraint.ByAnnotationType(FT);
043
044    ANNO_TYPE = new AnnotationType.Impl();
045    ANNO_TYPE.setDefaultConstraints(PropertyConstraint.NONE, NONE);
046    ANNO_TYPE.setConstraints("ID", c_string, ONE);
047    ANNO_TYPE.setConstraints("TYPE", c_string, ONE);
048    ANNO_TYPE.setConstraints("MOLECULE", c_string, ONE);
049    ANNO_TYPE.setConstraints("LENGTH", c_string, ONE);
050    ANNO_TYPE.setConstraints("AC", c_string, ONE_OR_MORE);
051    ANNO_TYPE.setConstraints("DT", c_string, ANY);
052    ANNO_TYPE.setConstraints("KW", c_string, ANY);
053    ANNO_TYPE.setConstraints("OS", c_string, ONE);
054    ANNO_TYPE.setConstraints("OC", c_string, ANY);
055    ANNO_TYPE.setConstraints("DE", c_string, ANY);
056    ANNO_TYPE.setConstraints("GN", c_string, ANY);
057    ANNO_TYPE.setConstraints("OS", c_string, ANY);
058    ANNO_TYPE.setConstraints("OG", c_string, ANY);
059    ANNO_TYPE.setConstraints("OC", c_string, ANY);
060    ANNO_TYPE.setConstraints("OX", c_string, ANY);
061    ANNO_TYPE.setConstraints("RN", c_string, ANY);
062    ANNO_TYPE.setConstraints("RP", c_string, ANY);
063    ANNO_TYPE.setConstraints("RC", c_string, ANY);
064    ANNO_TYPE.setConstraints("RX", c_string, ANY);
065    ANNO_TYPE.setConstraints("RA", c_string, ANY);
066    ANNO_TYPE.setConstraints("RT", c_string, ANY);
067    ANNO_TYPE.setConstraints("RL", c_string, ANY);
068    ANNO_TYPE.setConstraints("CC", c_string, ANY);
069    ANNO_TYPE.setConstraints("DR", c_string, ANY);
070    ANNO_TYPE.setConstraints("KW", c_string, ANY);
071    ANNO_TYPE.setConstraints("FT", c_ft, ANY);
072    ANNO_TYPE.setConstraints("SQ", c_string, ANY);
073    ANNO_TYPE.setConstraints("", c_string, ANY);
074  }
075
076  public ParserListener getParserListener(TagValueListener listener) {
077    RegexSplitter semiColonSplitter = new RegexSplitter(
078      Pattern.compile("(\\w+)[;.]"),
079      1
080    );
081    ValueChanger semiColonChanger = new ValueChanger(listener);
082    semiColonChanger.setDefaultSplitter(semiColonSplitter);
083
084    LineSplitParser ftParser = new LineSplitParser();
085    ftParser.setSplitOffset(29);
086    ftParser.setTrimTag(true);
087    ftParser.setTrimValue(true);
088    ftParser.setContinueOnEmptyTag(true);
089    ftParser.setMergeSameTag(false);
090
091    TagValueListener ftListener = new SPFeatureTableListener(listener);
092
093    LineSplitParser lsp = LineSplitParser.EMBL;
094    TagDelegator td = new TagDelegator(listener);
095
096    td.setListener("ID", new RegexFieldFinder(
097      listener,
098      Pattern.compile("(\\w+)\\s+(\\w+);\\s+(\\w+);\\s+(\\d+)"),
099      new String[] { "ID", "TYPE", "MOLECULE", "LENGTH" },
100      true
101    ));
102    td.setListener("AC", semiColonChanger);
103    td.setListener("KW", semiColonChanger);
104    td.setListener("OC", semiColonChanger);
105    td.setListener("RC", semiColonChanger);
106    td.setListener("RX", semiColonChanger);
107    td.setParserListener("FT", ftParser, ftListener);
108
109    return new ParserListener(lsp, td);
110  }
111
112
113  public AnnotationType getType() {
114    return ANNO_TYPE;
115  }
116
117  public LifeScienceIdentifier getLSID() {
118    return LSID;
119  }
120
121  private static class SPFeatureTableListener
122  extends SimpleTagValueWrapper {
123    private Pattern pat = Pattern.compile("(\\w+)\\s+((<?\\d+)|(?))\\s+((>?\\d+)|(\\?))");
124    private int depth = 0;
125    private Object tag;
126
127    public SPFeatureTableListener(TagValueListener delegate) {
128      super(delegate);
129    }
130
131    public void startRecord()
132    throws ParserException {
133      depth++;
134      super.startRecord();
135    }
136
137    public void endRecord()
138    throws ParserException {
139      super.endRecord();
140      depth--;
141    }
142
143    public void startTag(Object tag)
144    throws ParserException {
145      if(depth == 1) {
146        this.tag = tag;
147      } else {
148        super.startTag(tag);
149      }
150    }
151
152    public void endTag(Object tag)
153    throws ParserException {
154      if(depth == 1) {
155        // do we need something here?
156      }
157
158      super.endTag();
159    }
160
161    public void value(TagValueContext ctxt, Object val)
162    throws ParserException {
163      if(depth == 1) {
164        if(tag != null) {
165          try {
166            Matcher m = pat.matcher(tag.toString());
167            m.find();
168
169            super.startTag("TYPE");
170            super.value(ctxt, m.group(1));
171            super.endTag();
172
173            super.startTag("START");
174            super.value(ctxt, m.group(2));
175            super.endTag();
176
177            super.startTag("END");
178            super.value(ctxt, m.group(3));
179            super.endTag();
180
181            super.startTag("DESCRIPTION");
182            super.value(ctxt, val);
183
184            tag = null;
185          } catch (IllegalStateException ise) {
186            throw new ParserException("Couldn't match: " + pat.pattern() + " " + tag, ise);
187          }
188        } else {
189          super.value(ctxt, val);
190        }
191      } else {
192        super.value(ctxt, val);
193      }
194    }
195  }
196}
197