001package org.biojava.bio.program.unigene;
002
003import java.io.IOException;
004import java.net.URL;
005import java.util.ArrayList;
006import java.util.HashMap;
007import java.util.Iterator;
008import java.util.List;
009import java.util.Map;
010import java.util.regex.Matcher;
011import java.util.regex.Pattern;
012
013import org.biojava.bio.AnnotationType;
014import org.biojava.bio.BioError;
015import org.biojava.bio.BioException;
016import org.biojava.bio.CardinalityConstraint;
017import org.biojava.bio.PropertyConstraint;
018import org.biojava.bio.program.tagvalue.ChangeTable;
019import org.biojava.bio.program.tagvalue.LineSplitParser;
020import org.biojava.bio.program.tagvalue.ParserListener;
021import org.biojava.bio.program.tagvalue.RegexParser;
022import org.biojava.bio.program.tagvalue.RegexSplitter;
023import org.biojava.bio.program.tagvalue.SimpleTagValueWrapper;
024import org.biojava.bio.program.tagvalue.TagDelegator;
025import org.biojava.bio.program.tagvalue.TagValueContext;
026import org.biojava.bio.program.tagvalue.TagValueListener;
027import org.biojava.bio.program.tagvalue.TagValueParser;
028import org.biojava.bio.program.tagvalue.ValueChanger;
029import org.biojava.utils.ParserException;
030
031/**
032 * <p>Usefull tools for working with Unigene.</p>
033 *
034 * <p>This class is the main port-of-call for users of the Unigene package. It
035 * provides the core APIs for finding a Unigene database as well as registering
036 * your own Unigene drivers. Additionaly, it contains methods to return parsers
037 * for each of the main Unigene flat-file types. If you wish to bypass the
038 * biojava object model entirely, you can choose to use these parsers instead.
039 * </p>
040 *
041 * <h2>Example use</h2>
042 *
043 * <p>Creating a Unigene instance from your local Unigene directory (assuming
044 * that you have read/write privileges to the directory)</p>
045 *
046 * <pre>
047 * UnigeneDB unigene = UnigeneTools.createUnigene(
048 *   new URL("file:///usr/local/biodata/unigene") );
049 * </pre>
050 *
051 * <p>Fetch a unigene cluster</p>
052 *
053 * <pre>
054 * UnigeneDB unigene = UnigeneTools.loadUnigene(
055 *   new URL("file:///usr/local/biodata/unigene") );
056 * UnigeneCluster cluster = unigenge.getCluster("Aga001");
057 * System.out.println("Title: " + cluster.getTitle());
058 * </pre>
059 *
060 * <p>Parse a data file yourself</p>
061 *
062 * <pre>
063 * BufferedReader br = new BufferedReader(new FileReader(unigeneFile));
064 * Parser = new Parser();
065 * TagValueListener echo = new Echo();
066 * ParserListener pl = UnigeneTools.buildDataParser(echo);
067 *
068 * while(parser.read(br, pl.getParser(), pl.getListener())) {
069 *   // read an entry
070 * }
071 * </pre>
072 *
073 * @author Matthew Pocock
074 */
075public class UnigeneTools {
076  /**
077   * <p>
078   * Annotation schema for all UnigeneCluster instances. This states what
079   * propperties can be expected to be associated with a cluster and how many
080   * values they may have.
081   * </p>
082   */
083  public static final AnnotationType UNIGENE_ANNOTATION;
084  
085  /**
086   * <p>
087   * Annotation schema for all Unigene libraries. This states what propperties
088   * can be expected to be associated with a library and how many values they
089   * may have.
090   * </p>
091   */
092  public static final AnnotationType LIBRARY_ANNOTATION;
093
094  private static final List factories;
095  private static final Map shortName2SpeciesName;
096  
097  static {
098    factories = new ArrayList();
099    registerFactory(new FlatFileUnigeneFactory());
100
101    shortName2SpeciesName = new HashMap();
102    
103    shortName2SpeciesName.put("Aga", "Anophelese gambiae");
104    shortName2SpeciesName.put("Hs", "Homo sapiens");
105    shortName2SpeciesName.put("Aga", "Anopheles gambiae");
106    shortName2SpeciesName.put("Bt", "Bos taurus");
107    shortName2SpeciesName.put("Dm", "Drosophila melanogaster");
108    shortName2SpeciesName.put("Dr", "Danio rario");
109    shortName2SpeciesName.put("Mm", "Mus musculus");
110    shortName2SpeciesName.put("Rn", "Rattus norvegicus");
111    shortName2SpeciesName.put("Xl", "Xenopus laevis");
112    shortName2SpeciesName.put("At", "Arabidopsis thaliana");
113    shortName2SpeciesName.put("Gma", "Glycine max");
114    shortName2SpeciesName.put("Hv", "Hordeum vulgare");
115    shortName2SpeciesName.put("Les", "Lycopersicon esculentum");
116    shortName2SpeciesName.put("Mtr", "Medicago truncatula");
117    shortName2SpeciesName.put("Os", "Oryza sativa");
118    shortName2SpeciesName.put("Ta", "Triticum aestivum");
119    shortName2SpeciesName.put("Zm", "Zea mays");
120    
121    // start to build this annotation type for .data files & UnigeneCluster
122    // annotation bundles
123    PropertyConstraint pc_string = new PropertyConstraint.ByClass(String.class);
124    PropertyConstraint pc_int = new PropertyConstraint.ByClass(Integer.class);
125    
126    AnnotationType.Impl at_sts = new AnnotationType.Impl();
127    at_sts.setConstraints("NAME",   pc_string, CardinalityConstraint.ONE);
128    at_sts.setConstraints("ACC",    pc_string, CardinalityConstraint.ZERO_OR_ONE);
129    at_sts.setConstraints("DSEG",   pc_string, CardinalityConstraint.ZERO_OR_ONE);
130    at_sts.setConstraints("UNISTS", pc_string, CardinalityConstraint.ONE);
131    PropertyConstraint pc_sts = new PropertyConstraint.ByAnnotationType(at_sts);
132    
133    AnnotationType.Impl at_txmap = new AnnotationType.Impl();
134    at_txmap.setConstraints("MARKER", pc_string, CardinalityConstraint.ONE);
135    at_txmap.setConstraints("RHPANEL", pc_string, CardinalityConstraint.ONE);
136    PropertyConstraint pc_txmap = new PropertyConstraint.ByAnnotationType(at_txmap);
137    
138    AnnotationType.Impl at_protsim = new AnnotationType.Impl();
139    at_protsim.setConstraints("ORG", pc_string, CardinalityConstraint.ONE);
140    at_protsim.setConstraints("PROTGI", pc_string, CardinalityConstraint.ONE);
141    at_protsim.setConstraints("PROTID", pc_string, CardinalityConstraint.ONE);
142    at_protsim.setConstraints("PCT", pc_string, CardinalityConstraint.ONE);
143    at_protsim.setConstraints("ALN", pc_int, CardinalityConstraint.ONE);
144    PropertyConstraint pc_prosim = new PropertyConstraint.ByAnnotationType(at_protsim);
145    
146    AnnotationType.Impl at_sequence = new AnnotationType.Impl();
147    at_sequence.setConstraints("ACC", pc_string, CardinalityConstraint.ONE);
148    at_sequence.setConstraints("NID", pc_string, CardinalityConstraint.ONE);
149    at_sequence.setConstraints("PID", pc_string, CardinalityConstraint.ZERO_OR_ONE);
150    at_sequence.setConstraints("CLONE", pc_string, CardinalityConstraint.ZERO_OR_ONE);
151    at_sequence.setConstraints("END", pc_string, CardinalityConstraint.ZERO_OR_ONE);
152    at_sequence.setConstraints("LID", pc_string, CardinalityConstraint.ZERO_OR_ONE);
153    at_sequence.setConstraints("MGC", pc_string, CardinalityConstraint.ZERO_OR_ONE);
154    PropertyConstraint pc_sequence = new PropertyConstraint.ByAnnotationType(at_sequence);
155    
156    AnnotationType.Impl unigene = new AnnotationType.Impl();
157    unigene.setConstraints("ID", pc_string, CardinalityConstraint.ONE);
158    unigene.setConstraints("TITLE", pc_string, CardinalityConstraint.ONE);
159    unigene.setConstraints("GENE", pc_string, CardinalityConstraint.ONE);
160    unigene.setConstraints("CYTOBAND", pc_string, CardinalityConstraint.ONE);
161    unigene.setConstraints("EXPRESS", pc_string, CardinalityConstraint.ONE);
162    unigene.setConstraints(
163      "GNM_TERMINUS",
164      new PropertyConstraint.Enumeration(new Object[] { "T", "I", "S" } ),
165      CardinalityConstraint.ONE);
166    unigene.setConstraints("LOCUSLINK", pc_string, CardinalityConstraint.ONE);
167    unigene.setConstraints("CHROMOSOME", pc_string, CardinalityConstraint.ONE);
168    unigene.setConstraints("STS", pc_sts, CardinalityConstraint.ANY);
169    unigene.setConstraints("TXMAP", pc_txmap, CardinalityConstraint.ANY);
170    unigene.setConstraints("PROSIM", pc_prosim, CardinalityConstraint.ANY);
171    unigene.setConstraints("SCOUNT", pc_int, CardinalityConstraint.ONE);
172    unigene.setConstraints("SEQUENCE", pc_sequence, CardinalityConstraint.ANY);
173    
174    UNIGENE_ANNOTATION = unigene;
175    
176    AnnotationType.Impl library = new AnnotationType.Impl();
177    library.setConstraints("ID", pc_string, CardinalityConstraint.ONE);
178    library.setConstraints("TITLE", pc_string, CardinalityConstraint.ONE);
179    library.setConstraints("TISSUE", pc_string, CardinalityConstraint.ONE);
180    library.setConstraints("VECTOR", pc_string, CardinalityConstraint.ONE);
181    
182    LIBRARY_ANNOTATION = library;
183  }
184  
185  /**
186   * Converts short species names (like Hs) to long species names (like Homo
187   * Sapiens).
188   *
189   * @param name  the short name
190   * @return the long name
191   */
192  public static String getSpeciesForShortName(String name) {
193    return (String) shortName2SpeciesName.get(name);
194  }
195  
196  /**
197   * Generate a tag-value parser for unigene data files that will pass all
198   * parsing events on to your listener.
199   *
200   * @param listener the TagValueListener to pass events onto
201   * @return a ParserListener that is ready to consume unigene data documents
202   */
203  public static ParserListener buildDataParser(TagValueListener listener)
204  throws ParserException {
205    try {
206      LineSplitParser entryParser = (LineSplitParser) LineSplitParser.GENBANK.clone();
207      entryParser.setTrimValue(true);
208      entryParser.setEndOfRecord("//");
209      
210      ChangeTable changeT = new ChangeTable();
211      changeT.setSplitter(
212        "EXPRESS",
213        new RegexSplitter(Pattern.compile("([^;]+)"), 1)
214      );
215      changeT.setChanger("ALN", ChangeTable.STRING_TO_INT);
216      changeT.setChanger("SCOUNT", ChangeTable.STRING_TO_INT);
217      ValueChanger changer = new ValueChanger(listener, changeT);
218      
219      SplitAndProp splitAndProp = new SplitAndProp(
220        listener,
221        Pattern.compile("(\\S+?)=([^;\\s]*)")
222      );
223      TagDelegator entryListener = new TagDelegator(changer);
224      entryListener.setListener("STS", splitAndProp);
225      entryListener.setListener("PROTSIM", splitAndProp);
226      entryListener.setListener("SEQUENCE", splitAndProp);
227      entryListener.setListener("TXMAP", new HandleMapInterval(listener));
228      
229      return new ParserListener(entryParser, entryListener);
230    } catch (CloneNotSupportedException cnse) {
231      throw new BioError(cnse);
232    }
233  }
234  
235  /**
236   * Generate a tag-value parser for the library info unigene files.
237   *
238   * @param listener the TagValueListener to pass events onto
239   * @return a ParserListener that is ready to consume unigene lib.info files
240   */
241  public static ParserListener buildLibInfoParser(TagValueListener listener)
242  throws IOException, ParserException{
243    RegexParser parser = new RegexParser();
244    parser.setContinueOnEmptyTag(false);
245    parser.setEndOfRecord(TagValueParser.EMPTY_LINE_EOR);
246    parser.setMergeSameTag(false);
247    parser.setPattern(Pattern.compile("([^=]+)=(.*)"));
248    parser.setTagGroup(1);
249    parser.setValueGroup(2);
250    
251    return new ParserListener(parser, listener);
252  }  
253  
254  private static class SplitAndProp
255  extends SimpleTagValueWrapper {
256    private Pattern splitPattern;
257    
258    public SplitAndProp(TagValueListener delegate, Pattern splitPattern) {
259      super(delegate);
260      this.splitPattern = splitPattern;
261    }
262    
263    public void value(TagValueContext tvc, Object value)
264    throws ParserException {
265      TagValueListener delegate = super.getDelegate();
266      
267      delegate.startRecord();
268      
269      String sv = (String) value;
270      Matcher m = splitPattern.matcher(sv);
271      while(m.find()) {
272        String k = m.group(1);
273        String v = m.group(2);
274        
275        delegate.startTag(k);
276        delegate.value(tvc, v);
277        delegate.endTag();
278      }
279      
280      delegate.endRecord();
281    }
282  }
283  
284  private static class HandleMapInterval
285  extends SimpleTagValueWrapper {
286    private Pattern pattern;
287    public HandleMapInterval(TagValueListener tvl) {
288      super(tvl);
289      pattern = Pattern.compile("([^-]+-[^;]+);\\s+\\w+=([^;]+);\\s+\\w+=(\\S+)");
290    }
291    
292    public void value(TagValueContext tvc, Object value)
293    throws ParserException {
294      TagValueListener delegate = super.getDelegate();
295      
296      delegate.startRecord();
297      
298      String sv = (String) value;
299      Matcher m = pattern.matcher(sv);
300      if(!m.find()) {
301        throw new ParserException("Could not parse line: " + sv);
302      }
303      
304      delegate.startTag("INTERVAL");
305      delegate.value(tvc, m.group(1));
306      delegate.endTag();
307      
308      delegate.startTag("MARKER");
309      delegate.value(tvc, m.group(2));
310      delegate.endTag();
311
312      delegate.startTag("RHPANEL");
313      delegate.value(tvc, m.group(3));
314      delegate.endTag();
315      
316      delegate.endRecord();
317    }
318  }
319
320  /**
321   * <p>Register a UnigeneFactory.</p>
322   *
323   * <p>This method is for developers who have written their own UnigeneFactory
324   * implementations. By default, jdbc and file URLs are handled by built-in
325   * factories.</p>
326   *
327   * <p>When you register a factory, it will be used for all URLs that is can
328   * accept. If a factory is registered afterwards that can accept the same URL,
329   * the first factory registered will be used.</p>
330   *
331   * @param factory  the UnigeneFactory to register
332   */
333  public static void registerFactory(UnigeneFactory factory) {
334    factories.add(factory);
335  }
336
337  /**
338   * <p>Register a UnigeneFactory.</p>
339   *
340   * <p>This method is for developers who wish to unregister a factory.</p>
341   *
342   * @param factory  the UnigeneFactory to unregister
343   */
344  public static void unregisterFactory(UnigeneFactory factory) {
345    factories.remove(factory);
346  }
347
348  /**
349   * Load a UnigeneDB instance referred to by a URL.
350   *
351   * @param dbURL the URL location the database
352   * @return a UnigeneDB instance
353   * @throws BioException if there was no UnigeneFactory able to process that
354   *         URL or if there was some error connecting
355   */
356  public static UnigeneDB loadUnigene(URL dbURL)
357  throws BioException {
358    return findFactory(dbURL).loadUnigene(dbURL);
359  }
360
361  /**
362   * Create a new UnigeneDB instance referred to by a URL.
363   *
364   * @param dbURL the URL location the database
365   * @return a UnigeneDB instance
366   * @throws BioException if there was no UnigeneFactory able to process that
367   *         URL or if there was some error creating it
368   */
369  public static UnigeneDB createUnigene(URL dbURL)
370  throws BioException {
371    return findFactory(dbURL).createUnigene(dbURL);
372  }
373
374  /**
375   * Find the UnigeneFactory that can accept a URL.
376   *
377   * <p><em>This method is for developers only.</em> The normal way to interact
378   * with factories is to call UnigeneTools.loadUnigene() and
379   * UnigeneTools.createUnigene()</p>
380   *
381   * @param dbURL  the URL to find a factory for
382   * @return the UnigeneFactory that accepts that URL
383   * @throws BioException if there is no factory for that type of URL
384   */
385  public static UnigeneFactory findFactory(URL dbURL)
386  throws BioException {
387    for(Iterator i = factories.iterator(); i.hasNext(); ) {
388      UnigeneFactory factory = (UnigeneFactory) i.next();
389      if(factory.canAccept(dbURL)) {
390        return factory;
391      }
392    }
393
394    throw new BioException("No factory for unigene url: " + dbURL);
395  }
396}