001package org.biojava.bio.program.unigene; 002 003import java.io.IOException; 004import java.net.URL; 005import java.util.ArrayList; 006import java.util.HashMap; 007import java.util.Iterator; 008import java.util.List; 009import java.util.Map; 010import java.util.regex.Matcher; 011import java.util.regex.Pattern; 012 013import org.biojava.bio.AnnotationType; 014import org.biojava.bio.BioError; 015import org.biojava.bio.BioException; 016import org.biojava.bio.CardinalityConstraint; 017import org.biojava.bio.PropertyConstraint; 018import org.biojava.bio.program.tagvalue.ChangeTable; 019import org.biojava.bio.program.tagvalue.LineSplitParser; 020import org.biojava.bio.program.tagvalue.ParserListener; 021import org.biojava.bio.program.tagvalue.RegexParser; 022import org.biojava.bio.program.tagvalue.RegexSplitter; 023import org.biojava.bio.program.tagvalue.SimpleTagValueWrapper; 024import org.biojava.bio.program.tagvalue.TagDelegator; 025import org.biojava.bio.program.tagvalue.TagValueContext; 026import org.biojava.bio.program.tagvalue.TagValueListener; 027import org.biojava.bio.program.tagvalue.TagValueParser; 028import org.biojava.bio.program.tagvalue.ValueChanger; 029import org.biojava.utils.ParserException; 030 031/** 032 * <p>Usefull tools for working with Unigene.</p> 033 * 034 * <p>This class is the main port-of-call for users of the Unigene package. It 035 * provides the core APIs for finding a Unigene database as well as registering 036 * your own Unigene drivers. Additionaly, it contains methods to return parsers 037 * for each of the main Unigene flat-file types. If you wish to bypass the 038 * biojava object model entirely, you can choose to use these parsers instead. 039 * </p> 040 * 041 * <h2>Example use</h2> 042 * 043 * <p>Creating a Unigene instance from your local Unigene directory (assuming 044 * that you have read/write privileges to the directory)</p> 045 * 046 * <pre> 047 * UnigeneDB unigene = UnigeneTools.createUnigene( 048 * new URL("file:///usr/local/biodata/unigene") ); 049 * </pre> 050 * 051 * <p>Fetch a unigene cluster</p> 052 * 053 * <pre> 054 * UnigeneDB unigene = UnigeneTools.loadUnigene( 055 * new URL("file:///usr/local/biodata/unigene") ); 056 * UnigeneCluster cluster = unigenge.getCluster("Aga001"); 057 * System.out.println("Title: " + cluster.getTitle()); 058 * </pre> 059 * 060 * <p>Parse a data file yourself</p> 061 * 062 * <pre> 063 * BufferedReader br = new BufferedReader(new FileReader(unigeneFile)); 064 * Parser = new Parser(); 065 * TagValueListener echo = new Echo(); 066 * ParserListener pl = UnigeneTools.buildDataParser(echo); 067 * 068 * while(parser.read(br, pl.getParser(), pl.getListener())) { 069 * // read an entry 070 * } 071 * </pre> 072 * 073 * @author Matthew Pocock 074 */ 075public class UnigeneTools { 076 /** 077 * <p> 078 * Annotation schema for all UnigeneCluster instances. This states what 079 * propperties can be expected to be associated with a cluster and how many 080 * values they may have. 081 * </p> 082 */ 083 public static final AnnotationType UNIGENE_ANNOTATION; 084 085 /** 086 * <p> 087 * Annotation schema for all Unigene libraries. This states what propperties 088 * can be expected to be associated with a library and how many values they 089 * may have. 090 * </p> 091 */ 092 public static final AnnotationType LIBRARY_ANNOTATION; 093 094 private static final List factories; 095 private static final Map shortName2SpeciesName; 096 097 static { 098 factories = new ArrayList(); 099 registerFactory(new FlatFileUnigeneFactory()); 100 101 shortName2SpeciesName = new HashMap(); 102 103 shortName2SpeciesName.put("Aga", "Anophelese gambiae"); 104 shortName2SpeciesName.put("Hs", "Homo sapiens"); 105 shortName2SpeciesName.put("Aga", "Anopheles gambiae"); 106 shortName2SpeciesName.put("Bt", "Bos taurus"); 107 shortName2SpeciesName.put("Dm", "Drosophila melanogaster"); 108 shortName2SpeciesName.put("Dr", "Danio rario"); 109 shortName2SpeciesName.put("Mm", "Mus musculus"); 110 shortName2SpeciesName.put("Rn", "Rattus norvegicus"); 111 shortName2SpeciesName.put("Xl", "Xenopus laevis"); 112 shortName2SpeciesName.put("At", "Arabidopsis thaliana"); 113 shortName2SpeciesName.put("Gma", "Glycine max"); 114 shortName2SpeciesName.put("Hv", "Hordeum vulgare"); 115 shortName2SpeciesName.put("Les", "Lycopersicon esculentum"); 116 shortName2SpeciesName.put("Mtr", "Medicago truncatula"); 117 shortName2SpeciesName.put("Os", "Oryza sativa"); 118 shortName2SpeciesName.put("Ta", "Triticum aestivum"); 119 shortName2SpeciesName.put("Zm", "Zea mays"); 120 121 // start to build this annotation type for .data files & UnigeneCluster 122 // annotation bundles 123 PropertyConstraint pc_string = new PropertyConstraint.ByClass(String.class); 124 PropertyConstraint pc_int = new PropertyConstraint.ByClass(Integer.class); 125 126 AnnotationType.Impl at_sts = new AnnotationType.Impl(); 127 at_sts.setConstraints("NAME", pc_string, CardinalityConstraint.ONE); 128 at_sts.setConstraints("ACC", pc_string, CardinalityConstraint.ZERO_OR_ONE); 129 at_sts.setConstraints("DSEG", pc_string, CardinalityConstraint.ZERO_OR_ONE); 130 at_sts.setConstraints("UNISTS", pc_string, CardinalityConstraint.ONE); 131 PropertyConstraint pc_sts = new PropertyConstraint.ByAnnotationType(at_sts); 132 133 AnnotationType.Impl at_txmap = new AnnotationType.Impl(); 134 at_txmap.setConstraints("MARKER", pc_string, CardinalityConstraint.ONE); 135 at_txmap.setConstraints("RHPANEL", pc_string, CardinalityConstraint.ONE); 136 PropertyConstraint pc_txmap = new PropertyConstraint.ByAnnotationType(at_txmap); 137 138 AnnotationType.Impl at_protsim = new AnnotationType.Impl(); 139 at_protsim.setConstraints("ORG", pc_string, CardinalityConstraint.ONE); 140 at_protsim.setConstraints("PROTGI", pc_string, CardinalityConstraint.ONE); 141 at_protsim.setConstraints("PROTID", pc_string, CardinalityConstraint.ONE); 142 at_protsim.setConstraints("PCT", pc_string, CardinalityConstraint.ONE); 143 at_protsim.setConstraints("ALN", pc_int, CardinalityConstraint.ONE); 144 PropertyConstraint pc_prosim = new PropertyConstraint.ByAnnotationType(at_protsim); 145 146 AnnotationType.Impl at_sequence = new AnnotationType.Impl(); 147 at_sequence.setConstraints("ACC", pc_string, CardinalityConstraint.ONE); 148 at_sequence.setConstraints("NID", pc_string, CardinalityConstraint.ONE); 149 at_sequence.setConstraints("PID", pc_string, CardinalityConstraint.ZERO_OR_ONE); 150 at_sequence.setConstraints("CLONE", pc_string, CardinalityConstraint.ZERO_OR_ONE); 151 at_sequence.setConstraints("END", pc_string, CardinalityConstraint.ZERO_OR_ONE); 152 at_sequence.setConstraints("LID", pc_string, CardinalityConstraint.ZERO_OR_ONE); 153 at_sequence.setConstraints("MGC", pc_string, CardinalityConstraint.ZERO_OR_ONE); 154 PropertyConstraint pc_sequence = new PropertyConstraint.ByAnnotationType(at_sequence); 155 156 AnnotationType.Impl unigene = new AnnotationType.Impl(); 157 unigene.setConstraints("ID", pc_string, CardinalityConstraint.ONE); 158 unigene.setConstraints("TITLE", pc_string, CardinalityConstraint.ONE); 159 unigene.setConstraints("GENE", pc_string, CardinalityConstraint.ONE); 160 unigene.setConstraints("CYTOBAND", pc_string, CardinalityConstraint.ONE); 161 unigene.setConstraints("EXPRESS", pc_string, CardinalityConstraint.ONE); 162 unigene.setConstraints( 163 "GNM_TERMINUS", 164 new PropertyConstraint.Enumeration(new Object[] { "T", "I", "S" } ), 165 CardinalityConstraint.ONE); 166 unigene.setConstraints("LOCUSLINK", pc_string, CardinalityConstraint.ONE); 167 unigene.setConstraints("CHROMOSOME", pc_string, CardinalityConstraint.ONE); 168 unigene.setConstraints("STS", pc_sts, CardinalityConstraint.ANY); 169 unigene.setConstraints("TXMAP", pc_txmap, CardinalityConstraint.ANY); 170 unigene.setConstraints("PROSIM", pc_prosim, CardinalityConstraint.ANY); 171 unigene.setConstraints("SCOUNT", pc_int, CardinalityConstraint.ONE); 172 unigene.setConstraints("SEQUENCE", pc_sequence, CardinalityConstraint.ANY); 173 174 UNIGENE_ANNOTATION = unigene; 175 176 AnnotationType.Impl library = new AnnotationType.Impl(); 177 library.setConstraints("ID", pc_string, CardinalityConstraint.ONE); 178 library.setConstraints("TITLE", pc_string, CardinalityConstraint.ONE); 179 library.setConstraints("TISSUE", pc_string, CardinalityConstraint.ONE); 180 library.setConstraints("VECTOR", pc_string, CardinalityConstraint.ONE); 181 182 LIBRARY_ANNOTATION = library; 183 } 184 185 /** 186 * Converts short species names (like Hs) to long species names (like Homo 187 * Sapiens). 188 * 189 * @param name the short name 190 * @return the long name 191 */ 192 public static String getSpeciesForShortName(String name) { 193 return (String) shortName2SpeciesName.get(name); 194 } 195 196 /** 197 * Generate a tag-value parser for unigene data files that will pass all 198 * parsing events on to your listener. 199 * 200 * @param listener the TagValueListener to pass events onto 201 * @return a ParserListener that is ready to consume unigene data documents 202 */ 203 public static ParserListener buildDataParser(TagValueListener listener) 204 throws ParserException { 205 try { 206 LineSplitParser entryParser = (LineSplitParser) LineSplitParser.GENBANK.clone(); 207 entryParser.setTrimValue(true); 208 entryParser.setEndOfRecord("//"); 209 210 ChangeTable changeT = new ChangeTable(); 211 changeT.setSplitter( 212 "EXPRESS", 213 new RegexSplitter(Pattern.compile("([^;]+)"), 1) 214 ); 215 changeT.setChanger("ALN", ChangeTable.STRING_TO_INT); 216 changeT.setChanger("SCOUNT", ChangeTable.STRING_TO_INT); 217 ValueChanger changer = new ValueChanger(listener, changeT); 218 219 SplitAndProp splitAndProp = new SplitAndProp( 220 listener, 221 Pattern.compile("(\\S+?)=([^;\\s]*)") 222 ); 223 TagDelegator entryListener = new TagDelegator(changer); 224 entryListener.setListener("STS", splitAndProp); 225 entryListener.setListener("PROTSIM", splitAndProp); 226 entryListener.setListener("SEQUENCE", splitAndProp); 227 entryListener.setListener("TXMAP", new HandleMapInterval(listener)); 228 229 return new ParserListener(entryParser, entryListener); 230 } catch (CloneNotSupportedException cnse) { 231 throw new BioError(cnse); 232 } 233 } 234 235 /** 236 * Generate a tag-value parser for the library info unigene files. 237 * 238 * @param listener the TagValueListener to pass events onto 239 * @return a ParserListener that is ready to consume unigene lib.info files 240 */ 241 public static ParserListener buildLibInfoParser(TagValueListener listener) 242 throws IOException, ParserException{ 243 RegexParser parser = new RegexParser(); 244 parser.setContinueOnEmptyTag(false); 245 parser.setEndOfRecord(TagValueParser.EMPTY_LINE_EOR); 246 parser.setMergeSameTag(false); 247 parser.setPattern(Pattern.compile("([^=]+)=(.*)")); 248 parser.setTagGroup(1); 249 parser.setValueGroup(2); 250 251 return new ParserListener(parser, listener); 252 } 253 254 private static class SplitAndProp 255 extends SimpleTagValueWrapper { 256 private Pattern splitPattern; 257 258 public SplitAndProp(TagValueListener delegate, Pattern splitPattern) { 259 super(delegate); 260 this.splitPattern = splitPattern; 261 } 262 263 public void value(TagValueContext tvc, Object value) 264 throws ParserException { 265 TagValueListener delegate = super.getDelegate(); 266 267 delegate.startRecord(); 268 269 String sv = (String) value; 270 Matcher m = splitPattern.matcher(sv); 271 while(m.find()) { 272 String k = m.group(1); 273 String v = m.group(2); 274 275 delegate.startTag(k); 276 delegate.value(tvc, v); 277 delegate.endTag(); 278 } 279 280 delegate.endRecord(); 281 } 282 } 283 284 private static class HandleMapInterval 285 extends SimpleTagValueWrapper { 286 private Pattern pattern; 287 public HandleMapInterval(TagValueListener tvl) { 288 super(tvl); 289 pattern = Pattern.compile("([^-]+-[^;]+);\\s+\\w+=([^;]+);\\s+\\w+=(\\S+)"); 290 } 291 292 public void value(TagValueContext tvc, Object value) 293 throws ParserException { 294 TagValueListener delegate = super.getDelegate(); 295 296 delegate.startRecord(); 297 298 String sv = (String) value; 299 Matcher m = pattern.matcher(sv); 300 if(!m.find()) { 301 throw new ParserException("Could not parse line: " + sv); 302 } 303 304 delegate.startTag("INTERVAL"); 305 delegate.value(tvc, m.group(1)); 306 delegate.endTag(); 307 308 delegate.startTag("MARKER"); 309 delegate.value(tvc, m.group(2)); 310 delegate.endTag(); 311 312 delegate.startTag("RHPANEL"); 313 delegate.value(tvc, m.group(3)); 314 delegate.endTag(); 315 316 delegate.endRecord(); 317 } 318 } 319 320 /** 321 * <p>Register a UnigeneFactory.</p> 322 * 323 * <p>This method is for developers who have written their own UnigeneFactory 324 * implementations. By default, jdbc and file URLs are handled by built-in 325 * factories.</p> 326 * 327 * <p>When you register a factory, it will be used for all URLs that is can 328 * accept. If a factory is registered afterwards that can accept the same URL, 329 * the first factory registered will be used.</p> 330 * 331 * @param factory the UnigeneFactory to register 332 */ 333 public static void registerFactory(UnigeneFactory factory) { 334 factories.add(factory); 335 } 336 337 /** 338 * <p>Register a UnigeneFactory.</p> 339 * 340 * <p>This method is for developers who wish to unregister a factory.</p> 341 * 342 * @param factory the UnigeneFactory to unregister 343 */ 344 public static void unregisterFactory(UnigeneFactory factory) { 345 factories.remove(factory); 346 } 347 348 /** 349 * Load a UnigeneDB instance referred to by a URL. 350 * 351 * @param dbURL the URL location the database 352 * @return a UnigeneDB instance 353 * @throws BioException if there was no UnigeneFactory able to process that 354 * URL or if there was some error connecting 355 */ 356 public static UnigeneDB loadUnigene(URL dbURL) 357 throws BioException { 358 return findFactory(dbURL).loadUnigene(dbURL); 359 } 360 361 /** 362 * Create a new UnigeneDB instance referred to by a URL. 363 * 364 * @param dbURL the URL location the database 365 * @return a UnigeneDB instance 366 * @throws BioException if there was no UnigeneFactory able to process that 367 * URL or if there was some error creating it 368 */ 369 public static UnigeneDB createUnigene(URL dbURL) 370 throws BioException { 371 return findFactory(dbURL).createUnigene(dbURL); 372 } 373 374 /** 375 * Find the UnigeneFactory that can accept a URL. 376 * 377 * <p><em>This method is for developers only.</em> The normal way to interact 378 * with factories is to call UnigeneTools.loadUnigene() and 379 * UnigeneTools.createUnigene()</p> 380 * 381 * @param dbURL the URL to find a factory for 382 * @return the UnigeneFactory that accepts that URL 383 * @throws BioException if there is no factory for that type of URL 384 */ 385 public static UnigeneFactory findFactory(URL dbURL) 386 throws BioException { 387 for(Iterator i = factories.iterator(); i.hasNext(); ) { 388 UnigeneFactory factory = (UnigeneFactory) i.next(); 389 if(factory.canAccept(dbURL)) { 390 return factory; 391 } 392 } 393 394 throw new BioException("No factory for unigene url: " + dbURL); 395 } 396}