001package org.biojava.bio.program.unigene; 002 003import java.io.File; 004import java.io.FileFilter; 005import java.io.FileReader; 006import java.io.IOException; 007import java.net.URL; 008import java.util.Collections; 009import java.util.HashMap; 010import java.util.Map; 011import java.util.regex.Matcher; 012import java.util.regex.Pattern; 013 014import org.biojava.bio.BioException; 015import org.biojava.bio.program.indexdb.BioStore; 016import org.biojava.bio.program.indexdb.BioStoreFactory; 017import org.biojava.bio.program.indexdb.IndexStore; 018import org.biojava.bio.program.tagvalue.Indexer; 019import org.biojava.bio.program.tagvalue.Parser; 020import org.biojava.bio.program.tagvalue.ParserListener; 021import org.biojava.bio.seq.DNATools; 022import org.biojava.bio.seq.Sequence; 023import org.biojava.bio.seq.io.FastaFormat; 024import org.biojava.bio.seq.io.SeqIOAdapter; 025import org.biojava.bio.seq.io.SequenceBuilder; 026import org.biojava.bio.seq.io.SequenceBuilderFactory; 027import org.biojava.bio.seq.io.StreamReader; 028import org.biojava.bio.seq.io.SymbolTokenization; 029import org.biojava.utils.CommitFailure; 030import org.biojava.utils.ParserException; 031import org.biojava.utils.io.CountedBufferedReader; 032import org.biojava.utils.io.RAF; 033 034/** 035 * <p>A UnigeneFactory that will use flat-file indexing of the unigene ascii-art 036 * files.</p> 037 * 038 * <p><em>This class is for developers and power-users.</em> Usually you will 039 * not use this class directly, but rather use UnigeneTools.loadDatabase() with 040 * a file URL.</p> 041 * 042 * <p>This will create all the index files necisary to look up records in a timely 043 * manner. It requires read/write access to the unigene directory. No files 044 * will be deleted during this opperation. The indexing strategy used is 045 * compattible with the OBDA flat-file indexing spec and uses the package 046 * org.biojava.bio.program.indexdb and parsers that are compattible with the 047 * tag-value API.</p> 048 * 049 * @author Matthew Pocock 050 */ 051public class FlatFileUnigeneFactory 052implements UnigeneFactory { 053 private static final String DATA_INDEX = "data.index"; 054 private static final String LIB_INFO_INDEX = "libInfo.index"; 055 private static final String UNIQUE_INDEX = "unique.index"; 056 private static final String ALL_INDEX = "all.index"; 057 058 /** 059 * Accepts all URLs that are of the file protocol. 060 */ 061 public boolean canAccept(URL unigeneLoc) { 062 return unigeneLoc.getProtocol().equals("file"); 063 } 064 065 public UnigeneDB loadUnigene(URL unigeneLoc) 066 throws BioException { 067 if(!unigeneLoc.getProtocol().equals("file")) { 068 throw new BioException( 069 "Can't create unigene from non-file URL: " + 070 unigeneLoc 071 ); 072 } 073 074 File unigeneDir = new File(unigeneLoc.getPath()); 075 if(!unigeneDir.exists()) { 076 throw new BioException("Could not locate directory: " + unigeneDir); 077 } 078 if(!unigeneDir.isDirectory()) { 079 throw new BioException("Expecting a directory at: " + unigeneDir); 080 } 081 082 083 // load a pre-made unigene file set 084 try { 085 return new FlatFileUnigeneDB( 086 new BioStore(new File(unigeneDir, DATA_INDEX), true), 087 new BioStore(new File(unigeneDir, LIB_INFO_INDEX), true), 088 new BioStore(new File(unigeneDir, UNIQUE_INDEX), true), 089 new BioStore(new File(unigeneDir, ALL_INDEX), true) 090 ); 091 } catch (IOException ioe) { 092 throw new BioException("Could not instantiate flat file unigene db",ioe); 093 } 094 } 095 096 public UnigeneDB createUnigene(URL unigeneLoc) 097 throws BioException { 098 if(!unigeneLoc.getProtocol().equals("file")) { 099 throw new BioException( 100 "Can't create unigene from non-file URL: " + 101 unigeneLoc 102 ); 103 } 104 105 File unigeneDir = new File(unigeneLoc.getPath()); 106 if(!unigeneDir.exists()) { 107 throw new BioException("Could not locate directory: " + unigeneDir); 108 } 109 if(!unigeneDir.isDirectory()) { 110 throw new BioException("Expecting a directory at: " + unigeneDir); 111 } 112 113 try { 114 indexAll(unigeneDir); 115 indexUnique(unigeneDir); 116 indexData(unigeneDir); 117 indexLibInfo(unigeneDir); 118 } catch (IOException ioe) { 119 throw new BioException("Failed to index data",ioe); 120 } 121 122 return loadUnigene(unigeneLoc); 123 } 124 125 private void indexData(File unigeneDir) 126 throws BioException, IOException { 127 // create index file for all *.data files 128 File dataIndexFile = new File(unigeneDir, DATA_INDEX); 129 BioStoreFactory dataBSF = new BioStoreFactory(); 130 dataBSF.setPrimaryKey("ID"); 131 dataBSF.addKey("ID", 10); 132 dataBSF.setStoreLocation(dataIndexFile); 133 BioStore dataStore = dataBSF.createBioStore(); 134 File[] dataFiles = unigeneDir.listFiles(new FileFilter() { 135 public boolean accept(File pathName) { 136 return pathName.getName().endsWith(".data"); 137 } 138 }); 139 for(int i = 0; i < dataFiles.length; i++) { 140 File f = dataFiles[i]; 141 try { 142 Indexer indexer = new Indexer(f, dataStore); 143 indexer.setPrimaryKeyName("ID"); 144 Parser parser = new Parser(); 145 ParserListener pl = UnigeneTools.buildDataParser(indexer); 146 while(parser.read( 147 indexer.getReader(), 148 pl.getParser(), 149 pl.getListener() 150 )) { ; } 151 } catch (ParserException pe) { 152 throw new BioException("Failed to parse " + f, pe); 153 } 154 } 155 try { 156 dataStore.commit(); 157 } catch (CommitFailure ne) { 158 throw new BioException(ne); 159 } 160 } 161 162 private void indexLibInfo(File unigeneDir) 163 throws BioException, IOException { 164 // create index for all *.lib.info files 165 File liIndexFile = new File(unigeneDir, LIB_INFO_INDEX); 166 BioStoreFactory liBSF = new BioStoreFactory(); 167 liBSF.setPrimaryKey("ID"); 168 liBSF.addKey("ID", 7); 169 liBSF.setStoreLocation(liIndexFile); 170 BioStore liStore = liBSF.createBioStore(); 171 File[] liFiles = unigeneDir.listFiles(new FileFilter() { 172 public boolean accept(File pathName) { 173 return pathName.getName().endsWith(".lib.info"); 174 } 175 }); 176 for(int i = 0; i < liFiles.length; i++) { 177 File f = liFiles[i]; 178 try { 179 Indexer indexer = new Indexer(f, liStore); 180 indexer.setPrimaryKeyName("ID"); 181 Parser parser = new Parser(); 182 ParserListener pl = UnigeneTools.buildLibInfoParser(indexer); 183 while(parser.read( 184 indexer.getReader(), 185 pl.getParser(), 186 pl.getListener() 187 )) { ; } 188 } catch (ParserException pe) { 189 throw new BioException("Failed to parse " + f, pe); 190 } 191 } 192 try { 193 liStore.commit(); 194 } catch (CommitFailure ne) { 195 throw new BioException(ne); 196 } 197 } 198 199 private void indexUnique(File unigeneDir) 200 throws BioException, IOException { 201 File uniqueIndex = new File(unigeneDir, UNIQUE_INDEX); 202 BioStoreFactory uniqueBSF = new BioStoreFactory(); 203 uniqueBSF.setStoreLocation(uniqueIndex); 204 uniqueBSF.setPrimaryKey("ID"); 205 uniqueBSF.addKey("ID", 10); 206 BioStore uniqueStore = uniqueBSF.createBioStore(); 207 File[] uniqueFiles = unigeneDir.listFiles(new FileFilter() { 208 public boolean accept(File pathName) { 209 return pathName.getName().endsWith(".seq.uniq"); 210 } 211 }); 212 for(int i = 0; i < uniqueFiles.length; i++) { 213 File f = uniqueFiles[i]; 214 RAF raf = new RAF(f, "r"); 215 FastaIndexer indexer = new FastaIndexer( 216 raf, 217 uniqueStore, 218 Pattern.compile("#(\\S+)"), 219 1 220 ); 221 FastaFormat format = new FastaFormat(); 222 SymbolTokenization tok = DNATools.getDNA().getTokenization("token"); 223 StreamReader sreader = new StreamReader( 224 indexer.getReader(), 225 format, 226 tok, 227 indexer 228 ); 229 while(sreader.hasNext()) { 230 sreader.nextSequence(); 231 } 232 } 233 try { 234 uniqueStore.commit(); 235 } catch (CommitFailure ne) { 236 throw new BioException(ne); 237 } 238 } 239 240 private void indexAll(File unigeneDir) 241 throws BioException, IOException { 242 File allIndex = new File(unigeneDir, ALL_INDEX); 243 BioStoreFactory allBSF = new BioStoreFactory(); 244 allBSF.setStoreLocation(allIndex); 245 allBSF.setPrimaryKey("ID"); 246 allBSF.addKey("ID", 10); 247 BioStore allStore = allBSF.createBioStore(); 248 File[] allFiles = unigeneDir.listFiles(new FileFilter() { 249 public boolean accept(File pathName) { 250 return pathName.getName().endsWith(".seq.all"); 251 } 252 }); 253 Pattern pattern = Pattern.compile("/gb=(\\S+)"); 254 for(int i = 0; i < allFiles.length; i++) { 255 File f = allFiles[i]; 256 RAF raf = new RAF(f, "r"); 257 CountedBufferedReader reader = new CountedBufferedReader(new FileReader(f)); 258 259 long offset = -1; 260 String id = null; 261 for(String line = reader.readLine(); line != null; line = reader.readLine()) { 262 if(line.startsWith("#")) { 263 long nof = reader.getFilePointer(); 264 if(id != null) { 265 allStore.writeRecord(raf, offset, (int) (nof - offset), id, Collections.EMPTY_MAP); 266 } 267 Matcher matcher = pattern.matcher(line); 268 matcher.find(); 269 id = matcher.group(1); 270 offset = nof; 271 } 272 } 273 } 274 try { 275 allStore.commit(); 276 } catch (CommitFailure cf) { 277 throw new BioException(cf); 278 } 279 } 280 281 private static class FastaIndexer implements SequenceBuilderFactory { 282 private final Map map = new HashMap(); 283 private final RAF raf; 284 private final IndexStore store; 285 private final CountedBufferedReader reader; 286 private final Pattern idPattern; 287 private final int idGroup; 288 289 public FastaIndexer(RAF raf, IndexStore store, Pattern idPattern, int idGroup) 290 throws IOException { 291 this.raf = raf; 292 this.store = store; 293 this.idPattern = idPattern; 294 this.idGroup = idGroup; 295 reader = new CountedBufferedReader( 296 new FileReader( 297 raf.getFile() 298 ) 299 ); 300 } 301 302 public CountedBufferedReader getReader() { 303 return reader; 304 } 305 306 public SequenceBuilder makeSequenceBuilder() { 307 return new SeqIOIndexer(); 308 } 309 310 class SeqIOIndexer extends SeqIOAdapter implements SequenceBuilder { 311 long offset = 0L; 312 String id; 313 314 public void startSequence() { 315 id = null; 316 offset = reader.getFilePointer(); 317 } 318 319 public void addSequenceProperty(Object key, Object value) { 320 if(key.equals(FastaFormat.PROPERTY_DESCRIPTIONLINE)) { 321 String line = (String) value; 322 Matcher m = idPattern.matcher(line); 323 m.find(); 324 id = m.group(idGroup); 325 } 326 } 327 328 public void endSequence() { 329 long nof = reader.getFilePointer(); 330 store.writeRecord(raf, offset, (int) (nof - offset), id, map); 331 offset = nof; 332 } 333 334 public Sequence makeSequence() { 335 return null; 336 } 337 } 338 } 339}