001package org.biojava.bio.program.unigene;
002
003import java.io.File;
004import java.io.FileFilter;
005import java.io.FileReader;
006import java.io.IOException;
007import java.net.URL;
008import java.util.Collections;
009import java.util.HashMap;
010import java.util.Map;
011import java.util.regex.Matcher;
012import java.util.regex.Pattern;
013
014import org.biojava.bio.BioException;
015import org.biojava.bio.program.indexdb.BioStore;
016import org.biojava.bio.program.indexdb.BioStoreFactory;
017import org.biojava.bio.program.indexdb.IndexStore;
018import org.biojava.bio.program.tagvalue.Indexer;
019import org.biojava.bio.program.tagvalue.Parser;
020import org.biojava.bio.program.tagvalue.ParserListener;
021import org.biojava.bio.seq.DNATools;
022import org.biojava.bio.seq.Sequence;
023import org.biojava.bio.seq.io.FastaFormat;
024import org.biojava.bio.seq.io.SeqIOAdapter;
025import org.biojava.bio.seq.io.SequenceBuilder;
026import org.biojava.bio.seq.io.SequenceBuilderFactory;
027import org.biojava.bio.seq.io.StreamReader;
028import org.biojava.bio.seq.io.SymbolTokenization;
029import org.biojava.utils.CommitFailure;
030import org.biojava.utils.ParserException;
031import org.biojava.utils.io.CountedBufferedReader;
032import org.biojava.utils.io.RAF;
033
034/**
035 * <p>A UnigeneFactory that will use flat-file indexing of the unigene ascii-art
036 * files.</p>
037 *
038 * <p><em>This class is for developers and power-users.</em> Usually you will
039 * not use this class directly, but rather use UnigeneTools.loadDatabase() with
040 * a file URL.</p>
041 *
042 * <p>This will create all the index files necisary to look up records in a timely
043 * manner. It requires read/write access to the unigene directory. No files
044 * will be deleted during this opperation. The indexing strategy used is
045 * compattible with the OBDA flat-file indexing spec and uses the package
046 * org.biojava.bio.program.indexdb and parsers that are compattible with the
047 * tag-value API.</p>
048 *
049 * @author Matthew Pocock
050 */
051public class FlatFileUnigeneFactory
052implements UnigeneFactory {
053  private static final String DATA_INDEX = "data.index";
054  private static final String LIB_INFO_INDEX = "libInfo.index";
055  private static final String UNIQUE_INDEX = "unique.index";
056  private static final String ALL_INDEX = "all.index";
057
058  /**
059   * Accepts all URLs that are of the file protocol.
060   */
061  public boolean canAccept(URL unigeneLoc) {
062    return unigeneLoc.getProtocol().equals("file");
063  }
064
065  public UnigeneDB loadUnigene(URL unigeneLoc)
066  throws BioException {
067    if(!unigeneLoc.getProtocol().equals("file")) {
068      throw new BioException(
069        "Can't create unigene from non-file URL: " +
070        unigeneLoc
071      );
072    }
073
074    File unigeneDir = new File(unigeneLoc.getPath());
075    if(!unigeneDir.exists()) {
076      throw new BioException("Could not locate directory: " + unigeneDir);
077    }
078    if(!unigeneDir.isDirectory()) {
079      throw new BioException("Expecting a directory at: " + unigeneDir);
080    }
081
082
083    // load a pre-made unigene file set
084    try {
085      return new FlatFileUnigeneDB(
086        new BioStore(new File(unigeneDir, DATA_INDEX), true),
087        new BioStore(new File(unigeneDir, LIB_INFO_INDEX), true),
088        new BioStore(new File(unigeneDir, UNIQUE_INDEX), true),
089        new BioStore(new File(unigeneDir, ALL_INDEX), true)
090      );
091    } catch (IOException ioe) {
092      throw new BioException("Could not instantiate flat file unigene db",ioe);
093    }
094  }
095
096  public UnigeneDB createUnigene(URL unigeneLoc)
097  throws BioException {
098    if(!unigeneLoc.getProtocol().equals("file")) {
099      throw new BioException(
100        "Can't create unigene from non-file URL: " +
101        unigeneLoc
102      );
103    }
104
105    File unigeneDir = new File(unigeneLoc.getPath());
106    if(!unigeneDir.exists()) {
107      throw new BioException("Could not locate directory: " + unigeneDir);
108    }
109    if(!unigeneDir.isDirectory()) {
110      throw new BioException("Expecting a directory at: " + unigeneDir);
111    }
112
113    try {
114      indexAll(unigeneDir);
115      indexUnique(unigeneDir);
116      indexData(unigeneDir);
117      indexLibInfo(unigeneDir);
118    } catch (IOException ioe) {
119      throw new BioException("Failed to index data",ioe);
120    }
121
122    return loadUnigene(unigeneLoc);
123  }
124
125  private void indexData(File unigeneDir)
126  throws BioException, IOException {
127    // create index file for all *.data files
128    File dataIndexFile = new File(unigeneDir, DATA_INDEX);
129    BioStoreFactory dataBSF = new BioStoreFactory();
130    dataBSF.setPrimaryKey("ID");
131    dataBSF.addKey("ID", 10);
132    dataBSF.setStoreLocation(dataIndexFile);
133    BioStore dataStore = dataBSF.createBioStore();
134    File[] dataFiles = unigeneDir.listFiles(new FileFilter() {
135      public boolean accept(File pathName) {
136        return pathName.getName().endsWith(".data");
137      }
138    });
139    for(int i = 0; i < dataFiles.length; i++) {
140      File f = dataFiles[i];
141      try {
142        Indexer indexer = new Indexer(f, dataStore);
143        indexer.setPrimaryKeyName("ID");
144        Parser parser = new Parser();
145        ParserListener pl = UnigeneTools.buildDataParser(indexer);
146        while(parser.read(
147          indexer.getReader(),
148          pl.getParser(),
149          pl.getListener()
150        )) { ; }
151      } catch (ParserException pe) {
152        throw new BioException("Failed to parse " + f, pe);
153      }
154    }
155    try {
156      dataStore.commit();
157    } catch (CommitFailure ne) {
158      throw new BioException(ne);
159    }
160  }
161
162  private void indexLibInfo(File unigeneDir)
163  throws BioException, IOException {
164    // create index for all *.lib.info files
165    File liIndexFile = new File(unigeneDir, LIB_INFO_INDEX);
166    BioStoreFactory liBSF = new BioStoreFactory();
167    liBSF.setPrimaryKey("ID");
168    liBSF.addKey("ID", 7);
169    liBSF.setStoreLocation(liIndexFile);
170    BioStore liStore = liBSF.createBioStore();
171    File[] liFiles = unigeneDir.listFiles(new FileFilter() {
172      public boolean accept(File pathName) {
173        return pathName.getName().endsWith(".lib.info");
174      }
175    });
176    for(int i = 0; i < liFiles.length; i++) {
177      File f = liFiles[i];
178      try {
179        Indexer indexer = new Indexer(f, liStore);
180        indexer.setPrimaryKeyName("ID");
181        Parser parser = new Parser();
182        ParserListener pl = UnigeneTools.buildLibInfoParser(indexer);
183        while(parser.read(
184            indexer.getReader(),
185            pl.getParser(),
186            pl.getListener()
187        )) { ; }
188      } catch (ParserException pe) {
189        throw new BioException("Failed to parse " + f, pe);
190      }
191    }
192    try {
193      liStore.commit();
194    } catch (CommitFailure ne) {
195      throw new BioException(ne);
196    }
197  }
198
199  private void indexUnique(File unigeneDir)
200  throws BioException, IOException {
201    File uniqueIndex = new File(unigeneDir, UNIQUE_INDEX);
202    BioStoreFactory uniqueBSF = new BioStoreFactory();
203    uniqueBSF.setStoreLocation(uniqueIndex);
204    uniqueBSF.setPrimaryKey("ID");
205    uniqueBSF.addKey("ID", 10);
206    BioStore uniqueStore = uniqueBSF.createBioStore();
207    File[] uniqueFiles = unigeneDir.listFiles(new FileFilter() {
208      public boolean accept(File pathName) {
209        return pathName.getName().endsWith(".seq.uniq");
210      }
211    });
212    for(int i = 0; i < uniqueFiles.length; i++) {
213      File f = uniqueFiles[i];
214      RAF raf = new RAF(f, "r");
215      FastaIndexer indexer = new FastaIndexer(
216        raf,
217        uniqueStore,
218        Pattern.compile("#(\\S+)"),
219        1
220      );
221      FastaFormat format = new FastaFormat();
222      SymbolTokenization tok = DNATools.getDNA().getTokenization("token");
223      StreamReader sreader = new StreamReader(
224        indexer.getReader(),
225        format,
226        tok,
227        indexer
228      );
229      while(sreader.hasNext()) {
230        sreader.nextSequence();
231      }
232    }
233    try {
234      uniqueStore.commit();
235    } catch (CommitFailure ne) {
236      throw new BioException(ne);
237    }
238  }
239
240  private void indexAll(File unigeneDir)
241  throws BioException, IOException {
242    File allIndex = new File(unigeneDir, ALL_INDEX);
243    BioStoreFactory allBSF = new BioStoreFactory();
244    allBSF.setStoreLocation(allIndex);
245    allBSF.setPrimaryKey("ID");
246    allBSF.addKey("ID", 10);
247    BioStore allStore = allBSF.createBioStore();
248    File[] allFiles = unigeneDir.listFiles(new FileFilter() {
249      public boolean accept(File pathName) {
250        return pathName.getName().endsWith(".seq.all");
251      }
252    });
253    Pattern pattern = Pattern.compile("/gb=(\\S+)");
254    for(int i = 0; i < allFiles.length; i++) {
255      File f = allFiles[i];
256      RAF raf = new RAF(f, "r");
257      CountedBufferedReader reader = new CountedBufferedReader(new FileReader(f));
258
259      long offset = -1;
260      String id = null;
261      for(String line = reader.readLine(); line != null; line = reader.readLine()) {
262        if(line.startsWith("#")) {
263          long nof = reader.getFilePointer();
264          if(id != null) {
265            allStore.writeRecord(raf, offset, (int) (nof - offset), id, Collections.EMPTY_MAP);
266          }
267          Matcher matcher = pattern.matcher(line);
268          matcher.find();
269          id = matcher.group(1);
270          offset = nof;
271        }
272      }
273    }
274    try {
275      allStore.commit();
276    } catch (CommitFailure cf) {
277      throw new BioException(cf);
278    }
279  }
280
281  private static class FastaIndexer implements SequenceBuilderFactory {
282    private final Map map = new HashMap();
283    private final RAF raf;
284    private final IndexStore store;
285    private final CountedBufferedReader reader;
286    private final Pattern idPattern;
287    private final int idGroup;
288
289    public FastaIndexer(RAF raf, IndexStore store, Pattern idPattern, int idGroup)
290    throws IOException {
291      this.raf = raf;
292      this.store = store;
293      this.idPattern = idPattern;
294      this.idGroup = idGroup;
295      reader = new CountedBufferedReader(
296        new FileReader(
297          raf.getFile()
298        )
299      );
300    }
301
302    public CountedBufferedReader getReader() {
303      return reader;
304    }
305
306    public SequenceBuilder makeSequenceBuilder() {
307      return new SeqIOIndexer();
308    }
309
310    class SeqIOIndexer extends SeqIOAdapter implements SequenceBuilder {
311      long offset = 0L;
312      String id;
313
314      public void startSequence() {
315        id = null;
316        offset = reader.getFilePointer();
317      }
318
319      public void addSequenceProperty(Object key, Object value) {
320        if(key.equals(FastaFormat.PROPERTY_DESCRIPTIONLINE)) {
321          String line = (String) value;
322          Matcher m = idPattern.matcher(line);
323          m.find();
324          id = m.group(idGroup);
325        }
326      }
327
328      public void endSequence() {
329        long nof = reader.getFilePointer();
330        store.writeRecord(raf, offset, (int) (nof - offset), id, map);
331        offset = nof;
332      }
333
334      public Sequence makeSequence() {
335        return null;
336      }
337    }
338  }
339}