001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022package org.biojava.bio.program.indexdb;
023
024import java.io.File;
025import java.io.FileNotFoundException;
026import java.io.FileReader;
027import java.io.IOException;
028import java.util.HashMap;
029import java.util.Map;
030
031import org.biojava.bio.BioException;
032import org.biojava.bio.program.tagvalue.ChangeTable;
033import org.biojava.bio.program.tagvalue.Indexer;
034import org.biojava.bio.program.tagvalue.LineSplitParser;
035import org.biojava.bio.program.tagvalue.Parser;
036import org.biojava.bio.program.tagvalue.ValueChanger;
037import org.biojava.bio.seq.io.SeqIOConstants;
038import org.biojava.utils.CommitFailure;
039import org.biojava.utils.ParserException;
040import org.biojava.utils.io.CountedBufferedReader;
041import org.biojava.utils.io.RAF;
042import org.biojava.utils.lsid.LifeScienceIdentifier;
043
044/**
045 * <code>IndexTools</code> contains static utility methods for
046 * creating flatfile indices according to the OBDA standard.
047 *
048 * @author Keith James
049 * @author Matthew Pocock
050 */
051public class IndexTools
052{
053    // Cannot be instantiated
054    private IndexTools() { }
055
056    /**
057     * <code>indexFasta</code> indexes DNA, RNA or protein Fasta
058     * format sequence files on primary identifier.
059     *
060     * @param location a <code>File</code> directory which will
061     * contain the indices.
062     * @param seqFiles a <code>File []</code> array of files to index.
063     * @param alphabetIdentifier an <code>int</code> indicating the
064     * type of sequence to be indexed. May be one of
065     * <code>SeqIOConstants.DNA SeqIOConstants.RNA
066     * SeqIOConstants.AA</code>.
067     * @param name a <code>String</code> arbitrary database name.
068     *
069     * @exception FileNotFoundException if an error occurs.
070     * @exception IOException if an error occurs.
071     * @exception ParserException if an error occurs.
072     * @exception BioException if an error occurs.
073     */
074    public static void indexFasta(String name, File location, File [] seqFiles,
075                                  int alphabetIdentifier)
076        throws FileNotFoundException, IOException, ParserException,
077               BioException
078    {
079        BioStoreFactory bsf = new BioStoreFactory();
080        bsf.setStoreName(name);
081
082        switch (alphabetIdentifier)
083        {
084            case (SeqIOConstants.DNA):
085                bsf.setSequenceFormat(SeqIOConstants.LSID_FASTA_DNA);
086                break;
087            case (SeqIOConstants.RNA):
088                bsf.setSequenceFormat(SeqIOConstants.LSID_FASTA_RNA);
089                break;
090            case (SeqIOConstants.AA):
091                bsf.setSequenceFormat(SeqIOConstants.LSID_FASTA_AA);
092                break;
093
094            default:
095                throw new IllegalArgumentException("Unknown alphabet identifier '"
096                                                   + alphabetIdentifier
097                                                   + "'");
098        }
099
100        _indexFasta(bsf, location, seqFiles);
101    }
102
103    /**
104     * <code>indexEmbl</code> indexes DNA, RNA or protein EMBL format
105     * sequence files on ID as primary identifier and AC as secondary.
106     *
107     * @param location a <code>File</code> directory which will
108     * contain the indices.
109     * @param seqFiles a <code>File []</code> array of files to index.
110     * @param alphabetIdentifier an <code>int</code> indicating the
111     * type of sequence to be indexed. May be one of
112     * <code>SeqIOConstants.DNA SeqIOConstants.RNA
113     * SeqIOConstants.AA</code>.
114     * @param name a <code>String</code> arbitrary database name.
115     *
116     * @exception FileNotFoundException if an error occurs.
117     * @exception IOException if an error occurs.
118     * @exception ParserException if an error occurs.
119     * @exception BioException if an error occurs.
120     */
121    public static void indexEmbl(String name, File location, File [] seqFiles,
122                                 int alphabetIdentifier)
123        throws FileNotFoundException, IOException, ParserException,
124               BioException
125    {
126        BioStoreFactory bsf = new BioStoreFactory();
127        bsf.setStoreName(name);
128
129        switch (alphabetIdentifier)
130        {
131            case (SeqIOConstants.DNA):
132                bsf.setSequenceFormat(SeqIOConstants.LSID_EMBL_DNA);
133                break;
134            case (SeqIOConstants.RNA):
135                bsf.setSequenceFormat(SeqIOConstants.LSID_EMBL_RNA);
136                break;
137            case (SeqIOConstants.AA):
138                bsf.setSequenceFormat(SeqIOConstants.LSID_EMBL_AA);
139                break;
140
141            default:
142                throw new IllegalArgumentException("Unknown alphabet identifier '"
143                                                   + alphabetIdentifier
144                                                   + "'");
145        }
146
147        _indexEmblLike(bsf, location, seqFiles);
148    }
149
150    /**
151     * <code>indexGenbank</code> indexes DNA, RNA or protein Genbank
152     * format sequence files on LOCUS as primary identifier and
153     * ACCESSION as secondary.
154     *
155     * @param location a <code>File</code> directory which will
156     * contain the indices.
157     * @param seqFiles a <code>File []</code> array of files to index.
158     * @param alphabetIdentifier an <code>int</code> indicating the
159     * type of sequence to be indexed. May be one of
160     * <code>SeqIOConstants.DNA SeqIOConstants.RNA
161     * SeqIOConstants.AA</code>.
162     * @param name a <code>String</code> arbitrary database name.
163     *
164     * @exception FileNotFoundException if an error occurs.
165     * @exception IOException if an error occurs.
166     * @exception ParserException if an error occurs.
167     * @exception BioException if an error occurs.
168     */
169    public static void indexGenbank(String name, File location, File [] seqFiles,
170                                    int alphabetIdentifier)
171        throws FileNotFoundException, IOException, ParserException,
172               BioException
173    {
174        BioStoreFactory bsf = new BioStoreFactory();
175        bsf.setStoreName(name);
176
177        switch (alphabetIdentifier)
178        {
179            case (SeqIOConstants.DNA):
180                bsf.setSequenceFormat(SeqIOConstants.LSID_GENBANK_DNA);
181                break;
182            case (SeqIOConstants.RNA):
183                bsf.setSequenceFormat(SeqIOConstants.LSID_GENBANK_RNA);
184                break;
185            case (SeqIOConstants.AA):
186                bsf.setSequenceFormat(SeqIOConstants.LSID_GENBANK_AA);
187                break;
188
189            default:
190                throw new IllegalArgumentException("Unknown alphabet identifier '"
191                                                   + alphabetIdentifier
192                                                   + "'");
193        }
194
195        _indexGenbank(bsf, location, seqFiles);
196    }
197
198
199    /**
200     * <code>indexSwissprot</code> indexes Swissprot format protein
201     * sequence files on ID as primary identifier.
202     *
203     * @param location a <code>File</code> directory which will
204     * contain the indices.
205     * @param seqFiles a <code>File []</code> array of files to index.
206     * @exception FileNotFoundException if an error occurs.
207     * @exception IOException if an error occurs.
208     * @exception ParserException if an error occurs.
209     * @exception BioException if an error occurs.
210     */
211    public static void indexSwissprot(String name, File location, File [] seqFiles)
212        throws FileNotFoundException, IOException, ParserException,
213               BioException
214    {
215        BioStoreFactory bsf = new BioStoreFactory();
216        bsf.setStoreName(name);
217        bsf.setSequenceFormat(LifeScienceIdentifier.valueOf("open-bio.org",
218                                                            "swiss",
219                                                            "protein" ));
220        _indexEmblLike(bsf, location, seqFiles);
221    }
222
223    private static void _indexFasta(BioStoreFactory bsf,
224                                    File location, File [] seqFiles)
225       throws FileNotFoundException, IOException, BioException
226    {
227        bsf.setPrimaryKey("ID");
228        bsf.setStoreLocation(location);
229        bsf.addKey("ID", 10);
230
231        BioStore store = bsf.createBioStore();
232
233        for (int i = 0; i < seqFiles.length; i++)
234        {
235            // File data
236            long newOffset = 0L;
237            long oldOffset = 0L;
238            RAF raf = new RAF(seqFiles[i], "r");
239            Map map = new HashMap();
240
241            CountedBufferedReader reader =
242                new CountedBufferedReader(new FileReader(raf.getFile()));
243
244            // Record data
245            String id = "";
246
247            String line = null;
248            while ((line = reader.readLine()) != null)
249            {
250                if (line.startsWith(">"))
251                {
252                    // Write at end of record
253                    if (newOffset > 0)
254                    {
255                        store.writeRecord(raf, oldOffset,
256                                          (int) (newOffset - oldOffset),
257                                          id, map);
258                        oldOffset = newOffset;
259                    }
260                    newOffset = reader.getFilePointer();
261
262                    int delimeter = line.indexOf(" ");
263                    if (delimeter < 0)
264                        id = line.substring(1);
265                    else
266                        id = line.substring(1, delimeter);
267                }
268                else
269                {
270                    newOffset = reader.getFilePointer();
271                }
272            }
273
274            // Write final record
275            store.writeRecord(raf, oldOffset,
276                              (int) (newOffset - oldOffset),
277                              id, map);
278        }
279
280        try
281        {
282            store.commit();
283        }
284        catch (CommitFailure ne)
285        {
286            throw new BioException("Failed to commit new index to file",ne);
287        }
288    }
289
290    private static void _indexEmblLike(BioStoreFactory bsf,
291                                       File location, File [] seqFiles)
292        throws FileNotFoundException, IOException, ParserException,
293               BioException
294    {
295        bsf.setPrimaryKey("ID");
296        bsf.setStoreLocation(location);
297        bsf.addKey("AC", 10);
298        bsf.addKey("ID", 10);
299
300        BioStore store = bsf.createBioStore();
301
302        for (int i = 0; i < seqFiles.length; i++)
303        {
304            Indexer indexer = new Indexer(seqFiles[i], store);
305            indexer.setPrimaryKeyName("ID");
306            indexer.addSecondaryKey("AC");
307
308            ChangeTable changeTable = new ChangeTable();
309
310            changeTable.setChanger("ID", new ChangeTable.Changer()
311                {
312                    public Object change(Object value)
313                    {
314                        String s = (String) value;
315                        int i = s.indexOf(" ");
316
317                        if (i < 0)
318                            return s;
319                        else
320                            return s.substring(0, i);
321                    }
322                });
323
324            changeTable.setChanger("AC", new ChangeTable.Changer()
325                {
326                    public Object change(Object value)
327                    {
328                        String s = (String) value;
329                        int i = s.indexOf(";");
330                        return s.substring(0, i);
331                    }
332                });
333
334            ValueChanger changer = new ValueChanger(indexer, changeTable);
335            Parser parser = new Parser();
336
337            while(parser.read(indexer.getReader(),
338                              LineSplitParser.EMBL, changer));
339        }
340
341        try
342        {
343            store.commit();
344        }
345        catch (CommitFailure ne)
346        {
347            throw new BioException("Failed to commit new index to file",ne);
348        }
349    }
350
351    private static void _indexGenbank(BioStoreFactory bsf,
352                                      File location, File [] seqFiles)
353        throws FileNotFoundException, IOException, ParserException,
354               BioException
355    {
356        bsf.setPrimaryKey("LOCUS");
357        bsf.setStoreLocation(location);
358        bsf.addKey("LOCUS", 10);
359        bsf.addKey("ACCESSION", 10);
360
361        BioStore store = bsf.createBioStore();
362
363        for (int i = 0; i < seqFiles.length; i++)
364        {
365            Indexer indexer = new Indexer(seqFiles[i], store);
366            indexer.setPrimaryKeyName("LOCUS");
367            indexer.addSecondaryKey("ACCESSION");
368
369            ChangeTable changeTable = new ChangeTable();
370
371            changeTable.setChanger("LOCUS", new ChangeTable.Changer()
372                {
373                    public Object change(Object value)
374                    {
375                        String s = (String) value;
376                        int i = s.indexOf(" ");
377
378                        if (i < 0)
379                            return s;
380                        else
381                            return s.substring(0, i);
382                    }
383                });
384
385            ValueChanger changer = new ValueChanger(indexer, changeTable);
386            Parser parser = new Parser();
387
388            while(parser.read(indexer.getReader(),
389                              LineSplitParser.GENBANK, changer));
390        }
391
392        try
393        {
394            store.commit();
395        }
396        catch (CommitFailure ne)
397        {
398            throw new BioException("Failed to commit new index to file",ne);
399        }
400    }
401}