001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.bio.seq.db;
022
023import java.io.BufferedReader;
024import java.io.File;
025import java.io.FileOutputStream;
026import java.io.FileReader;
027import java.io.FileWriter;
028import java.io.IOException;
029import java.io.PrintStream;
030import java.io.PrintWriter;
031import java.io.RandomAccessFile;
032import java.util.AbstractList;
033import java.util.AbstractSet;
034import java.util.Arrays;
035import java.util.Collections;
036import java.util.Comparator;
037import java.util.HashMap;
038import java.util.HashSet;
039import java.util.Iterator;
040import java.util.Map;
041import java.util.Set;
042import java.util.StringTokenizer;
043
044import org.biojava.bio.BioError;
045import org.biojava.bio.BioException;
046import org.biojava.bio.seq.io.SequenceBuilderFactory;
047import org.biojava.bio.seq.io.SequenceFormat;
048import org.biojava.bio.seq.io.SymbolTokenization;
049
050/**
051 * The original object for indexing sequence files.
052 *
053 * <p>This class may not be thread-safe.</p>
054 *
055 * @author Matthew Pocock
056 * @author Thomas Down
057 */
058public class BioIndex implements IndexStore {
059  private static Comparator STRING_CASE_SENSITIVE_ORDER = new Comparator() {
060    public int compare(Object a, Object b) {
061      return ((Comparable) a).compareTo(b);
062    }
063  };
064
065  private File indexDirectory;
066
067  private int fileCount;
068  private File[] fileIDToFile;
069
070  private FileAsList indxList;
071
072  private Set idSet = new ListAsSet();
073
074  private String name;
075  private SequenceFormat format;
076  private SequenceBuilderFactory sbFactory;
077  private SymbolTokenization symbolTokenization;
078
079  {
080    fileCount = 0;
081    fileIDToFile = new File[4];
082  }
083
084  public BioIndex(
085    File indexDirectory,
086    String namespace,
087    int idLength
088  ) throws IOException, BioException {
089    if(indexDirectory.exists()) {
090      throw new BioException(
091        "Can't create new index as directory already exists: " +
092        indexDirectory
093      );
094    }
095
096    // create directory
097    indexDirectory.mkdirs();
098
099    // create BIOINDEX.dat
100    {
101      File bioindex = new File(indexDirectory, "BIOINDEX.dat");
102      bioindex.createNewFile();
103      PrintWriter pw = new PrintWriter(new FileWriter(bioindex));
104      pw.println("index\tflat/1");
105      pw.close();
106    }
107
108    // create fileids.dat
109    PrintWriter fileidsWriter;
110    {
111      File fileids = new File(indexDirectory, "fileids.dat");
112      fileids.createNewFile();
113      fileidsWriter = new PrintWriter(
114        new FileWriter(
115          fileids
116        )
117      );
118    }
119
120    // create config.dat
121    PrintWriter configWriter;
122    {
123      File config = new File(indexDirectory, "config.dat");
124      config.createNewFile();
125      configWriter = new PrintWriter(new FileWriter(config));
126      configWriter.println("namespace\t" + namespace);
127    }
128
129    // create index file
130    {
131      String uniqueName = "key_" + namespace + ".key";
132      File unique = new File(indexDirectory, uniqueName);
133      unique.createNewFile();
134
135      int recordLen =
136        idLength +                                   // id
137        1 +                                          // tab
138        4 +                                          // 9999 files
139        1 +                                          // tab
140        String.valueOf(Long.MAX_VALUE).length() +    // space for any long
141        1 +                                          // tab
142        String.valueOf(Integer.MAX_VALUE).length() + // space for any int
143        "\n".length()                                // new line (os dependant)
144        ;
145
146      indxList = new IndexFileAsList(
147        new RandomAccessFile(unique, "rw"),
148        recordLen
149      );
150
151      fileidsWriter.println(uniqueName + "\t" + recordLen);
152    }
153
154    // other field initialization to get things going
155    fileCount = 0;
156    fileIDToFile = new File[4];
157
158    configWriter.close();
159    fileidsWriter.close();
160  }
161
162  /**
163   * Load an existing index file.
164   *
165   * If indexDirectory does not exist, or is not a bioindex stoore, this will
166   * barf.
167   */
168  public BioIndex(
169    File indexDirectory
170  ) throws IOException, BioException {
171    this.indexDirectory = indexDirectory;
172
173    if(!indexDirectory.exists()) {
174      throw new BioException(
175        "Tried to load non-existant index: " +
176        indexDirectory
177      );
178    }
179
180    // read in the global config
181    {
182      System.out.println("Global");
183      Map config = new HashMap();
184      BufferedReader fi = new BufferedReader(
185        new FileReader(
186          new File(indexDirectory, "config.dat")
187        )
188      );
189      for(String line = fi.readLine(); line != null; line = fi.readLine()) {
190        int tab = line.indexOf("\t");
191        config.put(line.substring(0, tab), line.substring(tab + 1));
192      }
193      String namespace = (String) config.get("namespace");
194      RandomAccessFile indxFile = new RandomAccessFile("key_" + namespace + ".key", "rw");
195      int recLen = guessRecLen(indxFile);
196      indxList = new IndexFileAsList(indxFile, recLen);
197    }
198
199    // set up file set
200    {
201      System.out.println("Files");
202      fileCount = 0;
203      fileIDToFile = new File[4];
204
205      BufferedReader fi = new BufferedReader(
206        new FileReader(
207          new File(indexDirectory, "fileids.dat")
208        )
209      );
210      for(String line = fi.readLine(); line != null; line = fi.readLine()) {
211        StringTokenizer sTok = new StringTokenizer("\t");
212        int id = Integer.parseInt(sTok.nextToken());
213        File file = new File(sTok.nextToken());
214        long fileLength = Long.parseLong(sTok.nextToken());
215
216        if(file.length() != fileLength) {
217          throw new BioException("File length changed: " + file + " "
218          + file.length() + " vs " + fileLength);
219        }
220
221        fileIDToFile[id] = file;
222      }
223    }
224  }
225
226  private File getFileForID(int fileId) {
227    return fileIDToFile[fileId];
228  }
229
230  private int getIDForFile(File file) {
231    // scan list
232    for(int i = 0; i < fileCount; i++) {
233      if(file.equals(fileIDToFile[i])) {
234        return i;
235      }
236    }
237
238    // extend fileIDToFile array
239    if(fileCount >= fileIDToFile.length) {
240      File[] tmp = new File[fileIDToFile.length + 4]; // 4 is magic number
241      System.arraycopy(fileIDToFile, 0, tmp, 0, fileCount);
242      fileIDToFile = tmp;
243    }
244
245    // add the unseen file to the list
246    fileIDToFile[fileCount] = file;
247    return fileCount++;
248  }
249
250  public String getName() {
251    return this.name;
252  }
253
254  public int guessRecLen(RandomAccessFile file)
255  throws IOException {
256    file.seek(0l);
257    int b = 0;
258    while(b != '\n' && b != '\r') {
259      b  = file.read();
260    }
261
262    int offset = (int) file.getFilePointer();
263
264    if(b == '\n') {          // \n
265      return offset + 1;
266    } else {
267      b = file.read();
268      if(b == '\n') {        // \r\n
269        return offset + 2;
270      } else {               // \r
271        return offset + 1;
272      }
273    }
274  }
275
276  public Index fetch(String id)
277  throws IllegalIDException, BioException {
278    int indx = Collections.binarySearch(
279      indxList,
280      id,
281      indxList.getComparator()
282    );
283
284    if(indx < 0) {
285      throw new IllegalIDException("Can't find sequence for " + id);
286    }
287
288    return (Index) indxList.get(indx);
289  }
290
291  public void store(Index indx) {
292    indxList.add(indx);
293  }
294
295  public void commit()
296  throws BioException {
297    indxList.commit();
298    try {
299      // write files
300      {
301        PrintStream fo = new PrintStream(
302          new FileOutputStream(
303            new File(indexDirectory, "fileids.dat")
304          )
305        );
306        for(int i = 0; i < fileCount; i++) {
307          fo.print(i);
308          fo.print('\t');
309          fo.print(fileIDToFile[i]);
310          fo.print('\t');
311          fo.print(fileIDToFile[i].length());
312          fo.println();
313        }
314        fo.close();
315      }
316    } catch (Exception e) {
317      rollback();
318      throw new BioException("Unable to commit. Rolled back to be safe",e);
319    }
320  }
321
322  public void rollback() {
323    indxList.rollback();
324  }
325
326  public Set getIDs() {
327    return idSet;
328  }
329
330  public Set getFiles() {
331    return new HashSet(Arrays.asList(fileIDToFile));
332  }
333
334  public SequenceFormat getFormat() {
335    return format;
336  }
337
338  public SequenceBuilderFactory getSBFactory() {
339    return sbFactory;
340  }
341
342  public SymbolTokenization getSymbolParser() {
343    return symbolTokenization;
344  }
345
346  private interface Commitable {
347    public void commit()
348    throws BioException;
349
350    public void rollback();
351  }
352
353  // records stored as:
354  // seqID(\w+) \t fileID(\w+) \t start(\d+) \t length(\d+) ' ' * \n
355  private abstract class FileAsList
356  extends AbstractList
357    implements /* RandomAccess, */ Commitable {
358    private RandomAccessFile mappedFile;
359    private int commitedRecords;
360    private int lastIndx;
361    private Object lastRec;
362    private byte[] buffer;
363
364    public FileAsList(RandomAccessFile mappedFile, int recordLength) {
365      this.mappedFile = mappedFile;
366      buffer = new byte[recordLength];
367    }
368
369    public Object get(int indx) {
370      if(indx < 0 || indx >= size()) {
371        throw new IndexOutOfBoundsException();
372      }
373
374      if(indx == lastIndx) {
375        return lastRec;
376      }
377
378      long offset = indx * buffer.length;
379      try {
380        mappedFile.seek(offset);
381        mappedFile.readFully(buffer);
382      } catch (IOException ioe) {
383        throw new BioError("Failed to seek for record",ioe);
384      }
385
386      lastRec = parseRecord(buffer);
387      lastIndx = indx;
388      return lastRec;
389    }
390
391    public int size() {
392      try {
393        return (int) (mappedFile.length() / (long) buffer.length);
394      } catch (IOException ioe) {
395        throw new BioError("Can't read file length",ioe);
396      }
397    }
398
399    public boolean add(Object o) {
400      generateRecord(buffer, o);
401
402      try {
403        mappedFile.seek(mappedFile.length());
404        mappedFile.write(buffer);
405      } catch (IOException ioe) {
406        throw new BioError("Failed to write index",ioe);
407      }
408
409      return true;
410    }
411
412    public void commit() {
413      Collections.sort(indxList, indxList.getComparator());
414      commitedRecords = indxList.size();
415    }
416
417    public void rollback() {
418      try {
419        mappedFile.setLength((long) commitedRecords * (long) buffer.length);
420      } catch (Throwable t) {
421        throw new BioError(
422          "Could not roll back. " +
423          "The index store will be in an inconsistent state " +
424          "and should be discarded. File: " + mappedFile, t
425        );
426      }
427    }
428
429    protected abstract Object parseRecord(byte[] buffer);
430    protected abstract void generateRecord(byte[] buffer, Object item);
431    protected abstract Comparator getComparator();
432  }
433
434  private class IndexFileAsList extends FileAsList {
435    private Comparator INDEX_COMPARATOR = new Comparator() {
436      public int compare(Object a, Object b) {
437        String as;
438        String bs;
439
440        if(a instanceof Index) {
441          as = ((Index) a).getID();
442        } else {
443          as = (String) a;
444        }
445
446        if(b instanceof Index) {
447          bs = ((Index) b).getID();
448        } else {
449          bs = (String) b;
450        }
451
452        return STRING_CASE_SENSITIVE_ORDER.compare(as, bs);
453      }
454    };
455
456    public IndexFileAsList(RandomAccessFile file, int recordLength) {
457      super(file, recordLength);
458    }
459
460    protected Object parseRecord(byte[] buffer) {
461      int lastI = 0;
462      int newI = 0;
463      while(buffer[newI] != '\t') {
464        newI++;
465      }
466      String id = new String(buffer, lastI, newI);
467
468      while(buffer[newI] != '\t') {
469        newI++;
470      }
471      File file = getFileForID(Integer.parseInt(new String(buffer, lastI, newI).trim()));
472
473      while(buffer[newI] != '\t') {
474        newI++;
475      }
476      long start = Long.parseLong(new String(buffer, lastI, newI));
477
478      int length = Integer.parseInt(
479        new String(buffer, newI + 1, buffer.length)
480      );
481
482      return new SimpleIndex(file, start, length, id);
483    }
484
485    protected void generateRecord(byte[] buffer, Object item) {
486      Index indx = (Index) item;
487
488      String id = indx.getID();
489      int fileID = getIDForFile(indx.getFile());
490      String start = String.valueOf(indx.getStart());
491      String length = String.valueOf(indx.getLength());
492
493      int i = 0;
494      byte[] str;
495
496      str = id.getBytes();
497      for(int j = 0; j < str.length; j++) {
498        buffer[i++] = str[j];
499      }
500
501      buffer[i++] = '\t';
502
503      str = String.valueOf(fileID).getBytes();
504      for(int j = 0; j < str.length; j++) {
505        buffer[i++] = str[j];
506      }
507
508      buffer[i++] = '\t';
509
510      str = start.getBytes();
511      for(int j = 0; j < str.length; j++) {
512        buffer[i++] = str[j];
513      }
514
515      buffer[i++] = '\t';
516
517      str = length.getBytes();
518      for(int j = 0; j < str.length; j++) {
519        buffer[i++] = str[j];
520      }
521
522      while(i < buffer.length - 1) {
523        buffer[i++] = ' ';
524      }
525
526      buffer[i] = '\n';
527    }
528
529    public Comparator getComparator() {
530      return INDEX_COMPARATOR;
531    }
532  }
533
534  private class ListAsSet
535  extends AbstractSet {
536    public Iterator iterator() {
537      return indxList.iterator();
538    }
539
540    public int size() {
541      return indxList.size();
542    }
543  }
544}