001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.bio.seq.db; 022 023import java.io.BufferedReader; 024import java.io.File; 025import java.io.FileOutputStream; 026import java.io.FileReader; 027import java.io.FileWriter; 028import java.io.IOException; 029import java.io.PrintStream; 030import java.io.PrintWriter; 031import java.io.RandomAccessFile; 032import java.util.AbstractList; 033import java.util.AbstractSet; 034import java.util.Arrays; 035import java.util.Collections; 036import java.util.Comparator; 037import java.util.HashMap; 038import java.util.HashSet; 039import java.util.Iterator; 040import java.util.Map; 041import java.util.Set; 042import java.util.StringTokenizer; 043 044import org.biojava.bio.BioError; 045import org.biojava.bio.BioException; 046import org.biojava.bio.seq.io.SequenceBuilderFactory; 047import org.biojava.bio.seq.io.SequenceFormat; 048import org.biojava.bio.seq.io.SymbolTokenization; 049 050/** 051 * The original object for indexing sequence files. 052 * 053 * <p>This class may not be thread-safe.</p> 054 * 055 * @author Matthew Pocock 056 * @author Thomas Down 057 */ 058public class BioIndex implements IndexStore { 059 private static Comparator STRING_CASE_SENSITIVE_ORDER = new Comparator() { 060 public int compare(Object a, Object b) { 061 return ((Comparable) a).compareTo(b); 062 } 063 }; 064 065 private File indexDirectory; 066 067 private int fileCount; 068 private File[] fileIDToFile; 069 070 private FileAsList indxList; 071 072 private Set idSet = new ListAsSet(); 073 074 private String name; 075 private SequenceFormat format; 076 private SequenceBuilderFactory sbFactory; 077 private SymbolTokenization symbolTokenization; 078 079 { 080 fileCount = 0; 081 fileIDToFile = new File[4]; 082 } 083 084 public BioIndex( 085 File indexDirectory, 086 String namespace, 087 int idLength 088 ) throws IOException, BioException { 089 if(indexDirectory.exists()) { 090 throw new BioException( 091 "Can't create new index as directory already exists: " + 092 indexDirectory 093 ); 094 } 095 096 // create directory 097 indexDirectory.mkdirs(); 098 099 // create BIOINDEX.dat 100 { 101 File bioindex = new File(indexDirectory, "BIOINDEX.dat"); 102 bioindex.createNewFile(); 103 PrintWriter pw = new PrintWriter(new FileWriter(bioindex)); 104 pw.println("index\tflat/1"); 105 pw.close(); 106 } 107 108 // create fileids.dat 109 PrintWriter fileidsWriter; 110 { 111 File fileids = new File(indexDirectory, "fileids.dat"); 112 fileids.createNewFile(); 113 fileidsWriter = new PrintWriter( 114 new FileWriter( 115 fileids 116 ) 117 ); 118 } 119 120 // create config.dat 121 PrintWriter configWriter; 122 { 123 File config = new File(indexDirectory, "config.dat"); 124 config.createNewFile(); 125 configWriter = new PrintWriter(new FileWriter(config)); 126 configWriter.println("namespace\t" + namespace); 127 } 128 129 // create index file 130 { 131 String uniqueName = "key_" + namespace + ".key"; 132 File unique = new File(indexDirectory, uniqueName); 133 unique.createNewFile(); 134 135 int recordLen = 136 idLength + // id 137 1 + // tab 138 4 + // 9999 files 139 1 + // tab 140 String.valueOf(Long.MAX_VALUE).length() + // space for any long 141 1 + // tab 142 String.valueOf(Integer.MAX_VALUE).length() + // space for any int 143 "\n".length() // new line (os dependant) 144 ; 145 146 indxList = new IndexFileAsList( 147 new RandomAccessFile(unique, "rw"), 148 recordLen 149 ); 150 151 fileidsWriter.println(uniqueName + "\t" + recordLen); 152 } 153 154 // other field initialization to get things going 155 fileCount = 0; 156 fileIDToFile = new File[4]; 157 158 configWriter.close(); 159 fileidsWriter.close(); 160 } 161 162 /** 163 * Load an existing index file. 164 * 165 * If indexDirectory does not exist, or is not a bioindex stoore, this will 166 * barf. 167 */ 168 public BioIndex( 169 File indexDirectory 170 ) throws IOException, BioException { 171 this.indexDirectory = indexDirectory; 172 173 if(!indexDirectory.exists()) { 174 throw new BioException( 175 "Tried to load non-existant index: " + 176 indexDirectory 177 ); 178 } 179 180 // read in the global config 181 { 182 System.out.println("Global"); 183 Map config = new HashMap(); 184 BufferedReader fi = new BufferedReader( 185 new FileReader( 186 new File(indexDirectory, "config.dat") 187 ) 188 ); 189 for(String line = fi.readLine(); line != null; line = fi.readLine()) { 190 int tab = line.indexOf("\t"); 191 config.put(line.substring(0, tab), line.substring(tab + 1)); 192 } 193 String namespace = (String) config.get("namespace"); 194 RandomAccessFile indxFile = new RandomAccessFile("key_" + namespace + ".key", "rw"); 195 int recLen = guessRecLen(indxFile); 196 indxList = new IndexFileAsList(indxFile, recLen); 197 } 198 199 // set up file set 200 { 201 System.out.println("Files"); 202 fileCount = 0; 203 fileIDToFile = new File[4]; 204 205 BufferedReader fi = new BufferedReader( 206 new FileReader( 207 new File(indexDirectory, "fileids.dat") 208 ) 209 ); 210 for(String line = fi.readLine(); line != null; line = fi.readLine()) { 211 StringTokenizer sTok = new StringTokenizer("\t"); 212 int id = Integer.parseInt(sTok.nextToken()); 213 File file = new File(sTok.nextToken()); 214 long fileLength = Long.parseLong(sTok.nextToken()); 215 216 if(file.length() != fileLength) { 217 throw new BioException("File length changed: " + file + " " 218 + file.length() + " vs " + fileLength); 219 } 220 221 fileIDToFile[id] = file; 222 } 223 } 224 } 225 226 private File getFileForID(int fileId) { 227 return fileIDToFile[fileId]; 228 } 229 230 private int getIDForFile(File file) { 231 // scan list 232 for(int i = 0; i < fileCount; i++) { 233 if(file.equals(fileIDToFile[i])) { 234 return i; 235 } 236 } 237 238 // extend fileIDToFile array 239 if(fileCount >= fileIDToFile.length) { 240 File[] tmp = new File[fileIDToFile.length + 4]; // 4 is magic number 241 System.arraycopy(fileIDToFile, 0, tmp, 0, fileCount); 242 fileIDToFile = tmp; 243 } 244 245 // add the unseen file to the list 246 fileIDToFile[fileCount] = file; 247 return fileCount++; 248 } 249 250 public String getName() { 251 return this.name; 252 } 253 254 public int guessRecLen(RandomAccessFile file) 255 throws IOException { 256 file.seek(0l); 257 int b = 0; 258 while(b != '\n' && b != '\r') { 259 b = file.read(); 260 } 261 262 int offset = (int) file.getFilePointer(); 263 264 if(b == '\n') { // \n 265 return offset + 1; 266 } else { 267 b = file.read(); 268 if(b == '\n') { // \r\n 269 return offset + 2; 270 } else { // \r 271 return offset + 1; 272 } 273 } 274 } 275 276 public Index fetch(String id) 277 throws IllegalIDException, BioException { 278 int indx = Collections.binarySearch( 279 indxList, 280 id, 281 indxList.getComparator() 282 ); 283 284 if(indx < 0) { 285 throw new IllegalIDException("Can't find sequence for " + id); 286 } 287 288 return (Index) indxList.get(indx); 289 } 290 291 public void store(Index indx) { 292 indxList.add(indx); 293 } 294 295 public void commit() 296 throws BioException { 297 indxList.commit(); 298 try { 299 // write files 300 { 301 PrintStream fo = new PrintStream( 302 new FileOutputStream( 303 new File(indexDirectory, "fileids.dat") 304 ) 305 ); 306 for(int i = 0; i < fileCount; i++) { 307 fo.print(i); 308 fo.print('\t'); 309 fo.print(fileIDToFile[i]); 310 fo.print('\t'); 311 fo.print(fileIDToFile[i].length()); 312 fo.println(); 313 } 314 fo.close(); 315 } 316 } catch (Exception e) { 317 rollback(); 318 throw new BioException("Unable to commit. Rolled back to be safe",e); 319 } 320 } 321 322 public void rollback() { 323 indxList.rollback(); 324 } 325 326 public Set getIDs() { 327 return idSet; 328 } 329 330 public Set getFiles() { 331 return new HashSet(Arrays.asList(fileIDToFile)); 332 } 333 334 public SequenceFormat getFormat() { 335 return format; 336 } 337 338 public SequenceBuilderFactory getSBFactory() { 339 return sbFactory; 340 } 341 342 public SymbolTokenization getSymbolParser() { 343 return symbolTokenization; 344 } 345 346 private interface Commitable { 347 public void commit() 348 throws BioException; 349 350 public void rollback(); 351 } 352 353 // records stored as: 354 // seqID(\w+) \t fileID(\w+) \t start(\d+) \t length(\d+) ' ' * \n 355 private abstract class FileAsList 356 extends AbstractList 357 implements /* RandomAccess, */ Commitable { 358 private RandomAccessFile mappedFile; 359 private int commitedRecords; 360 private int lastIndx; 361 private Object lastRec; 362 private byte[] buffer; 363 364 public FileAsList(RandomAccessFile mappedFile, int recordLength) { 365 this.mappedFile = mappedFile; 366 buffer = new byte[recordLength]; 367 } 368 369 public Object get(int indx) { 370 if(indx < 0 || indx >= size()) { 371 throw new IndexOutOfBoundsException(); 372 } 373 374 if(indx == lastIndx) { 375 return lastRec; 376 } 377 378 long offset = indx * buffer.length; 379 try { 380 mappedFile.seek(offset); 381 mappedFile.readFully(buffer); 382 } catch (IOException ioe) { 383 throw new BioError("Failed to seek for record",ioe); 384 } 385 386 lastRec = parseRecord(buffer); 387 lastIndx = indx; 388 return lastRec; 389 } 390 391 public int size() { 392 try { 393 return (int) (mappedFile.length() / (long) buffer.length); 394 } catch (IOException ioe) { 395 throw new BioError("Can't read file length",ioe); 396 } 397 } 398 399 public boolean add(Object o) { 400 generateRecord(buffer, o); 401 402 try { 403 mappedFile.seek(mappedFile.length()); 404 mappedFile.write(buffer); 405 } catch (IOException ioe) { 406 throw new BioError("Failed to write index",ioe); 407 } 408 409 return true; 410 } 411 412 public void commit() { 413 Collections.sort(indxList, indxList.getComparator()); 414 commitedRecords = indxList.size(); 415 } 416 417 public void rollback() { 418 try { 419 mappedFile.setLength((long) commitedRecords * (long) buffer.length); 420 } catch (Throwable t) { 421 throw new BioError( 422 "Could not roll back. " + 423 "The index store will be in an inconsistent state " + 424 "and should be discarded. File: " + mappedFile, t 425 ); 426 } 427 } 428 429 protected abstract Object parseRecord(byte[] buffer); 430 protected abstract void generateRecord(byte[] buffer, Object item); 431 protected abstract Comparator getComparator(); 432 } 433 434 private class IndexFileAsList extends FileAsList { 435 private Comparator INDEX_COMPARATOR = new Comparator() { 436 public int compare(Object a, Object b) { 437 String as; 438 String bs; 439 440 if(a instanceof Index) { 441 as = ((Index) a).getID(); 442 } else { 443 as = (String) a; 444 } 445 446 if(b instanceof Index) { 447 bs = ((Index) b).getID(); 448 } else { 449 bs = (String) b; 450 } 451 452 return STRING_CASE_SENSITIVE_ORDER.compare(as, bs); 453 } 454 }; 455 456 public IndexFileAsList(RandomAccessFile file, int recordLength) { 457 super(file, recordLength); 458 } 459 460 protected Object parseRecord(byte[] buffer) { 461 int lastI = 0; 462 int newI = 0; 463 while(buffer[newI] != '\t') { 464 newI++; 465 } 466 String id = new String(buffer, lastI, newI); 467 468 while(buffer[newI] != '\t') { 469 newI++; 470 } 471 File file = getFileForID(Integer.parseInt(new String(buffer, lastI, newI).trim())); 472 473 while(buffer[newI] != '\t') { 474 newI++; 475 } 476 long start = Long.parseLong(new String(buffer, lastI, newI)); 477 478 int length = Integer.parseInt( 479 new String(buffer, newI + 1, buffer.length) 480 ); 481 482 return new SimpleIndex(file, start, length, id); 483 } 484 485 protected void generateRecord(byte[] buffer, Object item) { 486 Index indx = (Index) item; 487 488 String id = indx.getID(); 489 int fileID = getIDForFile(indx.getFile()); 490 String start = String.valueOf(indx.getStart()); 491 String length = String.valueOf(indx.getLength()); 492 493 int i = 0; 494 byte[] str; 495 496 str = id.getBytes(); 497 for(int j = 0; j < str.length; j++) { 498 buffer[i++] = str[j]; 499 } 500 501 buffer[i++] = '\t'; 502 503 str = String.valueOf(fileID).getBytes(); 504 for(int j = 0; j < str.length; j++) { 505 buffer[i++] = str[j]; 506 } 507 508 buffer[i++] = '\t'; 509 510 str = start.getBytes(); 511 for(int j = 0; j < str.length; j++) { 512 buffer[i++] = str[j]; 513 } 514 515 buffer[i++] = '\t'; 516 517 str = length.getBytes(); 518 for(int j = 0; j < str.length; j++) { 519 buffer[i++] = str[j]; 520 } 521 522 while(i < buffer.length - 1) { 523 buffer[i++] = ' '; 524 } 525 526 buffer[i] = '\n'; 527 } 528 529 public Comparator getComparator() { 530 return INDEX_COMPARATOR; 531 } 532 } 533 534 private class ListAsSet 535 extends AbstractSet { 536 public Iterator iterator() { 537 return indxList.iterator(); 538 } 539 540 public int size() { 541 return indxList.size(); 542 } 543 } 544}