001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojava.bio.seq.db; 023 024import java.io.BufferedInputStream; 025import java.io.File; 026import java.io.FileInputStream; 027import java.io.FileNotFoundException; 028import java.io.IOException; 029import java.util.Collections; 030import java.util.HashMap; 031import java.util.HashSet; 032import java.util.Map; 033import java.util.Set; 034 035import org.biojava.bio.BioException; 036import org.biojava.bio.seq.db.emblcd.DivisionLkpReader; 037import org.biojava.bio.seq.db.emblcd.EmblCDROMIndexReader; 038import org.biojava.bio.seq.db.emblcd.EmblCDROMRandomAccess; 039import org.biojava.bio.seq.db.emblcd.EntryNamIdxReader; 040import org.biojava.bio.seq.db.emblcd.EntryNamRandomAccess; 041import org.biojava.bio.seq.io.SequenceBuilderFactory; 042import org.biojava.bio.seq.io.SequenceFormat; 043import org.biojava.bio.seq.io.SymbolTokenization; 044 045/** 046 * <p><code>EmblCDROMIndexStore</code>s implement a read-only 047 * <code>IndexStore</code> backed by EMBL CD-ROM format binary 048 * indices. The required index files are typically named 049 * "division.lkp" and "entrynam.idx". As an <code>IndexStore</code> 050 * performs lookups by sequence ID, the index files "acnum.trg" and 051 * "acnum.hit" (which store additional accession number data) are not 052 * used.</p> 053 * 054 * <p>The sequence IDs are found using a binary search via a pointer 055 * into the index file. The whole file is not read unless a request 056 * for all the IDs is made using the getIDs() method. The set of IDs 057 * is then cached after the first pass. This class also has a 058 * <code>close()</code> method to free resources associated with the 059 * underlying <code>RandomAccessFile</code>.</p> 060 * 061 * <p>The binary index files may be created using the EMBOSS programs 062 * dbifasta, dbiblast, dbiflat or dbigcg. The least useful from the 063 * BioJava perspective is dbigcg because we do not have a 064 * <code>SequenceFormat</code> implementation for GCG format 065 * files.</p> 066 * 067 * <p>The <code>Index</code> instances returned by this class do not 068 * have the record length set because this information is not 069 * available in the binary index. The value -1 is used instead, as 070 * described in the <code>Index</code> interface.</p> 071 * 072 * @author Keith James 073 * @since 1.2 074 */ 075public class EmblCDROMIndexStore implements IndexStore 076{ 077 private File divisionLkp; 078 private File entryNamIdx; 079 080 // Optional PATH prefix to append to the filename(s) extracted 081 // from the binary indices 082 private File pathPrefix; 083 084 private SequenceFormat format; 085 private SequenceBuilderFactory factory; 086 private SymbolTokenization parser; 087 088 // Maps the file numbers used in the indices to the real file names 089 private Map seqFiles; 090 // Set view of file names 091 private Set fileSet; 092 // Lazily instantiated if someone asks for all the IDs at once 093 private Set seqIds; 094 // The database name defined in the index header 095 private String name; 096 097 // Details of the master index records 098 private long divRecordCount; 099 // Details of the ID/offset records 100 private int entryRecordLength; 101 private long entryRecordCount; 102 103 // The random access file containing ID/offset records 104 private EmblCDROMRandomAccess entryRandomAccess; 105 106 /** 107 * Creates a new <code>EmblCDROMIndexStore</code> backed by a 108 * random access binary index. 109 * 110 * @param divisionLkp a <code>File</code> containing the master 111 * index. 112 * @param entryNamIdx a <code>File</code> containing the sequence 113 * IDs and offsets. 114 * @param format a <code>SequenceFormat</code>. 115 * @param factory a <code>SequenceBuilderFactory</code>. 116 * @param parser a <code>SymbolTokenization</code>. 117 * 118 * @exception IOException if an error occurs. 119 */ 120 public EmblCDROMIndexStore(File divisionLkp, 121 File entryNamIdx, 122 SequenceFormat format, 123 SequenceBuilderFactory factory, 124 SymbolTokenization parser) 125 throws IOException 126 { 127 // Set to the empty abstract path 128 this(new File(""), divisionLkp, entryNamIdx, 129 format, factory, parser); 130 } 131 132 /** 133 * Creates a new <code>EmblCDROMIndexStore</code> backed by a 134 * random access binary index. 135 * 136 * @param pathPrefix a <code>File</code> containing the abstract 137 * path to be appended to sequence database filenames retrieved 138 * from the binary index. 139 * @param divisionLkp a <code>File</code> containing the master 140 * index. 141 * @param entryNamIdx a <code>File</code> containing the sequence 142 * IDs and offsets. 143 * @param format a <code>SequenceFormat</code>. 144 * @param factory a <code>SequenceBuilderFactory</code>. 145 * @param parser a <code>SymbolTokenization</code>. 146 * 147 * @exception IOException if an error occurs. 148 */ 149 public EmblCDROMIndexStore(File pathPrefix, 150 File divisionLkp, 151 File entryNamIdx, 152 SequenceFormat format, 153 SequenceBuilderFactory factory, 154 SymbolTokenization parser) 155 throws IOException 156 { 157 this.divisionLkp = divisionLkp; 158 this.entryNamIdx = entryNamIdx; 159 this.format = format; 160 this.factory = factory; 161 this.parser = parser; 162 this.pathPrefix = pathPrefix; 163 164 initialise(); 165 } 166 167 /** 168 * <code>getPathPrefix</code> returns the abstract path currently 169 * being appended to the raw sequence database filenames extracted 170 * from the binary index. This value defaults to the empty 171 * abstract path. 172 * 173 * @return a <code>File</code>. 174 */ 175 public File getPathPrefix() 176 { 177 return pathPrefix; 178 } 179 180 /** 181 * <code>setPathPrefix</code> sets the abstract path to be 182 * appended to sequence database filenames retrieved from the 183 * binary index. E.g. if the binary index refers to the database 184 * as 'SWALL' and the <code>pathPrefix</code> is set to 185 * "/usr/local/share/data/seq/", then the <code>IndexStore</code> 186 * will know the database path as 187 * "/usr/local/share/data/seq/swall" and any <code>Index</code> 188 * instances produced by the store will return the latter path 189 * when their getFile() method is called. This value defaults to 190 * the empty abstract path. 191 * 192 * @param pathPrefix a <code>File</code> prefix specifying the 193 * abstract path to append. 194 */ 195 public void setPathPrefix(File pathPrefix) 196 { 197 this.pathPrefix = pathPrefix; 198 } 199 200 /** 201 * <code>getName</code> returns the database name as defined 202 * within the EMBL CD-ROM index. 203 * 204 * @return a <code>String</code> value. 205 */ 206 public String getName() 207 { 208 return name; 209 } 210 211 /** 212 * <code>store</code> adds an <code>Index</code> to the store. As 213 * EMBL CD-ROM indices are read-only, this implementation throws a 214 * <code>BioException</code>. 215 * 216 * @param index an <code>Index</code>. 217 * 218 * @exception IllegalIDException if an error occurs. 219 * @exception BioException if an error occurs. 220 */ 221 public void store(Index index) 222 throws IllegalIDException, BioException 223 { 224 throw new BioException("Failed to add Index: store is read-only." 225 + " To add sequences use the dbi programs" 226 + " supplied in EMBOSS"); 227 } 228 229 /** 230 * <code>commit</code> commits changes. As EMBL CD-ROM indices are 231 * read-only, this implementation throws a 232 * <code>BioException</code>. 233 * 234 * @exception BioException if an error occurs. 235 */ 236 public void commit() throws BioException 237 { 238 throw new BioException("Failed to commit: store is read-only." 239 + " To add sequences use the dbi programs" 240 + " supplied in EMBOSS"); 241 } 242 243 /** 244 * <code>rollback</code> rolls back changes made since the last 245 * <code>commit</code>. As EMBL CD-ROM indices are read-only, this 246 * implementation does nothing. 247 */ 248 public void rollback() { } 249 250 public Index fetch(String id) throws IllegalIDException, BioException 251 { 252 Index index = null; 253 254 try 255 { 256 Object [] enRecord = entryRandomAccess.findRecord(id); 257 258 if (enRecord.length == 0) 259 throw new IllegalIDException("Failed to find ID: " + id); 260 261 // Append current pathPrefix 262 index = 263 new SimpleIndex(new File(pathPrefix, 264 (String) seqFiles.get((Integer) 265 enRecord[3])), 266 ((Long) enRecord[1]).longValue(), -1, id); 267 } 268 catch (IOException ioe) 269 { 270 throw new BioException("Failed to retrieve index for ID: " + id); 271 } 272 273 return index; 274 } 275 276 public Set getIDs() 277 { 278 if (seqIds == null) 279 { 280 seqIds = new HashSet((int) entryRecordCount); 281 282 BufferedInputStream bis = null; 283 284 try 285 { 286 bis = 287 new BufferedInputStream(new FileInputStream(entryNamIdx)); 288 EmblCDROMIndexReader ent = new EntryNamIdxReader(bis); 289 290 for (long i = 0; i < entryRecordCount; i++) 291 { 292 Object [] enRecord = ent.readRecord(); 293 seqIds.add((String) enRecord[0]); 294 } 295 296 bis.close(); 297 } 298 // File was not found, so don't try to close it 299 catch (FileNotFoundException fnfe) 300 { 301 System.err.println("Failed to find file " 302 + entryNamIdx.getName()); 303 fnfe.printStackTrace(); 304 } 305 // File was opened, so try to close it 306 catch (IOException ioe) 307 { 308 try 309 { 310 bis.close(); 311 } 312 catch (IOException ioe2) 313 { 314 System.err.println("Failed to close input stream from file " 315 + entryNamIdx.getName()); 316 } 317 318 System.err.println("Failed to read file " 319 + entryNamIdx.getName()); 320 ioe.printStackTrace(); 321 } 322 } 323 324 return Collections.unmodifiableSet(seqIds); 325 } 326 327 public Set getFiles() 328 { 329 return Collections.unmodifiableSet(fileSet); 330 } 331 332 public SequenceFormat getFormat() 333 { 334 return format; 335 } 336 337 public SequenceBuilderFactory getSBFactory() 338 { 339 return factory; 340 } 341 342 public SymbolTokenization getSymbolParser() 343 { 344 return parser; 345 } 346 347 /** 348 * <code>close</code> closes the underlying 349 * <code>EntryNamRandomAccess</code> which in turn closes the 350 * lower level <code>RandomAccessFile</code>. This frees the 351 * resources associated with the file. 352 * 353 * @exception IOException if an error occurs. 354 */ 355 public void close() throws IOException 356 { 357 entryRandomAccess.close(); 358 } 359 360 /** 361 * <code>initialise</code> reads the headers of the index files to 362 * obtain data about the record sizes and counts, database name 363 * and sequence filenames. It then opens a random access file to 364 * the ID index for lookups. 365 * 366 * @exception IOException if an error occurs. 367 */ 368 private void initialise() throws IOException 369 { 370 BufferedInputStream bis = null; 371 372 // First try to get details of file names and numbers from 373 // master index file. 374 try 375 { 376 bis = new BufferedInputStream(new FileInputStream(divisionLkp)); 377 EmblCDROMIndexReader div = new DivisionLkpReader(bis); 378 379 divRecordCount = div.readRecordCount(); 380 381 // The database name is the same in all the index headers 382 name = div.readDBName(); 383 384 seqFiles = new HashMap((int) divRecordCount); 385 386 // Store the file number->name mapping 387 for (long i = divRecordCount; --i >= 0;) 388 { 389 Object [] divRecord = div.readRecord(); 390 391 Integer fileNumber = (Integer) divRecord[0]; 392 String fileName = (String) divRecord[1]; 393 394 seqFiles.put(fileNumber, fileName); 395 } 396 397 // Keep a Set view 398 fileSet = new HashSet((int) divRecordCount); 399 fileSet.addAll(seqFiles.values()); 400 401 bis.close(); 402 } 403 // File was not found, so don't try to close it 404 catch (FileNotFoundException fnfe) 405 { 406 System.err.println("Failed to find file " 407 + divisionLkp.getName()); 408 // Rethrow 409 throw fnfe; 410 } 411 // File was opened, so try to close it 412 catch (IOException ioe) 413 { 414 try 415 { 416 bis.close(); 417 } 418 catch (IOException ioe2) 419 { 420 System.err.println("Failed to close input stream from file " 421 + divisionLkp.getName()); 422 } 423 424 System.err.println("Failed to read full set of sequence IDs file " 425 + divisionLkp.getName()); 426 // Rethrow 427 throw ioe; 428 } 429 430 // Now try to get details of sequence ID index file 431 try 432 { 433 bis = new BufferedInputStream(new FileInputStream(entryNamIdx)); 434 EmblCDROMIndexReader ent = new EntryNamIdxReader(bis); 435 436 entryRecordLength = ent.readRecordLength(); 437 entryRecordCount = ent.readRecordCount(); 438 439 bis.close(); 440 } 441 // File was not found, so don't try to close it 442 catch (FileNotFoundException fnfe) 443 { 444 System.err.println("Failed to find file " 445 + entryNamIdx.getName()); 446 // Rethrow 447 throw fnfe; 448 } 449 // File was opened, so try to close it 450 catch (IOException ioe) 451 { 452 try 453 { 454 bis.close(); 455 } 456 catch (IOException ioe2) 457 { 458 System.err.println("Failed to close input stream from file " 459 + entryNamIdx.getName()); 460 } 461 462 System.err.println("Failed to read file " 463 + entryNamIdx.getName()); 464 // Rethrow 465 throw ioe; 466 } 467 468 // Try to set up random access file 469 try 470 { 471 entryRandomAccess = new EntryNamRandomAccess(entryNamIdx, 472 300, 473 entryRecordLength, 474 entryRecordCount); 475 } 476 // File was not found, so don't try to close it 477 catch (FileNotFoundException fnfe) 478 { 479 System.err.println("Failed to find file " 480 + entryNamIdx.getName()); 481 try 482 { 483 bis.close(); 484 } 485 catch (IOException ioe2) 486 { 487 System.err.println("Failed to close random access file " 488 + entryNamIdx.getName()); 489 } 490 // Rethrow 491 throw fnfe; 492 } 493 } 494}