001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojava.bio.seq.db; 023 024import java.io.BufferedReader; 025import java.io.File; 026import java.io.FileReader; 027import java.io.IOException; 028import java.io.RandomAccessFile; 029import java.io.Serializable; 030import java.util.Iterator; 031import java.util.Set; 032import java.util.Vector; 033 034import org.biojava.bio.BioException; 035import org.biojava.bio.seq.Sequence; 036import org.biojava.bio.seq.SequenceIterator; 037import org.biojava.bio.seq.io.SequenceBuilder; 038import org.biojava.bio.seq.io.SequenceBuilderFactory; 039import org.biojava.bio.seq.io.SequenceBuilderFilter; 040import org.biojava.bio.seq.io.SequenceFormat; 041import org.biojava.bio.seq.io.SymbolTokenization; 042import org.biojava.bio.symbol.Alphabet; 043import org.biojava.bio.symbol.Symbol; 044import org.biojava.utils.ChangeEvent; 045import org.biojava.utils.ChangeSupport; 046import org.biojava.utils.ChangeVetoException; 047import org.biojava.utils.ParseErrorListener; 048import org.biojava.utils.ParseErrorSource; 049import org.biojava.utils.io.CountedBufferedReader; 050import org.biojava.utils.io.RandomAccessReader; 051 052/** 053 * <p> 054 * This class implements SequenceDB on top of a set of sequence files 055 * and sequence offsets within these files. 056 * </p> 057 * 058 * <p> This class is primarily responsible for managing the sequence 059 * IO, such as calculating the sequence file offsets, and parsing 060 * individual sequences based upon file offsets. The actual persistant 061 * storage of all this information is delegated to an instance of 062 * <code>IndexStore</code>, such as TabIndexStore. 063 * </p> 064 * 065 * <pre> 066 * // create a new index store and populate it 067 * // this may take some time 068 * TabIndexStore indexStore = new TabIndexStore( 069 * storeFile, indexFile, dbName, 070 * format, sbFactory, symbolParser ); 071 * IndexedSequenceDB seqDB = new IndexedSequenceDB(indexStore); 072 * 073 * for(int i = 0; i < files; i++) { 074 * seqDB.addFile(files[i]); 075 * } 076 * 077 * // load an existing index store and fetch a sequence 078 * // this should be quite quick 079 * TabIndexStore indexStore = TabIndexStore.open(storeFile); 080 * SequenceDB seqDB = new IndexedSequenceDB(indexStore); 081 * Sequence seq = seqDB.getSequence(id); 082 * </pre> 083 * 084 * <p> 085 * Note: We may be able to improve the indexing speed further by 086 * discarding all feature creation & annotation requests during index 087 * parsing. 088 * </p> 089 * 090 * @author Matthew Pocock 091 * @author Thomas Down 092 * @author Keith James 093 * @see org.biojava.bio.seq.db.TabIndexStore 094 */ 095 096public final class IndexedSequenceDB extends AbstractSequenceDB 097 implements SequenceDB, Serializable 098{ 099 private final IDMaker idMaker; 100 private final IndexStore indexStore; 101 102 /** 103 * Create an IndexedSequenceDB by specifying both the IDMaker and 104 * IndexStore used. 105 * 106 * <p> 107 * The IDMaker will be used to calculate the ID for each 108 * Sequence. It will delegate the storage and retrieval of the 109 * sequence offsets to the IndexStore. 110 * 111 * @param idMaker the IDMaker used to calculate Sequence IDs 112 * @param indexStore the IndexStore delegate 113 */ 114 public IndexedSequenceDB(IDMaker idMaker, IndexStore indexStore) { 115 this.idMaker = idMaker; 116 this.indexStore = indexStore; 117 } 118 119 /** 120 * Create an IndexedSequenceDB by specifying IndexStore used. 121 * 122 * <p> 123 * IDMaker.byName will be used to calculate the ID for each 124 * Sequence. It will delegate the storage and retrieval of the 125 * sequence offsets to the IndexStore. 126 * 127 * @param indexStore the IndexStore delegate 128 */ 129 public IndexedSequenceDB(IndexStore indexStore) { 130 this(IDMaker.byName, indexStore); 131 } 132 133 /** 134 * Retrieve the IndexStore. 135 * 136 * @return the IndexStore delegate 137 */ 138 public IndexStore getIndexStore() { 139 return indexStore; 140 } 141 142 /** 143 * Add sequences from a file to the sequence database. This method 144 * works on an "all or nothing" principle. If it can successfully 145 * interpret the entire file, all the sequences will be read 146 * in. However, if it encounters any problems, it will abandon the 147 * whole file; an IOException will be thrown. Multiple files may 148 * be indexed into a single database. A BioException will be 149 * thrown if it has problems understanding the sequences. 150 * 151 * @param seqFile the file containing the sequence or set of sequences 152 * @throws BioException if for any reason the sequences can't be read 153 * correctly 154 * @throws ChangeVetoException if there is a listener that vetoes adding 155 * the files 156 */ 157 158 public void addFile(File seqFile) 159 throws IllegalIDException, BioException, ChangeVetoException 160 { 161 boolean completed = false; // initially assume that we will fail 162 try { 163 seqFile = seqFile.getAbsoluteFile(); 164 CountedBufferedReader bReader = new CountedBufferedReader(new FileReader(seqFile)); 165 SequenceFormat format = indexStore.getFormat(); 166 SymbolTokenization symParser = indexStore.getSymbolParser(); 167 SequenceBuilderFactory sbFact = indexStore.getSBFactory(); 168 long pos = bReader.getFilePointer(); 169 boolean hasNextSequence = true; 170 while(hasNextSequence) { 171 SequenceBuilder sb = new ElideSymbolsSequenceBuilder(sbFact.makeSequenceBuilder()); 172 hasNextSequence = format.readSequence(bReader, symParser, sb); 173 Sequence seq = sb.makeSequence(); 174 String id = idMaker.calcID(seq); 175 long oldPos = pos; 176 pos = bReader.getFilePointer(); 177 indexStore.store(new SimpleIndex(seqFile, oldPos, (int) (pos - oldPos), id)); 178 } 179 180 if(!hasListeners()) { 181 indexStore.commit(); 182 } else { 183 ChangeEvent ce = new ChangeEvent( 184 this, 185 SequenceDB.SEQUENCES 186 ); 187 ChangeSupport changeSupport = getChangeSupport(SequenceDB.SEQUENCES); 188 synchronized(changeSupport) { 189 changeSupport.firePreChangeEvent(ce); 190 indexStore.commit(); 191 changeSupport.firePostChangeEvent(ce); 192 } 193 } 194 completed = true; // we completed succesfuly 195 } catch (IOException ioe) { 196 throw new BioException("Failed to read sequence file",ioe); 197 } finally { 198 if(!completed) { // if there was a failure, discard changes 199 indexStore.rollback(); 200 } 201 } 202 } 203 204 /** 205 * SequenceBuilder implementation that explicitly discards the Symbols 206 * 207 * @author Thomas Down 208 * @author Matthew Pocock 209 * @author Mark Schreiber 210 */ 211 private static class ElideSymbolsSequenceBuilder 212 extends SequenceBuilderFilter 213 implements ParseErrorSource { 214 215 private Vector listeners = new Vector(); 216 217 public synchronized void removeParseErrorListener(ParseErrorListener p){ 218 if(listeners.contains(p)) 219 listeners.remove(p); 220 } 221 222 public synchronized void addParseErrorListener(ParseErrorListener p){ 223 if(! listeners.contains(p)){ 224 listeners.add(p); 225 } 226 } 227 228 public ElideSymbolsSequenceBuilder(SequenceBuilder delegate) { 229 super(delegate); 230 } 231 232 /** 233 * Just ignore the symbols 234 */ 235 public void addSymbols(Alphabet alpha, Symbol[] syms, int start, int length) { 236 } 237 } 238 239 /** 240 * Get the name of this sequence database. The name is retrieved 241 * from the IndexStore delegate. 242 * 243 * @return the name of the sequence database, which may be null. 244 */ 245 public String getName() { 246 return indexStore.getName(); 247 } 248 249 public Sequence getSequence(String id) 250 throws IllegalIDException, BioException 251 { 252 try 253 { 254 Index indx = indexStore.fetch(id); 255 256 RandomAccessReader rar = 257 new RandomAccessReader(new RandomAccessFile(indx.getFile(), "r")); 258 259 long toSkip = indx.getStart(); 260 if (toSkip > rar.length()) 261 throw new BioException("Reached end of file"); 262 rar.seek(toSkip); 263 264 SequenceBuilder sb = 265 indexStore.getSBFactory().makeSequenceBuilder(); 266 267 indexStore.getFormat().readSequence(new BufferedReader(rar), 268 indexStore.getSymbolParser(), 269 sb); 270 Sequence seq = sb.makeSequence(); 271 272 rar.close(); 273 return seq; 274 } 275 catch (IOException ioe) 276 { 277 throw new BioException("Couldn't grab region of file",ioe); 278 } 279 } 280 281 public SequenceIterator sequenceIterator() { 282 return new SequenceIterator() { 283 private Iterator idI = indexStore.getIDs().iterator(); 284 285 public boolean hasNext() { 286 return idI.hasNext(); 287 } 288 289 public Sequence nextSequence() throws BioException { 290 return getSequence((String) idI.next()); 291 } 292 }; 293 } 294 295 public Set ids() { 296 return indexStore.getIDs(); 297 } 298 299 300}