001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojava.bio.program.tagvalue; 023 024import java.io.File; 025import java.io.FileNotFoundException; 026import java.io.FileReader; 027import java.util.ArrayList; 028import java.util.Iterator; 029import java.util.List; 030import java.util.Map; 031 032import org.biojava.bio.program.indexdb.IndexStore; 033import org.biojava.utils.ParserException; 034import org.biojava.utils.SmallMap; 035import org.biojava.utils.io.CountedBufferedReader; 036import org.biojava.utils.io.RAF; 037 038/** 039 * <p> 040 * Listens to tag-value events and passes on indexing events to an IndexStore. 041 * </p> 042 * 043 * <p> 044 * This class is provided to allow the indexing of arbitrary record-based text 045 * files. Indexer objects are built for a single file and the indexes are 046 * written to a single index store. To keep all of the reader offsets in sync 047 * with one another, you will almost certainly wish to use the getReader() 048 * method to retrieve a CountedBufferedReader instance if you want to read the 049 * byte-offset between calls to Parser.read(). Below is an example of how to 050 * index a file. 051 * </p> 052 * 053 * <p><em>Note:</em> It is very important to configure the BioStoreFactory 054 * instance with all the right keys before hand.</p> 055 * 056 * <pre> 057 * File fileToIndex; // get this from somewhere 058 * BioStore store = bsf.createBioStore(); 059 * Indexer indexer = new Indexer(fileToIndex, store); 060 * indexer.setPrimaryKeyName("foo"); 061 * indexer.addSecondaryKey("bar"); 062 * indexer.addSecondaryKey("baz"); 063 * 064 * TagValueParser tvParser; // make this appropriate for your format 065 * TagValueListener listener; // make this appropriate for your format 066 * // and forward all events to changer 067 * 068 * Parser parser = new Parser(); 069 * while( 070 * parser.read(indexer.getReader(), tvParser, listener) 071 * ) { 072 * System.out.print("."); 073 * } 074 * </pre> 075 * 076 * @since 1.2 077 * @author Matthew Pocock 078 */ 079public class Indexer 080implements TagValueListener { 081 private final RAF file; 082 private final CountedBufferedReader reader; 083 private final IndexStore indexStore; 084 private final Map seccondaryKeys; 085 private String primaryKeyName; 086 private String primaryKey; 087 private Object tag; 088 private long offset; 089 private int depth; 090 091 /** 092 * Build a new Indexer. 093 * 094 * @param file the file to be processed 095 * @param indexStore the IndexStore to write to 096 */ 097 public Indexer(File file, IndexStore indexStore) 098 throws FileNotFoundException { 099 this.file = new RAF(file, "r"); 100 this.reader = new CountedBufferedReader(new FileReader(file)); 101 this.indexStore = indexStore; 102 this.seccondaryKeys = new SmallMap(); 103 this.depth = 0; 104 } 105 106 /** 107 * Retrieve the reader that can be safely used to index this file. 108 * 109 * @return the CountedBufferedReader that should be processed 110 */ 111 public CountedBufferedReader getReader() { 112 return reader; 113 } 114 115 /** 116 * <p> 117 * Set the tag to use as a primary key in the index. 118 * </p> 119 * 120 * <p> 121 * Whenever a value for the primary key tag is seen, this is passed to the 122 * indexer as the primary key for indexing. 123 * </p> 124 * 125 * <p> 126 * Primary keys must be unique between entries, and each entry must provide 127 * exactly one primary key value. 128 * </p> 129 * 130 * @param primaryKeyName the tag to use as primary key 131 */ 132 public void setPrimaryKeyName(String primaryKeyName) { 133 this.primaryKeyName = primaryKeyName; 134 } 135 136 /** 137 * Retrieve the tag currently used as primary key. 138 * 139 * @return a String representing the primary key name 140 */ 141 public String getPrimaryKeyName() { 142 return primaryKeyName; 143 } 144 145 /** 146 * <p> 147 * Add a secondary key. 148 * </p> 149 * 150 * <p> 151 * Secondary keys are potentially non-unique properties of the entries being 152 * indexed. Multiple records can use the same secondary key values, and a 153 * single record can have multiple values for a secondary key. 154 * </p> 155 * 156 * @param secKeyName the name of the secondary key to add 157 */ 158 public void addSecondaryKey(String secKeyName) { 159 seccondaryKeys.put(secKeyName, new ArrayList()); 160 } 161 162 /** 163 * Remove a secondary key. 164 * 165 * @param secKeyName the name of the secondary key to remove 166 */ 167 public void removeSecondaryKey(String secKeyName) { 168 seccondaryKeys.remove(secKeyName); 169 } 170 171 public void startRecord() { 172 if(depth == 0) { 173 offset = reader.getFilePointer(); 174 primaryKey = null; 175 for(Iterator i = seccondaryKeys.values().iterator(); i.hasNext(); ) { 176 List list = (List) i.next(); 177 list.clear(); 178 } 179 } 180 181 depth++; 182 } 183 184 public void startTag(Object tag) { 185 this.tag = tag; 186 } 187 188 public void value(TagValueContext ctxt, Object value) { 189 if(tag.equals(primaryKeyName)) { 190 primaryKey = value.toString(); 191 } 192 193 List l = (List) seccondaryKeys.get(tag); 194 if(l != null) { 195 l.add(value.toString()); 196 } 197 } 198 199 public void endTag() {} 200 201 public void endRecord() 202 throws ParserException 203 { 204 depth--; 205 if(depth == 0) { 206 if(primaryKey == null) { 207 throw new NullPointerException("No primary key"); 208 } 209 210 int length = (int) (reader.getFilePointer() - offset); 211 indexStore.writeRecord( 212 file, 213 offset, 214 length, 215 primaryKey, 216 seccondaryKeys 217 ); 218 } 219 } 220} 221