001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojava.bio.program.tagvalue; 023 024import java.io.File; 025import java.io.FileNotFoundException; 026import java.io.FileReader; 027import java.util.Iterator; 028import java.util.Map; 029import java.util.Set; 030import java.util.Stack; 031 032import org.biojava.bio.program.indexdb.IndexStore; 033import org.biojava.utils.ParserException; 034import org.biojava.utils.SmallMap; 035import org.biojava.utils.SmallSet; 036import org.biojava.utils.io.CountedBufferedReader; 037import org.biojava.utils.io.RAF; 038 039/** 040 * <p> 041 * Listens to tag-value events and passes on indexing events to an IndexStore. 042 * This is an update to Indexer that understands that indexed properties may 043 * not be at the top level. 044 * </p> 045 * 046 * <p> 047 * This class is provided to allow the indexing of arbitrary record-based text 048 * files. Indexer objects are built for a single file and the indexes are 049 * written to a single index store. To keep all of the reader offsets in sync 050 * with one another, you will almost certainly wish to use the getReader() 051 * method to retrieve a CountedBufferedReader instance if you want to read the 052 * byte-offset between calls to Parser.read(). Below is an example of how to 053 * index a file. 054 * </p> 055 * 056 * <pre> 057 * File fileToIndex; // get this from somewhere 058 * 059 * // don't forget to register all the apropreate keys to the factory first. 060 * BioIndexStore indexStore = bioIndxStrFact.createBioStore(); 061 * 062 * Indexer indexer = new Indexer(fileToIndex, indexStore); 063 * indexer.setPrimaryKeyName("foo", new String[] { "foo" }); 064 * indexer.addSecondaryKey("bar", new String[] { "x", "y", "bar"}); 065 * indexer.addSecondaryKey("baz", new String[] { "z" }); 066 * 067 * TagValueParser tvParser; // make this appropriate for your format 068 * TagValueListener listener; // make this appropriate for your format 069 * // and forward all events to indexer 070 * 071 * Parser parser = new Parser(); 072 * while( 073 * parser.read(indexer.getReader(), tvParser, listener) 074 * ) { 075 * System.out.print("."); 076 * } 077 * </pre> 078 * 079 * @since 1.2 080 * @author Matthew Pocock 081 */ 082public class Indexer2 083implements TagValueListener { 084 private final String primaryKeyName; 085 private final RAF file; 086 private final CountedBufferedReader reader; 087 private final IndexStore indexStore; 088 private final Map keys; 089 private final Map keyValues; 090 private Object tag; 091 private long offset; 092 private int depth; 093 private Stack stack; 094 095 /** 096 * Build a new Indexer. 097 * 098 * @param file the file to be processed 099 * @param indexStore the IndexStore to write to 100 */ 101 public Indexer2(File file, IndexStore indexStore, Index2Model model) 102 throws FileNotFoundException { 103 this.file = new RAF(file, "r"); 104 this.reader = new CountedBufferedReader(new FileReader(file)); 105 this.indexStore = indexStore; 106 this.keyValues = new SmallMap(); 107 this.depth = 0; 108 this.stack = new Stack(); 109 110 this.keys = new SmallMap(); 111 for(Iterator i = model.getKeys().iterator(); i.hasNext(); ) { 112 String key = (String) i.next(); 113 Object val = model.getKeyPath(key); 114 115 keys.put(val, key); 116 } 117 this.primaryKeyName = model.getPrimaryKeyName(); 118 } 119 120 /** 121 * Retrieve the reader that can be safely used to index this file. 122 * 123 * @return the CountedBufferedReader that should be processed 124 */ 125 public CountedBufferedReader getReader() { 126 return reader; 127 } 128 129 public void startRecord() { 130 if(depth == 0) { 131 offset = reader.getFilePointer(); 132 133 Frame frame = new Frame(); 134 135 for(Iterator ki = keys.keySet().iterator(); ki.hasNext(); ) { 136 Object[] keyPath = (Object[]) ki.next(); 137 if(keyPath.length == 1) { 138 frame.addKey(keyPath); 139 } else { 140 frame.paths.add(keyPath); 141 } 142 } 143 144 stack.push(frame); 145 } else { 146 Frame top = (Frame) stack.peek(); 147 Frame frame = new Frame(); 148 149 //System.out.println("Tag: " + tag); 150 //System.out.println("Deth: " + depth); 151 //System.out.println("Top: " + top); 152 153 for(Iterator ki = top.paths.iterator(); ki.hasNext(); ) { 154 Object[] keyPath = (Object[]) ki.next(); 155 if(keyPath[depth-1].equals(tag)) { 156 if((keyPath.length-1) == depth) { 157 frame.addKey(keyPath); 158 } else { 159 frame.paths.add(keyPath); 160 } 161 } 162 } 163 //System.out.println("Pushing new stack frame: " + top + " <- " + frame); 164 stack.push(frame); 165 } 166 167 depth++; 168 } 169 170 public void startTag(Object tag) { 171 this.tag = tag; 172 //if(depth >= 2) System.out.println("tag: " + tag); 173 } 174 175 public void value(TagValueContext ctxt, Object value) { 176 Frame frame = (Frame) stack.peek(); 177 Object[] keyPath = (Object []) frame.getKeyPath(tag); 178 if(keyPath != null) { 179 //if(depth >= 2) System.out.println("Interested in: " + tag + " -> " + value); 180 KeyState ks = (KeyState) keyValues.get(keyPath); 181 if(ks == null) { 182 //if(depth >= 2) System.out.println("Allocating stoorage"); 183 keyValues.put(keyPath, ks = new KeyState(keys.get(keyPath).toString())); 184 } 185 ks.values.add(value); 186 //if(depth >= 2) System.out.println(keyValues); 187 } 188 } 189 190 public void endTag() {} 191 192 public void endRecord() 193 throws ParserException 194 { 195 depth--; 196 if(depth == 0) { 197 int length = (int) (reader.getFilePointer() - offset); 198 199 //System.out.println("keyValues: " + keyValues); 200 String primaryKeyValue = null; 201 Map secKeys = new SmallMap(); 202 for(Iterator i = keyValues.keySet().iterator(); i.hasNext(); ) { 203 Object key = i.next(); 204 KeyState ks = (KeyState) keyValues.get(key); 205 if(ks.keyName.equals(primaryKeyName)) { 206 if(ks.values.size() != 1) { 207 throw new ParserException( 208 "There must be exactly one value for the primary key: " + 209 primaryKeyName + " - " + ks.values 210 ); 211 } 212 primaryKeyValue = ks.values.iterator().next().toString(); 213 } else { 214 secKeys.put(ks.keyName, ks.values); 215 } 216 } 217 218 if(primaryKeyValue == null) { 219 throw new NullPointerException("No primary key"); 220 } 221 222 //System.out.println("Primary: " + primaryKeyValue); 223 //System.out.println("Secondaries: " + secKeys); 224 225 indexStore.writeRecord( 226 file, 227 offset, 228 length, 229 primaryKeyValue, 230 secKeys 231 ); 232 233 stack.clear(); 234 for(Iterator i = keyValues.values().iterator(); i.hasNext(); ) { 235 KeyState ks = (KeyState) i.next(); 236 ks.values.clear(); 237 } 238 } else { 239 stack.pop(); 240 } 241 } 242 243 private static class Frame { 244 public final Map keys = new SmallMap(); 245 public final Set paths = new SmallSet(); 246 247 public void addKey(Object[] keyPath) { 248 keys.put(keyPath[keyPath.length - 1], keyPath); 249 } 250 251 252 public Object[] getKeyPath(Object tag) { 253 return (Object []) keys.get(tag); 254 } 255 256 public String toString() { 257 return this.getClass() + ": (" + keys + "\t" + paths + " )"; 258 } 259 } 260 261 private static class KeyState { 262 public final String keyName; 263 public final Set values = new SmallSet(); 264 265 public KeyState(String keyName) { 266 this.keyName = keyName; 267 } 268 269 public String toString() { 270 return this.getClass() + ": (" + keyName + " " + values + ")"; 271 } 272 } 273} 274