001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022 023package org.biojava.stats.svm; 024 025import java.io.Serializable; 026import java.util.Arrays; 027import java.util.Iterator; 028import java.util.List; 029 030/** 031 * An implementation of a sparse vector. 032 * <p> 033 * Memory is only allocated for dimensions that have non-zero values. 034 * 035 * @author Thomas Down 036 * @author Matthew Pocock 037 */ 038public class SparseVector implements Serializable { 039 int size; 040 int[] keys; 041 double[] values; 042 043 public SparseVector() { 044 this(1); 045 } 046 047 public SparseVector(int capacity) { 048 keys = new int[capacity]; 049 values = new double[capacity]; 050 Arrays.fill(keys, 0, capacity, Integer.MAX_VALUE); 051 size = 0; 052 } 053 054 /** 055 * The number of used dimensions. 056 * <p> 057 * This is the total number of non-zero dimensions. It is not equal to the 058 * number of the highest indexed dimension. 059 * 060 * @return the number of non-zero dimensions 061 */ 062 public int size() { 063 return size; 064 } 065 066 /** 067 * Set the value at a particular dimension. 068 * <p> 069 * This silently handles the case when previously there was no value at dim. 070 * It does not contract the sparse array if you set a value to zero. 071 * 072 * @param dim the dimension to alter 073 * @param value the new value 074 */ 075 public void put(int dim, double value) { 076 // find index of key nearest dim 077 int indx = Arrays.binarySearch(keys, dim); 078 079 if(indx >= 0) { // found entry for dim 080 values[indx] = value; 081 } else { // need to create entry for dim 082 indx = -(indx + 1); 083 084 if ((size + 1) >= keys.length) { // growing arrays 085 int[] nKeys = new int[keys.length * 2]; 086 Arrays.fill(nKeys, size, nKeys.length, Integer.MAX_VALUE); 087 System.arraycopy(keys, 0, nKeys, 0, indx); 088 System.arraycopy(keys, indx, nKeys, indx+1, size-indx); 089 keys = nKeys; 090 091 double[] nValues = new double[values.length * 2]; 092 Arrays.fill(nValues, size, nValues.length, Double.NaN); 093 System.arraycopy(values, 0, nValues, 0, indx); 094 System.arraycopy(values, indx, nValues, indx+1, size-indx); 095 values = nValues; 096 } else { 097 try { 098 System.arraycopy(keys, indx, keys, indx+1, size-indx); 099 System.arraycopy(values, indx, values, indx+1, size-indx); 100 } catch (ArrayIndexOutOfBoundsException ae) { 101 // paranoya check - they were out to get me once 102 // - they may try it again 103 System.out.println("dim = " + dim); 104 System.out.println("value = " + value); 105 System.out.println("keys.length = " + keys.length); 106 System.out.println("size = " + size); 107 System.out.println("indx = " + indx); 108 System.out.println("indx+1 = " + (indx+1)); 109 System.out.println("size-indx = " + (size-indx)); 110 System.out.println("size+1 = " + (size+1)); 111 throw ae; 112 } 113 } 114 115 keys[indx] = dim; 116 values[indx] = value; 117 ++size; 118 } 119 } 120 121 /** 122 * Retrieve the value at dimension dim. 123 * 124 * @param dim the dimension to retrieve a value for 125 * @return the value at that dimension 126 */ 127 public double get(int dim) { 128 int pos = Arrays.binarySearch(keys, dim); 129 if (pos >= 0) { 130 return values[pos]; 131 } 132 return 0.0; 133 } 134 135 /** 136 * Retrieve the dimension at a specific index. 137 * <p> 138 * E.g., if the sparse vector had dimensions 5, 11, 12, 155 set to non-zero 139 * values then <code>sv.getDimAtIndex(2)</code> would return 12. 140 * 141 * @param indx the index 142 * @return the dimension stoored at that index 143 * @throws ArrayIndexOutOfBoundsException if index >= size. 144 */ 145 public int getDimAtIndex(int indx) { 146 if(indx >= size) { 147 throw new ArrayIndexOutOfBoundsException( 148 "Attempted to read item " + indx + " of an array with " + size + "elements" 149 ); 150 } 151 return keys[indx]; 152 } 153 154 /** 155 * Retrieve the value at a specific index. 156 * <p> 157 * E.g., if the sparse vector contained the data 5->0.1, 11->100, 12->8.5, 155->-10 158 * then <code>sv.geValueAtIndex(2)</code> would return 8.5. 159 * 160 * @param indx the index 161 * @return the value stoored at that index 162 * @throws ArrayIndexOutOfBoundsException if index >= size. 163 */ 164 public double getValueAtIndex(int indx) { 165 if(indx >= size) { 166 throw new ArrayIndexOutOfBoundsException( 167 "Attempted to read item " + indx + " of an array with " + size + "elements" 168 ); 169 } 170 return values[indx]; 171 } 172 173 public int maxIndex() { 174 return keys[size - 1]; 175 } 176 177 public static SparseVector normalLengthVector(SparseVector v, double length) { 178 SparseVector n = new SparseVector(v.size); 179 180 double oldLength = 0; 181 for (int i = 0; i < v.size; ++i) { 182 oldLength += v.values[i] * v.values[i]; 183 } 184 oldLength = Math.sqrt(oldLength); 185 186 for (int i = 0; i < v.size; ++i) { 187 n.put(v.keys[i], v.values[i] * length / oldLength); 188 } 189 190 return n; 191 } 192 193 public static final SVMKernel kernel = new SparseVectorKernel(); 194 195 196 197 private static class SparseVectorKernel implements SVMKernel, Serializable { 198 public double evaluate(Object o1, Object o2) { 199 SparseVector a = (SparseVector) o1; 200 SparseVector b = (SparseVector) o2; 201 202 int ai=0, bi=0; 203 double total = 0.0; 204 205 while (ai < a.size && bi < b.size) { 206 if (a.keys[ai] > b.keys[bi]) { 207 ++bi; 208 } else if (a.keys[ai] < b.keys[bi]) { 209 ++ai; 210 } else { 211 total += a.values[ai++] * b.values[bi++]; 212 } 213 } 214 return total; 215 } 216 217 public String toString() { 218 return "SparseVector kernel K(x, y) = sum_i ( x_i * y_i )."; 219 } 220 }; 221 222 /** 223 * A version of the standard dot-product kernel that scales each column 224 * independently. 225 * 226 * @author Matthew Pocock 227 */ 228 public static class NormalizingKernel implements SVMKernel, Serializable { 229 /** 230 * The sparse vector that performes the normalization. 231 */ 232 private SparseVector s; 233 234 /** 235 * Retrive the current normalizing vector. 236 * 237 * @return the normalizing vector 238 */ 239 public SparseVector getNormalizingVector() { 240 return s; 241 } 242 243 /** 244 * Set the normalizing vector. 245 * 246 * @param nv the new normalizing vector 247 */ 248 public void setNormalizingVector(SparseVector nv) { 249 s = nv; 250 } 251 252 /** 253 * Evaluate the kernel function between two SparseVectors. 254 * <p> 255 * This function is equivalent to: 256 * <br> 257 * <code>k(a, b) = sum_i ( a_i * b_i * nv_i )</code> 258 * <br> 259 * where nv_i is the value of the normalizing vector at index i. This can 260 * be thought of as scaling each vector at index i by 261 * <code>sqrt(nv_i)</code>. 262 */ 263 public double evaluate(Object o1, Object o2) { 264 SparseVector a = (SparseVector) o1; 265 SparseVector b = (SparseVector) o2; 266 267 int ai=0, bi=0, si=0; 268 double total = 0.0; 269 270 while (ai < a.size && bi < b.size && si < s.size) { 271 if ( (a.keys[ai] < b.keys[bi]) || (a.keys[ai] < s.keys[si])) { 272 ++ai; 273 } else if ((b.keys[bi] < a.keys[ai]) || (b.keys[bi] < s.keys[si])) { 274 ++bi; 275 } else if ((s.keys[si] < a.keys[ai]) || (s.keys[si] < b.keys[bi])) { 276 ++si; 277 } else if( (a.keys[ai] == s.keys[si]) && (b.keys[bi] == s.keys[si]) ) { 278 total += a.values[ai++] * b.values[bi++] * a.keys[ai-1]; //s.values[si++]; 279 } else { 280 System.out.println("ai = " + ai); 281 System.out.println("bi = " + bi); 282 System.out.println("si = " + si); 283 System.exit(1); 284 } 285 } 286 return total; 287 } 288 289 /** 290 * Generate a normalizing kernel with the normalizing vector s. 291 * 292 * @param s the SparseVector to normalize by 293 */ 294 public NormalizingKernel(SparseVector s) { 295 this.s = s; 296 } 297 298 /** 299 * Generate a normalizing kernel defined by the SparseVectors in vectors. 300 * <p> 301 * It will set up a normalizing vector that has weight that will scale 302 * each element so that the average score is 1. 303 */ 304 public NormalizingKernel(List vectors) { 305 this.s = new SparseVector(); 306 307 for(Iterator i = vectors.iterator(); i.hasNext(); ) { 308 SparseVector v = (SparseVector) i.next(); 309 for(int j = 0; j < v.size(); j++) { 310 s.put(v.keys[j], s.get(v.keys[j]) + v.values[j]); 311 } 312 } 313 314 for(int j = 0; j < s.size(); j++) { 315 s.values[j] = (double) vectors.size() / s.values[j]; 316 s.values[j] *= s.values[j]; 317 } 318 } 319 320 public String toString() { 321 return "SparseVector.NormalizingKernel K(x, y | s) = " + 322 "sum_i ( x_i * y_i * s_i )."; 323 } 324 325 } 326}