001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022
023package org.biojava.stats.svm;
024
025import java.io.Serializable;
026import java.util.Arrays;
027import java.util.Iterator;
028import java.util.List;
029
030/**
031 * An implementation of a sparse vector.
032 * <p>
033 * Memory is only allocated for dimensions that have non-zero values.
034 *
035 * @author Thomas Down
036 * @author Matthew Pocock
037 */
038public class SparseVector implements Serializable {
039    int size;
040    int[] keys;
041    double[] values;
042
043    public SparseVector() {
044      this(1);
045    }
046
047    public SparseVector(int capacity) {
048      keys = new int[capacity];
049      values = new double[capacity];
050      Arrays.fill(keys, 0, capacity, Integer.MAX_VALUE);
051      size = 0;
052    }
053
054    /**
055     * The number of used dimensions.
056     * <p>
057     * This is the total number of non-zero dimensions. It is not equal to the
058     * number of the highest indexed dimension.
059     *
060     * @return the number of non-zero dimensions
061     */
062    public int size() {
063      return size;
064    }
065    
066    /**
067     * Set the value at a particular dimension.
068     * <p>
069     * This silently handles the case when previously there was no value at dim.
070     * It does not contract the sparse array if you set a value to zero.
071     *
072     * @param dim  the dimension to alter
073     * @param value the new value
074     */
075    public void put(int dim, double value) {
076      // find index of key nearest dim
077      int indx = Arrays.binarySearch(keys, dim);
078      
079      if(indx >= 0) { // found entry for dim
080        values[indx] = value;
081      } else { // need to create entry for dim
082        indx = -(indx + 1);
083
084        if ((size + 1) >= keys.length) { // growing arrays
085          int[] nKeys = new int[keys.length * 2];
086          Arrays.fill(nKeys, size, nKeys.length, Integer.MAX_VALUE);
087          System.arraycopy(keys, 0, nKeys, 0, indx);
088          System.arraycopy(keys, indx, nKeys, indx+1, size-indx);
089          keys = nKeys;
090          
091          double[] nValues = new double[values.length * 2];
092          Arrays.fill(nValues, size, nValues.length, Double.NaN);
093          System.arraycopy(values, 0, nValues, 0, indx);
094          System.arraycopy(values, indx, nValues, indx+1, size-indx);
095          values = nValues;
096        } else {
097          try {
098            System.arraycopy(keys, indx, keys, indx+1, size-indx);
099            System.arraycopy(values, indx, values, indx+1, size-indx);
100          } catch (ArrayIndexOutOfBoundsException ae) {
101            // paranoya check - they were out to get me once
102            //                - they may try it again
103            System.out.println("dim         = " + dim);
104            System.out.println("value       = " + value);
105            System.out.println("keys.length = " + keys.length);
106            System.out.println("size        = " + size);
107            System.out.println("indx        = " + indx);
108            System.out.println("indx+1      = " + (indx+1));
109            System.out.println("size-indx   = " + (size-indx));
110            System.out.println("size+1      = " + (size+1));
111            throw ae;
112          }
113        }
114
115        keys[indx] = dim;
116        values[indx] = value;
117        ++size;
118      }
119    }
120
121    /**
122     * Retrieve the value at dimension dim.
123     *
124     * @param dim the dimension to retrieve a value for
125     * @return the value at that dimension
126     */
127    public double get(int dim) {
128      int pos = Arrays.binarySearch(keys, dim);
129      if (pos >= 0) {
130        return values[pos];
131      }
132      return 0.0;
133    }
134
135    /**
136     * Retrieve the dimension at a specific index.
137     * <p>
138     * E.g., if the sparse vector had dimensions 5, 11, 12, 155 set to non-zero
139     * values then <code>sv.getDimAtIndex(2)</code> would return 12.
140     *
141     * @param indx  the index
142     * @return the dimension stoored at that index
143     * @throws ArrayIndexOutOfBoundsException if index >= size.
144     */
145    public int getDimAtIndex(int indx) {
146      if(indx >= size) {
147        throw new ArrayIndexOutOfBoundsException(
148          "Attempted to read item " + indx + " of an array with " + size + "elements"
149        );
150      }
151      return keys[indx];
152    }
153    
154    /**
155     * Retrieve the value at a specific index.
156     * <p>
157     * E.g., if the sparse vector contained the data 5->0.1, 11->100, 12->8.5, 155->-10
158     * then <code>sv.geValueAtIndex(2)</code> would return 8.5.
159     *
160     * @param indx  the index
161     * @return the value stoored at that index
162     * @throws ArrayIndexOutOfBoundsException if index >= size.
163     */
164    public double getValueAtIndex(int indx) {
165      if(indx >= size) {
166        throw new ArrayIndexOutOfBoundsException(
167          "Attempted to read item " + indx + " of an array with " + size + "elements"
168        );
169      }
170      return values[indx];
171    }
172    
173    public int maxIndex() {
174      return keys[size - 1];
175    }
176
177    public static SparseVector normalLengthVector(SparseVector v, double length) {
178      SparseVector n = new SparseVector(v.size);
179
180      double oldLength = 0;
181      for (int i = 0; i < v.size; ++i) {
182        oldLength += v.values[i] * v.values[i];
183      }
184      oldLength = Math.sqrt(oldLength);
185        
186        for (int i = 0; i < v.size; ++i) {
187        n.put(v.keys[i], v.values[i] * length / oldLength);
188      }
189
190      return n;
191    }
192
193    public static final SVMKernel kernel = new SparseVectorKernel();
194    
195
196    
197    private static class SparseVectorKernel implements SVMKernel, Serializable {
198      public double evaluate(Object o1, Object o2) {
199        SparseVector a = (SparseVector) o1;
200        SparseVector b = (SparseVector) o2;
201        
202        int ai=0, bi=0;
203        double total = 0.0;
204            
205            while (ai < a.size && bi < b.size) {
206          if (a.keys[ai] > b.keys[bi]) {
207            ++bi;
208          } else if (a.keys[ai] < b.keys[bi]) {
209            ++ai;
210          } else {
211            total += a.values[ai++] * b.values[bi++];
212          }
213        }
214        return total;
215      }
216      
217      public String toString() {
218        return "SparseVector kernel K(x, y) = sum_i ( x_i * y_i ).";
219      }
220    };
221    
222    /**
223     * A version of the standard dot-product kernel that scales each column
224     * independently.
225     *
226     * @author Matthew Pocock
227     */
228    public static class NormalizingKernel implements SVMKernel, Serializable {
229      /**
230       * The sparse vector that performes the normalization.
231       */
232      private SparseVector s;
233      
234      /**
235       * Retrive the current normalizing vector.
236       *
237       * @return the normalizing vector
238       */
239      public SparseVector getNormalizingVector() {
240        return s;
241      }
242      
243      /**
244       * Set the normalizing vector.
245       *
246       * @param nv the new normalizing vector
247       */
248      public void setNormalizingVector(SparseVector nv) {
249        s = nv;
250      }
251      
252      /**
253       * Evaluate the kernel function between two SparseVectors.
254       * <p>
255       * This function is equivalent to:
256       * <br>
257       * <code>k(a, b) = sum_i ( a_i * b_i * nv_i )</code>
258       * <br>
259       * where nv_i is the value of the normalizing vector at index i. This can
260       * be thought of as scaling each vector at index i by
261       * <code>sqrt(nv_i)</code>.
262       */
263      public double evaluate(Object o1, Object o2) {
264        SparseVector a = (SparseVector) o1;
265        SparseVector b = (SparseVector) o2;
266        
267        int ai=0, bi=0, si=0;
268        double total = 0.0;
269            
270            while (ai < a.size && bi < b.size && si < s.size) {
271          if (       (a.keys[ai] < b.keys[bi]) || (a.keys[ai] < s.keys[si])) {
272            ++ai;
273          } else if ((b.keys[bi] < a.keys[ai]) || (b.keys[bi] < s.keys[si])) {
274            ++bi;
275          } else if ((s.keys[si] < a.keys[ai]) || (s.keys[si] < b.keys[bi])) {
276            ++si;
277          } else if( (a.keys[ai] == s.keys[si]) && (b.keys[bi] == s.keys[si]) ) {
278            total += a.values[ai++] * b.values[bi++] * a.keys[ai-1]; //s.values[si++];
279          } else {
280            System.out.println("ai = " + ai);
281            System.out.println("bi = " + bi);
282            System.out.println("si = " + si);
283            System.exit(1);
284          }
285        }
286        return total;
287      }
288      
289      /**
290       * Generate a normalizing kernel with the normalizing vector s.
291       *
292       * @param s the SparseVector to normalize by
293       */
294      public NormalizingKernel(SparseVector s) {
295        this.s = s;
296      }
297      
298      /**
299       * Generate a normalizing kernel defined by the SparseVectors in vectors.
300       * <p>
301       * It will set up a normalizing vector that has weight that will scale
302       * each element so that the average score is 1.
303       */
304      public NormalizingKernel(List vectors) {
305        this.s = new SparseVector();
306        
307        for(Iterator i = vectors.iterator(); i.hasNext(); ) {
308          SparseVector v = (SparseVector) i.next();
309          for(int j = 0; j < v.size(); j++) {
310            s.put(v.keys[j], s.get(v.keys[j]) + v.values[j]);
311          }
312        }
313        
314        for(int j = 0; j < s.size(); j++) {
315          s.values[j] = (double) vectors.size() / s.values[j];
316          s.values[j] *= s.values[j];
317        }
318      }
319
320      public String toString() {
321        return "SparseVector.NormalizingKernel K(x, y | s) = " +
322               "sum_i ( x_i * y_i * s_i ).";
323      }
324
325    }
326}