0.5 merging

2005-02-16 22:37:24 +00:00
parent 9646ac2911
commit 7ef9ce8cc6
16 changed files with 1139 additions and 1213 deletions
--- a/core/java/src/org/xlattice/crypto/filters/BloomSHA1.java
+++ b/core/java/src/org/xlattice/crypto/filters/BloomSHA1.java
@@ -0,0 +1,224 @@
+/* BloomSHA1.java */
+package org.xlattice.crypto.filters;
+
+/**
+ * A Bloom filter for sets of SHA1 digests.  A Bloom filter uses a set
+ * of k hash functions to determine set membership.  Each hash function
+ * produces a value in the range 0..M-1.  The filter is of size M.  To
+ * add a member to the set, apply each function to the new member and 
+ * set the corresponding bit in the filter.  For M very large relative
+ * to k, this will normally set k bits in the filter.  To check whether
+ * x is a member of the set, apply each of the k hash functions to x
+ * and check whether the corresponding bits are set in the filter.  If
+ * any are not set, x is definitely not a member.  If all are set, x 
+ * may be a member.  The probability of error (the false positive rate)
+ * is f = (1 - e^(-kN/M))^k, where N is the number of set members.
+ *
+ * This class takes advantage of the fact that SHA1 digests are good-
+ * quality pseudo-random numbers.  The k hash functions are the values
+ * of distinct sets of bits taken from the 20-byte SHA1 hash.  The
+ * number of bits in the filter, M, is constrained to be a power of 
+ * 2; M == 2^m.  The number of bits in each hash function may not 
+ * exceed floor(m/k).
+ *
+ * This class is designed to be thread-safe, but this has not been
+ * exhaustively tested.
+ *
+ * @author < A HREF="mailto:jddixon@users.sourceforge.net">Jim Dixon</A>
+ * 
+ * BloomSHA1.java and KeySelector.java are BSD licensed from the xlattice
+ * app - http://xlattice.sourceforge.net/
+ * 
+ * minor tweaks by jrandom, exposing unsynchronized access and 
+ * allowing larger M and K.  changes released into the public domain.
+ */
+
+public class BloomSHA1 {
+    protected final int m;
+    protected final int k;
+    protected int count;
+   
+    protected final int[] filter;
+    protected KeySelector ks;
+    protected final int[] wordOffset;
+    protected final int[] bitOffset;
+    
+    // convenience variables
+    protected final int filterBits;
+    protected final int filterWords;
+    
+    /**
+     * Creates a filter with 2^m bits and k 'hash functions', where
+     * each hash function is portion of the 160-bit SHA1 hash.   
+
+     * @param m determines number of bits in filter, defaults to 20
+     * @param k number of hash functions, defaults to 8
+     */
+    public BloomSHA1( int m, int k) {
+        // XXX need to devise more reasonable set of checks
+        //if ( m < 2 || m > 20) {
+        //    throw new IllegalArgumentException("m out of range");
+        //}
+        //if ( k < 1 || ( k * m > 160 )) {
+        //    throw new IllegalArgumentException( 
+        //        "too many hash functions for filter size");
+        //}
+        this.m = m;
+        this.k = k;
+        count = 0;
+        filterBits = 1 << m;
+        filterWords = (filterBits + 31)/32;     // round up 
+        filter = new int[filterWords];
+        doClear();
+        // offsets into the filter
+        wordOffset = new int[k];
+        bitOffset  = new int[k];
+        ks = new KeySelector(m, k, bitOffset, wordOffset);
+
+        // DEBUG
+        //System.out.println("Bloom constructor: m = " + m + ", k = " + k
+        //    + "\n    filterBits = " + filterBits
+        //    + ", filterWords = " + filterWords);
+        // END
+    }
+
+    /**
+     * Creates a filter of 2^m bits, with the number of 'hash functions"
+     * k defaulting to 8.
+     * @param m determines size of filter
+     */
+    public BloomSHA1 (int m) {
+        this(m, 8);
+    }
+
+    /**
+     * Creates a filter of 2^20 bits with k defaulting to 8.
+     */
+    public BloomSHA1 () {
+        this (20, 8);
+    }
+    /** Clear the filter, unsynchronized */
+    protected void doClear() {
+        for (int i = 0; i < filterWords; i++) {
+            filter[i] = 0;
+        }
+    }
+    /** Synchronized version */
+    public void clear() {
+        synchronized (this) {
+            doClear();
+        }
+    }
+    /**
+     * Returns the number of keys which have been inserted.  This 
+     * class (BloomSHA1) does not guarantee uniqueness in any sense; if the 
+     * same key is added N times, the number of set members reported
+     * will increase by N.
+     * 
+     * @return number of set members 
+     */
+    public final int size() {
+        synchronized (this) {
+            return count;
+        }
+    }
+    /**
+     * @return number of bits in filter
+     */
+    public final int capacity () {
+        return filterBits;
+    }
+
+    /**
+     * Add a key to the set represented by the filter.   
+     *
+     * XXX This version does not maintain 4-bit counters, it is not
+     * a counting Bloom filter.
+     * 
+     * @param b byte array representing a key (SHA1 digest)
+     */
+    public void insert (byte[]b) {
+        synchronized(this) {
+            locked_insert(b);
+        }
+    }
+
+    public final void locked_insert(byte[]b) { 
+        ks.getOffsets(b);
+        for (int i = 0; i < k; i++) {
+            filter[wordOffset[i]] |=  1 << bitOffset[i];
+        }
+        count++;
+    }
+    
+    /**
+     * Is a key in the filter.  Sets up the bit and word offset arrays.
+     * 
+     * @param b byte array representing a key (SHA1 digest)
+     * @return true if b is in the filter 
+     */
+    protected final boolean isMember(byte[] b) {
+        ks.getOffsets(b);
+        for (int i = 0; i < k; i++) {
+            if (! ((filter[wordOffset[i]] & (1 << bitOffset[i])) != 0) ) {
+                return false;
+            }
+        }
+        return true;
+    }
+    
+    public final boolean locked_member(byte[]b) { return isMember(b); }
+    
+    /**
+     * Is a key in the filter.  External interface, internally synchronized.
+     * 
+     * @param b byte array representing a key (SHA1 digest)
+     * @return true if b is in the filter 
+     */
+    public final boolean member(byte[]b) {
+        synchronized (this) {
+            return isMember(b);
+        }
+    }
+
+    /** 
+     * @param n number of set members
+     * @return approximate false positive rate
+     */
+    public final double falsePositives(int n) {
+        // (1 - e(-kN/M))^k
+        return java.lang.Math.pow ( 
+                (1l - java.lang.Math.exp( ((double)k) * (long)n / (long)filterBits)), (long)k);
+    }
+
+    public final double falsePositives() {
+        return falsePositives(count);
+    }
+    // DEBUG METHODS
+    public static String keyToString(byte[] key) {
+        StringBuffer sb = new StringBuffer().append(key[0]);
+        for (int i = 1; i < key.length; i++) {
+            sb.append(".").append(Integer.toString(key[i], 16));
+        }
+        return sb.toString();
+    }
+    /** convert 64-bit integer to hex String */
+    public static String ltoh (long i) {
+        StringBuffer sb = new StringBuffer().append("#")
+                                .append(Long.toString(i, 16));
+        return sb.toString();
+    }
+
+    /** convert 32-bit integer to String */
+    public static String itoh (int i) {
+        StringBuffer sb = new StringBuffer().append("#")
+                                .append(Integer.toString(i, 16));
+        return sb.toString();
+    }
+    /** convert single byte to String */
+    public static String btoh (byte b) {
+        int i = 0xff & b;
+        return itoh(i);
+    }
+}
+
--- a/core/java/src/org/xlattice/crypto/filters/KeySelector.java
+++ b/core/java/src/org/xlattice/crypto/filters/KeySelector.java
@@ -0,0 +1,245 @@
+/* KeySelector.java */
+package org.xlattice.crypto.filters;
+
+/**
+ * Given a key, populates arrays determining word and bit offsets into
+ * a Bloom filter.
+ * 
+ * @author <A HREF="mailto:jddixon@users.sourceforge.net">Jim Dixon</A>
+ *
+ * BloomSHA1.java and KeySelector.java are BSD licensed from the xlattice
+ * app - http://xlattice.sourceforge.net/
+ * 
+ * minor tweaks by jrandom, exposing unsynchronized access and 
+ * allowing larger M and K.  changes released into the public domain.
+ */
+public class KeySelector {
+   
+    private int m;
+    private int k;
+    private byte[] b;
+    private int[] bitOffset;
+    private int[] wordOffset;
+    private BitSelector  bitSel;
+    private WordSelector wordSel;
+    
+    public interface BitSelector {
+        public void getBitSelectors();
+    }
+    public interface WordSelector {
+        public void getWordSelectors();
+    }
+    /** AND with byte to expose index-many bits */
+    public final static int[] UNMASK = { 
+ // 0  1  2  3   4   5   6    7    8   9     10   11     12    13     14     15
+    0, 1, 3, 7, 15, 31, 63, 127, 255, 511, 1023, 2047, 4095, 8191, 16383, 32767};
+    /** AND with byte to zero out index-many bits */
+    public final static int[] MASK   = {
+    ~0,~1,~3,~7,~15,~31,~63,~127,~255,~511,~1023,~2047,~4095,~8191,~16383,~32767};
+
+    public final static int TWO_UP_15 = 32 * 1024;
+
+    /** 
+     * Creates a key selector for a Bloom filter.  When a key is presented
+     * to the getOffsets() method, the k 'hash function' values are 
+     * extracted and used to populate bitOffset and wordOffset arrays which
+     * specify the k flags to be set or examined in the filter.  
+     *
+     * @param m    size of the filter as a power of 2
+     * @param k    number of 'hash functions'
+     * @param bitOffset array of k bit offsets (offset of flag bit in word)
+     * @param wordOffset array of k word offsets (offset of word flag is in)
+     */
+    public KeySelector (int m, int k, int[] bitOffset, int [] wordOffset) {
+        //if ( (m < 2) || (m > 20)|| (k < 1) 
+        //             || (bitOffset == null) || (wordOffset == null)) {
+        //    throw new IllegalArgumentException();
+        //}
+        this.m = m;
+        this.k = k;
+        this.bitOffset = bitOffset;
+        this.wordOffset = wordOffset;
+        bitSel  = new GenericBitSelector();
+        wordSel = new GenericWordSelector();
+    }
+    
+    /** 
+     * Extracts the k bit offsets from a key, suitable for general values 
+     * of m and k.
+     */
+    public class GenericBitSelector implements BitSelector {
+        /** Do the extraction */
+        public void getBitSelectors() {
+            int curBit = 0; 
+            int curByte;
+            for (int j = 0; j < k; j++) {
+                curByte = curBit / 8;
+                int bitsUnused = ((curByte + 1) * 8) - curBit;    // left in byte
+
+//              // DEBUG
+//              System.out.println (
+//                  "this byte = " + btoh(b[curByte])
+//                  + ", next byte = " + btoh(b[curByte + 1])
+//                  + "; curBit=" + curBit + ", curByte= " + curByte
+//                  + ", bitsUnused=" + bitsUnused);
+//              // END
+                if (bitsUnused > 5) {
+                    bitOffset[j] = ((0xff & b[curByte])
+                                        >> (bitsUnused - 5)) & UNMASK[5];
+//                  // DEBUG
+//                  System.out.println(
+//                      "    before shifting: " + btoh(b[curByte])
+//                  + "\n    after shifting:  " 
+//                          + itoh( (0xff & b[curByte]) >> (bitsUnused - 5))
+//                  + "\n    mask:            " + itoh(UNMASK[5]) );
+//                  // END
+                } else if (bitsUnused == 5) {
+                    bitOffset[j] = b[curByte] & UNMASK[5];
+                } else {
+                    bitOffset[j] = (b[curByte]          & UNMASK[bitsUnused])
+                              | (((0xff & b[curByte + 1]) >> 3) 
+                                                        &   MASK[bitsUnused]);
+//                  // DEBUG
+//                  System.out.println(
+//                    "    contribution from first byte:  "
+//                    + itoh(b[curByte] & UNMASK[bitsUnused])
+//                + "\n    second byte: " + btoh(b[curByte + 1])
+//                + "\n    shifted:     " + itoh((0xff & b[curByte + 1]) >> 3)
+//                + "\n    mask:        " + itoh(MASK[bitsUnused])
+//                + "\n    contribution from second byte: "
+//                    + itoh((0xff & b[curByte + 1] >> 3) & MASK[bitsUnused]));
+//                  // END
+                }
+//              // DEBUG
+//              System.out.println ("    bitOffset[j] = " + bitOffset[j]);
+//              // END
+                curBit += 5;
+            }
+        } 
+    }
+    /** 
+     * Extracts the k word offsets from a key.  Suitable for general
+     * values of m and k.
+     */
+    public class GenericWordSelector implements WordSelector {
+        /** Extract the k offsets into the word offset array */
+        public void getWordSelectors() {
+            int stride = m - 5;
+            //assert true: stride<16;
+            int curBit = k * 5; 
+            int curByte;
+            for (int j = 0; j < k; j++) {
+                curByte = curBit / 8;
+                int bitsUnused = ((curByte + 1) * 8) - curBit;    // left in byte
+
+//              // DEBUG
+//              System.out.println (
+//                  "curr 3 bytes: " + btoh(b[curByte]) 
+//                  + (curByte < 19 ?
+//                      " " + btoh(b[curByte + 1]) : "") 
+//                  + (curByte < 18 ?
+//                      " " + btoh(b[curByte + 2]) : "")
+//                  + "; curBit=" + curBit + ", curByte= " + curByte
+//                  + ", bitsUnused=" + bitsUnused);
+//              // END
+
+                if (bitsUnused > stride) {
+                    // the value is entirely within the current byte
+                    wordOffset[j] = ((0xff & b[curByte]) 
+                                        >> (bitsUnused - stride)) 
+                                                & UNMASK[stride];
+                } else if (bitsUnused == stride) {
+                    // the value fills the current byte
+                    wordOffset[j] = b[curByte] & UNMASK[stride];
+                } else {    // bitsUnused < stride
+                    // value occupies more than one byte
+                    // bits from first byte, right-aligned in result
+                    wordOffset[j] = b[curByte] & UNMASK[bitsUnused];
+//                  // DEBUG
+//                  System.out.println("    first byte contributes "
+//                          + itoh(wordOffset[j]));
+//                  // END
+                    // bits from second byte
+                    int bitsToGet = stride - bitsUnused;
+                    if (bitsToGet >= 8) {
+                        // 8 bits from second byte
+                        wordOffset[j] |= (0xff & b[curByte + 1]) << bitsUnused;
+//                      // DEBUG
+//                      System.out.println("    second byte contributes "
+//                          + itoh(
+//                          (0xff & b[curByte + 1]) << bitsUnused
+//                      ));
+//                      // END
+                        
+                        // bits from third byte
+                        bitsToGet -= 8;
+                        if (bitsToGet > 0) {
+                            wordOffset[j] |= 
+                                ((0xff & b[curByte + 2]) >> (8 - bitsToGet))
+                                                    << (stride - bitsToGet) ;
+//                          // DEBUG
+//                          System.out.println("    third byte contributes " 
+//                              + itoh(
+//                              (((0xff & b[curByte + 2]) >> (8 - bitsToGet))
+//                                                  << (stride - bitsToGet))
+//                              ));
+//                          // END
+                        }
+                    } else {
+                        // all remaining bits are within second byte
+                        wordOffset[j] |= ((b[curByte + 1] >> (8 - bitsToGet))
+                                            & UNMASK[bitsToGet])
+                                                << bitsUnused;
+//                      // DEBUG
+//                      System.out.println("    second byte contributes "
+//                          + itoh(
+//                          ((b[curByte + 1] >> (8 - bitsToGet))
+//                              & UNMASK[bitsToGet])
+//                                      << bitsUnused
+//                          ));
+//                      // END
+                    }
+                }
+//              // DEBUG
+//              System.out.println (
+//                  "    wordOffset[" + j + "] = " + wordOffset[j]
+//                  + ", "                     + itoh(wordOffset[j])
+//              );
+//              // END
+                curBit += stride;
+            }
+        } 
+    }
+    /**
+     * Given a key, populate the word and bit offset arrays, each
+     * of which has k elements.
+     * 
+     * @param key cryptographic key used in populating the arrays
+     */
+    public void getOffsets (byte[] key) {
+        if (key == null) {
+            throw new IllegalArgumentException("null key");
+        }
+        if (key.length < 20) {
+            throw new IllegalArgumentException(
+                "key must be at least 20 bytes long");
+        }
+        b = key;
+//      // DEBUG
+//      System.out.println("KeySelector.getOffsets for " 
+//                                          + BloomSHA1.keyToString(b));
+//      // END
+        bitSel.getBitSelectors();
+        wordSel.getWordSelectors();
+    }
+
+    // DEBUG METHODS ////////////////////////////////////////////////
+    String itoh(int i) {
+        return BloomSHA1.itoh(i);
+    }
+    String btoh(byte b) {
+        return BloomSHA1.btoh(b);
+    }
+}
+
+