package com.search; import java.io.Serializable; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.function.Function; /** * A simple implementation of Bloom filter. *

* Bloom filter have a chance of being wrong. *

* The Bloom filter assert that elements that do not exist must not exist, * if assert an element exists, but not necessarily. *

* The accuracy rate depends on capacity and hash functions. * */ public class BloomFilter implements Serializable { private static final long serialVersionUID = -4466610350741278658L; private static final int LONG_SHIFT = 6; private static final int LONG_MASK = 63; /** * hash functions */ private final List> hashFunctions; private final long[] table; private final int tableMask; private int size; /** * @param capacity the filter capacity * @param hashFunctions hash functions * @see Builder must be build by {@link Builder}. */ private BloomFilter(int capacity, List> hashFunctions) { this.hashFunctions = hashFunctions; int cap = nextPowerOf2(capacity); tableMask = (cap << LONG_SHIFT) - 1; table = new long[cap]; } public static Builder builder(int capacity) { if (capacity < 1) { throw new IllegalStateException("capacity must be > 0"); } return new Builder(capacity); } /** * Add an element to the Bloom filter. */ public void add(String element) { checkNotNull(element, "element"); for (Function hashFunction : hashFunctions) { int key = hashFunction.apply(element) & tableMask; table[key >>> LONG_SHIFT] |= (1 << (key & LONG_MASK)); } size++; } /** * @return true if the element exists, otherwise false. */ public boolean contains(String element) { if (element == null) { return false; } for (Function hashFunction : hashFunctions) { int key = hashFunction.apply(element) & tableMask; if ((table[key >>> LONG_SHIFT] & (1 << (key & LONG_MASK))) == 0) { return false; } } return true; } public List> getHashFunctions() { return hashFunctions; } public int size() { return size; } private static void checkNotNull(String element, String msg) { if (element == null) { throw new NullPointerException(msg + " must be not null"); } } private static int nextPowerOf2(int i) { int n = i - 1; n |= n >>> 1; n |= n >>> 2; n |= n >>> 4; n |= n >>> 8; n |= n >>> 16; return (n < 0) ? 1 : (n >= 0x40000000) ? 0x40000000 : n + 1; } /** * We need a list of unmodifiable hash functions. */ public static class Builder { private int capacity; private List> hashFunctions = new ArrayList<>(); private Builder(int capacity) { this.capacity = capacity; } public Builder addHashFunction(Function function) { hashFunctions.add(function); return this; } public BloomFilter build() { if (hashFunctions.isEmpty()) { addDefaultHashFunction(); } return new BloomFilter(capacity, Collections.unmodifiableList(hashFunctions)); } /** * I provides several default hash functions */ private void addDefaultHashFunction() { // Java String Hash Function hashFunctions.add(String::hashCode); // SDBM Hash Function hashFunctions.add(key -> { if (key == null || key.isEmpty()) { return 0; } int hash = 0; for (int i = 0; i < key.length(); i++) { hash = key.charAt(i) + (hash << 6) + (hash << 16) - hash; } hash &= 0x7ffffff; return hash; }); // Robert Sedgwicks Hash Function hashFunctions.add(key -> { if (key == null || key.isEmpty()) { return 0; } int hash = 0; int magic = 63689; for (int i = 0; i < key.length(); i++) { hash = hash * magic + key.charAt(i); magic *= 378551; } return hash; }); // Arash Partow Hash Function hashFunctions.add(key -> { if (key == null || key.isEmpty()) { return 0; } int hash = 0; for (int i = 0; i < key.length(); i++) { char ch = key.charAt(i); if ((i & 1) == 0) { hash ^= ((hash << 7) ^ ch ^ (hash >> 3)); } else { hash ^= (~((hash << 11) ^ ch ^ (hash >> 5))); } } return hash; }); } } }