首页 > 程序开发 > 软件开发 > C++ >

一个用于白名单服务的布隆过滤器(bloom filter)

2014-11-27

bloom filter这种数据结构用于判断一个元素是否在集合内,当然,这种功能也可以由HashMap来实现。bloom filter与HashMap的区别在于,HashMap会储存代表这个元素的key自身(如key为 "IKnow7 ",那么HashMap将存储

bloom filter这种数据结构用于判断一个元素是否在集合内,当然,这种功能也可以由HashMap来实现。bloom filter与HashMap的区别在于,HashMap会储存代表这个元素的key自身(如key为"IKnow7",那么HashMap将存储"IKnow7"这12个字节(java),其实还需要包括引用大小,但java中相同string只存一份),而bloom filter在底层只会使用几个bit来代表这个元素。在速度上,bloom filter对比与HashMap相差不大,底层同样是hash+随机访问。由于bloom filter对空间节省的特性,bloom filter适合判断一个元素是否在海量数据集合中。

bloom filter的一些概念

bloom filter并非十全十美。bloom filter在添加元素时,会将对象hash到底层位图数组的k个位上,对这些位,bloom filter会将其值设为1。由于hash函数特性以及位图数组长度有限,不同的对象可能在某些位上有重叠。bloom filter在检查元素是否存在时,会检查该对象所对应的k个位是否为1,如果全部都为1表示存在,这里就出现问题了,这些位上的1未必是该元素之前设置的,有可能是别的元素所设置的,所以会造成一些误判,即原本不在bloom filter中的一些元素也被判别在bloom filter中。bloom filter的这种误判被称为"积极的误判",即存在的元素的一定会通过,不存在的元素也有可能通过,而不会造成对存在的元素结果为否的判定。 \ \ 可以简单猜测,误判的概率与hash的选择、位图数组的大小、当前元素的数量以及K(映射位的个数)有关。一般来说,hash值越平均、位图数组越大、元素数量越少那么误判的概率就越低。 这是一个大牛写的关于bloom filter设计与误判率的理论分析,大伙可以去看看:https://www.cnblogs.com/allensun/archive/2011/02/16/1956532.html。

bloom filter在web上的应用<喎"https://www.2cto.com/kf/ware/vc/" target="_blank" class="keylink">vc3Ryb25nPjwvaDE+CiAgICAg1Np3ZWLTptPD1tDO0sPHvq2zo9Do0qrKudPDsNfD+7WlwLS5/cLL0rvQqcfrx/OjrNPD0tSx3MPi0rvQqc7e0Ke1xMr9vt2/4rfDzsq78tXftvHS4rmlu/eho7bU09rUytDt0rvQqc7zxdDCysfStObU2rqjwb/K/b7dtcSw18P7taXAtMu1o6zKudPDYmxvb20gZmlsdGVyyseyu7b+tcTRodTxoaMKPGJyPgoKPGgxPjxzdHJvbmc+yrnTw2Jsb29tIGZpbHRlcsq1z9bSu7j21qez1tT2wb/H68fztcSw18P7taU8L3N0cm9uZz48L2gxPgogICAgILDXw/u1pc2os6PKx9Do0qq4/NDCtcSjrLj80MK1xLe9yr3Su7Dj09DIq8G/us3U9sG/uPzQwqGjyKvBv7K7sdjLtaOs1tjQwrao0uW49mJsb29tIGZpbHRlcr2rtbHHsMv509DK/b7dt8XI68bk1tC8tL/JoaPU9sG/uPzQwrXEu7CjrNK7sOO74czhuanSu7bOyrG85MTa0MLU9rrNyb6z/bXEyv2+3aOsy/nS1NDo0qrU2rDXw/u1pdbQvavK/b7dvfjQ0LrPsqKjrLjDzO2807XEzO2806OsuMPJvrP9tcTJvrP9oaMKICAgICC/ycrHLi4uLi4uINStyfq1xGJsb29tIGZpbHRlcrKisrvWp7PW1KrL2LXEyb6z/bLZ1/ejrNLyzqrEs9K7zru/ycTczqq24Lj21KrL2Mv508Oho9K71tayu8fQyrW8yrXEz+u3qMrHzqpibG9vbSBmaWx0ZXK1xMO/0rvOu8no1sPSu7j20v3Tw7zGyv2jrMO/yb6z/dK7uPbUqsvYvPUxoaMKICAgICDSu9bWv8nQ0LXE1/a3qMrHo6zB7c3iyrnTw9K7uPZtYXDAtLGjtObS0cm+s/21xNSqy9ijrNTaxdC2z9Sqy9jKx7fxtObU2sqxz8jF0LbP1Nq4w2RlbGV0ZW1hcNbQyse38bTm1NqjrMjnufu05tTao6zWsb3TZmFsc2Who8jnufuyu7Tm1NqjrNTZzai5/WJsb29tIGZpbHRlcr340NDF0LbPoaPU2tDCzO2809Sqy9jKsaOsyOe5+2RlbGV0ZW1hcNbQtObU2qOsyb6z/bjDZGVsZXRlbWFw1tC1xLjD1KrL2KOs1NnM7bzTtb1ibG9vbSBmaWx0ZXLW0KGj1NrKtbzK06bTw9bQo6zKudPDsNfD+7WltcSzob6w0OjSqsm+s/21xNSqy9jSu7Djyse9z8nZtcSjrMv50tTV4tbWt73KvbTT0KfCysrHv8nQ0LXEoaPV4tbWt73KvbTm1NrSu7j2zsrM4qOstbFkZWxldGVtYXDW0NSqy9i5/bbgyrGjrMrGsdi74dTss8libG9vbQogZmlsdGVytcTO88XQwsrJz8n9o6zS8s6qxLPQqdStsb6xu8m+s/3UqsvYyejWw86qMbXEzruyosO709Cxu7npMKGjuMPOyszitcS94r72tOvKqcrHo6y1sWRlbGV0ZW1hcLXEyN3Bv7W9tO+1xNK7uPa958/fyrGjrMq508PIq8G/zayyvbj80MK4w2Jsb29tIGZpbHRlcqGjCjxicj4KCjxoMT48c3Ryb25nPrDXw/u1pWJsb29tIGZpbHRlcrXEyrXP1jwvc3Ryb25nPjwvaDE+CiAgICAg1eLA4Lm5vP64tNPD0NS63Me/o6y/ydLUx+HLybXEvK+zybW9z9bT0LXEtPrC69auyc+ho8/Cw+bWsb3TzPmz9sC0o7oKPHByZSBjbGFzcz0="brush:java;">public class BloomFilter implements Serializable { private static final long serialVersionUID = 3507830443935243576L; private long timestamp;//用于时间戳更新机制 private HashMap deleteMap ; //储存已删除元素 private BitSet bitset;//位图存储 private int bitSetSize; // expected (maximum) number of elements to be added private int expectedNumberOfFilterElements; // number of elements actually added to the Bloom filter private int numberOfAddedElements; private int k; //每一个元素对应k个位 // encoding used for storing hash values as strings static Charset charset = Charset.forName("UTF-8"); // MD5 gives good enough accuracy in most circumstances. // Change to SHA1 if it&#39;s needed static String hashName = "MD5"; static final MessageDigest digestFunction; static { // The digest method is reused between instances to provide higher entropy. MessageDigest tmp; try { tmp = java.security.MessageDigest.getInstance(hashName); } catch (NoSuchAlgorithmException e) { tmp = null; } digestFunction = tmp; } /** * Constructs an empty Bloom filter. * * @param bitSetSize defines how many bits should be used for the filter. * @param expectedNumberOfFilterElements defines the maximum * number of elements the filter is expected to contain. */ public BloomFilter(int bitSetSize, int expectedNumberOfFilterElements) { this.expectedNumberOfFilterElements = expectedNumberOfFilterElements; this.k = (int) Math.round( (bitSetSize / expectedNumberOfFilterElements) * Math.log(2.0)); bitset = new BitSet(bitSetSize); deleteMap = new HashMap(); this.bitSetSize = bitSetSize; numberOfAddedElements = 0; } /** * Generates a digest based on the contents of a String. * * @param val specifies the input data. * @param charset specifies the encoding of the input data. * @return digest as long. */ public static long createHash(String val, Charset charset) { try { return createHash(val.getBytes(charset.name())); } catch (UnsupportedEncodingException e) { e.printStackTrace(); // Ingore } return -1; } /** * Generates a digest based on the contents of a String. * * @param val specifies the input data. The encoding is expected to be UTF-8. * @return digest as long. */ public static long createHash(String val) { return createHash(val, charset); } /** * Generates a digest based on the contents of an array of bytes. * * @param data specifies input data. * @return digest as long. */ public static long createHash(byte[] data) { long h = 0; byte[] res; synchronized (digestFunction) { res = digestFunction.digest(data); } for (int i = 0; i < 4; i++) { h <<= 8; h |= ((int) res[i]) & 0xFF; } return h; } /** * Compares the contents of two instances to see if they are equal. * * @param obj is the object to compare to. * @return True if the contents of the objects are equal. */ @SuppressWarnings("unchecked") @Override public boolean equals(Object obj) { if (obj == null) { return false; } if (getClass() != obj.getClass()) { return false; } final BloomFilter other = (BloomFilter) obj; if (this.expectedNumberOfFilterElements != other.expectedNumberOfFilterElements) { return false; } if (this.k != other.k) { return false; } if (this.bitSetSize != other.bitSetSize) { return false; } if (this.bitset != other.bitset && (this.bitset == null || !this.bitset.equals(other.bitset))) { return false; } return true; } /** * Calculates a hash code for this class. * @return hash code representing the contents of an instance of this class. */ @Override public int hashCode() { int hash = 7; hash = 61 * hash + (this.bitset != null ? this.bitset.hashCode() : 0); hash = 61 * hash + this.expectedNumberOfFilterElements; hash = 61 * hash + this.bitSetSize; hash = 61 * hash + this.k; return hash; } /** * Calculates the expected probability of false positives based on * the number of expected filter elements and the size of the Bloom filter. *

* The value returned by this method is the expected rate of false * positives, assuming the number of inserted elements equals the number of * expected elements. If the number of elements in the Bloom filter is less * than the expected value, the true probability of false positives will be lower. * * @return expected probability of false positives. */ public double expectedFalsePositiveProbability() { return getFalsePositiveProbability(expectedNumberOfFilterElements); } /** * Calculate the probability of a false positive given the specified * number of inserted elements. * * @param numberOfElements number of inserted elements. * @return probability of a false positive. */ public double getFalsePositiveProbability(double numberOfElements) { // (1 - e^(-k * n / m)) ^ k return Math.pow((1 - Math.exp(-k * (double) numberOfElements / (double) bitSetSize)), k); } /** * Get the current probability of a false positive. The probability is calculated from * the size of the Bloom filter and the current number of elements added to it. * * @return probability of false positives. */ public double getFalsePositiveProbability() { return getFalsePositiveProbability(numberOfAddedElements); } /** * Returns the value chosen for K.
*
* K is the optimal number of hash functions based on the size * of the Bloom filter and the expected number of inserted elements. * * @return optimal k. */ public int getK() { return k; } /** * Sets all bits to false in the Bloom filter. */ public void clear() { bitset.clear(); numberOfAddedElements = 0; } /** * Adds an object to the Bloom filter. The output from the object&#39;s * toString() method is used as input to the hash functions. * * @param element is an element to register in the Bloom filter. */ public void add(E element) { deleteMap.remove(element); long hash; String valString = element.toString(); for (int x = 0; x < k; x++) { hash = createHash(valString + Integer.toString(x)); hash = hash % (long)bitSetSize; bitset.set(Math.abs((int)hash), true); } numberOfAddedElements ++; } /** * Remove all elements from a Collection to the Bloom filter. * @param c Collection of elements. */ public void removeAll(Collection c) { for (E element : c) remove(element); } public void remove(E element) { deleteMap.put(element, Boolean.TRUE); } public int getDeleteMapSize(){ return deleteMap.size(); } /** * Adds all elements from a Collection to the Bloom filter. * @param c Collection of elements. */ public void addAll(Collection c) { for (E element : c) { if (element != null) add(element); } } /** * Returns true if the element could have been inserted into the Bloom filter. * Use getFalsePositiveProbability() to calculate the probability of this * being correct. * * @param element element to check. * @return true if the element could have been inserted into the Bloom filter. */ public boolean contains(E element) { Boolean contains = deleteMap.get(element); if (contains != null && contains) return false; long hash; String valString = element.toString(); for (int x = 0; x < k; x++) { hash = createHash(valString + Integer.toString(x)); hash = hash % (long) bitSetSize; if (!bitset.get(Math.abs((int) hash))) return false; } return true; } /** * Returns true if all the elements of a Collection could have been inserted * into the Bloom filter. Use getFalsePositiveProbability() to calculate the * probability of this being correct. * @param c elements to check. * @return true if all the elements in c could have been inserted into the Bloom filter. */ public boolean containsAll(Collection c) { for (E element : c) if (!contains(element)) return false; return true; } /** * Read a single bit from the Bloom filter. * @param bit the bit to read. * @return true if the bit is set, false if it is not. */ public boolean getBit(int bit) { return bitset.get(bit); } /** * Set a single bit in the Bloom filter. * @param bit is the bit to set. * @param value If true, the bit is set. If false, the bit is cleared. */ public void setBit(int bit, boolean value) { bitset.set(bit, value); } /** * Return the bit set used to store the Bloom filter. * @return bit set representing the Bloom filter. */ public BitSet getBitSet() { return bitset; } /** * Returns the number of bits in the Bloom filter. Use count() to retrieve * the number of inserted elements. * * @return the size of the bitset used by the Bloom filter. */ public int size() { return this.bitSetSize; } /** * Returns the number of elements added to the Bloom filter after it * was constructed or after clear() was called. * * @return number of elements added to the Bloom filter. */ public int count() { return this.numberOfAddedElements; } /** * Returns the expected number of elements to be inserted into the filter. * This value is the same value as the one passed to the constructor. * * @return expected number of elements. */ public int getExpectedNumberOfElements() { return expectedNumberOfFilterElements; } /** * 返回更新的时间戳机制 * @return */ public long getTimestamp() { return timestamp; } /** * 设置跟新的时间戳 * @param timestamp */ public void setTimestamp(long timestamp) { this.timestamp = timestamp; } @Override public String toString() { return "BloomFilter [timestamp=" + timestamp + ", bitSetSize=" + bitSetSize + ", expectedNumberOfFilterElements=" + expectedNumberOfFilterElements + ", numberOfAddedElements=" + numberOfAddedElements + ", k=" + k +",deleteMapSize=" +getDeleteMapSize()+"]"; } }


热点推荐