/* * Copyright (C) 2011, 2015 Apple Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BloomFilter_h #define BloomFilter_h #include #include namespace WTF { // Bloom filter with k=2. Uses 2^keyBits/8 bytes of memory. // False positive rate is approximately (1-e^(-2n/m))^2, where n is the number of unique // keys and m is the table size (==2^keyBits). // See http://en.wikipedia.org/wiki/Bloom_filter template class BloomFilter { WTF_MAKE_FAST_ALLOCATED; public: static const size_t tableSize = 1 << keyBits; BloomFilter(); void add(unsigned hash); // For example SHA1::Digest. template void add(const std::array&); void add(const BloomFilter&); // The filter may give false positives (claim it may contain a key it doesn't) // but never false negatives (claim it doesn't contain a key it does). bool mayContain(unsigned hash) const; template bool mayContain(const std::array&) const; void clear(); void add(const AtomicString& string) { add(string.impl()->existingHash()); } void add(const String& string) { add(string.impl()->hash()); } bool mayContain(const AtomicString& string) const { return mayContain(string.impl()->existingHash()); } bool mayContain(const String& string) const { return mayContain(string.impl()->hash()); } private: static const unsigned bitsPerPosition = 8 * sizeof(unsigned); static const unsigned keyMask = (1 << keyBits) - 1; static unsigned arrayIndex(unsigned key) { return key / bitsPerPosition; } static unsigned bitMask(unsigned key) { return 1 << (key % bitsPerPosition); } template static std::pair keysFromHash(const std::array&); bool isBitSet(unsigned key) const; void setBit(unsigned key); std::array m_bitArray; }; template inline BloomFilter::BloomFilter() : m_bitArray() { } template inline bool BloomFilter::mayContain(unsigned hash) const { // The top and bottom bits of the incoming hash are treated as independent bloom filter hash functions. // This works well as long as the filter size is not much above 2^16. return isBitSet(hash) && isBitSet(hash >> 16); } template inline void BloomFilter::add(unsigned hash) { setBit(hash); setBit(hash >> 16); } template template inline std::pair BloomFilter::keysFromHash(const std::array& hash) { // We could use larger k value than 2 for long hashes. static_assert(hashSize >= 2 * sizeof(unsigned), "Hash array too short"); return { *reinterpret_cast(hash.data()), *reinterpret_cast(hash.data() + sizeof(unsigned)) }; } template template inline bool BloomFilter::mayContain(const std::array& hash) const { auto keys = keysFromHash(hash); return isBitSet(keys.first) && isBitSet(keys.second); } template template inline void BloomFilter::add(const std::array& hash) { auto keys = keysFromHash(hash); setBit(keys.first); setBit(keys.second); } template inline void BloomFilter::add(const BloomFilter& other) { for (size_t i = 0; i < m_bitArray.size(); ++i) m_bitArray[i] |= other.m_bitArray[i]; } template bool BloomFilter::isBitSet(unsigned key) const { unsigned maskedKey = key & keyMask; ASSERT(arrayIndex(maskedKey) < m_bitArray.size()); return m_bitArray[arrayIndex(maskedKey)] & bitMask(maskedKey); } template void BloomFilter::setBit(unsigned key) { unsigned maskedKey = key & keyMask; ASSERT(arrayIndex(maskedKey) < m_bitArray.size()); m_bitArray[arrayIndex(maskedKey)] |= bitMask(maskedKey); } template inline void BloomFilter::clear() { m_bitArray.fill(0); } // Counting bloom filter with 8 bit counters. Uses 2^keyBits bytes of memory. Error rates as above. // See http://en.wikipedia.org/wiki/Bloom_filter#Counting_filters template class CountingBloomFilter { WTF_MAKE_FAST_ALLOCATED; public: static const size_t tableSize = 1 << keyBits; static unsigned maximumCount() { return std::numeric_limits::max(); } CountingBloomFilter(); void add(unsigned hash); void remove(unsigned hash); // The filter may give false positives (claim it may contain a key it doesn't) // but never false negatives (claim it doesn't contain a key it does). bool mayContain(unsigned hash) const { return firstBucket(hash) && secondBucket(hash); } // The filter must be cleared before reuse even if all keys are removed. // Otherwise overflowed keys will stick around. void clear(); void add(const AtomicString& string) { add(string.impl()->existingHash()); } void add(const String& string) { add(string.impl()->hash()); } void remove(const AtomicString& string) { remove(string.impl()->existingHash()); } void remove(const String& string) { remove(string.impl()->hash()); } bool mayContain(const AtomicString& string) const { return mayContain(string.impl()->existingHash()); } bool mayContain(const String& string) const { return mayContain(string.impl()->hash()); } #if !ASSERT_DISABLED // Slow. bool likelyEmpty() const; bool isClear() const; #endif private: static const unsigned keyMask = (1 << keyBits) - 1; uint8_t& firstBucket(unsigned hash) { return m_buckets[hash & keyMask]; } uint8_t& secondBucket(unsigned hash) { return m_buckets[(hash >> 16) & keyMask]; } const uint8_t& firstBucket(unsigned hash) const { return m_buckets[hash & keyMask]; } const uint8_t& secondBucket(unsigned hash) const { return m_buckets[(hash >> 16) & keyMask]; } std::array m_buckets; }; template inline CountingBloomFilter::CountingBloomFilter() : m_buckets() { } template inline void CountingBloomFilter::add(unsigned hash) { auto& first = firstBucket(hash); auto& second = secondBucket(hash); if (LIKELY(first < maximumCount())) ++first; if (LIKELY(second < maximumCount())) ++second; } template inline void CountingBloomFilter::remove(unsigned hash) { auto& first = firstBucket(hash); auto& second = secondBucket(hash); ASSERT(first); ASSERT(second); // In case of an overflow, the bucket sticks in the table until clear(). if (LIKELY(first < maximumCount())) --first; if (LIKELY(second < maximumCount())) --second; } template inline void CountingBloomFilter::clear() { m_buckets.fill(0); } #if !ASSERT_DISABLED template bool CountingBloomFilter::likelyEmpty() const { for (auto& bucket : m_buckets) { if (bucket && bucket != maximumCount()) return false; } return true; } template bool CountingBloomFilter::isClear() const { for (auto& bucket : m_buckets) { if (bucket) return false; } return true; } #endif } using WTF::BloomFilter; using WTF::CountingBloomFilter; #endif