1/*
2 * Copyright (C) 2011, 2015 Apple Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
14 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
15 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
17 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
18 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
19 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
20 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
21 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
22 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
23 * THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26#pragma once
27
28#include <array>
29#include <wtf/text/AtomString.h>
30
31namespace WTF {
32
33// Bloom filter with k=2. Uses 2^keyBits/8 bytes of memory.
34// False positive rate is approximately (1-e^(-2n/m))^2, where n is the number of unique
35// keys and m is the table size (==2^keyBits).
36// See http://en.wikipedia.org/wiki/Bloom_filter
37template <unsigned keyBits>
38class BloomFilter {
39 WTF_MAKE_FAST_ALLOCATED;
40public:
41 static const size_t tableSize = 1 << keyBits;
42
43 BloomFilter();
44
45 void add(unsigned hash);
46 // For example SHA1::Digest.
47 template <size_t hashSize> void add(const std::array<uint8_t, hashSize>&);
48
49 void add(const BloomFilter&);
50
51 // The filter may give false positives (claim it may contain a key it doesn't)
52 // but never false negatives (claim it doesn't contain a key it does).
53 bool mayContain(unsigned hash) const;
54 template <size_t hashSize> bool mayContain(const std::array<uint8_t, hashSize>&) const;
55
56 void clear();
57
58 void add(const AtomString& string) { add(string.impl()->existingHash()); }
59 void add(const String& string) { add(string.impl()->hash()); }
60 bool mayContain(const AtomString& string) const { return mayContain(string.impl()->existingHash()); }
61 bool mayContain(const String& string) const { return mayContain(string.impl()->hash()); }
62
63private:
64 static const unsigned bitsPerPosition = 8 * sizeof(unsigned);
65 static const unsigned keyMask = (1 << keyBits) - 1;
66 static unsigned arrayIndex(unsigned key) { return key / bitsPerPosition; }
67 static unsigned bitMask(unsigned key) { return 1 << (key % bitsPerPosition); }
68 template <size_t hashSize> static std::pair<unsigned, unsigned> keysFromHash(const std::array<uint8_t, hashSize>&);
69
70 bool isBitSet(unsigned key) const;
71 void setBit(unsigned key);
72
73 std::array<unsigned, tableSize / bitsPerPosition> m_bitArray;
74};
75
76template <unsigned keyBits>
77inline BloomFilter<keyBits>::BloomFilter()
78 : m_bitArray()
79{
80}
81
82template <unsigned keyBits>
83inline bool BloomFilter<keyBits>::mayContain(unsigned hash) const
84{
85 // The top and bottom bits of the incoming hash are treated as independent bloom filter hash functions.
86 // This works well as long as the filter size is not much above 2^16.
87 return isBitSet(hash) && isBitSet(hash >> 16);
88}
89
90template <unsigned keyBits>
91inline void BloomFilter<keyBits>::add(unsigned hash)
92{
93 setBit(hash);
94 setBit(hash >> 16);
95}
96
97template <unsigned keyBits>
98template <size_t hashSize>
99inline std::pair<unsigned, unsigned> BloomFilter<keyBits>::keysFromHash(const std::array<uint8_t, hashSize>& hash)
100{
101 // We could use larger k value than 2 for long hashes.
102 static_assert(hashSize >= 2 * sizeof(unsigned), "Hash array too short");
103 return {
104 *reinterpret_cast<const unsigned*>(hash.data()),
105 *reinterpret_cast<const unsigned*>(hash.data() + sizeof(unsigned))
106 };
107}
108
109template <unsigned keyBits>
110template <size_t hashSize>
111inline bool BloomFilter<keyBits>::mayContain(const std::array<uint8_t, hashSize>& hash) const
112{
113 auto keys = keysFromHash(hash);
114 return isBitSet(keys.first) && isBitSet(keys.second);
115}
116
117template <unsigned keyBits>
118template <size_t hashSize>
119inline void BloomFilter<keyBits>::add(const std::array<uint8_t, hashSize>& hash)
120{
121 auto keys = keysFromHash(hash);
122 setBit(keys.first);
123 setBit(keys.second);
124}
125
126template <unsigned keyBits>
127inline void BloomFilter<keyBits>::add(const BloomFilter& other)
128{
129 for (size_t i = 0; i < m_bitArray.size(); ++i)
130 m_bitArray[i] |= other.m_bitArray[i];
131}
132
133template <unsigned keyBits>
134bool BloomFilter<keyBits>::isBitSet(unsigned key) const
135{
136 unsigned maskedKey = key & keyMask;
137 ASSERT(arrayIndex(maskedKey) < m_bitArray.size());
138 return m_bitArray[arrayIndex(maskedKey)] & bitMask(maskedKey);
139}
140
141template <unsigned keyBits>
142void BloomFilter<keyBits>::setBit(unsigned key)
143{
144 unsigned maskedKey = key & keyMask;
145 ASSERT(arrayIndex(maskedKey) < m_bitArray.size());
146 m_bitArray[arrayIndex(maskedKey)] |= bitMask(maskedKey);
147}
148
149template <unsigned keyBits>
150inline void BloomFilter<keyBits>::clear()
151{
152 m_bitArray.fill(0);
153}
154
155// Counting bloom filter with 8 bit counters. Uses 2^keyBits bytes of memory. Error rates as above.
156// See http://en.wikipedia.org/wiki/Bloom_filter#Counting_filters
157template <unsigned keyBits>
158class CountingBloomFilter {
159 WTF_MAKE_FAST_ALLOCATED;
160public:
161 static const size_t tableSize = 1 << keyBits;
162 static unsigned maximumCount() { return std::numeric_limits<uint8_t>::max(); }
163
164 CountingBloomFilter();
165
166 void add(unsigned hash);
167 void remove(unsigned hash);
168
169 // The filter may give false positives (claim it may contain a key it doesn't)
170 // but never false negatives (claim it doesn't contain a key it does).
171 bool mayContain(unsigned hash) const { return firstBucket(hash) && secondBucket(hash); }
172
173 // The filter must be cleared before reuse even if all keys are removed.
174 // Otherwise overflowed keys will stick around.
175 void clear();
176
177 void add(const AtomString& string) { add(string.impl()->existingHash()); }
178 void add(const String& string) { add(string.impl()->hash()); }
179 void remove(const AtomString& string) { remove(string.impl()->existingHash()); }
180 void remove(const String& string) { remove(string.impl()->hash()); }
181
182 bool mayContain(const AtomString& string) const { return mayContain(string.impl()->existingHash()); }
183 bool mayContain(const String& string) const { return mayContain(string.impl()->hash()); }
184
185#if !ASSERT_DISABLED
186 // Slow.
187 bool likelyEmpty() const;
188 bool isClear() const;
189#endif
190
191private:
192 static const unsigned keyMask = (1 << keyBits) - 1;
193
194 uint8_t& firstBucket(unsigned hash) { return m_buckets[hash & keyMask]; }
195 uint8_t& secondBucket(unsigned hash) { return m_buckets[(hash >> 16) & keyMask]; }
196 const uint8_t& firstBucket(unsigned hash) const { return m_buckets[hash & keyMask]; }
197 const uint8_t& secondBucket(unsigned hash) const { return m_buckets[(hash >> 16) & keyMask]; }
198
199 std::array<uint8_t, tableSize> m_buckets;
200};
201
202template <unsigned keyBits>
203inline CountingBloomFilter<keyBits>::CountingBloomFilter()
204 : m_buckets()
205{
206}
207
208template <unsigned keyBits>
209inline void CountingBloomFilter<keyBits>::add(unsigned hash)
210{
211 auto& first = firstBucket(hash);
212 auto& second = secondBucket(hash);
213 if (LIKELY(first < maximumCount()))
214 ++first;
215 if (LIKELY(second < maximumCount()))
216 ++second;
217}
218
219template <unsigned keyBits>
220inline void CountingBloomFilter<keyBits>::remove(unsigned hash)
221{
222 auto& first = firstBucket(hash);
223 auto& second = secondBucket(hash);
224 ASSERT(first);
225 ASSERT(second);
226 // In case of an overflow, the bucket sticks in the table until clear().
227 if (LIKELY(first < maximumCount()))
228 --first;
229 if (LIKELY(second < maximumCount()))
230 --second;
231}
232
233template <unsigned keyBits>
234inline void CountingBloomFilter<keyBits>::clear()
235{
236 m_buckets.fill(0);
237}
238
239#if !ASSERT_DISABLED
240template <unsigned keyBits>
241bool CountingBloomFilter<keyBits>::likelyEmpty() const
242{
243 for (auto& bucket : m_buckets) {
244 if (bucket && bucket != maximumCount())
245 return false;
246 }
247 return true;
248}
249
250template <unsigned keyBits>
251bool CountingBloomFilter<keyBits>::isClear() const
252{
253 for (auto& bucket : m_buckets) {
254 if (bucket)
255 return false;
256 }
257 return true;
258}
259#endif
260
261}
262
263using WTF::BloomFilter;
264using WTF::CountingBloomFilter;
265