AI 24/25 Project Software
Documentation for the AI 24/25 course programming project software
Loading...
Searching...
No Matches
int_hash_set.h
1#ifndef ALGORITHMS_INT_HASH_SET_H
2#define ALGORITHMS_INT_HASH_SET_H
3
4#include "downward/utils/collections.h"
5#include "downward/utils/logging.h"
6#include "downward/utils/system.h"
7
8#include <algorithm>
9#include <cassert>
10#include <iostream>
11#include <limits>
12#include <utility>
13#include <vector>
14
15namespace int_hash_set {
16/*
17 Hash set for storing non-negative integer keys.
18
19 Compared to unordered_set<int> in the standard library, this
20 implementation is much more memory-efficient. It requires 8 bytes
21 per bucket, so roughly 12-16 bytes per entry with typical load
22 factors.
23
24 Usage:
25
26 IntHashSet<MyHasher, MyEqualityTester> s;
27 pair<int, bool> result1 = s.insert(3);
28 assert(result1 == make_pair(3, true));
29 pair<int, bool> result2 = s.insert(3);
30 assert(result2 == make_pair(3, false));
31
32 Limitations:
33
34 We use 32-bit (signed and unsigned) integers instead of larger data
35 types for keys and hashes to save memory.
36
37 Consequently, the range of valid keys is [0, 2^31 - 1]. This range
38 could be extended to [0, 2^32 - 2] without using more memory by
39 using unsigned integers for the keys and a different designated
40 value for empty buckets (currently we use -1 for this).
41
42 The maximum capacity (i.e., number of buckets) is 2^30 because we
43 use a signed integer to store it, we grow the hash set by doubling
44 its capacity, and the next larger power of 2 (2^31) is too big for
45 an int. The maximum capacity could be increased by storing the
46 number of buckets in a larger data type.
47
48 Note on hash functions:
49
50 Because the hash table uses open addressing, it is important to use
51 hash functions that distribute the values roughly uniformly. Hash
52 implementations intended for use with C++ standard containers often
53 do not satisfy this requirement, for example hashing ints or
54 pointers to themselves, which is not problematic for hash
55 implementations based on chaining but can be catastrophic for this
56 implementation.
57
58 Implementation:
59
60 All data and hashes are stored in a single vector, using open
61 addressing. We use ideas from hopscotch hashing
62 (https://en.wikipedia.org/wiki/Hopscotch_hashing) to ensure that
63 each key is at most "max_distance" buckets away from its ideal
64 bucket. This ensures constant lookup times since we always need to
65 check at most "max_distance" buckets. Since all buckets we need to
66 check for a given key are aligned in memory, the lookup has good
67 cache locality.
68
69*/
70
71using KeyType = int;
72using HashType = unsigned int;
73
74static_assert(sizeof(KeyType) == 4, "KeyType does not use 4 bytes");
75static_assert(sizeof(HashType) == 4, "HashType does not use 4 bytes");
76
77template <typename Hasher, typename Equal>
78class IntHashSet {
79 // Max distance from the ideal bucket to the actual bucket for each key.
80 static const int MAX_DISTANCE = 32;
81 static const unsigned int MAX_BUCKETS =
82 std::numeric_limits<unsigned int>::max();
83
84 struct Bucket {
85 KeyType key;
86 HashType hash;
87
88 static const KeyType empty_bucket_key = -1;
89
90 Bucket()
91 : key(empty_bucket_key)
92 , hash(0)
93 {
94 }
95
96 Bucket(KeyType key, HashType hash)
97 : key(key)
98 , hash(hash)
99 {
100 }
101
102 bool full() const { return key != empty_bucket_key; }
103 };
104
105 Hasher hasher;
106 Equal equal;
107 std::vector<Bucket> buckets;
108 int num_entries;
109 int num_resizes;
110
111 int capacity() const { return buckets.size(); }
112
113 void rehash(int new_capacity)
114 {
115 assert(new_capacity >= 1);
116 int num_entries_before = num_entries;
117 std::vector<Bucket> old_buckets = std::move(buckets);
118 assert(buckets.empty());
119 num_entries = 0;
120 buckets.resize(new_capacity);
121 for (const Bucket& bucket : old_buckets) {
122 if (bucket.full()) {
123 insert(bucket.key, bucket.hash);
124 }
125 }
126 (void)num_entries_before;
127 assert(num_entries == num_entries_before);
128 ++num_resizes;
129 }
130
131 void enlarge()
132 {
133 unsigned int num_buckets = buckets.size();
134 // Verify that the number of buckets is a power of 2.
135 assert((num_buckets & (num_buckets - 1)) == 0);
136 if (num_buckets > MAX_BUCKETS / 2) {
137 std::cerr << "IntHashSet surpassed maximum capacity. This means"
138 " you either use IntHashSet for high-memory"
139 " applications for which it was not designed, or there"
140 " is an unexpectedly high number of hash collisions"
141 " that should be investigated. Aborting."
142 << std::endl;
143 utils::exit_with(utils::ExitCode::SEARCH_CRITICAL_ERROR);
144 }
145 rehash(num_buckets * 2);
146 }
147
148 int get_bucket(HashType hash) const
149 {
150 assert(!buckets.empty());
151 unsigned int num_buckets = buckets.size();
152 // Verify that the number of buckets is a power of 2.
153 assert((num_buckets & (num_buckets - 1)) == 0);
154 /* We want to return hash % num_buckets. The following line does this
155 because we know that num_buckets is a power of 2. */
156 return hash & (num_buckets - 1);
157 }
158
159 /*
160 Return distance from index1 to index2, only moving right and wrapping
161 from the last to the first bucket.
162 */
163 int get_distance(int index1, int index2) const
164 {
165 assert(utils::in_bounds(index1, buckets));
166 assert(utils::in_bounds(index2, buckets));
167 if (index2 >= index1) {
168 return index2 - index1;
169 } else {
170 return capacity() + index2 - index1;
171 }
172 }
173
174 int find_next_free_bucket_index(int index) const
175 {
176 assert(num_entries < capacity());
177 assert(utils::in_bounds(index, buckets));
178 while (buckets[index].full()) {
179 index = get_bucket(index + 1);
180 }
181 return index;
182 }
183
184 KeyType find_equal_key(KeyType key, HashType hash) const
185 {
186 assert(hasher(key) == hash);
187 int ideal_index = get_bucket(hash);
188 for (int i = 0; i < MAX_DISTANCE; ++i) {
189 int index = get_bucket(ideal_index + i);
190 const Bucket& bucket = buckets[index];
191 if (bucket.full() && bucket.hash == hash &&
192 equal(bucket.key, key)) {
193 return bucket.key;
194 }
195 }
196 return Bucket::empty_bucket_key;
197 }
198
199 /*
200 Private method that inserts a key and its corresponding hash into the
201 hash set.
202
203 The method ensures that each key is at most "max_distance" buckets away
204 from its ideal bucket by moving the closest free bucket towards the ideal
205 bucket. If this can't be achieved, we resize the vector, reinsert the old
206 keys and try inserting the new key again.
207
208 For the return type, see the public insert() method.
209
210 Note that the private insert() may call enlarge() and therefore rehash(),
211 which itself calls the private insert() again.
212 */
213 std::pair<KeyType, bool> insert(KeyType key, HashType hash)
214 {
215 assert(hasher(key) == hash);
216
217 /* If the hash set already contains the key, return the key and a
218 Boolean indicating that no new key has been inserted. */
219 KeyType equal_key = find_equal_key(key, hash);
220 if (equal_key != Bucket::empty_bucket_key) {
221 return std::make_pair(equal_key, false);
222 }
223
224 assert(num_entries <= capacity());
225 if (num_entries == capacity()) {
226 enlarge();
227 }
228 assert(num_entries < capacity());
229
230 // Compute ideal bucket.
231 int ideal_index = get_bucket(hash);
232
233 // Find first free bucket left of the ideal bucket.
234 int free_index = find_next_free_bucket_index(ideal_index);
235
236 /*
237 While the free bucket is too far from the ideal bucket, move the free
238 bucket towards the ideal bucket by swapping a suitable third bucket
239 with the free bucket. A full and an empty bucket can be swapped if
240 the swap doesn't move the full bucket too far from its ideal
241 position.
242 */
243 while (get_distance(ideal_index, free_index) >= MAX_DISTANCE) {
244 bool swapped = false;
245 int num_buckets = capacity();
246 int max_offset = std::min(MAX_DISTANCE, num_buckets) - 1;
247 for (int offset = max_offset; offset >= 1; --offset) {
248 assert(offset < num_buckets);
249 int candidate_index = free_index + num_buckets - offset;
250 assert(candidate_index >= 0);
251 candidate_index = get_bucket(candidate_index);
252 HashType candidate_hash = buckets[candidate_index].hash;
253 int candidate_ideal_index = get_bucket(candidate_hash);
254 if (get_distance(candidate_ideal_index, free_index) <
255 MAX_DISTANCE) {
256 // Candidate can be swapped.
257 std::swap(buckets[candidate_index], buckets[free_index]);
258 free_index = candidate_index;
259 swapped = true;
260 break;
261 }
262 }
263 if (!swapped) {
264 /* Free bucket could not be moved close enough to ideal bucket.
265 -> Enlarge and try inserting again. */
266 enlarge();
267 return insert(key, hash);
268 }
269 }
270 assert(utils::in_bounds(free_index, buckets));
271 assert(!buckets[free_index].full());
272 buckets[free_index] = Bucket(key, hash);
273 ++num_entries;
274 return std::make_pair(key, true);
275 }
276
277public:
278 IntHashSet(const Hasher& hasher, const Equal& equal)
279 : hasher(hasher)
280 , equal(equal)
281 , buckets(1)
282 , num_entries(0)
283 , num_resizes(0)
284 {
285 }
286
287 int size() const { return num_entries; }
288
289 /*
290 Insert a key into the hash set.
291
292 Return a pair whose first item is the given key, or an equivalent key
293 already contained in the hash set. The second item in the pair is a bool
294 indicating whether a new key was inserted into the hash set.
295 */
296 std::pair<KeyType, bool> insert(KeyType key)
297 {
298 assert(key >= 0);
299 return insert(key, hasher(key));
300 }
301
302 void dump(utils::LogProxy& log) const
303 {
304 int num_buckets = capacity();
305 log << "[";
306 for (int i = 0; i < num_buckets; ++i) {
307 const Bucket& bucket = buckets[i];
308 if (bucket.full()) {
309 log << bucket.key;
310 } else {
311 log << "_";
312 }
313 if (i < num_buckets - 1) {
314 log << ", ";
315 }
316 }
317 log << "]" << std::endl;
318 }
319
320 void print_statistics(utils::LogProxy& log) const
321 {
322 assert(!buckets.empty());
323 int num_buckets = capacity();
324 assert(num_buckets != 0);
325 log << "Int hash set load factor: " << num_entries << "/" << num_buckets
326 << " = " << static_cast<double>(num_entries) / num_buckets
327 << std::endl;
328 log << "Int hash set resizes: " << num_resizes << std::endl;
329 }
330};
331
332template <typename Hasher, typename Equal>
333const int IntHashSet<Hasher, Equal>::MAX_DISTANCE;
334
335template <typename Hasher, typename Equal>
336const unsigned int IntHashSet<Hasher, Equal>::MAX_BUCKETS;
337} // namespace int_hash_set
338
339#endif