kloessner/probfd/int__hash__set_8h_source.html

#ifndef ALGORITHMS_INT_HASH_SET_H

#define ALGORITHMS_INT_HASH_SET_H


#include "downward/utils/collections.h"

#include "downward/utils/logging.h"

#include "downward/utils/system.h"


#include <algorithm>

#include <cassert>

#include <iostream>

#include <limits>

#include <utility>

#include <vector>


namespace int_hash_set {

/*

  Hash set for storing non-negative integer keys.


  Compared to unordered_set<int> in the standard library, this

  implementation is much more memory-efficient. It requires 8 bytes

  per bucket, so roughly 12-16 bytes per entry with typical load

  factors.


  Usage:


  IntHashSet<MyHasher, MyEqualityTester> s;

  pair<int, bool> result1 = s.insert(3);

  assert(result1 == make_pair(3, true));

  pair<int, bool> result2 = s.insert(3);

  assert(result2 == make_pair(3, false));


  Limitations:


  We use 32-bit (signed and unsigned) integers instead of larger data

  types for keys and hashes to save memory.


  Consequently, the range of valid keys is [0, 2^31 - 1]. This range

  could be extended to [0, 2^32 - 2] without using more memory by

  using unsigned integers for the keys and a different designated

  value for empty buckets (currently we use -1 for this).


  The maximum capacity (i.e., number of buckets) is 2^30 because we

  use a signed integer to store it, we grow the hash set by doubling

  its capacity, and the next larger power of 2 (2^31) is too big for

  an int. The maximum capacity could be increased by storing the

  number of buckets in a larger data type.


  Note on hash functions:


  Because the hash table uses open addressing, it is important to use

  hash functions that distribute the values roughly uniformly. Hash

  implementations intended for use with C++ standard containers often

  do not satisfy this requirement, for example hashing ints or

  pointers to themselves, which is not problematic for hash

  implementations based on chaining but can be catastrophic for this

  implementation.


  Implementation:


  All data and hashes are stored in a single vector, using open

  addressing. We use ideas from hopscotch hashing

  (https://en.wikipedia.org/wiki/Hopscotch_hashing) to ensure that

  each key is at most "max_distance" buckets away from its ideal

  bucket. This ensures constant lookup times since we always need to

  check at most "max_distance" buckets. Since all buckets we need to

  check for a given key are aligned in memory, the lookup has good

  cache locality.


*/


using KeyType = int;

using HashType = unsigned int;


static_assert(sizeof(KeyType) == 4, "KeyType does not use 4 bytes");

static_assert(sizeof(HashType) == 4, "HashType does not use 4 bytes");


template <typename Hasher, typename Equal>

class IntHashSet {

    // Max distance from the ideal bucket to the actual bucket for each key.

    static const int MAX_DISTANCE = 32;

    static const unsigned int MAX_BUCKETS =

        std::numeric_limits<unsigned int>::max();


    struct Bucket {

        KeyType key;

        HashType hash;


        static const KeyType empty_bucket_key = -1;


        Bucket()

            : key(empty_bucket_key)

            , hash(0)

        {

        }


        Bucket(KeyType key, HashType hash)

            : key(key)

            , hash(hash)

        {

        }


        bool full() const { return key != empty_bucket_key; }

    };


    Hasher hasher;

    Equal equal;

    std::vector<Bucket> buckets;

    int num_entries;

    int num_resizes;


    int capacity() const { return buckets.size(); }


    void rehash(int new_capacity)

    {

        assert(new_capacity >= 1);

        int num_entries_before = num_entries;

        std::vector<Bucket> old_buckets = std::move(buckets);

        assert(buckets.empty());

        num_entries = 0;

        buckets.resize(new_capacity);

        for (const Bucket& bucket : old_buckets) {

            if (bucket.full()) {

                insert(bucket.key, bucket.hash);

            }

        }

        (void)num_entries_before;

        assert(num_entries == num_entries_before);

        ++num_resizes;

    }


    void enlarge()

    {

        unsigned int num_buckets = buckets.size();

        // Verify that the number of buckets is a power of 2.

        assert((num_buckets & (num_buckets - 1)) == 0);

        if (num_buckets > MAX_BUCKETS / 2) {

            std::cerr << "IntHashSet surpassed maximum capacity. This means"

                         " you either use IntHashSet for high-memory"

                         " applications for which it was not designed, or there"

                         " is an unexpectedly high number of hash collisions"

                         " that should be investigated. Aborting."

                      << std::endl;

            utils::exit_with(utils::ExitCode::SEARCH_CRITICAL_ERROR);

        }

        rehash(num_buckets * 2);

    }


    int get_bucket(HashType hash) const

    {

        assert(!buckets.empty());

        unsigned int num_buckets = buckets.size();

        // Verify that the number of buckets is a power of 2.

        assert((num_buckets & (num_buckets - 1)) == 0);

        /* We want to return hash % num_buckets. The following line does this

           because we know that num_buckets is a power of 2. */

        return hash & (num_buckets - 1);

    }


    /*

      Return distance from index1 to index2, only moving right and wrapping

      from the last to the first bucket.

    */

    int get_distance(int index1, int index2) const

    {

        assert(utils::in_bounds(index1, buckets));

        assert(utils::in_bounds(index2, buckets));

        if (index2 >= index1) {

            return index2 - index1;

        } else {

            return capacity() + index2 - index1;

        }

    }


    int find_next_free_bucket_index(int index) const

    {

        assert(num_entries < capacity());

        assert(utils::in_bounds(index, buckets));

        while (buckets[index].full()) {

            index = get_bucket(index + 1);

        }

        return index;

    }


    KeyType find_equal_key(KeyType key, HashType hash) const

    {

        assert(hasher(key) == hash);

        int ideal_index = get_bucket(hash);

        for (int i = 0; i < MAX_DISTANCE; ++i) {

            int index = get_bucket(ideal_index + i);

            const Bucket& bucket = buckets[index];

            if (bucket.full() && bucket.hash == hash &&

                equal(bucket.key, key)) {

                return bucket.key;

            }

        }

        return Bucket::empty_bucket_key;

    }


    /*

      Private method that inserts a key and its corresponding hash into the

      hash set.


      The method ensures that each key is at most "max_distance" buckets away

      from its ideal bucket by moving the closest free bucket towards the ideal

      bucket. If this can't be achieved, we resize the vector, reinsert the old

      keys and try inserting the new key again.


      For the return type, see the public insert() method.


      Note that the private insert() may call enlarge() and therefore rehash(),

      which itself calls the private insert() again.

    */

    std::pair<KeyType, bool> insert(KeyType key, HashType hash)

    {

        assert(hasher(key) == hash);


        /* If the hash set already contains the key, return the key and a

           Boolean indicating that no new key has been inserted. */

        KeyType equal_key = find_equal_key(key, hash);

        if (equal_key != Bucket::empty_bucket_key) {

            return std::make_pair(equal_key, false);

        }


        assert(num_entries <= capacity());

        if (num_entries == capacity()) {

            enlarge();

        }

        assert(num_entries < capacity());


        // Compute ideal bucket.

        int ideal_index = get_bucket(hash);


        // Find first free bucket left of the ideal bucket.

        int free_index = find_next_free_bucket_index(ideal_index);


        /*

          While the free bucket is too far from the ideal bucket, move the free

          bucket towards the ideal bucket by swapping a suitable third bucket

          with the free bucket. A full and an empty bucket can be swapped if

          the swap doesn't move the full bucket too far from its ideal

          position.

        */

        while (get_distance(ideal_index, free_index) >= MAX_DISTANCE) {

            bool swapped = false;

            int num_buckets = capacity();

            int max_offset = std::min(MAX_DISTANCE, num_buckets) - 1;

            for (int offset = max_offset; offset >= 1; --offset) {

                assert(offset < num_buckets);

                int candidate_index = free_index + num_buckets - offset;

                assert(candidate_index >= 0);

                candidate_index = get_bucket(candidate_index);

                HashType candidate_hash = buckets[candidate_index].hash;

                int candidate_ideal_index = get_bucket(candidate_hash);

                if (get_distance(candidate_ideal_index, free_index) <

                    MAX_DISTANCE) {

                    // Candidate can be swapped.

                    std::swap(buckets[candidate_index], buckets[free_index]);

                    free_index = candidate_index;

                    swapped = true;

                    break;

                }

            }

            if (!swapped) {

                /* Free bucket could not be moved close enough to ideal bucket.

                   -> Enlarge and try inserting again. */

                enlarge();

                return insert(key, hash);

            }

        }

        assert(utils::in_bounds(free_index, buckets));

        assert(!buckets[free_index].full());

        buckets[free_index] = Bucket(key, hash);

        ++num_entries;

        return std::make_pair(key, true);

    }


public:

    IntHashSet(const Hasher& hasher, const Equal& equal)

        : hasher(hasher)

        , equal(equal)

        , buckets(1)

        , num_entries(0)

        , num_resizes(0)

    {

    }


    int size() const { return num_entries; }


    /*

      Insert a key into the hash set.


      Return a pair whose first item is the given key, or an equivalent key

      already contained in the hash set. The second item in the pair is a bool

      indicating whether a new key was inserted into the hash set.

    */

    std::pair<KeyType, bool> insert(KeyType key)

    {

        assert(key >= 0);

        return insert(key, hasher(key));

    }


    void dump(utils::LogProxy& log) const

    {

        int num_buckets = capacity();

        log << "[";

        for (int i = 0; i < num_buckets; ++i) {

            const Bucket& bucket = buckets[i];

            if (bucket.full()) {

                log << bucket.key;

            } else {

                log << "_";

            }

            if (i < num_buckets - 1) {

                log << ", ";

            }

        }

        log << "]" << std::endl;

    }


    void print_statistics(utils::LogProxy& log) const

    {

        assert(!buckets.empty());

        int num_buckets = capacity();

        assert(num_buckets != 0);

        log << "Int hash set load factor: " << num_entries << "/" << num_buckets

            << " = " << static_cast<double>(num_entries) / num_buckets

            << std::endl;

        log << "Int hash set resizes: " << num_resizes << std::endl;

    }

};


template <typename Hasher, typename Equal>

const int IntHashSet<Hasher, Equal>::MAX_DISTANCE;


template <typename Hasher, typename Equal>

const unsigned int IntHashSet<Hasher, Equal>::MAX_BUCKETS;

} // namespace int_hash_set


#endif