https://github.com/lorenzhs/BuRR
Raw File
Tip revision: 1c62832ad7d6eab5b337f386955868c3ce9a54ea authored by Lorenz Hübschle-Schneider on 11 September 2021, 12:56:53 UTC
README: paper link, bibtex
Tip revision: 1c62832
hasher.hpp
//  Copyright (c) Lorenz Hübschle-Schneider
//  Copyright (c) Facebook, Inc. and its affiliates.
//  All Rights Reserved.  This source code is licensed under the Apache 2.0
//  License (found in the LICENSE file in the root directory).

#pragma once

#include "config.hpp"
#include "pcg-cpp/include/pcg_random.hpp"
#include "rocksdb/coding.h"
#include "rocksdb/fastrange.h"

#include <tlx/math/integer_log2.hpp>

#include <utility>

namespace ribbon {

namespace {
// for std::conditional_t<cond, ActualThing, DummyData>
struct DummyData {};
} // namespace

// Based on Peter Dillinger / Facebook's rocksdb::ribbon::StandardHasher
template <typename Config>
class Hasher {
public:
    IMPORT_RIBBON_CONFIG(Config);
    static constexpr bool log = Config::log; // export for BandingAddRange...
    static_assert(sizeof(Hash) == 8 || sizeof(Hash) == 4,
                  "Unsupported hash size");
    static_assert(sizeof(CoeffRow) <= 16,
                  "More than 64 coefficient bits not supported atm");

    Hasher() = default;
    explicit Hasher(uint64_t seed) {
        Seed(seed);
    }

    void Seed(uint64_t seed) {
        seed_ = seed;
        if constexpr (kUseMultiplyShiftHash) {
            pcg32 random(seed);
            for (auto v : {&multiply_, &add_}) {
                *v = random();
                for (int i = 1; i <= 4; ++i) {
                    *v = *v << 32;
                    *v |= random();
                }
            }
        }
    }

    inline Hash GetHash(const Key& key) const {
        if constexpr (kUseMultiplyShiftHash && std::is_arithmetic_v<Key>) {
            return (add_ + multiply_ * static_cast<decltype(multiply_)>(key)) >> 64;
        } else {
            return Config::HashFn(key, seed_);
        }
    }

    inline Hash GetHash(const std::pair<Key, ResultRow>& in) const {
        return GetHash(in.first);
    }

    // this has to be fast
    inline Index GetStart(Hash h, Index num_starts) const {
        return rocksdb::FastRangeGeneric(h, num_starts);
    }

    inline CoeffRow GetCoeffs(Hash h) const {
        Hash a = h * kCoeffAndResultFactor + kCARFadd;
        CoeffRow cr;

        // expand hash to 64 bits if needed
        if constexpr (sizeof(CoeffRow) > sizeof(Hash)) {
            uint64_t b = a;
            // Almost-trivial hash expansion, favoring roughly
            // equal number of 1's and 0's in result
            b = (b << 32) ^ b ^ kCoeffXor32;

            if constexpr (sizeof(CoeffRow) == 16) {
                __uint128_t c = b;
                c = (c << 64) ^ c ^ kCoeffXor64;
                cr = static_cast<CoeffRow>(c);
            } else {
                static_assert(sizeof(Hash) < 8);
                cr = static_cast<CoeffRow>(b);
            }
        } else {
            cr = static_cast<CoeffRow>(a);
        }

        // Now ensure the value is non-zero
        if constexpr (kFirstCoeffAlwaysOne) {
            cr |= 1;
        } else if constexpr (sizeof(CoeffRow) <= sizeof(Hash)) {
            // Still have to ensure some bit is non-zero
            cr |= (cr == 0) ? 1 : 0;
        } else {
            // We did trivial expansion with constant xor, which ensures
            // some bits are non-zero.
            assert(cr != 0);
        }
        return cr;
    }

    inline ResultRow GetResultRowFromHash(Hash h) const {
        // it makes no sense to call this for a retrieval configuration
        assert(Config::kIsFilter);
        Hash a = h * kCoeffAndResultFactor + kCARFadd;
        // The bits here that are *most* independent of Start are the highest
        // order bits (as in Knuth multiplicative hash). To make those the
        // most preferred for use in the result row, we do a bswap here.
        auto rr = static_cast<ResultRow>(rocksdb::EndianSwapValue(a));
        return rr & kResultRowMask;
    }

    // makes no sense for filters
    inline ResultRow GetResultRowFromInput(const Key&) const {
        assert(false);
        return 0;
    }

    inline ResultRow GetResultRowFromInput(const std::pair<Key, ResultRow>& in) const {
        // simple extraction
        return in.second;
    }

protected:
    uint64_t seed_;
    std::conditional_t<kUseMultiplyShiftHash, __uint128_t, DummyData> multiply_,
        add_;
    // For expanding hash: large random prime, congruent 1 modulo 4
    static constexpr Hash kCoeffAndResultFactor =
        static_cast<Hash>(0xc28f82822b650bedULL);
    // additional odd constant to add to the product
    static constexpr Hash kCARFadd = static_cast<Hash>(0xe6165a994ca52647);
    // random-ish data
    static constexpr uint32_t kCoeffXor32 = 0xa6293635U;
    static constexpr uint64_t kCoeffXor64 = 0xc367844a6e52731dU;

    static constexpr ResultRow kResultRowMask = (1ul << kResultBits) - 1;
};


namespace {
// some large random primes for Knuth multiplicative hashing - more than should
// ever be needed (if you need 10 recursive ribbons, you're doing it wrong)
static constexpr std::array<uint64_t, 9> kHashFactors = {
    0x805A57654F4304C3, 0x8E54A33A76D524A9, 0x6F2CC7AB2F1EC959,
    0x9E34124D0877F827, 0x9BE5DFCD6933E6E1, 0xE000C949B34E31FD,
    0xB96A37021B8A394D, 0xFB9B6D45A37C6055, 0xEC63A51B4B89C7BF};
} // namespace

template <typename Config>
class MHCHasher /* not derived */ {
public:
    IMPORT_RIBBON_CONFIG(Config);
    static constexpr bool log = Config::log; // export for BandingAddRange...
    using mhc_t = uint64_t;
    using hash_t = uint64_t;
    // Hash should not be used but make sure is set to something sane
    static_assert(std::is_same_v<Hash, hash_t>, "Hash must be uint64_t");

    MHCHasher() = default;
    explicit MHCHasher(uint64_t seed, unsigned idx) {
        Seed(seed, idx);
    }

    void Seed(uint64_t seed, unsigned idx) {
        seed_ = seed;
        assert(idx < kHashFactors.size());
        fct_ = kHashFactors[idx];

        if constexpr (kUseMultiplyShiftHash) {
            pcg32 random(seed);
            for (auto v : {&multiply_, &add_}) {
                *v = random();
                for (int i = 1; i <= 4; ++i) {
                    *v <<= 32;
                    *v |= random();
                }
            }
        }
    }

    inline hash_t GetHash(mhc_t mhc) const {
        // return middle bits of the product (Knuth Multiplicative Hashing)
        // XXX why middle? why not most significant?
        return static_cast<hash_t>(
            (static_cast<__uint128_t>(mhc) * static_cast<__uint128_t>(fct_)) >> 32);
    }

    inline hash_t GetHash(const std::pair<mhc_t, ResultRow>& p) const {
        assert(!kIsFilter);
        return GetHash(p.first);
    }

    // Get MHC
    inline mhc_t GetMHC(const Key& key) const {
        if constexpr (kUseMultiplyShiftHash && std::is_arithmetic_v<Key>) {
            return (add_ + multiply_ * static_cast<decltype(multiply_)>(key)) >> 64;
        } else {
            return Config::HashFn(key, seed_);
        }
    }

    inline Index GetStart(hash_t h, Index num_starts) const {
        return rocksdb::FastRange64(h, num_starts);
    }

    inline CoeffRow GetCoeffs(hash_t h) const {
        // intentional cast to hash_t
        hash_t a = h * kCoeffAndResultFactor + kCARFadd;
        CoeffRow cr;

        // Expand hash to 64 bits. Almost-trivial hash expansion, favoring
        // roughly equal number of 1's and 0's in result
        uint64_t b = a;
        // b = (b << 32) ^ b ^ kCoeffXor32;

        if constexpr (sizeof(CoeffRow) == 16) {
            __uint128_t c = b;
            c = (c << 64) ^ c ^ kCoeffXor64;
            cr = static_cast<CoeffRow>(c);
        } else {
            cr = static_cast<CoeffRow>(b);
        }

        // Now ensure the value is non-zero
        if constexpr (kFirstCoeffAlwaysOne) {
            cr |= 1;
        } else if constexpr (sizeof(CoeffRow) <= sizeof(hash_t)) {
            // Still have to ensure some bit is non-zero
            cr |= (cr == 0) ? 1 : 0;
        } else {
            // We did trivial expansion with constant xor, which ensures
            // some bits are non-zero.
            assert(cr != 0);
        }
        return cr;
    }

    inline ResultRow GetResultRowFromHash(hash_t h) const {
        // it makes no sense to call this for a retrieval configuration
        assert(Config::kIsFilter);
        uint64_t a = static_cast<mhc_t>(h) * kCoeffAndResultFactor + kCARFadd;
        // The bits here that are *most* independent of Start are the highest
        // order bits (as in Knuth multiplicative hash). To make those the
        // most preferred for use in the result row, we do a bswap here.
        auto rr = static_cast<ResultRow>(rocksdb::EndianSwapValue(a));
        return rr & kResultRowMask;
    }

    // makes no sense for filters
    inline ResultRow GetResultRowFromInput(const Key&) const {
        assert(false);
        return 0;
    }

    inline ResultRow GetResultRowFromInput(const std::pair<Key, ResultRow>& in) const {
        // simple extraction
        return in.second;
    }

    uint64_t GetFactor() const {
        return fct_;
    }

protected:
    uint64_t seed_;
    uint64_t fct_;
    std::conditional_t<kUseMultiplyShiftHash, __uint128_t, DummyData> multiply_,
        add_;
    // For expanding hash: large random prime, congruent 1 modulo 4
    static constexpr Hash kCoeffAndResultFactor =
        static_cast<Hash>(0xc28f82822b650bedULL);
    // additional odd constant to add to the product
    static constexpr Hash kCARFadd = static_cast<Hash>(0xe6165a994ca52647);
    // random-ish data
    static constexpr uint32_t kCoeffXor32 = 0xa6293635U;
    static constexpr uint64_t kCoeffXor64 = 0xc367844a6e52731dU;

    static constexpr ResultRow kResultRowMask = (1ul << kResultBits) - 1;
};

namespace {
template <bool mhc, typename Config>
struct BaseHasherChooser;

template <typename Config>
struct BaseHasherChooser<true, Config> {
    using type = MHCHasher<Config>;
};

template <typename Config>
struct BaseHasherChooser<false, Config> {
    using type = Hasher<Config>;
};

template <typename Config>
using ChooseBaseHasher =
    typename BaseHasherChooser<Config::kUseMHC, Config>::type;
} // namespace

template <typename Config>
class SparseHasher : public ChooseBaseHasher<Config> {
public:
    IMPORT_RIBBON_CONFIG(Config);
    using Super = ChooseBaseHasher<Config>;
    static constexpr bool debug = false;

    // when changing set_bits_, also update thresh_meta_bits in config.hpp!
    static constexpr unsigned set_bits_ = (kCoeffBits == 128)
                                              ? 16
                                              : (kCoeffBits == 16 ? 4 : 8),
                              step_ = kCoeffBits / set_bits_, mask_ = step_ - 1,
                              shift_ = tlx::integer_log2_ceil(step_);

    SparseHasher() : Super() {
        sLOGC(Config::log) << "Sparse Hasher with" << set_bits_ << "set bits, step"
                           << step_ << "mask" << mask_ << "shift" << shift_;
    }
    explicit SparseHasher(uint64_t seed) : Super(seed) {
        sLOGC(Config::log) << "Sparse Hasher with" << set_bits_ << "set bits, step"
                           << step_ << "mask" << mask_ << "shift" << shift_;
    }

    // this has to be fast
    inline Index GetStart(Hash h, Index num_starts) const {
        // byte-aligned
        return rocksdb::FastRangeGeneric(h, num_starts >> shift_) << shift_;
    }

    uint32_t GetCompactHash(Hash h) const {
        Hash a = h * Super::kCoeffAndResultFactor;

        // coeff positions
        const unsigned cbits = set_bits_ * shift_;
        uint32_t result = a & ((1u << cbits) - 1);
        // result
        ResultRow r = GetResultRowFromHash(h);
        assert(r < (1u << (32 - cbits)));
        result |= (r << cbits);

        // check
        assert(GetResultRowFromHash(result) == r);
        assert(GetCoeffs(result) == GetCoeffs(h));

        return result;
    }

    using Super::GetResultRowFromHash;
    inline ResultRow GetResultRowFromHash(uint32_t compact_hash) const {
        LOG0 << "Getting result from compact" << compact_hash;
        return compact_hash >> (set_bits_ * shift_);
    }

    inline CoeffRow GetCoeffs(uint32_t compact_hash) const {
        LOG0 << "Unpacking compact" << compact_hash;
        CoeffRow cr = 0;
        for (unsigned i = 0; i < set_bits_; i++) {
            unsigned pos = (compact_hash & mask_) + step_ * i;
            cr |= static_cast<CoeffRow>(1) << pos;
            compact_hash >>= shift_;
        }
        // compact_hash now contains the ResultRow

        // Now ensure the value is non-zero
        if constexpr (kFirstCoeffAlwaysOne) {
            cr |= 1;
        } else {
            // Still have to ensure some bit is non-zero
            cr |= (cr == 0) ? 1 : 0;
        }
        return cr;
    }

    inline CoeffRow GetCoeffs(Hash h) const {
        Hash a = h * Super::kCoeffAndResultFactor;
        CoeffRow cr = 0;

        for (unsigned i = 0; i < set_bits_; i++) {
            unsigned pos = (a & mask_) + step_ * i;
            cr |= static_cast<CoeffRow>(1) << pos;
            a >>= shift_;
        }
        // LOG << "GetCoeffs(0x" << std::hex << h << ") = 0x" << (uint64_t)cr
        //     << std::dec;

        // Now ensure the value is non-zero
        if constexpr (kFirstCoeffAlwaysOne) {
            cr |= 1;
        } else {
            // Still have to ensure some bit is non-zero
            cr |= (cr == 0) ? 1 : 0;
        }
        return cr;
    }
};

namespace {
template <bool, typename>
struct HasherChooser;

template <typename Config>
struct HasherChooser<true, Config> {
    using type = SparseHasher<Config>;
};

template <typename Config>
struct HasherChooser<false, Config> {
    using type = ChooseBaseHasher<Config>;
};

} // namespace

template <typename Config>
using ChooseHasher = typename HasherChooser<Config::kSparseCoeffs, Config>::type;


// can't do std::conditional_t<kUseMHC, typename Hasher::mhc_t, xyz> because
// mhc_t isn't defined unless kUseMHC is set so need to work around that
template <typename Hasher, typename = void>
struct HashTraits {
    using mhc_or_key_t = typename Hasher::Key;
    using hash_t = typename Hasher::Hash;
};

template <typename Hasher>
struct HashTraits<Hasher, std::void_t<typename Hasher::mhc_t>> {
    using mhc_or_key_t = typename Hasher::mhc_t;
    using hash_t = typename Hasher::hash_t;
};

} // namespace ribbon
back to top