diff --git a/_codeql_detected_source_root b/_codeql_detected_source_root new file mode 120000 index 00000000..945c9b46 --- /dev/null +++ b/_codeql_detected_source_root @@ -0,0 +1 @@ +. \ No newline at end of file diff --git a/include/boost/openmethod/policies/minimal_perfect_hash.hpp b/include/boost/openmethod/policies/minimal_perfect_hash.hpp new file mode 100644 index 00000000..c07e53aa --- /dev/null +++ b/include/boost/openmethod/policies/minimal_perfect_hash.hpp @@ -0,0 +1,413 @@ +// Copyright (c) 2018-2025 Jean-Louis Leroy +// Distributed under the Boost Software License, Version 1.0. +// See accompanying file LICENSE_1_0.txt +// or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef BOOST_OPENMETHOD_POLICY_MINIMAL_PERFECT_HASH_HPP +#define BOOST_OPENMETHOD_POLICY_MINIMAL_PERFECT_HASH_HPP + +#include + +#include +#include +#include +#include +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4702) // unreachable code +#endif + +namespace { + +#if defined(UINTPTR_MAX) +using uintptr = std::uintptr_t; +constexpr uintptr uintptr_max = UINTPTR_MAX; +#else +static_assert( + sizeof(std::size_t) == sizeof(void*), + "This implementation requires that size_t and void* have the same size."); +using uintptr = std::size_t; +constexpr uintptr uintptr_max = (std::numeric_limits::max)(); +#endif + +} // anonymous namespace + +namespace boost::openmethod { + +namespace detail { + +template +std::vector minimal_perfect_hash_control; + +template +std::vector minimal_perfect_hash_displacements; + +} // namespace detail + +namespace policies { + +//! Hash type ids using a minimal perfect hash function. +//! +//! `minimal_perfect_hash` implements the @ref type_hash policy using a hash +//! function in the form `H(x)=(M*x)>>N`. It uses the PtHash algorithm to +//! determine values for `M` and `N` that result in a minimal perfect hash +//! function for the set of registered type_ids. This means that the hash +//! function is collision-free and the codomain is approximately the size of +//! the domain, resulting in a dense range [0, 1.1*n-1] for n inputs. +//! +//! Unlike @ref fast_perfect_hash, which uses a hash table of size 2^k +//! (typically larger than needed) and may have unused slots, this policy +//! uses approximately 1.1*n slots for n type_ids (allowing up to 10% waste). +//! This minimizes memory usage while maintaining good search performance +//! during initialization. +struct minimal_perfect_hash : type_hash { + + //! Cannot find hash factors + struct search_error : openmethod_error { + //! Number of attempts to find hash factors + std::size_t attempts; + //! Number of buckets used in the last attempt + std::size_t buckets; + + //! Write a short description to an output stream + //! @param os The output stream + //! @tparam Registry The registry + //! @tparam Stream A @ref LightweightOutputStream + template + auto write(Stream& os) const -> void; + }; + + using errors = std::variant; + + //! A TypeHashFn metafunction. + //! + //! @tparam Registry The registry containing this policy + template + class fn { + static std::size_t mult; + static std::size_t shift; + static std::size_t table_size; // N for minimal perfect hash + static std::size_t num_groups; + static std::uint32_t group_mult; // Smaller type to avoid overflow + static std::size_t group_shift; + + static void check(std::size_t index, type_id type); + + template + static void initialize( + const InitializeContext& ctx, std::vector& buckets, + const std::tuple& options); + + public: + //! Find the hash factors using PtHash algorithm + //! + //! Uses the PtHash algorithm to find: + //! - Pilot hash parameters (M, N) for H(x) = (M * x) >> N + //! - Bucket assignment parameters + //! - Displacement values for each bucket to achieve minimal perfect hashing + //! + //! If no suitable values are found, calls the error handler with + //! a @ref search_error object then calls `abort`. + //! + //! @tparam Context An @ref InitializeContext. + //! @param ctx A Context object. + //! @return A pair containing the minimum (0) and maximum (n-1) hash values. + template + static auto + initialize(const Context& ctx, const std::tuple& options) { + if constexpr (Registry::has_runtime_checks) { + initialize( + ctx, detail::minimal_perfect_hash_control, options); + } else { + std::vector buckets; + initialize(ctx, buckets, options); + } + + return std::pair{std::size_t(0), table_size - 1}; + } + + //! Hash a type id using the PtHash algorithm + //! + //! Hash a type id using H(x) = (pilot(x) + disp[group(x)]) % N + //! where pilot(x) = (M * x) >> S and group(x) = (GM * x) >> GS. + //! + //! If `Registry` contains the @ref runtime_checks policy, checks that + //! the type id is valid, i.e. if it was present in the set passed to + //! @ref initialize. Its absence indicates that a class involved in a + //! method definition, method overrider, or method call was not + //! registered. In this case, signal a @ref missing_class using + //! the registry's @ref error_handler if present; then calls `abort`. + //! + //! @param type The type_id to hash + //! @return The hash value + BOOST_FORCEINLINE + static auto hash(type_id type) -> std::size_t { + auto pilot = (mult * reinterpret_cast(type)) >> shift; + auto group = (group_mult * reinterpret_cast(type)) >> group_shift; + auto index = (pilot + detail::minimal_perfect_hash_displacements[group]) % table_size; + + if constexpr (Registry::has_runtime_checks) { + check(index, type); + } + + return index; + } + + //! Releases the memory allocated by `initialize`. + //! + //! @tparam Options... Zero or more option types, deduced from the function + //! arguments. + //! @param options Zero or more option objects. + template + static auto finalize(const std::tuple&) -> void { + detail::minimal_perfect_hash_control.clear(); + detail::minimal_perfect_hash_displacements.clear(); + } + }; +}; + +template +std::size_t minimal_perfect_hash::fn::mult; + +template +std::size_t minimal_perfect_hash::fn::shift; + +template +std::size_t minimal_perfect_hash::fn::table_size; + +template +std::size_t minimal_perfect_hash::fn::num_groups; + +template +std::uint32_t minimal_perfect_hash::fn::group_mult; + +template +std::size_t minimal_perfect_hash::fn::group_shift; + +template +template +void minimal_perfect_hash::fn::initialize( + const InitializeContext& ctx, std::vector& buckets, + const std::tuple& options) { + (void)options; + + const auto N = std::distance(ctx.classes_begin(), ctx.classes_end()); + + if constexpr (InitializeContext::template has_option) { + ctx.tr << "Finding minimal perfect hash using PtHash for " << N << " types\n"; + } + + // Table size is N * 1.1 to allow up to 10% waste (makes finding hash easier) + // Formula: ceil(N * 1.1) = (N * 11 + 9) / 10 ensures proper rounding for all N + constexpr std::size_t WASTE_FACTOR_NUMERATOR = 11; // 1.1 = 11/10 + constexpr std::size_t WASTE_FACTOR_DENOMINATOR = 10; + constexpr std::size_t ROUNDING_ADJUSTMENT = 9; // For ceiling division + table_size = (N * WASTE_FACTOR_NUMERATOR + ROUNDING_ADJUSTMENT) / WASTE_FACTOR_DENOMINATOR; + + if (table_size == 0) { + shift = 0; + mult = 1; + num_groups = 0; + group_mult = 1; + group_shift = 0; + detail::minimal_perfect_hash_displacements.clear(); + return; + } + + if (table_size == 1) { + // Special case: only one type + constexpr std::size_t bits_per_type_id = 8 * sizeof(type_id); + shift = bits_per_type_id; + mult = 1; + num_groups = 1; + group_mult = 1; + group_shift = bits_per_type_id; + detail::minimal_perfect_hash_displacements.assign(1, 0); + buckets.resize(1); + for (auto iter = ctx.classes_begin(); iter != ctx.classes_end(); ++iter) { + for (auto type_iter = iter->type_id_begin(); + type_iter != iter->type_id_end(); ++type_iter) { + buckets[0] = *type_iter; + } + } + return; + } + + // Collect all type_ids + std::vector keys; + for (auto iter = ctx.classes_begin(); iter != ctx.classes_end(); ++iter) { + for (auto type_iter = iter->type_id_begin(); + type_iter != iter->type_id_end(); ++type_iter) { + keys.push_back(*type_iter); + } + } + + // Constants for PtHash algorithm + constexpr std::size_t DEFAULT_RANDOM_SEED = 13081963; // Same seed as fast_perfect_hash + constexpr std::size_t MAX_PASSES = 10; + constexpr std::size_t MAX_ATTEMPTS = 100000; + constexpr std::size_t DEFAULT_GROUP_DIVISOR = 4; // N/4 groups for balance between memory and speed + constexpr std::size_t DISTRIBUTION_FACTOR = 2; // 2*N range for better distribution + constexpr std::size_t bits_per_type_id = 8 * sizeof(type_id); + + std::default_random_engine rnd(DEFAULT_RANDOM_SEED); + std::uniform_int_distribution uniform_dist; + std::size_t total_attempts = 0; + + // PtHash algorithm: partition keys into groups, then find displacements + // Number of groups: typically sqrt(N) to N/4 for good performance + num_groups = (std::max)(std::size_t(1), table_size / DEFAULT_GROUP_DIVISOR); + if (num_groups > table_size) num_groups = table_size; + + // Calculate bits needed for num_groups + std::size_t GM = 0; + std::size_t power = 1; + while (power < num_groups) { + power <<= 1; + ++GM; + } + group_shift = bits_per_type_id - GM; + + if constexpr (InitializeContext::template has_option) { + ctx.tr << " Using " << num_groups << " groups for " << table_size << " keys\n"; + } + + // Try different pilot hash parameters + for (std::size_t pass = 0; pass < MAX_PASSES && total_attempts < MAX_ATTEMPTS; ++pass) { + mult = uniform_dist(rnd) | 1; + // Use a smaller multiplier for group hash to avoid overflow + // We only need enough bits to distinguish between num_groups + std::uniform_int_distribution group_dist; + group_mult = group_dist(rnd) | 1; + + // Calculate M for pilot hash (number of bits for table_size range) + std::size_t M = 0; + power = 1; + while (power < table_size * DISTRIBUTION_FACTOR) { + power <<= 1; + ++M; + } + shift = bits_per_type_id - M; + + // Partition keys into groups + std::vector> groups(num_groups); + for (auto key : keys) { + auto group_idx = ((group_mult * reinterpret_cast(key)) >> group_shift) % num_groups; + groups[group_idx].push_back(key); + } + + // Try to find displacements for each group + detail::minimal_perfect_hash_displacements.assign(num_groups, 0); + buckets.assign(table_size, type_id(uintptr_max)); + std::vector used(table_size, false); + bool success = true; + + // Process groups in descending order of size (larger groups first) + std::vector group_order(num_groups); + for (std::size_t i = 0; i < num_groups; ++i) group_order[i] = i; + std::sort(group_order.begin(), group_order.end(), + [&groups](std::size_t a, std::size_t b) { + return groups[a].size() > groups[b].size(); + }); + + for (auto g : group_order) { + if (groups[g].empty()) continue; + + // Try different displacement values + bool found = false; + for (std::size_t disp = 0; disp < table_size * DISTRIBUTION_FACTOR && !found; ++disp) { + ++total_attempts; + if (total_attempts > MAX_ATTEMPTS) { + success = false; + break; + } + + // Check if this displacement works for all keys in group + std::vector positions; + positions.reserve(groups[g].size()); + bool valid = true; + for (auto key : groups[g]) { + auto pilot = (mult * reinterpret_cast(key)) >> shift; + auto pos = (pilot + disp) % table_size; + if (used[pos]) { + valid = false; + break; + } + positions.push_back(pos); + } + + if (valid) { + // Mark positions as used and store keys + detail::minimal_perfect_hash_displacements[g] = disp; + for (std::size_t i = 0; i < groups[g].size(); ++i) { + used[positions[i]] = true; + buckets[positions[i]] = groups[g][i]; + } + found = true; + } + } + + if (!found) { + success = false; + break; + } + } + + if (success) { + // Count how many positions are used + std::size_t used_count = 0; + for (std::size_t i = 0; i < table_size; ++i) { + if (uintptr(buckets[i]) != uintptr_max) { + used_count++; + } + } + + // Accept if we've placed all keys (allow up to 10% waste) + if (used_count == keys.size()) { + if constexpr (InitializeContext::template has_option) { + ctx.tr << " Found minimal perfect hash after " << total_attempts + << " attempts; " << used_count << "/" << table_size + << " slots used\n"; + } + return; + } + } + } + + // Failed to find minimal perfect hash + search_error error; + error.attempts = total_attempts; + error.buckets = table_size; + + if constexpr (Registry::has_error_handler) { + Registry::error_handler::error(error); + } + + abort(); +} + +template +void minimal_perfect_hash::fn::check(std::size_t index, type_id type) { + if (index >= table_size || + detail::minimal_perfect_hash_control[index] != type) { + + if constexpr (Registry::has_error_handler) { + missing_class error; + error.type = type; + Registry::error_handler::error(error); + } + + abort(); + } +} + +template +auto minimal_perfect_hash::search_error::write(Stream& os) const -> void { + os << "could not find minimal perfect hash factors after " << attempts + << " attempts using " << buckets << " buckets\n"; +} + +} // namespace policies +} // namespace boost::openmethod + +#endif diff --git a/test/test_minimal_perfect_hash.cpp b/test/test_minimal_perfect_hash.cpp new file mode 100644 index 00000000..e8adb0d0 --- /dev/null +++ b/test/test_minimal_perfect_hash.cpp @@ -0,0 +1,252 @@ +// Copyright (c) 2018-2025 Jean-Louis Leroy +// Distributed under the Boost Software License, Version 1.0. +// See accompanying file LICENSE_1_0.txt +// or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include + +#define BOOST_TEST_MODULE minimal_perfect_hash +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "test_util.hpp" + +using namespace boost::openmethod; +using namespace boost::openmethod::policies; + +// Test registry with minimal_perfect_hash +struct minimal_hash_registry + : registry< + std_rtti, vptr_vector, minimal_perfect_hash, + default_error_handler, stderr_output> { +}; + +// Test registry with runtime checks +struct minimal_hash_registry_with_checks + : registry< + std_rtti, vptr_vector, minimal_perfect_hash, + default_error_handler, stderr_output, runtime_checks> { +}; + +namespace test_basic { + +struct Animal { + virtual ~Animal() {} +}; + +struct Dog : Animal {}; +struct Cat : Animal {}; +struct Bird : Animal {}; + +BOOST_OPENMETHOD_CLASSES(Animal, Dog, Cat, Bird, minimal_hash_registry); + +BOOST_OPENMETHOD(get_sound, (virtual_), std::string, minimal_hash_registry); + +BOOST_OPENMETHOD_OVERRIDE(get_sound, (const Dog&), std::string) { + return "woof"; +} + +BOOST_OPENMETHOD_OVERRIDE(get_sound, (const Cat&), std::string) { + return "meow"; +} + +BOOST_OPENMETHOD_OVERRIDE(get_sound, (const Bird&), std::string) { + return "chirp"; +} + +BOOST_AUTO_TEST_CASE(basic_functionality) { + initialize(); + + Dog dog; + Cat cat; + Bird bird; + + BOOST_TEST(get_sound(dog) == "woof"); + BOOST_TEST(get_sound(cat) == "meow"); + BOOST_TEST(get_sound(bird) == "chirp"); +} + +} // namespace test_basic + +namespace test_hash_properties { + +struct Base { + virtual ~Base() {} +}; + +struct D1 : Base {}; +struct D2 : Base {}; +struct D3 : Base {}; +struct D4 : Base {}; +struct D5 : Base {}; + +BOOST_OPENMETHOD_CLASSES(Base, D1, D2, D3, D4, D5, minimal_hash_registry); + +BOOST_OPENMETHOD(get_id, (virtual_), int, minimal_hash_registry); + +BOOST_OPENMETHOD_OVERRIDE(get_id, (const D1&), int) { + return 1; +} + +BOOST_OPENMETHOD_OVERRIDE(get_id, (const D2&), int) { + return 2; +} + +BOOST_OPENMETHOD_OVERRIDE(get_id, (const D3&), int) { + return 3; +} + +BOOST_OPENMETHOD_OVERRIDE(get_id, (const D4&), int) { + return 4; +} + +BOOST_OPENMETHOD_OVERRIDE(get_id, (const D5&), int) { + return 5; +} + +BOOST_AUTO_TEST_CASE(minimal_hash_properties) { + initialize(); + + // Test that all classes are correctly hashed + D1 d1; + D2 d2; + D3 d3; + D4 d4; + D5 d5; + + BOOST_TEST(get_id(d1) == 1); + BOOST_TEST(get_id(d2) == 2); + BOOST_TEST(get_id(d3) == 3); + BOOST_TEST(get_id(d4) == 4); + BOOST_TEST(get_id(d5) == 5); + + // Verify that the hash function produces a minimal perfect hash + // (This is implicit - if it didn't, initialization would fail or we'd get wrong results) +} + +} // namespace test_hash_properties + +namespace test_with_runtime_checks { + +struct Vehicle { + virtual ~Vehicle() {} +}; + +struct Car : Vehicle {}; +struct Bike : Vehicle {}; + +BOOST_OPENMETHOD_CLASSES(Vehicle, Car, Bike, minimal_hash_registry_with_checks); + +BOOST_OPENMETHOD(get_wheels, (virtual_), int, minimal_hash_registry_with_checks); + +BOOST_OPENMETHOD_OVERRIDE(get_wheels, (const Car&), int) { + return 4; +} + +BOOST_OPENMETHOD_OVERRIDE(get_wheels, (const Bike&), int) { + return 2; +} + +BOOST_AUTO_TEST_CASE(runtime_checks) { + initialize(); + + Car car; + Bike bike; + + BOOST_TEST(get_wheels(car) == 4); + BOOST_TEST(get_wheels(bike) == 2); +} + +} // namespace test_with_runtime_checks + +namespace test_empty { + +struct Empty { + virtual ~Empty() {} +}; + +BOOST_OPENMETHOD_CLASSES(Empty, minimal_hash_registry); + +BOOST_OPENMETHOD(process, (virtual_), int, minimal_hash_registry); + +BOOST_OPENMETHOD_OVERRIDE(process, (const Empty&), int) { + return 42; +} + +BOOST_AUTO_TEST_CASE(single_class) { + initialize(); + + Empty e; + BOOST_TEST(process(e) == 42); +} + +} // namespace test_empty + +namespace test_large_hierarchy { + +struct Root { + virtual ~Root() {} +}; + +struct L1_1 : Root {}; +struct L1_2 : Root {}; +struct L1_3 : Root {}; +struct L1_4 : Root {}; +struct L1_5 : Root {}; +struct L1_6 : Root {}; +struct L1_7 : Root {}; +struct L1_8 : Root {}; +struct L1_9 : Root {}; +struct L1_10 : Root {}; + +BOOST_OPENMETHOD_CLASSES(Root, L1_1, L1_2, L1_3, L1_4, L1_5, L1_6, L1_7, L1_8, L1_9, L1_10, minimal_hash_registry); + +BOOST_OPENMETHOD(classify, (virtual_), int, minimal_hash_registry); + +BOOST_OPENMETHOD_OVERRIDE(classify, (const L1_1&), int) { return 1; } +BOOST_OPENMETHOD_OVERRIDE(classify, (const L1_2&), int) { return 2; } +BOOST_OPENMETHOD_OVERRIDE(classify, (const L1_3&), int) { return 3; } +BOOST_OPENMETHOD_OVERRIDE(classify, (const L1_4&), int) { return 4; } +BOOST_OPENMETHOD_OVERRIDE(classify, (const L1_5&), int) { return 5; } +BOOST_OPENMETHOD_OVERRIDE(classify, (const L1_6&), int) { return 6; } +BOOST_OPENMETHOD_OVERRIDE(classify, (const L1_7&), int) { return 7; } +BOOST_OPENMETHOD_OVERRIDE(classify, (const L1_8&), int) { return 8; } +BOOST_OPENMETHOD_OVERRIDE(classify, (const L1_9&), int) { return 9; } +BOOST_OPENMETHOD_OVERRIDE(classify, (const L1_10&), int) { return 10; } + +BOOST_AUTO_TEST_CASE(larger_hierarchy) { + initialize(); + + L1_1 o1; + L1_2 o2; + L1_3 o3; + L1_4 o4; + L1_5 o5; + L1_6 o6; + L1_7 o7; + L1_8 o8; + L1_9 o9; + L1_10 o10; + + BOOST_TEST(classify(o1) == 1); + BOOST_TEST(classify(o2) == 2); + BOOST_TEST(classify(o3) == 3); + BOOST_TEST(classify(o4) == 4); + BOOST_TEST(classify(o5) == 5); + BOOST_TEST(classify(o6) == 6); + BOOST_TEST(classify(o7) == 7); + BOOST_TEST(classify(o8) == 8); + BOOST_TEST(classify(o9) == 9); + BOOST_TEST(classify(o10) == 10); +} + +} // namespace test_large_hierarchy