From b42015791672c4ec73e4d19747591f260a8e2772 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 18 Dec 2025 04:37:38 +0000 Subject: [PATCH 01/15] Initial plan From 530de74f2787939ff270255b3f75daa87333f1fb Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 18 Dec 2025 04:43:21 +0000 Subject: [PATCH 02/15] Add minimal_perfect_hash policy and test suite Co-authored-by: jll63 <5083077+jll63@users.noreply.github.com> --- .../policies/minimal_perfect_hash.hpp | 290 ++++++++++++++++++ test/test_minimal_perfect_hash.cpp | 252 +++++++++++++++ 2 files changed, 542 insertions(+) create mode 100644 include/boost/openmethod/policies/minimal_perfect_hash.hpp create mode 100644 test/test_minimal_perfect_hash.cpp diff --git a/include/boost/openmethod/policies/minimal_perfect_hash.hpp b/include/boost/openmethod/policies/minimal_perfect_hash.hpp new file mode 100644 index 00000000..67f93d84 --- /dev/null +++ b/include/boost/openmethod/policies/minimal_perfect_hash.hpp @@ -0,0 +1,290 @@ +// Copyright (c) 2018-2025 Jean-Louis Leroy +// Distributed under the Boost Software License, Version 1.0. +// See accompanying file LICENSE_1_0.txt +// or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef BOOST_OPENMETHOD_POLICY_MINIMAL_PERFECT_HASH_HPP +#define BOOST_OPENMETHOD_POLICY_MINIMAL_PERFECT_HASH_HPP + +#include + +#include +#include +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4702) // unreachable code +#endif + +namespace boost::openmethod { + +namespace detail { + +template +std::vector minimal_perfect_hash_control; + +} // namespace detail + +namespace policies { + +//! Hash type ids using a minimal perfect hash function. +//! +//! `minimal_perfect_hash` implements the @ref type_hash policy using a hash +//! function in the form `H(x)=(M*x)>>N`. It uses the PtHash algorithm to +//! determine values for `M` and `N` that result in a minimal perfect hash +//! function for the set of registered type_ids. This means that the hash +//! function is collision-free and the codomain is exactly the size of the +//! domain, resulting in a dense range [0, n-1] for n inputs. +struct minimal_perfect_hash : type_hash { + + //! Cannot find hash factors + struct search_error : openmethod_error { + //! Number of attempts to find hash factors + std::size_t attempts; + //! Number of buckets used in the last attempt + std::size_t buckets; + + //! Write a short description to an output stream + //! @param os The output stream + //! @tparam Registry The registry + //! @tparam Stream A @ref LightweightOutputStream + template + auto write(Stream& os) const -> void; + }; + + using errors = std::variant; + + //! A TypeHashFn metafunction. + //! + //! @tparam Registry The registry containing this policy + template + class fn { + static std::size_t mult; + static std::size_t shift; + static std::size_t min_value; + static std::size_t max_value; + + static void check(std::size_t index, type_id type); + + template + static void initialize( + const InitializeContext& ctx, std::vector& buckets, + const std::tuple& options); + + public: + //! Find the hash factors + //! + //! Attempts to find suitable values for the multiplication factor `M` + //! and the shift amount `N` that result in a minimal perfect hash + //! function for the specified input values. + //! + //! If no suitable values are found, calls the error handler with + //! a @ref hash_error object then calls `abort`. + //! + //! @tparam Context An @ref InitializeContext. + //! @param ctx A Context object. + //! @return A pair containing the minimum and maximum hash values. + template + static auto + initialize(const Context& ctx, const std::tuple& options) { + if constexpr (Registry::has_runtime_checks) { + initialize( + ctx, detail::minimal_perfect_hash_control, options); + } else { + std::vector buckets; + initialize(ctx, buckets, options); + } + + return std::pair{min_value, max_value}; + } + + //! Hash a type id + //! + //! Hash a type id. + //! + //! If `Registry` contains the @ref runtime_checks policy, checks that + //! the type id is valid, i.e. if it was present in the set passed to + //! @ref initialize. Its absence indicates that a class involved in a + //! method definition, method overrider, or method call was not + //! registered. In this case, signal a @ref missing_class using + //! the registry's @ref error_handler if present; then calls `abort`. + //! + //! @param type The type_id to hash + //! @return The hash value + BOOST_FORCEINLINE + static auto hash(type_id type) -> std::size_t { + auto index = + (mult * reinterpret_cast(type)) >> shift; + + if constexpr (Registry::has_runtime_checks) { + check(index, type); + } + + return index; + } + + //! Releases the memory allocated by `initialize`. + //! + //! @tparam Options... Zero or more option types, deduced from the function + //! arguments. + //! @param options Zero or more option objects. + template + static auto finalize(const std::tuple&) -> void { + detail::minimal_perfect_hash_control.clear(); + } + }; +}; + +template +std::size_t minimal_perfect_hash::fn::mult; + +template +std::size_t minimal_perfect_hash::fn::shift; + +template +std::size_t minimal_perfect_hash::fn::min_value; + +template +std::size_t minimal_perfect_hash::fn::max_value; + +template +template +void minimal_perfect_hash::fn::initialize( + const InitializeContext& ctx, std::vector& buckets, + const std::tuple& options) { + (void)options; + + const auto N = std::distance(ctx.classes_begin(), ctx.classes_end()); + + if constexpr (mp11::mp_contains, trace>::value) { + Registry::output::os << "Finding minimal perfect hash factors for " << N << " types\n"; + } + + // For minimal perfect hash, we need exactly N buckets + std::size_t hash_size = N; + + if (hash_size == 0) { + min_value = 0; + max_value = 0; + shift = 0; + mult = 1; + return; + } + + std::default_random_engine rnd(13081963); + std::size_t total_attempts = 0; + + // Calculate M (number of bits needed to represent hash_size) + std::size_t M = 0; + for (auto size = hash_size; size > 0; size >>= 1) { + ++M; + } + if (M > 0) { + M--; + } + + std::uniform_int_distribution uniform_dist; + + // Try increasing values of M for better distribution + for (std::size_t pass = 0; pass < 4; ++pass, ++M) { + shift = 8 * sizeof(type_id) - M; + min_value = (std::numeric_limits::max)(); + max_value = (std::numeric_limits::min)(); + + if constexpr (InitializeContext::template has_option) { + ctx.tr << " trying with M = " << M << ", " << hash_size + << " buckets (minimal)\n"; + } + + std::size_t attempts = 0; + buckets.resize(hash_size); + + while (attempts < 100000) { + std::fill( + buckets.begin(), buckets.end(), type_id(detail::uintptr_max)); + ++attempts; + ++total_attempts; + mult = uniform_dist(rnd) | 1; + + bool collision_found = false; + for (auto iter = ctx.classes_begin(); iter != ctx.classes_end(); + ++iter) { + for (auto type_iter = iter->type_id_begin(); + type_iter != iter->type_id_end(); ++type_iter) { + auto type = *type_iter; + auto index = (detail::uintptr(type) * mult) >> shift; + + // For minimal perfect hash, index must be in [0, N) + if (index >= hash_size) { + collision_found = true; + goto collision; + } + + min_value = (std::min)(min_value, index); + max_value = (std::max)(max_value, index); + + if (detail::uintptr(buckets[index]) != + detail::uintptr_max) { + collision_found = true; + goto collision; + } + + buckets[index] = type; + } + } + + // Verify that we have a minimal perfect hash (all buckets used) + for (std::size_t i = 0; i < hash_size; ++i) { + if (detail::uintptr(buckets[i]) == detail::uintptr_max) { + collision_found = true; + goto collision; + } + } + + if constexpr (InitializeContext::template has_option) { + ctx.tr << " found " << mult << " after " << total_attempts + << " attempts; span = [" << min_value << ", " + << max_value << "], size = " << (max_value - min_value + 1) << "\n"; + } + + return; + + collision: {} + } + } + + search_error error; + error.attempts = total_attempts; + error.buckets = hash_size; + + if constexpr (Registry::has_error_handler) { + Registry::error_handler::error(error); + } + + abort(); +} + +template +void minimal_perfect_hash::fn::check(std::size_t index, type_id type) { + if (index < min_value || index > max_value || + detail::minimal_perfect_hash_control[index] != type) { + + if constexpr (Registry::has_error_handler) { + missing_class error; + error.type = type; + Registry::error_handler::error(error); + } + + abort(); + } +} + +template +auto minimal_perfect_hash::search_error::write(Stream& os) const -> void { + os << "could not find minimal perfect hash factors after " << attempts + << " attempts using " << buckets << " buckets\n"; +} + +} // namespace policies +} // namespace boost::openmethod + +#endif diff --git a/test/test_minimal_perfect_hash.cpp b/test/test_minimal_perfect_hash.cpp new file mode 100644 index 00000000..13c0b407 --- /dev/null +++ b/test/test_minimal_perfect_hash.cpp @@ -0,0 +1,252 @@ +// Copyright (c) 2018-2025 Jean-Louis Leroy +// Distributed under the Boost Software License, Version 1.0. +// See accompanying file LICENSE_1_0.txt +// or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include + +#define BOOST_TEST_MODULE minimal_perfect_hash +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "test_util.hpp" + +using namespace boost::openmethod; +using namespace boost::openmethod::policies; + +// Test registry with minimal_perfect_hash +struct minimal_hash_registry + : registry< + std_rtti, vptr_vector, minimal_perfect_hash, + default_error_handler, stderr_output> { +}; + +// Test registry with runtime checks +struct minimal_hash_registry_with_checks + : registry< + std_rtti, vptr_vector, minimal_perfect_hash, + default_error_handler, stderr_output, runtime_checks> { +}; + +namespace test_basic { + +struct Animal { + virtual ~Animal() {} +}; + +struct Dog : Animal {}; +struct Cat : Animal {}; +struct Bird : Animal {}; + +BOOST_OPENMETHOD_CLASSES(Animal, Dog, Cat, Bird, minimal_hash_registry); + +BOOST_OPENMETHOD(get_sound, (virtual_), std::string, minimal_hash_registry); + +BOOST_OPENMETHOD_OVERRIDE(get_sound, (const Dog&), std::string) { + return "woof"; +} + +BOOST_OPENMETHOD_OVERRIDE(get_sound, (const Cat&), std::string) { + return "meow"; +} + +BOOST_OPENMETHOD_OVERRIDE(get_sound, (const Bird&), std::string) { + return "chirp"; +} + +BOOST_AUTO_TEST_CASE(basic_functionality) { + initialize(); + + Dog dog; + Cat cat; + Bird bird; + + BOOST_TEST(get_sound(dog) == "woof"); + BOOST_TEST(get_sound(cat) == "meow"); + BOOST_TEST(get_sound(bird) == "chirp"); +} + +} // namespace test_basic + +namespace test_hash_properties { + +struct Base { + virtual ~Base() {} +}; + +struct D1 : Base {}; +struct D2 : Base {}; +struct D3 : Base {}; +struct D4 : Base {}; +struct D5 : Base {}; + +BOOST_OPENMETHOD_CLASSES(Base, D1, D2, D3, D4, D5, minimal_hash_registry); + +BOOST_OPENMETHOD(get_id, (virtual_), int, minimal_hash_registry); + +BOOST_OPENMETHOD_OVERRIDE(get_id, (const D1&), int) { + return 1; +} + +BOOST_OPENMETHOD_OVERRIDE(get_id, (const D2&), int) { + return 2; +} + +BOOST_OPENMETHOD_OVERRIDE(get_id, (const D3&), int) { + return 3; +} + +BOOST_OPENMETHOD_OVERRIDE(get_id, (const D4&), int) { + return 4; +} + +BOOST_OPENMETHOD_OVERRIDE(get_id, (const D5&), int) { + return 5; +} + +BOOST_AUTO_TEST_CASE(minimal_hash_properties) { + initialize(); + + // Test that all classes are correctly hashed + D1 d1; + D2 d2; + D3 d3; + D4 d4; + D5 d5; + + BOOST_TEST(get_id(d1) == 1); + BOOST_TEST(get_id(d2) == 2); + BOOST_TEST(get_id(d3) == 3); + BOOST_TEST(get_id(d4) == 4); + BOOST_TEST(get_id(d5) == 5); + + // Verify that the hash function produces a minimal perfect hash + // (This is implicit - if it didn't, initialization would fail or we'd get wrong results) +} + +} // namespace test_hash_properties + +namespace test_with_runtime_checks { + +struct Vehicle { + virtual ~Vehicle() {} +}; + +struct Car : Vehicle {}; +struct Bike : Vehicle {}; + +BOOST_OPENMETHOD_CLASSES(Vehicle, Car, Bike, minimal_hash_registry_with_checks); + +BOOST_OPENMETHOD(get_wheels, (virtual_), int, minimal_hash_registry_with_checks); + +BOOST_OPENMETHOD_OVERRIDE(get_wheels, (const Car&), int) { + return 4; +} + +BOOST_OPENMETHOD_OVERRIDE(get_wheels, (const Bike&), int) { + return 2; +} + +BOOST_AUTO_TEST_CASE(runtime_checks) { + initialize(); + + Car car; + Bike bike; + + BOOST_TEST(get_wheels(car) == 4); + BOOST_TEST(get_wheels(bike) == 2); +} + +} // namespace test_with_runtime_checks + +namespace test_empty { + +struct Empty { + virtual ~Empty() {} +}; + +BOOST_OPENMETHOD_CLASSES(Empty, minimal_hash_registry); + +BOOST_OPENMETHOD(process, (virtual_), int, minimal_hash_registry); + +BOOST_OPENMETHOD_OVERRIDE(process, (const Empty&), int) { + return 42; +} + +BOOST_AUTO_TEST_CASE(single_class) { + initialize(); + + Empty e; + BOOST_TEST(process(e) == 42); +} + +} // namespace test_empty + +namespace test_large_hierarchy { + +struct Root { + virtual ~Root() {} +}; + +struct L1_1 : Root {}; +struct L1_2 : Root {}; +struct L1_3 : Root {}; +struct L1_4 : Root {}; +struct L1_5 : Root {}; +struct L1_6 : Root {}; +struct L1_7 : Root {}; +struct L1_8 : Root {}; +struct L1_9 : Root {}; +struct L1_10 : Root {}; + +BOOST_OPENMETHOD_CLASSES(Root, L1_1, L1_2, L1_3, L1_4, L1_5, L1_6, L1_7, L1_8, L1_9, L1_10, minimal_hash_registry); + +BOOST_OPENMETHOD(classify, (virtual_), int, minimal_hash_registry); + +BOOST_OPENMETHOD_OVERRIDE(classify, (const L1_1&), int) { return 1; } +BOOST_OPENMETHOD_OVERRIDE(classify, (const L1_2&), int) { return 2; } +BOOST_OPENMETHOD_OVERRIDE(classify, (const L1_3&), int) { return 3; } +BOOST_OPENMETHOD_OVERRIDE(classify, (const L1_4&), int) { return 4; } +BOOST_OPENMETHOD_OVERRIDE(classify, (const L1_5&), int) { return 5; } +BOOST_OPENMETHOD_OVERRIDE(classify, (const L1_6&), int) { return 6; } +BOOST_OPENMETHOD_OVERRIDE(classify, (const L1_7&), int) { return 7; } +BOOST_OPENMETHOD_OVERRIDE(classify, (const L1_8&), int) { return 8; } +BOOST_OPENMETHOD_OVERRIDE(classify, (const L1_9&), int) { return 9; } +BOOST_OPENMETHOD_OVERRIDE(classify, (const L1_10&), int) { return 10; } + +BOOST_AUTO_TEST_CASE(larger_hierarchy) { + initialize(); + + L1_1 o1; + L1_2 o2; + L1_3 o3; + L1_4 o4; + L1_5 o5; + L1_6 o6; + L1_7 o7; + L1_8 o8; + L1_9 o9; + L1_10 o10; + + BOOST_TEST(classify(o1) == 1); + BOOST_TEST(classify(o2) == 2); + BOOST_TEST(classify(o3) == 3); + BOOST_TEST(classify(o4) == 4); + BOOST_TEST(classify(o5) == 5); + BOOST_TEST(classify(o6) == 6); + BOOST_TEST(classify(o7) == 7); + BOOST_TEST(classify(o8) == 8); + BOOST_TEST(classify(o9) == 9); + BOOST_TEST(classify(o10) == 10); +} + +} // namespace test_large_hierarchy From 63d23e79a43a23c50d87a5a58e995a12d20232c7 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 18 Dec 2025 04:47:10 +0000 Subject: [PATCH 03/15] Fix M calculation for minimal perfect hash to ensure enough bits Co-authored-by: jll63 <5083077+jll63@users.noreply.github.com> --- .../policies/minimal_perfect_hash.hpp | 26 +++++++++++++++---- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/include/boost/openmethod/policies/minimal_perfect_hash.hpp b/include/boost/openmethod/policies/minimal_perfect_hash.hpp index 67f93d84..456a9e3d 100644 --- a/include/boost/openmethod/policies/minimal_perfect_hash.hpp +++ b/include/boost/openmethod/policies/minimal_perfect_hash.hpp @@ -169,18 +169,34 @@ void minimal_perfect_hash::fn::initialize( mult = 1; return; } + + if (hash_size == 1) { + // Special case: only one type, any hash function works + min_value = 0; + max_value = 0; + shift = 8 * sizeof(type_id); // Shift everything away, result is always 0 + mult = 1; + buckets.resize(1); + for (auto iter = ctx.classes_begin(); iter != ctx.classes_end(); ++iter) { + for (auto type_iter = iter->type_id_begin(); + type_iter != iter->type_id_end(); ++type_iter) { + buckets[0] = *type_iter; + } + } + return; + } std::default_random_engine rnd(13081963); std::size_t total_attempts = 0; - // Calculate M (number of bits needed to represent hash_size) + // Calculate M (number of bits needed for the hash range) + // We need 2^M >= hash_size, so M = ceil(log2(hash_size)) std::size_t M = 0; - for (auto size = hash_size; size > 0; size >>= 1) { + std::size_t power = 1; + while (power < hash_size) { + power <<= 1; ++M; } - if (M > 0) { - M--; - } std::uniform_int_distribution uniform_dist; From dc906d8a1f9bfdcf89ec1c75ed591b310f1c4f9b Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 18 Dec 2025 04:48:02 +0000 Subject: [PATCH 04/15] Add comments clarifying minimal perfect hash property Co-authored-by: jll63 <5083077+jll63@users.noreply.github.com> --- .../boost/openmethod/policies/minimal_perfect_hash.hpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/include/boost/openmethod/policies/minimal_perfect_hash.hpp b/include/boost/openmethod/policies/minimal_perfect_hash.hpp index 456a9e3d..8ee2e419 100644 --- a/include/boost/openmethod/policies/minimal_perfect_hash.hpp +++ b/include/boost/openmethod/policies/minimal_perfect_hash.hpp @@ -34,6 +34,12 @@ namespace policies { //! function for the set of registered type_ids. This means that the hash //! function is collision-free and the codomain is exactly the size of the //! domain, resulting in a dense range [0, n-1] for n inputs. +//! +//! Unlike @ref fast_perfect_hash, which uses a hash table of size 2^k +//! (typically larger than needed) and may have unused slots, this policy +//! ensures the hash table has exactly n slots for n type_ids, with all +//! slots filled. This minimizes memory usage but may require more search +//! attempts during initialization. struct minimal_perfect_hash : type_hash { //! Cannot find hash factors @@ -249,6 +255,9 @@ void minimal_perfect_hash::fn::initialize( } // Verify that we have a minimal perfect hash (all buckets used) + // This is the key difference from fast_perfect_hash: we require that + // all N buckets are filled, ensuring the codomain size equals the + // domain size, making it a truly minimal perfect hash function. for (std::size_t i = 0; i < hash_size; ++i) { if (detail::uintptr(buckets[i]) == detail::uintptr_max) { collision_found = true; From 88984a1d14644f53f76c9551b062047721af6d96 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 18 Dec 2025 04:52:59 +0000 Subject: [PATCH 05/15] Implement proper PtHash algorithm with displacement tables Co-authored-by: jll63 <5083077+jll63@users.noreply.github.com> --- .../policies/minimal_perfect_hash.hpp | 252 ++++++++++++------ 1 file changed, 164 insertions(+), 88 deletions(-) diff --git a/include/boost/openmethod/policies/minimal_perfect_hash.hpp b/include/boost/openmethod/policies/minimal_perfect_hash.hpp index 8ee2e419..f049ab25 100644 --- a/include/boost/openmethod/policies/minimal_perfect_hash.hpp +++ b/include/boost/openmethod/policies/minimal_perfect_hash.hpp @@ -10,6 +10,8 @@ #include #include +#include +#include #ifdef _MSC_VER #pragma warning(push) #pragma warning(disable : 4702) // unreachable code @@ -19,9 +21,23 @@ namespace boost::openmethod { namespace detail { +#if defined(UINTPTR_MAX) +using uintptr = std::uintptr_t; +constexpr uintptr uintptr_max = UINTPTR_MAX; +#else +static_assert( + sizeof(std::size_t) == sizeof(void*), + "This implementation requires that size_t and void* have the same size."); +using uintptr = std::size_t; +constexpr uintptr uintptr_max = (std::numeric_limits::max)(); +#endif + template std::vector minimal_perfect_hash_control; +template +std::vector minimal_perfect_hash_displacements; + } // namespace detail namespace policies { @@ -66,8 +82,10 @@ struct minimal_perfect_hash : type_hash { class fn { static std::size_t mult; static std::size_t shift; - static std::size_t min_value; - static std::size_t max_value; + static std::size_t table_size; // N for minimal perfect hash + static std::size_t num_groups; + static std::size_t group_mult; + static std::size_t group_shift; static void check(std::size_t index, type_id type); @@ -77,18 +95,19 @@ struct minimal_perfect_hash : type_hash { const std::tuple& options); public: - //! Find the hash factors + //! Find the hash factors using PtHash algorithm //! - //! Attempts to find suitable values for the multiplication factor `M` - //! and the shift amount `N` that result in a minimal perfect hash - //! function for the specified input values. + //! Uses the PtHash algorithm to find: + //! - Pilot hash parameters (M, N) for H(x) = (M * x) >> N + //! - Bucket assignment parameters + //! - Displacement values for each bucket to achieve minimal perfect hashing //! //! If no suitable values are found, calls the error handler with - //! a @ref hash_error object then calls `abort`. + //! a @ref search_error object then calls `abort`. //! //! @tparam Context An @ref InitializeContext. //! @param ctx A Context object. - //! @return A pair containing the minimum and maximum hash values. + //! @return A pair containing the minimum (0) and maximum (n-1) hash values. template static auto initialize(const Context& ctx, const std::tuple& options) { @@ -100,12 +119,13 @@ struct minimal_perfect_hash : type_hash { initialize(ctx, buckets, options); } - return std::pair{min_value, max_value}; + return std::pair{std::size_t(0), table_size - 1}; } - //! Hash a type id + //! Hash a type id using the PtHash algorithm //! - //! Hash a type id. + //! Hash a type id using H(x) = (pilot(x) + disp[group(x)]) % N + //! where pilot(x) = (M * x) >> S and group(x) = (GM * x) >> GS. //! //! If `Registry` contains the @ref runtime_checks policy, checks that //! the type id is valid, i.e. if it was present in the set passed to @@ -118,8 +138,9 @@ struct minimal_perfect_hash : type_hash { //! @return The hash value BOOST_FORCEINLINE static auto hash(type_id type) -> std::size_t { - auto index = - (mult * reinterpret_cast(type)) >> shift; + auto pilot = (mult * reinterpret_cast(type)) >> shift; + auto group = (group_mult * reinterpret_cast(type)) >> group_shift; + auto index = (pilot + detail::minimal_perfect_hash_displacements[group]) % table_size; if constexpr (Registry::has_runtime_checks) { check(index, type); @@ -136,6 +157,7 @@ struct minimal_perfect_hash : type_hash { template static auto finalize(const std::tuple&) -> void { detail::minimal_perfect_hash_control.clear(); + detail::minimal_perfect_hash_displacements.clear(); } }; }; @@ -147,10 +169,16 @@ template std::size_t minimal_perfect_hash::fn::shift; template -std::size_t minimal_perfect_hash::fn::min_value; +std::size_t minimal_perfect_hash::fn::table_size; + +template +std::size_t minimal_perfect_hash::fn::num_groups; + +template +std::size_t minimal_perfect_hash::fn::group_mult; template -std::size_t minimal_perfect_hash::fn::max_value; +std::size_t minimal_perfect_hash::fn::group_shift; template template @@ -162,26 +190,30 @@ void minimal_perfect_hash::fn::initialize( const auto N = std::distance(ctx.classes_begin(), ctx.classes_end()); if constexpr (mp11::mp_contains, trace>::value) { - Registry::output::os << "Finding minimal perfect hash factors for " << N << " types\n"; + Registry::output::os << "Finding minimal perfect hash using PtHash for " << N << " types\n"; } - // For minimal perfect hash, we need exactly N buckets - std::size_t hash_size = N; + // Table size is exactly N for minimal perfect hash + table_size = N; - if (hash_size == 0) { - min_value = 0; - max_value = 0; + if (table_size == 0) { shift = 0; mult = 1; + num_groups = 0; + group_mult = 1; + group_shift = 0; + detail::minimal_perfect_hash_displacements.clear(); return; } - if (hash_size == 1) { - // Special case: only one type, any hash function works - min_value = 0; - max_value = 0; - shift = 8 * sizeof(type_id); // Shift everything away, result is always 0 + if (table_size == 1) { + // Special case: only one type + shift = 8 * sizeof(type_id); mult = 1; + num_groups = 1; + group_mult = 1; + group_shift = 8 * sizeof(type_id); + detail::minimal_perfect_hash_displacements.assign(1, 0); buckets.resize(1); for (auto iter = ctx.classes_begin(); iter != ctx.classes_end(); ++iter) { for (auto type_iter = iter->type_id_begin(); @@ -192,94 +224,138 @@ void minimal_perfect_hash::fn::initialize( return; } + // Collect all type_ids + std::vector keys; + for (auto iter = ctx.classes_begin(); iter != ctx.classes_end(); ++iter) { + for (auto type_iter = iter->type_id_begin(); + type_iter != iter->type_id_end(); ++type_iter) { + keys.push_back(*type_iter); + } + } + std::default_random_engine rnd(13081963); + std::uniform_int_distribution uniform_dist; std::size_t total_attempts = 0; + + // PtHash algorithm: partition keys into groups, then find displacements + // Number of groups: typically sqrt(N) to N/4 for good performance + num_groups = (std::max)(std::size_t(1), table_size / 4); + if (num_groups > table_size) num_groups = table_size; - // Calculate M (number of bits needed for the hash range) - // We need 2^M >= hash_size, so M = ceil(log2(hash_size)) - std::size_t M = 0; + // Calculate bits needed for num_groups + std::size_t GM = 0; std::size_t power = 1; - while (power < hash_size) { + while (power < num_groups) { power <<= 1; - ++M; + ++GM; } + group_shift = 8 * sizeof(type_id) - GM; - std::uniform_int_distribution uniform_dist; + if constexpr (InitializeContext::template has_option) { + ctx.tr << " Using " << num_groups << " groups for " << table_size << " keys\n"; + } - // Try increasing values of M for better distribution - for (std::size_t pass = 0; pass < 4; ++pass, ++M) { + // Try different pilot hash parameters + for (std::size_t pass = 0; pass < 10 && total_attempts < 100000; ++pass) { + mult = uniform_dist(rnd) | 1; + group_mult = uniform_dist(rnd) | 1; + + // Calculate M for pilot hash (number of bits for table_size range) + std::size_t M = 0; + power = 1; + while (power < table_size * 2) { // Use 2*N for better distribution + power <<= 1; + ++M; + } shift = 8 * sizeof(type_id) - M; - min_value = (std::numeric_limits::max)(); - max_value = (std::numeric_limits::min)(); - if constexpr (InitializeContext::template has_option) { - ctx.tr << " trying with M = " << M << ", " << hash_size - << " buckets (minimal)\n"; + // Partition keys into groups + std::vector> groups(num_groups); + for (auto key : keys) { + auto group_idx = ((group_mult * reinterpret_cast(key)) >> group_shift) % num_groups; + groups[group_idx].push_back(key); } - std::size_t attempts = 0; - buckets.resize(hash_size); - - while (attempts < 100000) { - std::fill( - buckets.begin(), buckets.end(), type_id(detail::uintptr_max)); - ++attempts; - ++total_attempts; - mult = uniform_dist(rnd) | 1; - - bool collision_found = false; - for (auto iter = ctx.classes_begin(); iter != ctx.classes_end(); - ++iter) { - for (auto type_iter = iter->type_id_begin(); - type_iter != iter->type_id_end(); ++type_iter) { - auto type = *type_iter; - auto index = (detail::uintptr(type) * mult) >> shift; - - // For minimal perfect hash, index must be in [0, N) - if (index >= hash_size) { - collision_found = true; - goto collision; - } - - min_value = (std::min)(min_value, index); - max_value = (std::max)(max_value, index); - - if (detail::uintptr(buckets[index]) != - detail::uintptr_max) { - collision_found = true; - goto collision; + // Try to find displacements for each group + detail::minimal_perfect_hash_displacements.assign(num_groups, 0); + buckets.assign(table_size, type_id(detail::uintptr_max)); + std::vector used(table_size, false); + bool success = true; + + // Process groups in descending order of size (larger groups first) + std::vector group_order(num_groups); + for (std::size_t i = 0; i < num_groups; ++i) group_order[i] = i; + std::sort(group_order.begin(), group_order.end(), + [&groups](std::size_t a, std::size_t b) { + return groups[a].size() > groups[b].size(); + }); + + for (auto g : group_order) { + if (groups[g].empty()) continue; + + // Try different displacement values + bool found = false; + for (std::size_t disp = 0; disp < table_size * 2 && !found; ++disp) { + ++total_attempts; + if (total_attempts > 100000) { + success = false; + break; + } + + // Check if this displacement works for all keys in group + std::vector positions; + bool valid = true; + for (auto key : groups[g]) { + auto pilot = (mult * reinterpret_cast(key)) >> shift; + auto pos = (pilot + disp) % table_size; + if (used[pos]) { + valid = false; + break; } + positions.push_back(pos); + } - buckets[index] = type; + if (valid) { + // Mark positions as used and store keys + detail::minimal_perfect_hash_displacements[g] = disp; + for (std::size_t i = 0; i < groups[g].size(); ++i) { + used[positions[i]] = true; + buckets[positions[i]] = groups[g][i]; + } + found = true; } } - // Verify that we have a minimal perfect hash (all buckets used) - // This is the key difference from fast_perfect_hash: we require that - // all N buckets are filled, ensuring the codomain size equals the - // domain size, making it a truly minimal perfect hash function. - for (std::size_t i = 0; i < hash_size; ++i) { + if (!found) { + success = false; + break; + } + } + + if (success) { + // Verify all positions are used (minimal property) + bool all_used = true; + for (std::size_t i = 0; i < table_size; ++i) { if (detail::uintptr(buckets[i]) == detail::uintptr_max) { - collision_found = true; - goto collision; + all_used = false; + break; } } - if constexpr (InitializeContext::template has_option) { - ctx.tr << " found " << mult << " after " << total_attempts - << " attempts; span = [" << min_value << ", " - << max_value << "], size = " << (max_value - min_value + 1) << "\n"; + if (all_used) { + if constexpr (InitializeContext::template has_option) { + ctx.tr << " Found minimal perfect hash after " << total_attempts + << " attempts\n"; + } + return; } - - return; - - collision: {} } } + // Failed to find minimal perfect hash search_error error; error.attempts = total_attempts; - error.buckets = hash_size; + error.buckets = table_size; if constexpr (Registry::has_error_handler) { Registry::error_handler::error(error); @@ -290,7 +366,7 @@ void minimal_perfect_hash::fn::initialize( template void minimal_perfect_hash::fn::check(std::size_t index, type_id type) { - if (index < min_value || index > max_value || + if (index >= table_size || detail::minimal_perfect_hash_control[index] != type) { if constexpr (Registry::has_error_handler) { From fe45caf13cef39e5e539dedd43f357c36e038b4c Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 18 Dec 2025 04:55:03 +0000 Subject: [PATCH 06/15] Address code review feedback: add named constants and fix trace checking Co-authored-by: jll63 <5083077+jll63@users.noreply.github.com> --- .../policies/minimal_perfect_hash.hpp | 33 ++++++++++++------- 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/include/boost/openmethod/policies/minimal_perfect_hash.hpp b/include/boost/openmethod/policies/minimal_perfect_hash.hpp index f049ab25..b9e93a09 100644 --- a/include/boost/openmethod/policies/minimal_perfect_hash.hpp +++ b/include/boost/openmethod/policies/minimal_perfect_hash.hpp @@ -189,8 +189,8 @@ void minimal_perfect_hash::fn::initialize( const auto N = std::distance(ctx.classes_begin(), ctx.classes_end()); - if constexpr (mp11::mp_contains, trace>::value) { - Registry::output::os << "Finding minimal perfect hash using PtHash for " << N << " types\n"; + if constexpr (InitializeContext::template has_option) { + ctx.tr << "Finding minimal perfect hash using PtHash for " << N << " types\n"; } // Table size is exactly N for minimal perfect hash @@ -208,11 +208,12 @@ void minimal_perfect_hash::fn::initialize( if (table_size == 1) { // Special case: only one type - shift = 8 * sizeof(type_id); + constexpr std::size_t bits_per_type_id = 8 * sizeof(type_id); + shift = bits_per_type_id; mult = 1; num_groups = 1; group_mult = 1; - group_shift = 8 * sizeof(type_id); + group_shift = bits_per_type_id; detail::minimal_perfect_hash_displacements.assign(1, 0); buckets.resize(1); for (auto iter = ctx.classes_begin(); iter != ctx.classes_end(); ++iter) { @@ -233,13 +234,21 @@ void minimal_perfect_hash::fn::initialize( } } - std::default_random_engine rnd(13081963); + // Constants for PtHash algorithm + constexpr std::size_t DEFAULT_RANDOM_SEED = 13081963; // Same seed as fast_perfect_hash + constexpr std::size_t MAX_PASSES = 10; + constexpr std::size_t MAX_ATTEMPTS = 100000; + constexpr std::size_t DEFAULT_GROUP_DIVISOR = 4; // N/4 groups for balance between memory and speed + constexpr std::size_t DISTRIBUTION_FACTOR = 2; // 2*N range for better distribution + constexpr std::size_t bits_per_type_id = 8 * sizeof(type_id); + + std::default_random_engine rnd(DEFAULT_RANDOM_SEED); std::uniform_int_distribution uniform_dist; std::size_t total_attempts = 0; // PtHash algorithm: partition keys into groups, then find displacements // Number of groups: typically sqrt(N) to N/4 for good performance - num_groups = (std::max)(std::size_t(1), table_size / 4); + num_groups = (std::max)(std::size_t(1), table_size / DEFAULT_GROUP_DIVISOR); if (num_groups > table_size) num_groups = table_size; // Calculate bits needed for num_groups @@ -249,25 +258,25 @@ void minimal_perfect_hash::fn::initialize( power <<= 1; ++GM; } - group_shift = 8 * sizeof(type_id) - GM; + group_shift = bits_per_type_id - GM; if constexpr (InitializeContext::template has_option) { ctx.tr << " Using " << num_groups << " groups for " << table_size << " keys\n"; } // Try different pilot hash parameters - for (std::size_t pass = 0; pass < 10 && total_attempts < 100000; ++pass) { + for (std::size_t pass = 0; pass < MAX_PASSES && total_attempts < MAX_ATTEMPTS; ++pass) { mult = uniform_dist(rnd) | 1; group_mult = uniform_dist(rnd) | 1; // Calculate M for pilot hash (number of bits for table_size range) std::size_t M = 0; power = 1; - while (power < table_size * 2) { // Use 2*N for better distribution + while (power < table_size * DISTRIBUTION_FACTOR) { power <<= 1; ++M; } - shift = 8 * sizeof(type_id) - M; + shift = bits_per_type_id - M; // Partition keys into groups std::vector> groups(num_groups); @@ -295,9 +304,9 @@ void minimal_perfect_hash::fn::initialize( // Try different displacement values bool found = false; - for (std::size_t disp = 0; disp < table_size * 2 && !found; ++disp) { + for (std::size_t disp = 0; disp < table_size * DISTRIBUTION_FACTOR && !found; ++disp) { ++total_attempts; - if (total_attempts > 100000) { + if (total_attempts > MAX_ATTEMPTS) { success = false; break; } From b4866427482cee9837030179af07fd76e31d5f48 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 18 Dec 2025 04:56:37 +0000 Subject: [PATCH 07/15] Optimize positions vector allocation with reserve() Co-authored-by: jll63 <5083077+jll63@users.noreply.github.com> --- _codeql_detected_source_root | 1 + include/boost/openmethod/policies/minimal_perfect_hash.hpp | 1 + 2 files changed, 2 insertions(+) create mode 120000 _codeql_detected_source_root diff --git a/_codeql_detected_source_root b/_codeql_detected_source_root new file mode 120000 index 00000000..945c9b46 --- /dev/null +++ b/_codeql_detected_source_root @@ -0,0 +1 @@ +. \ No newline at end of file diff --git a/include/boost/openmethod/policies/minimal_perfect_hash.hpp b/include/boost/openmethod/policies/minimal_perfect_hash.hpp index b9e93a09..7a693ca6 100644 --- a/include/boost/openmethod/policies/minimal_perfect_hash.hpp +++ b/include/boost/openmethod/policies/minimal_perfect_hash.hpp @@ -313,6 +313,7 @@ void minimal_perfect_hash::fn::initialize( // Check if this displacement works for all keys in group std::vector positions; + positions.reserve(groups[g].size()); bool valid = true; for (auto key : groups[g]) { auto pilot = (mult * reinterpret_cast(key)) >> shift; From 7e44f683ea4ea7d5566da0c157dd39ee2c82d59d Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 18 Dec 2025 05:33:17 +0000 Subject: [PATCH 08/15] Allow up to 10% bucket waste in minimal_perfect_hash Co-authored-by: jll63 <5083077+jll63@users.noreply.github.com> --- .../policies/minimal_perfect_hash.hpp | 33 +++++++++++-------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/include/boost/openmethod/policies/minimal_perfect_hash.hpp b/include/boost/openmethod/policies/minimal_perfect_hash.hpp index 7a693ca6..c32af666 100644 --- a/include/boost/openmethod/policies/minimal_perfect_hash.hpp +++ b/include/boost/openmethod/policies/minimal_perfect_hash.hpp @@ -48,14 +48,14 @@ namespace policies { //! function in the form `H(x)=(M*x)>>N`. It uses the PtHash algorithm to //! determine values for `M` and `N` that result in a minimal perfect hash //! function for the set of registered type_ids. This means that the hash -//! function is collision-free and the codomain is exactly the size of the -//! domain, resulting in a dense range [0, n-1] for n inputs. +//! function is collision-free and the codomain is approximately the size of +//! the domain, resulting in a dense range [0, n-1] for n inputs. //! //! Unlike @ref fast_perfect_hash, which uses a hash table of size 2^k //! (typically larger than needed) and may have unused slots, this policy -//! ensures the hash table has exactly n slots for n type_ids, with all -//! slots filled. This minimizes memory usage but may require more search -//! attempts during initialization. +//! uses approximately 1.1*n slots for n type_ids (allowing up to 10% waste). +//! This minimizes memory usage while maintaining good search performance +//! during initialization. struct minimal_perfect_hash : type_hash { //! Cannot find hash factors @@ -193,8 +193,11 @@ void minimal_perfect_hash::fn::initialize( ctx.tr << "Finding minimal perfect hash using PtHash for " << N << " types\n"; } - // Table size is exactly N for minimal perfect hash - table_size = N; + // Table size is N * 1.1 to allow up to 10% waste (makes finding hash easier) + table_size = N + N / 10; + if (table_size == N && N > 0) { + table_size = N + 1; // Ensure at least 1 extra slot for N > 0 + } if (table_size == 0) { shift = 0; @@ -241,6 +244,7 @@ void minimal_perfect_hash::fn::initialize( constexpr std::size_t DEFAULT_GROUP_DIVISOR = 4; // N/4 groups for balance between memory and speed constexpr std::size_t DISTRIBUTION_FACTOR = 2; // 2*N range for better distribution constexpr std::size_t bits_per_type_id = 8 * sizeof(type_id); + // Allow 10% waste to make finding a hash function easier while still being memory-efficient std::default_random_engine rnd(DEFAULT_RANDOM_SEED); std::uniform_int_distribution uniform_dist; @@ -343,19 +347,20 @@ void minimal_perfect_hash::fn::initialize( } if (success) { - // Verify all positions are used (minimal property) - bool all_used = true; + // Count how many positions are used + std::size_t used_count = 0; for (std::size_t i = 0; i < table_size; ++i) { - if (detail::uintptr(buckets[i]) == detail::uintptr_max) { - all_used = false; - break; + if (detail::uintptr(buckets[i]) != detail::uintptr_max) { + used_count++; } } - if (all_used) { + // Accept if we've placed all keys (allow up to 10% waste) + if (used_count == keys.size()) { if constexpr (InitializeContext::template has_option) { ctx.tr << " Found minimal perfect hash after " << total_attempts - << " attempts\n"; + << " attempts; " << used_count << "/" << table_size + << " slots used\n"; } return; } From d2885bad9428e1a901b7ddf91fa60dfea94632f9 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 18 Dec 2025 05:35:01 +0000 Subject: [PATCH 09/15] Fix table size calculation and documentation for 10% waste Co-authored-by: jll63 <5083077+jll63@users.noreply.github.com> --- .../boost/openmethod/policies/minimal_perfect_hash.hpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/include/boost/openmethod/policies/minimal_perfect_hash.hpp b/include/boost/openmethod/policies/minimal_perfect_hash.hpp index c32af666..78353522 100644 --- a/include/boost/openmethod/policies/minimal_perfect_hash.hpp +++ b/include/boost/openmethod/policies/minimal_perfect_hash.hpp @@ -49,7 +49,7 @@ namespace policies { //! determine values for `M` and `N` that result in a minimal perfect hash //! function for the set of registered type_ids. This means that the hash //! function is collision-free and the codomain is approximately the size of -//! the domain, resulting in a dense range [0, n-1] for n inputs. +//! the domain, resulting in a dense range [0, 1.1*n-1] for n inputs. //! //! Unlike @ref fast_perfect_hash, which uses a hash table of size 2^k //! (typically larger than needed) and may have unused slots, this policy @@ -194,10 +194,8 @@ void minimal_perfect_hash::fn::initialize( } // Table size is N * 1.1 to allow up to 10% waste (makes finding hash easier) - table_size = N + N / 10; - if (table_size == N && N > 0) { - table_size = N + 1; // Ensure at least 1 extra slot for N > 0 - } + // Use (N * 11 + 9) / 10 to ensure proper rounding up for small N + table_size = (N * 11 + 9) / 10; if (table_size == 0) { shift = 0; From 509fcaaf4b1c90ff27e47c56984c793a22d09956 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 18 Dec 2025 05:36:04 +0000 Subject: [PATCH 10/15] Improve code readability with named constants for waste factor Co-authored-by: jll63 <5083077+jll63@users.noreply.github.com> --- .../boost/openmethod/policies/minimal_perfect_hash.hpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/include/boost/openmethod/policies/minimal_perfect_hash.hpp b/include/boost/openmethod/policies/minimal_perfect_hash.hpp index 78353522..5f916a51 100644 --- a/include/boost/openmethod/policies/minimal_perfect_hash.hpp +++ b/include/boost/openmethod/policies/minimal_perfect_hash.hpp @@ -194,8 +194,11 @@ void minimal_perfect_hash::fn::initialize( } // Table size is N * 1.1 to allow up to 10% waste (makes finding hash easier) - // Use (N * 11 + 9) / 10 to ensure proper rounding up for small N - table_size = (N * 11 + 9) / 10; + // Formula: ceil(N * 1.1) = (N * 11 + 9) / 10 ensures proper rounding for all N + constexpr std::size_t WASTE_FACTOR_NUMERATOR = 11; // 1.1 = 11/10 + constexpr std::size_t WASTE_FACTOR_DENOMINATOR = 10; + constexpr std::size_t ROUNDING_ADJUSTMENT = 9; // For ceiling division + table_size = (N * WASTE_FACTOR_NUMERATOR + ROUNDING_ADJUSTMENT) / WASTE_FACTOR_DENOMINATOR; if (table_size == 0) { shift = 0; @@ -242,7 +245,6 @@ void minimal_perfect_hash::fn::initialize( constexpr std::size_t DEFAULT_GROUP_DIVISOR = 4; // N/4 groups for balance between memory and speed constexpr std::size_t DISTRIBUTION_FACTOR = 2; // 2*N range for better distribution constexpr std::size_t bits_per_type_id = 8 * sizeof(type_id); - // Allow 10% waste to make finding a hash function easier while still being memory-efficient std::default_random_engine rnd(DEFAULT_RANDOM_SEED); std::uniform_int_distribution uniform_dist; From 39b8412b672ace07780c64d0e7f0bfe3c311283b Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 18 Dec 2025 05:41:45 +0000 Subject: [PATCH 11/15] Remove trailing spaces Co-authored-by: jll63 <5083077+jll63@users.noreply.github.com> --- .../openmethod/policies/minimal_perfect_hash.hpp | 16 ++++++++-------- test/test_minimal_perfect_hash.cpp | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/include/boost/openmethod/policies/minimal_perfect_hash.hpp b/include/boost/openmethod/policies/minimal_perfect_hash.hpp index 5f916a51..866f9e0e 100644 --- a/include/boost/openmethod/policies/minimal_perfect_hash.hpp +++ b/include/boost/openmethod/policies/minimal_perfect_hash.hpp @@ -99,7 +99,7 @@ struct minimal_perfect_hash : type_hash { //! //! Uses the PtHash algorithm to find: //! - Pilot hash parameters (M, N) for H(x) = (M * x) >> N - //! - Bucket assignment parameters + //! - Bucket assignment parameters //! - Displacement values for each bucket to achieve minimal perfect hashing //! //! If no suitable values are found, calls the error handler with @@ -199,7 +199,7 @@ void minimal_perfect_hash::fn::initialize( constexpr std::size_t WASTE_FACTOR_DENOMINATOR = 10; constexpr std::size_t ROUNDING_ADJUSTMENT = 9; // For ceiling division table_size = (N * WASTE_FACTOR_NUMERATOR + ROUNDING_ADJUSTMENT) / WASTE_FACTOR_DENOMINATOR; - + if (table_size == 0) { shift = 0; mult = 1; @@ -209,7 +209,7 @@ void minimal_perfect_hash::fn::initialize( detail::minimal_perfect_hash_displacements.clear(); return; } - + if (table_size == 1) { // Special case: only one type constexpr std::size_t bits_per_type_id = 8 * sizeof(type_id); @@ -254,7 +254,7 @@ void minimal_perfect_hash::fn::initialize( // Number of groups: typically sqrt(N) to N/4 for good performance num_groups = (std::max)(std::size_t(1), table_size / DEFAULT_GROUP_DIVISOR); if (num_groups > table_size) num_groups = table_size; - + // Calculate bits needed for num_groups std::size_t GM = 0; std::size_t power = 1; @@ -272,7 +272,7 @@ void minimal_perfect_hash::fn::initialize( for (std::size_t pass = 0; pass < MAX_PASSES && total_attempts < MAX_ATTEMPTS; ++pass) { mult = uniform_dist(rnd) | 1; group_mult = uniform_dist(rnd) | 1; - + // Calculate M for pilot hash (number of bits for table_size range) std::size_t M = 0; power = 1; @@ -298,7 +298,7 @@ void minimal_perfect_hash::fn::initialize( // Process groups in descending order of size (larger groups first) std::vector group_order(num_groups); for (std::size_t i = 0; i < num_groups; ++i) group_order[i] = i; - std::sort(group_order.begin(), group_order.end(), + std::sort(group_order.begin(), group_order.end(), [&groups](std::size_t a, std::size_t b) { return groups[a].size() > groups[b].size(); }); @@ -359,7 +359,7 @@ void minimal_perfect_hash::fn::initialize( if (used_count == keys.size()) { if constexpr (InitializeContext::template has_option) { ctx.tr << " Found minimal perfect hash after " << total_attempts - << " attempts; " << used_count << "/" << table_size + << " attempts; " << used_count << "/" << table_size << " slots used\n"; } return; @@ -396,7 +396,7 @@ void minimal_perfect_hash::fn::check(std::size_t index, type_id type) template auto minimal_perfect_hash::search_error::write(Stream& os) const -> void { - os << "could not find minimal perfect hash factors after " << attempts + os << "could not find minimal perfect hash factors after " << attempts << " attempts using " << buckets << " buckets\n"; } diff --git a/test/test_minimal_perfect_hash.cpp b/test/test_minimal_perfect_hash.cpp index 13c0b407..e8adb0d0 100644 --- a/test/test_minimal_perfect_hash.cpp +++ b/test/test_minimal_perfect_hash.cpp @@ -128,7 +128,7 @@ BOOST_AUTO_TEST_CASE(minimal_hash_properties) { BOOST_TEST(get_id(d3) == 3); BOOST_TEST(get_id(d4) == 4); BOOST_TEST(get_id(d5) == 5); - + // Verify that the hash function produces a minimal perfect hash // (This is implicit - if it didn't, initialization would fail or we'd get wrong results) } From 8e1603766e614c6de07e1cd2dfa0d87a550c9f4e Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 18 Dec 2025 05:50:20 +0000 Subject: [PATCH 12/15] Remove duplicate uintptr definitions (lines 23-33) Co-authored-by: jll63 <5083077+jll63@users.noreply.github.com> --- .../openmethod/policies/minimal_perfect_hash.hpp | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/include/boost/openmethod/policies/minimal_perfect_hash.hpp b/include/boost/openmethod/policies/minimal_perfect_hash.hpp index 866f9e0e..ef0082cb 100644 --- a/include/boost/openmethod/policies/minimal_perfect_hash.hpp +++ b/include/boost/openmethod/policies/minimal_perfect_hash.hpp @@ -21,17 +21,6 @@ namespace boost::openmethod { namespace detail { -#if defined(UINTPTR_MAX) -using uintptr = std::uintptr_t; -constexpr uintptr uintptr_max = UINTPTR_MAX; -#else -static_assert( - sizeof(std::size_t) == sizeof(void*), - "This implementation requires that size_t and void* have the same size."); -using uintptr = std::size_t; -constexpr uintptr uintptr_max = (std::numeric_limits::max)(); -#endif - template std::vector minimal_perfect_hash_control; From 2a3145718be4650aba656adabd6862dc0a5efc6b Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 18 Dec 2025 05:54:25 +0000 Subject: [PATCH 13/15] Move uintptr definitions to separate namespace declaration Co-authored-by: jll63 <5083077+jll63@users.noreply.github.com> --- .../openmethod/policies/minimal_perfect_hash.hpp | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/include/boost/openmethod/policies/minimal_perfect_hash.hpp b/include/boost/openmethod/policies/minimal_perfect_hash.hpp index ef0082cb..a32b60c5 100644 --- a/include/boost/openmethod/policies/minimal_perfect_hash.hpp +++ b/include/boost/openmethod/policies/minimal_perfect_hash.hpp @@ -17,6 +17,21 @@ #pragma warning(disable : 4702) // unreachable code #endif +namespace boost::openmethod::detail { + +#if defined(UINTPTR_MAX) +using uintptr = std::uintptr_t; +constexpr uintptr uintptr_max = UINTPTR_MAX; +#else +static_assert( + sizeof(std::size_t) == sizeof(void*), + "This implementation requires that size_t and void* have the same size."); +using uintptr = std::size_t; +constexpr uintptr uintptr_max = (std::numeric_limits::max)(); +#endif + +} // namespace boost::openmethod::detail + namespace boost::openmethod { namespace detail { From 508687c5d765a2643f1ff84fb3616d0beb4305c1 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 18 Dec 2025 05:57:56 +0000 Subject: [PATCH 14/15] Use anonymous namespace for uintptr definitions Co-authored-by: jll63 <5083077+jll63@users.noreply.github.com> --- .../openmethod/policies/minimal_perfect_hash.hpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/include/boost/openmethod/policies/minimal_perfect_hash.hpp b/include/boost/openmethod/policies/minimal_perfect_hash.hpp index a32b60c5..6f492e07 100644 --- a/include/boost/openmethod/policies/minimal_perfect_hash.hpp +++ b/include/boost/openmethod/policies/minimal_perfect_hash.hpp @@ -17,7 +17,7 @@ #pragma warning(disable : 4702) // unreachable code #endif -namespace boost::openmethod::detail { +namespace { #if defined(UINTPTR_MAX) using uintptr = std::uintptr_t; @@ -30,7 +30,7 @@ using uintptr = std::size_t; constexpr uintptr uintptr_max = (std::numeric_limits::max)(); #endif -} // namespace boost::openmethod::detail +} // anonymous namespace namespace boost::openmethod { @@ -142,8 +142,8 @@ struct minimal_perfect_hash : type_hash { //! @return The hash value BOOST_FORCEINLINE static auto hash(type_id type) -> std::size_t { - auto pilot = (mult * reinterpret_cast(type)) >> shift; - auto group = (group_mult * reinterpret_cast(type)) >> group_shift; + auto pilot = (mult * reinterpret_cast(type)) >> shift; + auto group = (group_mult * reinterpret_cast(type)) >> group_shift; auto index = (pilot + detail::minimal_perfect_hash_displacements[group]) % table_size; if constexpr (Registry::has_runtime_checks) { @@ -289,13 +289,13 @@ void minimal_perfect_hash::fn::initialize( // Partition keys into groups std::vector> groups(num_groups); for (auto key : keys) { - auto group_idx = ((group_mult * reinterpret_cast(key)) >> group_shift) % num_groups; + auto group_idx = ((group_mult * reinterpret_cast(key)) >> group_shift) % num_groups; groups[group_idx].push_back(key); } // Try to find displacements for each group detail::minimal_perfect_hash_displacements.assign(num_groups, 0); - buckets.assign(table_size, type_id(detail::uintptr_max)); + buckets.assign(table_size, type_id(uintptr_max)); std::vector used(table_size, false); bool success = true; @@ -324,7 +324,7 @@ void minimal_perfect_hash::fn::initialize( positions.reserve(groups[g].size()); bool valid = true; for (auto key : groups[g]) { - auto pilot = (mult * reinterpret_cast(key)) >> shift; + auto pilot = (mult * reinterpret_cast(key)) >> shift; auto pos = (pilot + disp) % table_size; if (used[pos]) { valid = false; @@ -354,7 +354,7 @@ void minimal_perfect_hash::fn::initialize( // Count how many positions are used std::size_t used_count = 0; for (std::size_t i = 0; i < table_size; ++i) { - if (detail::uintptr(buckets[i]) != detail::uintptr_max) { + if (uintptr(buckets[i]) != uintptr_max) { used_count++; } } From 22673b0f45e99ddced51ad91c58b6ab23ccb264a Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 18 Dec 2025 06:05:28 +0000 Subject: [PATCH 15/15] Fix potential overflow in group hash calculation by using uint32_t multiplier Co-authored-by: jll63 <5083077+jll63@users.noreply.github.com> --- .../boost/openmethod/policies/minimal_perfect_hash.hpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/include/boost/openmethod/policies/minimal_perfect_hash.hpp b/include/boost/openmethod/policies/minimal_perfect_hash.hpp index 6f492e07..c07e53aa 100644 --- a/include/boost/openmethod/policies/minimal_perfect_hash.hpp +++ b/include/boost/openmethod/policies/minimal_perfect_hash.hpp @@ -88,7 +88,7 @@ struct minimal_perfect_hash : type_hash { static std::size_t shift; static std::size_t table_size; // N for minimal perfect hash static std::size_t num_groups; - static std::size_t group_mult; + static std::uint32_t group_mult; // Smaller type to avoid overflow static std::size_t group_shift; static void check(std::size_t index, type_id type); @@ -179,7 +179,7 @@ template std::size_t minimal_perfect_hash::fn::num_groups; template -std::size_t minimal_perfect_hash::fn::group_mult; +std::uint32_t minimal_perfect_hash::fn::group_mult; template std::size_t minimal_perfect_hash::fn::group_shift; @@ -275,7 +275,10 @@ void minimal_perfect_hash::fn::initialize( // Try different pilot hash parameters for (std::size_t pass = 0; pass < MAX_PASSES && total_attempts < MAX_ATTEMPTS; ++pass) { mult = uniform_dist(rnd) | 1; - group_mult = uniform_dist(rnd) | 1; + // Use a smaller multiplier for group hash to avoid overflow + // We only need enough bits to distinguish between num_groups + std::uniform_int_distribution group_dist; + group_mult = group_dist(rnd) | 1; // Calculate M for pilot hash (number of bits for table_size range) std::size_t M = 0;