diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 6b03cb98c..00c5a46f3 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -62,7 +62,8 @@ ConfigureBench(STATIC_MAP_BENCH hash_table/static_map/insert_bench.cu hash_table/static_map/find_bench.cu hash_table/static_map/contains_bench.cu - hash_table/static_map/erase_bench.cu) + hash_table/static_map/erase_bench.cu + hash_table/static_map/insert_or_apply_bench.cu) ################################################################################################### # - static_multimap benchmarks -------------------------------------------------------------------- diff --git a/benchmarks/hash_table/static_map/insert_or_apply_bench.cu b/benchmarks/hash_table/static_map/insert_or_apply_bench.cu new file mode 100644 index 000000000..725f7c15e --- /dev/null +++ b/benchmarks/hash_table/static_map/insert_or_apply_bench.cu @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include + +#include +#include + +using namespace cuco::benchmark; +using namespace cuco::utility; + +/** + * @brief A benchmark evaluating `cuco::static_map::insert_or_apply` performance + */ +template +std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_map_insert_or_apply( + nvbench::state& state, nvbench::type_list) +{ + using pair_type = cuco::pair; + + auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); + auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY); + auto const multiplicity = state.get_int64_or_default("Multiplicity", defaults::MULTIPLICITY); + + std::size_t const size = cuco::detail::int_div_ceil(num_keys, multiplicity) / occupancy; + + thrust::device_vector keys(num_keys); + + key_generator gen; + gen.generate(dist_from_state(state), keys.begin(), keys.end()); + + thrust::device_vector pairs(num_keys); + thrust::transform(keys.begin(), keys.end(), pairs.begin(), [] __device__(Key const& key) { + return pair_type(key, static_cast(key)); + }); + + state.add_element_count(num_keys); + + cuco::experimental::static_map map{size, cuco::empty_key{-1}, cuco::empty_value{0}}; + + state.exec(nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { + map.clear_async({launch.get_stream()}); + + timer.start(); + map.insert_or_apply_async( + pairs.begin(), pairs.end(), cuco::experimental::op::reduce::sum, {launch.get_stream()}); + timer.stop(); + }); +} + +template +std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> static_map_insert_or_apply( + nvbench::state& state, nvbench::type_list) +{ + state.skip("Key should be the same type as Value."); +} + +NVBENCH_BENCH_TYPES(static_map_insert_or_apply, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_map_insert_or_apply_uniform_multiplicity") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_int64_axis("Multiplicity", defaults::MULTIPLICITY_RANGE); + +NVBENCH_BENCH_TYPES(static_map_insert_or_apply, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_set_insert_or_apply_uniform_occupancy") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE); \ No newline at end of file diff --git a/include/cuco/detail/static_map/kernels.cuh b/include/cuco/detail/static_map/kernels.cuh index f9171ef77..b65e59451 100644 --- a/include/cuco/detail/static_map/kernels.cuh +++ b/include/cuco/detail/static_map/kernels.cuh @@ -30,6 +30,7 @@ namespace experimental { namespace static_map_ns { namespace detail { +// TODO user insert_or_assign internally /** * @brief For any key-value pair `{k, v}` in the range `[first, first + n)`, if a key equivalent to * `k` already exists in the container, assigns `v` to the mapped_type corresponding to the key `k`. @@ -67,6 +68,44 @@ __global__ void insert_or_assign(InputIt first, cuco::detail::index_type n, Ref } } +// TODO docs +/** + * @brief For any key-value pair `{k, v}` in the range `[first, first + n)`, if a key equivalent to + * `k` already exists in the container, assigns `v` to the mapped_type corresponding to the key `k`. + * If the key does not exist, inserts the pair as if by insert. + * + * @note If multiple elements in `[first, first + n)` compare equal, it is unspecified which element + * is inserted. + * + * @tparam CGSize Number of threads in each CG + * @tparam BlockSize Number of threads in each block + * @tparam InputIt Device accessible input iterator whose `value_type` is + * convertible to the `value_type` of the data structure + * @tparam Ref Type of non-owning device ref allowing access to storage + * + * @param first Beginning of the sequence of input elements + * @param n Number of input elements + * @param ref Non-owning container device ref used to access the slot storage + */ +template +__global__ void insert_or_apply(InputIt first, cuco::detail::index_type n, Op op, Ref ref) +{ + auto const loop_stride = cuco::detail::grid_stride() / CGSize; + auto idx = cuco::detail::global_thread_id() / CGSize; + + while (idx < n) { + typename std::iterator_traits::value_type const& insert_pair = *(first + idx); + if constexpr (CGSize == 1) { + ref.insert_or_apply(insert_pair, op); + } else { + auto const tile = + cooperative_groups::tiled_partition(cooperative_groups::this_thread_block()); + ref.insert_or_apply(tile, insert_pair, op); + } + idx += loop_stride; + } +} + /** * @brief Finds the equivalent map elements of all keys in the range `[first, first + n)`. * diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl index 9249d4fa1..80d7135e9 100644 --- a/include/cuco/detail/static_map/static_map.inl +++ b/include/cuco/detail/static_map/static_map.inl @@ -241,6 +241,44 @@ void static_map +template +void static_map:: + insert_or_apply(InputIt first, InputIt last, Op op, cuda_stream_ref stream) noexcept +{ + return this->insert_or_apply_async(first, last, op, stream); + stream.synchronize(); +} + +template +template +void static_map:: + insert_or_apply_async(InputIt first, InputIt last, Op op, cuda_stream_ref stream) noexcept +{ + auto const num = cuco::detail::distance(first, last); + if (num == 0) { return; } + + auto const grid_size = cuco::detail::grid_size(num, cg_size); + + static_map_ns::detail::insert_or_apply + <<>>( + first, num, op, ref(op::insert_or_apply)); +} + template +class operator_impl< + op::insert_or_apply_tag, + static_map_ref> { + using base_type = static_map_ref; + using ref_type = static_map_ref; + using key_type = typename base_type::key_type; + using value_type = typename base_type::value_type; + + static constexpr auto cg_size = base_type::cg_size; + static constexpr auto window_size = base_type::window_size; + + static_assert(sizeof(T) == 4 or sizeof(T) == 8, + "sizeof(mapped_type) must be either 4 bytes or 8 bytes."); + + public: + // TODO docs + /** + * @brief Inserts a key-value pair `{k, v}` if it's not present in the map. Otherwise, assigns `v` + * to the mapped_type corresponding to the key `k`. + * + * @tparam Value Input type which is implicitly convertible to 'value_type' + * + * @param value The element to insert + */ + template + __device__ void insert_or_apply(Value const& value, Op op) noexcept + { + static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme"); + + ref_type& ref_ = static_cast(*this); + auto const key = thrust::get<0>(value); + auto& probing_scheme = ref_.impl_.probing_scheme(); + auto storage_ref = ref_.impl_.storage_ref(); + auto probing_iter = probing_scheme(key, storage_ref.window_extent()); + + while (true) { + auto const window_slots = storage_ref[*probing_iter]; + + for (auto& slot_content : window_slots) { + auto const eq_res = ref_.impl_.predicate()(slot_content.first, key); + + // If the key is already in the container, update the payload and return + if (eq_res == detail::equal_result::EQUAL) { + auto const intra_window_index = thrust::distance(window_slots.begin(), &slot_content); + op(((storage_ref.data() + *probing_iter)->data() + intra_window_index)->second, + static_cast(thrust::get<1>(value))); + return; + } + if (eq_res == detail::equal_result::EMPTY or + cuco::detail::bitwise_compare(slot_content.first, ref_.impl_.erased_key_sentinel())) { + auto const intra_window_index = thrust::distance(window_slots.begin(), &slot_content); + if (attempt_insert_or_apply( + (storage_ref.data() + *probing_iter)->data() + intra_window_index, value, op)) { + return; + } + } + } + ++probing_iter; + } + } + + template + __device__ void insert_or_apply(Value const& value, + cuco::experimental::op::reduce::sum_tag) noexcept + { + auto& ref_ = static_cast(*this); + ref_.insert_or_apply(value, [](T& slot, T const& payload) { + cuda::atomic_ref slot_ref{slot}; + slot_ref.fetch_add(payload, cuda::memory_order_relaxed); + }); + } + + // TODO docs + /** + * @brief Inserts an element. + * + * @brief Inserts a key-value pair `{k, v}` if it's not present in the map. Otherwise, assigns `v` + * to the mapped_type corresponding to the key `k`. + * + * @tparam Value Input type which is implicitly convertible to 'value_type' + * + * @param group The Cooperative Group used to perform group insert + * @param value The element to insert + */ + template + __device__ void insert_or_apply(cooperative_groups::thread_block_tile const& group, + Value const& value, + Op op) noexcept + { + ref_type& ref_ = static_cast(*this); + + auto const key = thrust::get<0>(thrust::raw_reference_cast(value)); + auto& probing_scheme = ref_.impl_.probing_scheme(); + auto storage_ref = ref_.impl_.storage_ref(); + auto probing_iter = probing_scheme(group, key, storage_ref.window_extent()); + + while (true) { + auto const window_slots = storage_ref[*probing_iter]; + + auto const [state, intra_window_index] = [&]() { + for (auto i = 0; i < window_size; ++i) { + switch (ref_.impl_.predicate()(window_slots[i].first, key)) { + case detail::equal_result::EMPTY: + return detail::window_probing_results{detail::equal_result::EMPTY, i}; + case detail::equal_result::EQUAL: + return detail::window_probing_results{detail::equal_result::EQUAL, i}; + default: { + if (cuco::detail::bitwise_compare(window_slots[i].first, + ref_.impl_.erased_key_sentinel())) { + return window_probing_results{detail::equal_result::ERASED, i}; + } else { + continue; + } + } + } + } + // returns dummy index `-1` for UNEQUAL + return detail::window_probing_results{detail::equal_result::UNEQUAL, -1}; + }(); + + auto const group_contains_equal = group.ballot(state == detail::equal_result::EQUAL); + if (group_contains_equal) { + auto const src_lane = __ffs(group_contains_equal) - 1; + if (group.thread_rank() == src_lane) { + op(((storage_ref.data() + *probing_iter)->data() + intra_window_index)->second, + static_cast(thrust::get<1>(value))); + } + group.sync(); + return; + } + + auto const group_contains_available = + group.ballot(state == detail::equal_result::EMPTY or state == detail::equal_result::ERASED); + if (group_contains_available) { + auto const src_lane = __ffs(group_contains_available) - 1; + auto const status = + (group.thread_rank() == src_lane) + ? attempt_insert_or_apply( + (storage_ref.data() + *probing_iter)->data() + intra_window_index, value, op) + : false; + + // Exit if inserted or assigned + if (group.shfl(status, src_lane)) { return; } + } else { + ++probing_iter; + } + } + } + + template + __device__ void insert_or_apply(cooperative_groups::thread_block_tile const& group, + Value const& value, + cuco::experimental::op::reduce::sum_tag) noexcept + { + auto& ref_ = static_cast(*this); + ref_.insert_or_apply(group, value, [](T& slot, T const& payload) { + cuda::atomic_ref slot_ref{slot}; + slot_ref.fetch_add(payload, cuda::memory_order_relaxed); + }); + } + + private: + // TODO docs + /** + * @brief Attempts to insert an element into a slot or update the matching payload with the given + * element + * + * @brief Inserts a key-value pair `{k, v}` if it's not present in the map. Otherwise, assigns `v` + * to the mapped_type corresponding to the key `k`. + * + * @tparam Value Input type which is implicitly convertible to 'value_type' + * + * @param group The Cooperative Group used to perform group insert + * @param value The element to insert + * + * @return Returns `true` if the given `value` is inserted or `value` has a match in the map. + */ + template + __device__ constexpr bool attempt_insert_or_apply(value_type* slot, + Value const& value, + Op op) noexcept + { + ref_type& ref_ = static_cast(*this); + auto const expected_key = ref_.impl_.empty_slot_sentinel().first; + + auto old_key = ref_.impl_.compare_and_swap( + &slot->first, expected_key, static_cast(thrust::get<0>(value))); + auto* old_key_ptr = reinterpret_cast(&old_key); + + // if key success or key was already present in the map + if (cuco::detail::bitwise_compare(*old_key_ptr, expected_key) or + (ref_.impl_.predicate().equal_to(*old_key_ptr, + thrust::get<0>(thrust::raw_reference_cast(value))) == + detail::equal_result::EQUAL)) { + // Update payload + op(slot->second, static_cast(thrust::get<1>(value))); + return true; + } + return false; + } +}; + template void insert_or_assign_async(InputIt first, InputIt last, cuda_stream_ref stream = {}) noexcept; + // TODO docs + template + void insert_or_apply(InputIt first, InputIt last, Op op, cuda_stream_ref stream = {}) noexcept; + + // TODO docs + template + void insert_or_apply_async(InputIt first, + InputIt last, + Op op, + cuda_stream_ref stream = {}) noexcept; + /** * @brief Erases keys in the range `[first, last)`. * diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 916e0ea42..c44de4037 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -77,6 +77,7 @@ ConfigureTest(STATIC_MAP_TEST static_map/heterogeneous_lookup_test.cu static_map/insert_and_find_test.cu static_map/insert_or_assign_test.cu + static_map/insert_or_apply_test.cu static_map/key_sentinel_test.cu static_map/shared_memory_test.cu static_map/stream_test.cu diff --git a/tests/static_map/insert_or_apply_test.cu b/tests/static_map/insert_or_apply_test.cu new file mode 100644 index 000000000..a7cbff438 --- /dev/null +++ b/tests/static_map/insert_or_apply_test.cu @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include + +using size_type = std::size_t; + +template +__inline__ void test_insert_or_apply(Map& map, size_type num_keys, size_type num_unique_keys) +{ + REQUIRE((num_keys % num_unique_keys) == 0); + + using key_type = typename Map::key_type; + using mapped_type = typename Map::mapped_type; + + auto keys_begin = thrust::make_transform_iterator( + thrust::counting_iterator(0), + [num_unique_keys] __host__ __device__(key_type const& x) -> key_type { + return x % num_unique_keys; + }); + + auto values_begin = thrust::make_constant_iterator(1); + + auto pairs_begin = thrust::make_zip_iterator(thrust::make_tuple(keys_begin, values_begin)); + + map.insert_or_apply(pairs_begin, pairs_begin + num_keys, cuco::experimental::op::reduce::sum); + + REQUIRE(map.size() == num_unique_keys); + + thrust::device_vector d_keys(num_unique_keys); + thrust::device_vector d_values(num_unique_keys); + map.retrieve_all(d_keys.begin(), d_values.begin()); + + REQUIRE(cuco::test::equal(d_values.begin(), + d_values.end(), + thrust::make_constant_iterator(num_keys / num_unique_keys), + thrust::equal_to{})); +} + +TEMPLATE_TEST_CASE_SIG( + "Insert or apply", + "", + ((typename Key, typename Value, cuco::test::probe_sequence Probe, int CGSize), + Key, + Value, + Probe, + CGSize), + (int32_t, int32_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, int32_t, cuco::test::probe_sequence::double_hashing, 2), + (int32_t, int64_t, cuco::test::probe_sequence::double_hashing, 2), + (int64_t, int32_t, cuco::test::probe_sequence::double_hashing, 1), + (int64_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int64_t, int32_t, cuco::test::probe_sequence::double_hashing, 2), + (int64_t, int64_t, cuco::test::probe_sequence::double_hashing, 2), + (int32_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), + (int32_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), + (int32_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), + (int32_t, int64_t, cuco::test::probe_sequence::linear_probing, 2), + (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), + (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2)) +{ + constexpr size_type num_keys{400}; + constexpr size_type num_unique_keys{100}; + + using probe = + std::conditional_t>, + cuco::experimental::double_hashing, + cuco::murmurhash3_32>>; + + auto map = cuco::experimental::static_map, + cuda::thread_scope_device, + thrust::equal_to, + probe, + cuco::cuda_allocator, + cuco::experimental::storage<2>>{ + num_keys, cuco::empty_key{-1}, cuco::empty_value{0}}; + + test_insert_or_apply(map, num_keys, num_unique_keys); +} \ No newline at end of file