diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 0e7411922..ee0a5244f 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -28,6 +28,8 @@ if("${GPU_ARCHS}" STREQUAL "") evaluate_gpu_archs(GPU_ARCHS) endif() +message("GPU_ARCHS = ${GPU_ARCHS}") + ################################################################################################### # - compiler function ----------------------------------------------------------------------------- @@ -35,7 +37,7 @@ function(ConfigureBench BENCH_NAME BENCH_SRC) add_executable(${BENCH_NAME} "${BENCH_SRC}") set_target_properties(${BENCH_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON - CUDA_ARCHITECTURES ${GPU_ARCHS} + CUDA_ARCHITECTURES "${GPU_ARCHS}" RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/gbenchmarks") target_include_directories(${BENCH_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}") @@ -58,6 +60,9 @@ ConfigureBench(DYNAMIC_MAP_BENCH "${DYNAMIC_MAP_BENCH_SRC}") set(STATIC_MAP_BENCH_SRC "${CMAKE_CURRENT_SOURCE_DIR}/hash_table/static_map_bench.cu") ConfigureBench(STATIC_MAP_BENCH "${STATIC_MAP_BENCH_SRC}") +################################################################################################### +ConfigureBench(STATIC_REDUCTION_MAP_BENCH "${CMAKE_CURRENT_SOURCE_DIR}/hash_table/static_reduction_map_bench.cu") + ################################################################################################### set(RBK_BENCH_SRC "${CMAKE_CURRENT_SOURCE_DIR}/reduce_by_key/reduce_by_key.cu") ConfigureBench(RBK_BENCH "${RBK_BENCH_SRC}") diff --git a/benchmarks/hash_table/static_map_bench.cu b/benchmarks/hash_table/static_map_bench.cu index 165465518..563769df6 100644 --- a/benchmarks/hash_table/static_map_bench.cu +++ b/benchmarks/hash_table/static_map_bench.cu @@ -15,40 +15,38 @@ */ #include -#include "cuco/static_map.cuh" -#include #include -#include +#include +#include #include +#include #include +#include "cuco/static_map.cuh" -enum class dist_type { - UNIQUE, - UNIFORM, - GAUSSIAN -}; +enum class dist_type { UNIQUE, UNIFORM, GAUSSIAN }; -template -static void generate_keys(OutputIt output_begin, OutputIt output_end) { +template +static void generate_keys(OutputIt output_begin, OutputIt output_end) +{ auto num_keys = std::distance(output_begin, output_end); - + std::random_device rd; std::mt19937 gen{rd()}; - switch(Dist) { + switch (Dist) { case dist_type::UNIQUE: - for(auto i = 0; i < num_keys; ++i) { + for (auto i = 0; i < num_keys; ++i) { output_begin[i] = i; } break; case dist_type::UNIFORM: - for(auto i = 0; i < num_keys; ++i) { + for (auto i = 0; i < num_keys; ++i) { output_begin[i] = std::abs(static_cast(gen())); } break; case dist_type::GAUSSIAN: std::normal_distribution<> dg{1e9, 1e7}; - for(auto i = 0; i < num_keys; ++i) { + for (auto i = 0; i < num_keys; ++i) { output_begin[i] = std::abs(static_cast(dg(gen))); } break; @@ -59,88 +57,84 @@ static void generate_keys(OutputIt output_begin, OutputIt output_end) { * @brief Generates input sizes and hash table occupancies * */ -static void generate_size_and_occupancy(benchmark::internal::Benchmark* b) { - for (auto size = 100'000'000; size <= 100'000'000; size *= 10) { - for (auto occupancy = 10; occupancy <= 90; occupancy += 10) { +static void generate_size_and_occupancy(benchmark::internal::Benchmark* b) +{ + for (auto size = 4096; size <= 1 << 28; size *= 2) { + for (auto occupancy = 60; occupancy <= 60; occupancy += 10) { b->Args({size, occupancy}); } } } - - template -static void BM_static_map_insert(::benchmark::State& state) { +static void BM_static_map_insert(::benchmark::State& state) +{ using map_type = cuco::static_map; - + std::size_t num_keys = state.range(0); - float occupancy = state.range(1) / float{100}; - std::size_t size = num_keys / occupancy; + float occupancy = state.range(1) / float{100}; + std::size_t size = num_keys / occupancy; + + std::vector h_keys(num_keys); + std::vector> h_pairs(num_keys); - std::vector h_keys( num_keys ); - std::vector> h_pairs( num_keys ); - generate_keys(h_keys.begin(), h_keys.end()); - - for(auto i = 0; i < num_keys; ++i) { - Key key = h_keys[i]; - Value val = h_keys[i]; - h_pairs[i].first = key; + + for (auto i = 0; i < num_keys; ++i) { + Key key = h_keys[i]; + Value val = h_keys[i]; + h_pairs[i].first = key; h_pairs[i].second = val; } - thrust::device_vector> d_pairs( h_pairs ); + thrust::device_vector> d_pairs(h_pairs); - for(auto _ : state) { - state.ResumeTiming(); - state.PauseTiming(); + for (auto _ : state) { map_type map{size, -1, -1}; - state.ResumeTiming(); - - map.insert(d_pairs.begin(), d_pairs.end()); - state.PauseTiming(); + { + cuda_event_timer raii{state}; + map.insert(d_pairs.begin(), d_pairs.end()); + } } - state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * - int64_t(state.iterations()) * + state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) * int64_t(state.range(0))); } - - template -static void BM_static_map_search_all(::benchmark::State& state) { +static void BM_static_map_search_all(::benchmark::State& state) +{ using map_type = cuco::static_map; - + std::size_t num_keys = state.range(0); - float occupancy = state.range(1) / float{100}; - std::size_t size = num_keys / occupancy; + float occupancy = state.range(1) / float{100}; + std::size_t size = num_keys / occupancy; map_type map{size, -1, -1}; auto view = map.get_device_mutable_view(); - std::vector h_keys( num_keys ); - std::vector h_values( num_keys ); - std::vector> h_pairs ( num_keys ); - std::vector h_results (num_keys); + std::vector h_keys(num_keys); + std::vector h_values(num_keys); + std::vector> h_pairs(num_keys); + std::vector h_results(num_keys); generate_keys(h_keys.begin(), h_keys.end()); - - for(auto i = 0; i < num_keys; ++i) { - Key key = h_keys[i]; - Value val = h_keys[i]; - h_pairs[i].first = key; + + for (auto i = 0; i < num_keys; ++i) { + Key key = h_keys[i]; + Value val = h_keys[i]; + h_pairs[i].first = key; h_pairs[i].second = val; } - thrust::device_vector d_keys( h_keys ); - thrust::device_vector d_results( num_keys); - thrust::device_vector> d_pairs( h_pairs ); + thrust::device_vector d_keys(h_keys); + thrust::device_vector d_results(num_keys); + thrust::device_vector> d_pairs(h_pairs); map.insert(d_pairs.begin(), d_pairs.end()); - - for(auto _ : state) { + + for (auto _ : state) { map.find(d_keys.begin(), d_keys.end(), d_results.begin()); } @@ -148,52 +142,62 @@ static void BM_static_map_search_all(::benchmark::State& state) { int64_t(state.range(0))); } - - BENCHMARK_TEMPLATE(BM_static_map_insert, int32_t, int32_t, dist_type::UNIQUE) ->Unit(benchmark::kMillisecond) + ->UseManualTime() ->Apply(generate_size_and_occupancy); BENCHMARK_TEMPLATE(BM_static_map_search_all, int32_t, int32_t, dist_type::UNIQUE) ->Unit(benchmark::kMillisecond) + ->UseManualTime() ->Apply(generate_size_and_occupancy); BENCHMARK_TEMPLATE(BM_static_map_insert, int32_t, int32_t, dist_type::UNIFORM) ->Unit(benchmark::kMillisecond) + ->UseManualTime() ->Apply(generate_size_and_occupancy); BENCHMARK_TEMPLATE(BM_static_map_search_all, int32_t, int32_t, dist_type::UNIFORM) ->Unit(benchmark::kMillisecond) + ->UseManualTime() ->Apply(generate_size_and_occupancy); BENCHMARK_TEMPLATE(BM_static_map_insert, int32_t, int32_t, dist_type::GAUSSIAN) ->Unit(benchmark::kMillisecond) + ->UseManualTime() ->Apply(generate_size_and_occupancy); BENCHMARK_TEMPLATE(BM_static_map_search_all, int32_t, int32_t, dist_type::GAUSSIAN) ->Unit(benchmark::kMillisecond) + ->UseManualTime() ->Apply(generate_size_and_occupancy); BENCHMARK_TEMPLATE(BM_static_map_insert, int64_t, int64_t, dist_type::UNIQUE) ->Unit(benchmark::kMillisecond) + ->UseManualTime() ->Apply(generate_size_and_occupancy); BENCHMARK_TEMPLATE(BM_static_map_search_all, int64_t, int64_t, dist_type::UNIQUE) ->Unit(benchmark::kMillisecond) + ->UseManualTime() ->Apply(generate_size_and_occupancy); BENCHMARK_TEMPLATE(BM_static_map_insert, int64_t, int64_t, dist_type::UNIFORM) ->Unit(benchmark::kMillisecond) + ->UseManualTime() ->Apply(generate_size_and_occupancy); BENCHMARK_TEMPLATE(BM_static_map_search_all, int64_t, int64_t, dist_type::UNIFORM) ->Unit(benchmark::kMillisecond) + ->UseManualTime() ->Apply(generate_size_and_occupancy); BENCHMARK_TEMPLATE(BM_static_map_insert, int64_t, int64_t, dist_type::GAUSSIAN) ->Unit(benchmark::kMillisecond) + ->UseManualTime() ->Apply(generate_size_and_occupancy); BENCHMARK_TEMPLATE(BM_static_map_search_all, int64_t, int64_t, dist_type::GAUSSIAN) ->Unit(benchmark::kMillisecond) + ->UseManualTime() ->Apply(generate_size_and_occupancy); \ No newline at end of file diff --git a/benchmarks/hash_table/static_reduction_map_bench.cu b/benchmarks/hash_table/static_reduction_map_bench.cu new file mode 100644 index 000000000..92a2ab788 --- /dev/null +++ b/benchmarks/hash_table/static_reduction_map_bench.cu @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "cuco/static_reduction_map.cuh" + +enum class dist_type { UNIQUE, UNIFORM, GAUSSIAN }; + +template +static void generate_keys(OutputIt output_begin, OutputIt output_end) +{ + auto num_keys = std::distance(output_begin, output_end); + + std::random_device rd; + std::mt19937 gen{rd()}; + + switch (Dist) { + case dist_type::UNIQUE: + for (auto i = 0; i < num_keys; ++i) { + output_begin[i] = i; + } + break; + case dist_type::UNIFORM: + for (auto i = 0; i < num_keys; ++i) { + output_begin[i] = std::abs(static_cast(gen())); + } + break; + case dist_type::GAUSSIAN: + std::normal_distribution<> dg{1e9, 1e7}; + for (auto i = 0; i < num_keys; ++i) { + output_begin[i] = std::abs(static_cast(dg(gen))); + } + break; + } +} + +/** + * @brief Generates input sizes and hash table occupancies + * + */ +static void generate_size_and_occupancy(benchmark::internal::Benchmark* b) +{ + for (auto size = 4096; size <= 1 << 28; size *= 2) { + for (auto occupancy = 60; occupancy <= 60; occupancy += 10) { + b->Args({size, occupancy}); + } + } +} + +template typename ReductionOp> +static void BM_static_map_insert(::benchmark::State& state) +{ + using map_type = cuco::static_reduction_map, Key, Value>; + + std::size_t num_keys = state.range(0); + float occupancy = state.range(1) / float{100}; + std::size_t size = num_keys / occupancy; + + std::vector h_keys(num_keys); + std::vector> h_pairs(num_keys); + + generate_keys(h_keys.begin(), h_keys.end()); + + thrust::device_vector d_keys(h_keys); + thrust::device_vector d_values(h_keys); + + auto pairs_begin = + thrust::make_zip_iterator(thrust::make_tuple(d_keys.begin(), d_values.begin())); + auto pairs_end = pairs_begin + num_keys; + + for (auto _ : state) { + map_type map{size, -1}; + { + cuda_event_timer raii{state}; + map.insert(pairs_begin, pairs_end); + } + } + + state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) * + int64_t(state.range(0))); +} + +BENCHMARK_TEMPLATE(BM_static_map_insert, int32_t, int32_t, dist_type::UNIQUE, cuco::reduce_add) + ->Unit(benchmark::kMillisecond) + ->UseManualTime() + ->Apply(generate_size_and_occupancy); + +BENCHMARK_TEMPLATE(BM_static_map_insert, int32_t, int32_t, dist_type::UNIFORM, cuco::reduce_add) + ->Unit(benchmark::kMillisecond) + ->UseManualTime() + ->Apply(generate_size_and_occupancy); + +BENCHMARK_TEMPLATE(BM_static_map_insert, int32_t, int32_t, dist_type::GAUSSIAN, cuco::reduce_add) + ->Unit(benchmark::kMillisecond) + ->UseManualTime() + ->Apply(generate_size_and_occupancy); + +BENCHMARK_TEMPLATE(BM_static_map_insert, int64_t, int64_t, dist_type::UNIQUE, cuco::reduce_add) + ->Unit(benchmark::kMillisecond) + ->UseManualTime() + ->Apply(generate_size_and_occupancy); + +BENCHMARK_TEMPLATE(BM_static_map_insert, int64_t, int64_t, dist_type::UNIFORM, cuco::reduce_add) + ->Unit(benchmark::kMillisecond) + ->UseManualTime() + ->Apply(generate_size_and_occupancy); + +BENCHMARK_TEMPLATE(BM_static_map_insert, int64_t, int64_t, dist_type::GAUSSIAN, cuco::reduce_add) + ->Unit(benchmark::kMillisecond) + ->UseManualTime() + ->Apply(generate_size_and_occupancy); \ No newline at end of file diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index a70b53da8..be1a760e6 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -15,7 +15,7 @@ endif() function(ConfigureExample EXAMPLE_NAME EXAMPLE_SRC) add_executable(${EXAMPLE_NAME} "${EXAMPLE_SRC}") set_target_properties(${EXAMPLE_NAME} PROPERTIES - CUDA_ARCHITECTURES ${GPU_ARCHS} + CUDA_ARCHITECTURES "${GPU_ARCHS}" RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/examples") target_include_directories(${EXAMPLE_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}") @@ -28,3 +28,5 @@ endfunction(ConfigureExample) ################################################################################################### ConfigureExample(STATIC_MAP_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/static_map_example.cu") + +ConfigureExample(STATIC_REDUCTION_MAP_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_reduction_map.cu") diff --git a/examples/static_reduction_map.cu b/examples/static_reduction_map.cu new file mode 100644 index 000000000..f152ceb78 --- /dev/null +++ b/examples/static_reduction_map.cu @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +/** + * @file host_bulk_example.cu + * @brief Demonstrates usage of the static_map "bulk" host APIs. + * + * The bulk APIs are only invocable from the host and are used for doing operations like insert or + * find on a set of keys. + * + */ + +int main(void) +{ + using Key = int; + using Value = int; + + // Empty slots are represented by reserved "sentinel" values. These values should be selected such + // that they never occur in your input data. + Key const empty_key_sentinel = -1; + + // Number of key/value pairs to be inserted + std::size_t num_keys = 257; + + // Compute capacity based on a 50% load factor + auto const load_factor = 0.5; + std::size_t const capacity = std::ceil(num_keys / load_factor); + + // Constructs a map each key with "capacity" slots using -1 as the + // empty key sentinel. The initial payload value for empty slots is determined by the identity of + // the reduction operation. By using the `reduce_add` operation, all values associated with a + // given key will be summed. + cuco::static_reduction_map, Key, Value> map{capacity, empty_key_sentinel}; + + // Create a sequence of random keys in `[0, num_keys/2]` + thrust::device_vector insert_keys(num_keys); + thrust::transform(thrust::device, + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(insert_keys.size()), + insert_keys.begin(), + [=] __device__(auto i) { + thrust::default_random_engine rng; + thrust::uniform_int_distribution dist{0, 10}; + rng.discard(i); + return dist(rng); + }); + + // Insert each key with a payload of `1` to count the number of times each key was inserted by + // using the `reduce_add` op + auto zipped = thrust::make_zip_iterator( + thrust::make_tuple(insert_keys.begin(), thrust::make_constant_iterator(1))); + + // Inserts all pairs into the map, accumulating the payloads with the `reduce_add` operation + map.insert(zipped, zipped + insert_keys.size()); + + std::cout << "Num unique keys: " << map.get_size() << std::endl; + + thrust::device_vector unique_keys(map.get_size()); + thrust::device_vector count_per_key(map.get_size()); + + map.retrieve_all(unique_keys.begin(), count_per_key.begin()); + + for (int i = 0; i < unique_keys.size(); ++i) { + std::cout << "Key: " << unique_keys[i] << " Count: " << count_per_key[i] << std::endl; + } +} \ No newline at end of file diff --git a/include/cuco/detail/pair.cuh b/include/cuco/detail/pair.cuh index da50f7258..dfdf7632e 100644 --- a/include/cuco/detail/pair.cuh +++ b/include/cuco/detail/pair.cuh @@ -68,8 +68,8 @@ struct is_thrust_pair_like_impl : std::false_type { template struct is_thrust_pair_like_impl(std::declval())), - decltype(thrust::get<1>(std::declval()))>> + std::void_t(std::declval())), + decltype(thrust::get<1>(std::declval()))>> : std::conditional_t::value == 2, std::true_type, std::false_type> { }; diff --git a/include/cuco/detail/static_reduction_map.inl b/include/cuco/detail/static_reduction_map.inl new file mode 100644 index 000000000..bd9907ebc --- /dev/null +++ b/include/cuco/detail/static_reduction_map.inl @@ -0,0 +1,460 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +namespace cuco { + +/**---------------------------------------------------------------------------* + * @brief Enumeration of the possible results of attempting to insert into + *a hash bucket + *---------------------------------------------------------------------------**/ +enum class insert_result { + CONTINUE, ///< Insert did not succeed, continue trying to insert + SUCCESS, ///< New pair inserted successfully + DUPLICATE ///< Insert did not succeed, key is already present +}; + +template +static_reduction_map::static_reduction_map( + std::size_t capacity, Key empty_key_sentinel, ReductionOp reduction_op, Allocator const& alloc) + : capacity_{capacity}, + empty_key_sentinel_{empty_key_sentinel}, + empty_value_sentinel_{ReductionOp::identity}, + op_{reduction_op}, + slot_allocator_{alloc} +{ + slots_ = std::allocator_traits::allocate(slot_allocator_, capacity); + + auto constexpr block_size = 256; + auto constexpr stride = 4; + auto const grid_size = (capacity + stride * block_size - 1) / (stride * block_size); + detail::initialize<<>>( + slots_, get_empty_key_sentinel(), get_empty_value_sentinel(), get_capacity()); + + CUCO_CUDA_TRY(cudaMallocManaged(&num_successes_, sizeof(atomic_ctr_type))); +} + +template +static_reduction_map::~static_reduction_map() +{ + std::allocator_traits::deallocate(slot_allocator_, slots_, capacity_); + CUCO_CUDA_TRY(cudaFree(num_successes_)); +} + +template +template +void static_reduction_map::insert(InputIt first, + InputIt last, + Hash hash, + KeyEqual key_equal) +{ + auto num_keys = std::distance(first, last); + auto const block_size = 128; + auto const stride = 1; + auto const tile_size = 4; + auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size); + auto view = get_device_mutable_view(); + + *num_successes_ = 0; + int device_id; + CUCO_CUDA_TRY(cudaGetDevice(&device_id)); + CUCO_CUDA_TRY(cudaMemPrefetchAsync(num_successes_, sizeof(atomic_ctr_type), device_id)); + + detail::insert + <<>>(first, first + num_keys, num_successes_, view, hash, key_equal); + CUCO_CUDA_TRY(cudaDeviceSynchronize()); + + size_ += num_successes_->load(cuda::std::memory_order_relaxed); +} + +template +template +void static_reduction_map::find( + InputIt first, InputIt last, OutputIt output_begin, Hash hash, KeyEqual key_equal) noexcept +{ + auto num_keys = std::distance(first, last); + auto const block_size = 128; + auto const stride = 1; + auto const tile_size = 4; + auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size); + auto view = get_device_view(); + + detail::find + <<>>(first, last, output_begin, view, hash, key_equal); + CUCO_CUDA_TRY(cudaDeviceSynchronize()); +} + +namespace detail { +template +struct slot_to_tuple { + template + __device__ thrust::tuple operator()(S const& s) + { + return thrust::tuple(s.first, s.second); + } +}; + +template +struct slot_is_filled { + Key empty_key_sentinel; + template + __device__ bool operator()(S const& s) + { + return thrust::get<0>(s) != empty_key_sentinel; + } +}; +} // namespace detail + +template +template +void static_reduction_map::retrieve_all( + KeyOut keys_out, ValueOut values_out) +{ + // Convert pair_type to thrust::tuple to allow assigning to a zip iterator + auto begin = thrust::make_transform_iterator(raw_slots_begin(), detail::slot_to_tuple{}); + auto end = begin + get_capacity(); + auto filled = detail::slot_is_filled{get_empty_key_sentinel()}; + auto zipped_out = thrust::make_zip_iterator(thrust::make_tuple(keys_out, values_out)); + + thrust::copy_if(thrust::device, begin, end, zipped_out, filled); +} + +template +template +void static_reduction_map::contains( + InputIt first, InputIt last, OutputIt output_begin, Hash hash, KeyEqual key_equal) noexcept +{ + auto num_keys = std::distance(first, last); + auto const block_size = 128; + auto const stride = 1; + auto const tile_size = 4; + auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size); + auto view = get_device_view(); + + detail::contains + <<>>(first, last, output_begin, view, hash, key_equal); + CUCO_CUDA_TRY(cudaDeviceSynchronize()); +} + +template +template +__device__ Value +static_reduction_map::device_mutable_view::insert( + value_type const& insert_pair, Hash hash, KeyEqual key_equal) noexcept +{ + auto current_slot{initial_slot(insert_pair.first, hash)}; + + while (true) { + using cuda::std::memory_order_relaxed; + auto expected_key = this->get_empty_key_sentinel(); + auto expected_value = this->get_empty_value_sentinel(); + auto& slot_key = current_slot->first; + auto& slot_value = current_slot->second; + + auto const key_success = + slot_key.compare_exchange_strong(expected_key, insert_pair.first, memory_order_relaxed); + + if (key_success or key_equal(insert_pair.first, expected_key)) { + return this->get_op().apply(slot_value, insert_pair.second); + } + + // if we couldn't insert the key, but it wasn't a duplicate, then there must + // have been some other key there, so we keep looking for a slot + current_slot = next_slot(current_slot); + } +} + +template +template +__device__ bool +static_reduction_map::device_mutable_view::insert( + CG g, value_type const& insert_pair, Hash hash, KeyEqual key_equal) noexcept +{ + auto current_slot = initial_slot(g, insert_pair.first, hash); + + while (true) { + auto& slot_key = current_slot->first; + auto& slot_value = current_slot->second; + auto const current_key = slot_key.load(cuda::std::memory_order_relaxed); + + // The user provided `key_equal` should never be used to compare against `empty_key_sentinel` as + // the sentinel is not a valid key value. Therefore, first check for the sentinel + // TODO: Use memcmp + auto const slot_is_empty = (current_key == this->get_empty_key_sentinel()); + + auto const key_exists = not slot_is_empty and key_equal(current_key, insert_pair.first); + + // Key already exists, aggregate with it's value + if (key_exists) { this->get_op().apply(slot_value, insert_pair.second); } + + // If key already exists in the CG window, all threads exit + if (g.ballot(key_exists)) { return false; } + + auto const window_empty_mask = g.ballot(slot_is_empty); + + if (window_empty_mask) { + // the first lane in the group with an empty slot will attempt the insert + auto const src_lane = __ffs(window_empty_mask) - 1; + + auto const attempt_update = [&]() { + auto expected_key = this->get_empty_key_sentinel(); + + auto const key_success = slot_key.compare_exchange_strong( + expected_key, insert_pair.first, cuda::memory_order_relaxed); + + if (key_success or key_equal(insert_pair.first, expected_key)) { + this->get_op().apply(slot_value, insert_pair.second); + return key_success ? insert_result::SUCCESS : insert_result::DUPLICATE; + } + return insert_result::CONTINUE; + }; + + auto const update_result = + (g.thread_rank() == src_lane) ? attempt_update() : insert_result::CONTINUE; + + auto const window_result = g.shfl(update_result, src_lane); + + // If the update succeeded, the thread group exits + if (window_result != insert_result::CONTINUE) { + return (window_result == insert_result::SUCCESS); + } + + // A different key took the current slot. Look for an empty slot in the current window + } else { + // No empty slots in the current window, move onto the next window + current_slot = next_slot(g, current_slot); + } + } +} + +template +template +__device__ + typename static_reduction_map::device_view::iterator + static_reduction_map::device_view::find( + Key const& k, Hash hash, KeyEqual key_equal) noexcept +{ + auto current_slot = initial_slot(k, hash); + + while (true) { + auto const existing_key = current_slot->first.load(cuda::std::memory_order_relaxed); + // Key doesn't exist, return end() + if (existing_key == this->get_empty_key_sentinel()) { return this->end(); } + + // Key exists, return iterator to location + if (key_equal(existing_key, k)) { return current_slot; } + + current_slot = next_slot(current_slot); + } +} + +template +template +__device__ typename static_reduction_map::device_view:: + const_iterator + static_reduction_map::device_view::find( + Key const& k, Hash hash, KeyEqual key_equal) const noexcept +{ + auto current_slot = initial_slot(k, hash); + + while (true) { + auto const existing_key = current_slot->first.load(cuda::std::memory_order_relaxed); + // Key doesn't exist, return end() + if (existing_key == this->get_empty_key_sentinel()) { return this->end(); } + + // Key exists, return iterator to location + if (key_equal(existing_key, k)) { return current_slot; } + + current_slot = next_slot(current_slot); + } +} + +template +template +__device__ + typename static_reduction_map::device_view::iterator + static_reduction_map::device_view::find( + CG g, Key const& k, Hash hash, KeyEqual key_equal) noexcept +{ + auto current_slot = initial_slot(g, k, hash); + + while (true) { + auto const existing_key = current_slot->first.load(cuda::std::memory_order_relaxed); + + // The user provide `key_equal` can never be used to compare against `empty_key_sentinel` as + // the sentinel is not a valid key value. Therefore, first check for the sentinel + auto const slot_is_empty = (existing_key == this->get_empty_key_sentinel()); + + // the key we were searching for was found by one of the threads, + // so we return an iterator to the entry + auto const exists = g.ballot(not slot_is_empty and key_equal(existing_key, k)); + if (exists) { + uint32_t src_lane = __ffs(exists) - 1; + // TODO: This shouldn't cast an iterator to an int to shuffle. Instead, get the index of the + // current_slot and shuffle that instead. + intptr_t res_slot = g.shfl(reinterpret_cast(current_slot), src_lane); + return reinterpret_cast(res_slot); + } + + // we found an empty slot, meaning that the key we're searching for isn't present + if (g.ballot(slot_is_empty)) { return this->end(); } + + // otherwise, all slots in the current window are full with other keys, so we move onto the + // next window + current_slot = next_slot(g, current_slot); + } +} + +template +template +__device__ typename static_reduction_map::device_view:: + const_iterator + static_reduction_map::device_view::find( + CG g, Key const& k, Hash hash, KeyEqual key_equal) const noexcept +{ + auto current_slot = initial_slot(g, k, hash); + + while (true) { + auto const existing_key = current_slot->first.load(cuda::std::memory_order_relaxed); + + // The user provide `key_equal` can never be used to compare against `empty_key_sentinel` as + // the sentinel is not a valid key value. Therefore, first check for the sentinel + auto const slot_is_empty = (existing_key == this->get_empty_key_sentinel()); + + // the key we were searching for was found by one of the threads, so we return an iterator to + // the entry + auto const exists = g.ballot(not slot_is_empty and key_equal(existing_key, k)); + if (exists) { + uint32_t src_lane = __ffs(exists) - 1; + // TODO: This shouldn't cast an iterator to an int to shuffle. Instead, get the index of the + // current_slot and shuffle that instead. + intptr_t res_slot = g.shfl(reinterpret_cast(current_slot), src_lane); + return reinterpret_cast(res_slot); + } + + // we found an empty slot, meaning that the key we're searching + // for isn't in this submap, so we should move onto the next one + if (g.ballot(slot_is_empty)) { return this->end(); } + + // otherwise, all slots in the current window are full with other keys, + // so we move onto the next window in the current submap + + current_slot = next_slot(g, current_slot); + } +} + +template +template +__device__ bool +static_reduction_map::device_view::contains( + Key const& k, Hash hash, KeyEqual key_equal) noexcept +{ + auto current_slot = initial_slot(k, hash); + + while (true) { + auto const existing_key = current_slot->first.load(cuda::std::memory_order_relaxed); + + if (existing_key == empty_key_sentinel_) { return false; } + + if (key_equal(existing_key, k)) { return true; } + + current_slot = next_slot(current_slot); + } +} + +template +template +__device__ bool +static_reduction_map::device_view::contains( + CG g, Key const& k, Hash hash, KeyEqual key_equal) noexcept +{ + auto current_slot = initial_slot(g, k, hash); + + while (true) { + key_type const existing_key = current_slot->first.load(cuda::std::memory_order_relaxed); + + // The user provide `key_equal` can never be used to compare against `empty_key_sentinel` as + // the sentinel is not a valid key value. Therefore, first check for the sentinel + auto const slot_is_empty = (existing_key == this->get_empty_key_sentinel()); + + // the key we were searching for was found by one of the threads, so we return an iterator to + // the entry + if (g.ballot(not slot_is_empty and key_equal(existing_key, k))) { return true; } + + // we found an empty slot, meaning that the key we're searching for isn't present + if (g.ballot(slot_is_empty)) { return false; } + + // otherwise, all slots in the current window are full with other keys, so we move onto the + // next window + current_slot = next_slot(g, current_slot); + } +} +} // namespace cuco diff --git a/include/cuco/detail/static_reduction_map_kernels.cuh b/include/cuco/detail/static_reduction_map_kernels.cuh new file mode 100644 index 000000000..9849efb44 --- /dev/null +++ b/include/cuco/detail/static_reduction_map_kernels.cuh @@ -0,0 +1,389 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +namespace cuco { +namespace detail { +namespace cg = cooperative_groups; + +/** + * @brief Initializes each slot in the flat `slots` storage to contain `k` and `v`. + * + * Each space in `slots` that can hold a key value pair is initialized to a + * `pair_atomic_type` containing the key `k` and the value `v`. + * + * @tparam atomic_key_type Type of the `Key` atomic container + * @tparam atomic_mapped_type Type of the `Value` atomic container + * @tparam Key key type + * @tparam Value value type + * @tparam pair_atomic_type key/value pair type + * @param slots Pointer to flat storage for the map's key/value pairs + * @param k Key to which all keys in `slots` are initialized + * @param v Value to which all values in `slots` are initialized + * @param size Size of the storage pointed to by `slots` + */ +template +__global__ void initialize(pair_atomic_type* const slots, Key k, Value v, std::size_t size) +{ + auto tid = threadIdx.x + blockIdx.x * blockDim.x; + while (tid < size) { + new (&slots[tid].first) atomic_key_type{k}; + new (&slots[tid].second) atomic_mapped_type{v}; + tid += gridDim.x * blockDim.x; + } +} + +/** + * @brief Inserts all key/value pairs in the range `[first, last)`. + * + * If multiple keys in `[first, last)` compare equal, it is unspecified which + * element is inserted. + * + * @tparam block_size + * @tparam InputIt Device accessible input iterator whose `value_type` is + * convertible to the map's `value_type` + * @tparam atomicT Type of atomic storage + * @tparam viewT Type of device view allowing access of hash map storage + * @tparam Hash Unary callable type + * @tparam KeyEqual Binary callable type + * @param first Beginning of the sequence of key/value pairs + * @param last End of the sequence of key/value pairs + * @param num_successes The number of successfully inserted key/value pairs + * @param view Mutable device view used to access the hash map's slot storage + * @param hash The unary function to apply to hash each key + * @param key_equal The binary function used to compare two keys for equality + */ +template +__global__ void insert( + InputIt first, InputIt last, atomicT* num_successes, viewT view, Hash hash, KeyEqual key_equal) +{ + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + std::size_t thread_num_successes = 0; + + auto tid = blockDim.x * blockIdx.x + threadIdx.x; + auto it = first + tid; + + while (it < last) { + typename viewT::value_type const insert_pair{*it}; + if (view.insert(insert_pair, hash, key_equal)) { thread_num_successes++; } + it += gridDim.x * blockDim.x; + } + + // compute number of successfully inserted elements for each block + // and atomically add to the grand total + std::size_t block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes); + if (threadIdx.x == 0) { *num_successes += block_num_successes; } +} + +/** + * @brief Inserts all key/value pairs in the range `[first, last)`. + * + * If multiple keys in `[first, last)` compare equal, it is unspecified which + * element is inserted. Uses the CUDA Cooperative Groups API to leverage groups + * of multiple threads to perform each key/value insertion. This provides a + * significant boost in throughput compared to the non Cooperative Group + * `insert` at moderate to high load factors. + * + * @tparam block_size + * @tparam tile_size The number of threads in the Cooperative Groups used to perform + * inserts + * @tparam InputIt Device accessible input iterator whose `value_type` is + * convertible to the map's `value_type` + * @tparam atomicT Type of atomic storage + * @tparam viewT Type of device view allowing access of hash map storage + * @tparam Hash Unary callable type + * @tparam KeyEqual Binary callable type + * @param first Beginning of the sequence of key/value pairs + * @param last End of the sequence of key/value pairs + * @param num_successes The number of successfully inserted key/value pairs + * @param view Mutable device view used to access the hash map's slot storage + * @param hash The unary function to apply to hash each key + * @param key_equal The binary function used to compare two keys for equality + */ +template +__global__ void insert( + InputIt first, InputIt last, atomicT* num_successes, viewT view, Hash hash, KeyEqual key_equal) +{ + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + std::size_t thread_num_successes = 0; + + auto tile = cg::tiled_partition(cg::this_thread_block()); + auto tid = blockDim.x * blockIdx.x + threadIdx.x; + auto it = first + tid / tile_size; + + while (it < last) { + // force conversion to value_type + typename viewT::value_type const insert_pair{ + static_cast(thrust::get<0>(*it)), + static_cast(thrust::get<1>(*it))}; + + if (view.insert(tile, insert_pair, hash, key_equal) && tile.thread_rank() == 0) { + thread_num_successes++; + } + it += (gridDim.x * blockDim.x) / tile_size; + } + + // compute number of successfully inserted elements for each block + // and atomically add to the grand total + std::size_t block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes); + if (threadIdx.x == 0) { *num_successes += block_num_successes; } +} + +/** + * @brief Finds the values corresponding to all keys in the range `[first, last)`. + * + * If the key `*(first + i)` exists in the map, copies its associated value to `(output_begin + i)`. + * Else, copies the empty value sentinel. + * @tparam block_size The size of the thread block + * @tparam Value The type of the mapped value for the map + * @tparam InputIt Device accessible input iterator whose `value_type` is + * convertible to the map's `key_type` + * @tparam OutputIt Device accessible output iterator whose `value_type` is + * convertible to the map's `mapped_type` + * @tparam viewT Type of device view allowing access of hash map storage + * @tparam Hash Unary callable type + * @tparam KeyEqual Binary callable type + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param output_begin Beginning of the sequence of values retrieved for each key + * @param view Device view used to access the hash map's slot storage + * @param hash The unary function to apply to hash each key + * @param key_equal The binary function to compare two keys for equality + */ +template +__global__ void find( + InputIt first, InputIt last, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal) +{ + auto tid = blockDim.x * blockIdx.x + threadIdx.x; + auto key_idx = tid; + __shared__ Value writeBuffer[block_size]; + + while (first + key_idx < last) { + auto key = *(first + key_idx); + auto found = view.find(key, hash, key_equal); + + /* + * The ld.relaxed.gpu instruction used in view.find causes L1 to + * flush more frequently, causing increased sector stores from L2 to global memory. + * By writing results to shared memory and then synchronizing before writing back + * to global, we no longer rely on L1, preventing the increase in sector stores from + * L2 to global and improving performance. + */ + writeBuffer[threadIdx.x] = found->second.load(cuda::std::memory_order_relaxed); + __syncthreads(); + *(output_begin + key_idx) = writeBuffer[threadIdx.x]; + key_idx += gridDim.x * blockDim.x; + } +} + +/** + * @brief Finds the values corresponding to all keys in the range `[first, last)`. + * + * If the key `*(first + i)` exists in the map, copies its associated value to `(output_begin + i)`. + * Else, copies the empty value sentinel. Uses the CUDA Cooperative Groups API to leverage groups + * of multiple threads to find each key. This provides a significant boost in throughput compared + * to the non Cooperative Group `find` at moderate to high load factors. + * + * @tparam block_size The size of the thread block + * @tparam tile_size The number of threads in the Cooperative Groups used to perform + * inserts + * @tparam Value The type of the mapped value for the map + * @tparam InputIt Device accessible input iterator whose `value_type` is + * convertible to the map's `key_type` + * @tparam OutputIt Device accessible output iterator whose `value_type` is + * convertible to the map's `mapped_type` + * @tparam viewT Type of device view allowing access of hash map storage + * @tparam Hash Unary callable type + * @tparam KeyEqual Binary callable type + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param output_begin Beginning of the sequence of values retrieved for each key + * @param view Device view used to access the hash map's slot storage + * @param hash The unary function to apply to hash each key + * @param key_equal The binary function to compare two keys for equality + */ +template +__global__ void find( + InputIt first, InputIt last, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal) +{ + auto tile = cg::tiled_partition(cg::this_thread_block()); + auto tid = blockDim.x * blockIdx.x + threadIdx.x; + auto key_idx = tid / tile_size; + __shared__ Value writeBuffer[block_size]; + + while (first + key_idx < last) { + auto key = *(first + key_idx); + auto found = view.find(tile, key, hash, key_equal); + + /* + * The ld.relaxed.gpu instruction used in view.find causes L1 to + * flush more frequently, causing increased sector stores from L2 to global memory. + * By writing results to shared memory and then synchronizing before writing back + * to global, we no longer rely on L1, preventing the increase in sector stores from + * L2 to global and improving performance. + */ + if (tile.thread_rank() == 0) { + writeBuffer[threadIdx.x / tile_size] = found->second.load(cuda::std::memory_order_relaxed); + } + __syncthreads(); + if (tile.thread_rank() == 0) { + *(output_begin + key_idx) = writeBuffer[threadIdx.x / tile_size]; + } + key_idx += (gridDim.x * blockDim.x) / tile_size; + } +} + +/** + * @brief Indicates whether the keys in the range `[first, last)` are contained in the map. + * + * Writes a `bool` to `(output + i)` indicating if the key `*(first + i)` exists in the map. + * + * @tparam block_size The size of the thread block + * @tparam InputIt Device accessible input iterator whose `value_type` is + * convertible to the map's `key_type` + * @tparam OutputIt Device accessible output iterator whose `value_type` is + * convertible to the map's `mapped_type` + * @tparam viewT Type of device view allowing access of hash map storage + * @tparam Hash Unary callable type + * @tparam KeyEqual Binary callable type + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param output_begin Beginning of the sequence of booleans for the presence of each key + * @param view Device view used to access the hash map's slot storage + * @param hash The unary function to apply to hash each key + * @param key_equal The binary function to compare two keys for equality + */ +template +__global__ void contains( + InputIt first, InputIt last, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal) +{ + auto tid = blockDim.x * blockIdx.x + threadIdx.x; + auto key_idx = tid; + __shared__ bool writeBuffer[block_size]; + + while (first + key_idx < last) { + auto key = *(first + key_idx); + + /* + * The ld.relaxed.gpu instruction used in view.find causes L1 to + * flush more frequently, causing increased sector stores from L2 to global memory. + * By writing results to shared memory and then synchronizing before writing back + * to global, we no longer rely on L1, preventing the increase in sector stores from + * L2 to global and improving performance. + */ + writeBuffer[threadIdx.x] = view.contains(key, hash, key_equal); + __syncthreads(); + *(output_begin + key_idx) = writeBuffer[threadIdx.x]; + key_idx += gridDim.x * blockDim.x; + } +} + +/** + * @brief Indicates whether the keys in the range `[first, last)` are contained in the map. + * + * Writes a `bool` to `(output + i)` indicating if the key `*(first + i)` exists in the map. + * Uses the CUDA Cooperative Groups API to leverage groups of multiple threads to perform the + * contains operation for each key. This provides a significant boost in throughput compared + * to the non Cooperative Group `contains` at moderate to high load factors. + * + * @tparam block_size The size of the thread block + * @tparam tile_size The number of threads in the Cooperative Groups used to perform + * inserts + * @tparam InputIt Device accessible input iterator whose `value_type` is + * convertible to the map's `key_type` + * @tparam OutputIt Device accessible output iterator whose `value_type` is + * convertible to the map's `mapped_type` + * @tparam viewT Type of device view allowing access of hash map storage + * @tparam Hash Unary callable type + * @tparam KeyEqual Binary callable type + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param output_begin Beginning of the sequence of booleans for the presence of each key + * @param view Device view used to access the hash map's slot storage + * @param hash The unary function to apply to hash each key + * @param key_equal The binary function to compare two keys for equality + */ +template +__global__ void contains( + InputIt first, InputIt last, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal) +{ + auto tile = cg::tiled_partition(cg::this_thread_block()); + auto tid = blockDim.x * blockIdx.x + threadIdx.x; + auto key_idx = tid / tile_size; + __shared__ bool writeBuffer[block_size]; + + while (first + key_idx < last) { + auto key = *(first + key_idx); + auto found = view.contains(tile, key, hash, key_equal); + + /* + * The ld.relaxed.gpu instruction used in view.find causes L1 to + * flush more frequently, causing increased sector stores from L2 to global memory. + * By writing results to shared memory and then synchronizing before writing back + * to global, we no longer rely on L1, preventing the increase in sector stores from + * L2 to global and improving performance. + */ + if (tile.thread_rank() == 0) { writeBuffer[threadIdx.x / tile_size] = found; } + __syncthreads(); + if (tile.thread_rank() == 0) { + *(output_begin + key_idx) = writeBuffer[threadIdx.x / tile_size]; + } + key_idx += (gridDim.x * blockDim.x) / tile_size; + } +} + +} // namespace detail +} // namespace cuco \ No newline at end of file diff --git a/include/cuco/static_reduction_map.cuh b/include/cuco/static_reduction_map.cuh new file mode 100644 index 000000000..c66958fb8 --- /dev/null +++ b/include/cuco/static_reduction_map.cuh @@ -0,0 +1,1022 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#ifndef CUDART_VERSION +#error CUDART_VERSION Undefined! +#elif (CUDART_VERSION >= 11000) // including with CUDA 10.2 leads to compilation errors +#include +#endif + +#include +#include +#include +#include + +namespace cuco { + +template +struct reduce_add { + using value_type = T; + static constexpr T identity = 0; + + template + __device__ T apply(cuda::atomic& slot, T2 const& value) const + { + return slot.fetch_add(value, cuda::memory_order_relaxed); + } +}; + +template +struct reduce_sub { + using value_type = T; + static constexpr T identity = 0; + + template + __device__ T apply(cuda::atomic& slot, T2 const& value) const + { + return slot.fetch_sub(value, cuda::memory_order_relaxed); + } +}; + +template +struct reduce_min { + using value_type = T; + static constexpr T identity = std::numeric_limits::max(); + + template + __device__ T apply(cuda::atomic& slot, T2 const& value) const + { + return slot.fetch_min(value, cuda::memory_order_relaxed); + } +}; + +template +struct reduce_max { + using value_type = T; + static constexpr T identity = std::numeric_limits::lowest(); + + template + __device__ T apply(cuda::atomic& slot, T2 const& value) const + { + return slot.fetch_max(value, cuda::memory_order_relaxed); + } +}; + +template +struct custom_op { + using value_type = T; + static constexpr T identity = Identity; + + Op op; + + template + __device__ T apply(cuda::atomic& slot, T2 const& value) const + { + auto old = slot.load(cuda::memory_order_relaxed); + while (not slot.compare_exchange_strong(old, op(old, value), cuda::memory_order_relaxed)) {} + return old; + } +}; + +/** + * @brief A GPU-accelerated, unordered, associative container of key-value + * pairs with unique keys. + * + * Allows constant time concurrent inserts or concurrent find operations (not + * concurrent insert and find) from threads in device code. + * + * Current limitations: + * - Requires keys that are Arithmetic + * - Does not support erasing keys + * - Capacity is fixed and will not grow automatically + * - Requires the user to specify sentinel values for both key and mapped value + * to indicate empty slots + * - Does not support concurrent insert and find operations + * + * The `static_reduction_map` supports two types of operations: + * - Host-side "bulk" operations + * - Device-side "singular" operations + * + * The host-side bulk operations include `insert`, `find`, and `contains`. These + * APIs should be used when there are a large number of keys to insert or lookup + * in the map. For example, given a range of keys specified by device-accessible + * iterators, the bulk `insert` function will insert all keys into the map. + * + * The singular device-side operations allow individual threads to to perform + * independent insert or find/contains operations from device code. These + * operations are accessed through non-owning, trivially copyable "view" types: + * `device_view` and `mutable_device_view`. The `device_view` class is an + * immutable view that allows only non-modifying operations such as `find` or + * `contains`. The `mutable_device_view` class only allows `insert` operations. + * The two types are separate to prevent erroneous concurrent insert/find + * operations. + * + * Example: + * \code{.cpp} + * int empty_key_sentinel = -1; + * int empty_value_sentine = -1; + * + * // Constructs a map with 100,000 slots using -1 and -1 as the empty key/value + * // sentinels. Note the capacity is chosen knowing we will insert 50,000 keys, + * // for an load factor of 50%. + * static_reduction_map m{100'000, empty_key_sentinel, empty_value_sentinel}; + * + * // Create a sequence of pairs {{0,0}, {1,1}, ... {i,i}} + * thrust::device_vector> pairs(50,000); + * thrust::transform(thrust::make_counting_iterator(0), + * thrust::make_counting_iterator(pairs.size()), + * pairs.begin(), + * []__device__(auto i){ return thrust::make_pair(i,i); }; + * + * + * // Inserts all pairs into the map + * m.insert(pairs.begin(), pairs.end()); + * + * // Get a `device_view` and passes it to a kernel where threads may perform + * // `find/contains` lookups + * kernel<<<...>>>(m.get_device_view()); + * \endcode + * + * + * @tparam Key Arithmetic type used for key + * @tparam Value Type of the mapped values + * @tparam Scope The scope in which insert/find operations will be performed by + * individual threads. + * @tparam Allocator Type of allocator used for device storage + */ +template > +class static_reduction_map { + static_assert(std::is_arithmetic::value, "Unsupported, non-arithmetic key type."); + static_assert(std::is_same::value, + "Type mismatch between ReductionOp::value_type and Value"); + + public: + using value_type = cuco::pair_type; + using key_type = Key; + using mapped_type = Value; + using atomic_key_type = cuda::atomic; + using atomic_mapped_type = cuda::atomic; + using pair_atomic_type = cuco::pair_type; + using atomic_ctr_type = cuda::atomic; + using allocator_type = Allocator; + using slot_allocator_type = + typename std::allocator_traits::rebind_alloc; + + static_reduction_map(static_reduction_map const&) = delete; + static_reduction_map(static_reduction_map&&) = delete; + static_reduction_map& operator=(static_reduction_map const&) = delete; + static_reduction_map& operator=(static_reduction_map&&) = delete; + + /** + * @brief Construct a fixed-size map with the specified capacity and sentinel values. + * @brief Construct a statically sized map with the specified number of slots + * and sentinel values. + * + * The capacity of the map is fixed. Insert operations will not automatically + * grow the map. Attempting to insert more unique keys than the capacity of + * the map results in undefined behavior. + * + * Performance begins to degrade significantly beyond a load factor of ~70%. + * For best performance, choose a capacity that will keep the load factor + * below 70%. E.g., if inserting `N` unique keys, choose a capacity of + * `N * (1/0.7)`. + * + * The `empty_key_sentinel` and `empty_value_sentinel` values are reserved and + * undefined behavior results from attempting to insert any key/value pair + * that contains either. + * + * @param capacity The total number of slots in the map + * @param empty_key_sentinel The reserved key value for empty slots + * @param empty_value_sentinel The reserved mapped value for empty slots + * @param alloc Allocator used for allocating device storage + */ + static_reduction_map(std::size_t capacity, + Key empty_key_sentinel, + ReductionOp reduction_op = {}, + Allocator const& alloc = Allocator{}); + + /** + * @brief Destroys the map and frees its contents. + * + */ + ~static_reduction_map(); + + /** + * @brief Inserts all key/value pairs in the range `[first, last)`. + * + * If multiple keys in `[first, last)` compare equal, it is unspecified which + * element is inserted. + * + * @tparam InputIt Device accessible input iterator whose `value_type` is + * convertible to the map's `value_type` + * @tparam Hash Unary callable type + * @tparam KeyEqual Binary callable type + * @param first Beginning of the sequence of key/value pairs + * @param last End of the sequence of key/value pairs + * @param hash The unary function to apply to hash each key + * @param key_equal The binary function to compare two keys for equality + */ + template , + typename KeyEqual = thrust::equal_to> + void insert(InputIt first, InputIt last, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}); + + /** + * @brief Finds the values corresponding to all keys in the range `[first, last)`. + * + * If the key `*(first + i)` exists in the map, copies its associated value to `(output_begin + + * i)`. Else, copies the empty value sentinel. + * + * @tparam InputIt Device accessible input iterator whose `value_type` is + * convertible to the map's `key_type` + * @tparam OutputIt Device accessible output iterator whose `value_type` is + * convertible to the map's `mapped_type` + * @tparam Hash Unary callable type + * @tparam KeyEqual Binary callable type + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param output_begin Beginning of the sequence of values retrieved for each key + * @param hash The unary function to apply to hash each key + * @param key_equal The binary function to compare two keys for equality + */ + template , + typename KeyEqual = thrust::equal_to> + void find(InputIt first, + InputIt last, + OutputIt output_begin, + Hash hash = Hash{}, + KeyEqual key_equal = KeyEqual{}) noexcept; + + /** + * @brief Retrieves all of the keys and their associated values. + * + * The order in which keys are returned is implementation defined and not guaranteed to be + * consistent between subsequent calls to `retrieve_all`. + * + * Behavior is undefined if the range beginning at `keys_out` or `values_out` is not large enough + * to contain the number of keys in the map. + * + * @tparam KeyOut Device accessible random access output iterator whose `value_type` is + * convertible from `key_type`. + * @tparam ValueOut Device accesible random access output iterator whose `value_type` is + * convertible from `mapped_type`. + * @param keys_out Beginning output iterator for keys + * @param values_out Beginning output iterator for values + */ + template + void retrieve_all(KeyOut keys_out, ValueOut values_out); + + /** + * @brief Indicates whether the keys in the range `[first, last)` are contained in the map. + * + * Writes a `bool` to `(output + i)` indicating if the key `*(first + i)` exists in the map. + * + * @tparam InputIt Device accessible input iterator whose `value_type` is + * convertible to the map's `key_type` + * @tparam OutputIt Device accessible output iterator whose `value_type` is + * convertible to the map's `mapped_type` + * @tparam Hash Unary callable type + * @tparam KeyEqual Binary callable type + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param output_begin Beginning of the sequence of booleans for the presence of each key + * @param hash The unary function to apply to hash each key + * @param key_equal The binary function to compare two keys for equality + */ + template , + typename KeyEqual = thrust::equal_to> + void contains(InputIt first, + InputIt last, + OutputIt output_begin, + Hash hash = Hash{}, + KeyEqual key_equal = KeyEqual{}) noexcept; + + private: + class device_view_base { + protected: + // Import member type definitions from `static_reduction_map` + using value_type = value_type; + using key_type = Key; + using mapped_type = Value; + using iterator = pair_atomic_type*; + using const_iterator = pair_atomic_type const*; + + private: + pair_atomic_type* slots_{}; ///< Pointer to flat slots storage + std::size_t capacity_{}; ///< Total number of slots + Key empty_key_sentinel_{}; ///< Key value that represents an empty slot + Value empty_value_sentinel_{}; ///< Initial Value of empty slot + ReductionOp op_{}; ///< Binary operation reduction function object + + protected: + __host__ __device__ device_view_base(pair_atomic_type* slots, + std::size_t capacity, + Key empty_key_sentinel, + ReductionOp reduction_op) noexcept + : slots_{slots}, + capacity_{capacity}, + empty_key_sentinel_{empty_key_sentinel}, + empty_value_sentinel_{ReductionOp::identity}, + op_{reduction_op} + { + } + + /** + * @brief Gets the binary op + * + */ + __device__ ReductionOp get_op() const { return op_; } + + /** + * @brief Gets slots array. + * + * @return Slots array + */ + __device__ pair_atomic_type* get_slots() noexcept { return slots_; } + + /** + * @brief Gets slots array. + * + * @return Slots array + */ + __device__ pair_atomic_type const* get_slots() const noexcept { return slots_; } + + /** + * @brief Returns the initial slot for a given key `k` + * + * @tparam Hash Unary callable type + * @param k The key to get the slot for + * @param hash The unary callable used to hash the key + * @return Pointer to the initial slot for `k` + */ + template + __device__ iterator initial_slot(Key const& k, Hash hash) noexcept + { + return &slots_[hash(k) % capacity_]; + } + + /** + * @brief Returns the initial slot for a given key `k` + * + * @tparam Hash Unary callable type + * @param k The key to get the slot for + * @param hash The unary callable used to hash the key + * @return Pointer to the initial slot for `k` + */ + template + __device__ const_iterator initial_slot(Key const& k, Hash hash) const noexcept + { + return &slots_[hash(k) % capacity_]; + } + + /** + * @brief Returns the initial slot for a given key `k` + * + * To be used for Cooperative Group based probing. + * + * @tparam CG Cooperative Group type + * @tparam Hash Unary callable type + * @param g the Cooperative Group for which the initial slot is needed + * @param k The key to get the slot for + * @param hash The unary callable used to hash the key + * @return Pointer to the initial slot for `k` + */ + template + __device__ iterator initial_slot(CG g, Key const& k, Hash hash) noexcept + { + return &slots_[(hash(k) + g.thread_rank()) % capacity_]; + } + + /** + * @brief Returns the initial slot for a given key `k` + * + * To be used for Cooperative Group based probing. + * + * @tparam CG Cooperative Group type + * @tparam Hash Unary callable type + * @param g the Cooperative Group for which the initial slot is needed + * @param k The key to get the slot for + * @param hash The unary callable used to hash the key + * @return Pointer to the initial slot for `k` + */ + template + __device__ const_iterator initial_slot(CG g, Key const& k, Hash hash) const noexcept + { + return &slots_[(hash(k) + g.thread_rank()) % capacity_]; + } + + /** + * @brief Given a slot `s`, returns the next slot. + * + * If `s` is the last slot, wraps back around to the first slot. + * + * @param s The slot to advance + * @return The next slot after `s` + */ + __device__ iterator next_slot(iterator s) noexcept { return (++s < end()) ? s : begin_slot(); } + + /** + * @brief Given a slot `s`, returns the next slot. + * + * If `s` is the last slot, wraps back around to the first slot. + * + * @param s The slot to advance + * @return The next slot after `s` + */ + __device__ const_iterator next_slot(const_iterator s) const noexcept + { + return (++s < end()) ? s : begin_slot(); + } + + /** + * @brief Given a slot `s`, returns the next slot. + * + * If `s` is the last slot, wraps back around to the first slot. To + * be used for Cooperative Group based probing. + * + * @tparam CG The Cooperative Group type + * @param g The Cooperative Group for which the next slot is needed + * @param s The slot to advance + * @return The next slot after `s` + */ + template + __device__ iterator next_slot(CG g, iterator s) noexcept + { + uint32_t index = s - slots_; + return &slots_[(index + g.size()) % capacity_]; + } + + /** + * @brief Given a slot `s`, returns the next slot. + * + * If `s` is the last slot, wraps back around to the first slot. To + * be used for Cooperative Group based probing. + * + * @tparam CG The Cooperative Group type + * @param g The Cooperative Group for which the next slot is needed + * @param s The slot to advance + * @return The next slot after `s` + */ + template + __device__ const_iterator next_slot(CG g, const_iterator s) const noexcept + { + uint32_t index = s - slots_; + return &slots_[(index + g.size()) % capacity_]; + } + + public: + /** + * @brief Gets the maximum number of elements the hash map can hold. + * + * @return The maximum number of elements the hash map can hold + */ + __host__ __device__ std::size_t get_capacity() const noexcept { return capacity_; } + + /** + * @brief Gets the sentinel value used to represent an empty key slot. + * + * @return The sentinel value used to represent an empty key slot + */ + __host__ __device__ Key get_empty_key_sentinel() const noexcept { return empty_key_sentinel_; } + + /** + * @brief Gets the sentinel value used to represent an empty value slot. + * + * @return The sentinel value used to represent an empty value slot + */ + __host__ __device__ Value get_empty_value_sentinel() const noexcept + { + return empty_value_sentinel_; + } + + /** + * @brief Returns iterator to the first slot. + * + * @note Unlike `std::map::begin()`, the `begin_slot()` iterator does _not_ point to the first + * occupied slot. Instead, it refers to the first slot in the array of contiguous slot storage. + * Iterating from `begin_slot()` to `end_slot()` will iterate over all slots, including those + * both empty and filled. + * + * There is no `begin()` iterator to avoid confusion as it is not possible to provide an + * iterator over only the filled slots. + * + * @return Iterator to the first slot + */ + __device__ iterator begin_slot() noexcept { return slots_; } + + /** + * @brief Returns iterator to the first slot. + * + * @note Unlike `std::map::begin()`, the `begin_slot()` iterator does _not_ point to the first + * occupied slot. Instead, it refers to the first slot in the array of contiguous slot storage. + * Iterating from `begin_slot()` to `end_slot()` will iterate over all slots, including those + * both empty and filled. + * + * There is no `begin()` iterator to avoid confusion as it is not possible to provide an + * iterator over only the filled slots. + * + * @return Iterator to the first slot + */ + __device__ const_iterator begin_slot() const noexcept { return slots_; } + + /** + * @brief Returns a const_iterator to one past the last slot. + * + * @return A const_iterator to one past the last slot + */ + __host__ __device__ const_iterator end_slot() const noexcept { return slots_ + capacity_; } + + /** + * @brief Returns an iterator to one past the last slot. + * + * @return An iterator to one past the last slot + */ + __host__ __device__ iterator end_slot() noexcept { return slots_ + capacity_; } + + /** + * @brief Returns a const_iterator to one past the last slot. + * + * `end()` calls `end_slot()` and is provided for convenience for those familiar with checking + * an iterator returned from `find()` against the `end()` iterator. + * + * @return A const_iterator to one past the last slot + */ + __host__ __device__ const_iterator end() const noexcept { return end_slot(); } + + /** + * @brief Returns an iterator to one past the last slot. + * + * `end()` calls `end_slot()` and is provided for convenience for those familiar with checking + * an iterator returned from `find()` against the `end()` iterator. + * + * @return An iterator to one past the last slot + */ + __host__ __device__ iterator end() noexcept { return end_slot(); } + }; + + public: + /** + * @brief Mutable, non-owning view-type that may be used in device code to + * perform singular inserts into the map. + * + * `device_mutable_view` is trivially-copyable and is intended to be passed by + * value. + * + * Example: + * \code{.cpp} + * cuco::static_reduction_map m{100'000, -1, -1}; + * + * // Inserts a sequence of pairs {{0,0}, {1,1}, ... {i,i}} + * thrust::for_each(thrust::make_counting_iterator(0), + * thrust::make_counting_iterator(50'000), + * [map = m.get_mutable_device_view()] + * __device__ (auto i) mutable { + * map.insert(thrust::make_pair(i,i)); + * }); + * \endcode + */ + class device_mutable_view : public device_view_base { + public: + using value_type = typename device_view_base::value_type; + using key_type = typename device_view_base::key_type; + using mapped_type = typename device_view_base::mapped_type; + using iterator = typename device_view_base::iterator; + using const_iterator = typename device_view_base::const_iterator; + /** + * @brief Construct a mutable view of the first `capacity` slots of the + * slots array pointed to by `slots`. + * + * @param slots Pointer to beginning of initialized slots array + * @param capacity The number of slots viewed by this object + * @param empty_key_sentinel The reserved value for keys to represent empty + * slots + * @param empty_value_sentinel The reserved value for mapped values to + * represent empty slots + */ + __host__ __device__ device_mutable_view(pair_atomic_type* slots, + std::size_t capacity, + Key empty_key_sentinel, + ReductionOp reduction_op = {}) noexcept + : device_view_base{slots, capacity, empty_key_sentinel, reduction_op} + { + } + + /** + * @brief Inserts the specified key/value pair into the map. + * + * Returns a pair consisting of an iterator to the inserted element (or to + * the element that prevented the insertion) and a `bool` denoting whether + * the insertion took place. + * + * @tparam Hash Unary callable type + * @tparam KeyEqual Binary callable type + * @param insert_pair The pair to insert + * @param hash The unary callable used to hash the key + * @param key_equal The binary callable used to compare two keys for + * equality + * @return `true` if the insert was successful, `false` otherwise. + */ + template , + typename KeyEqual = thrust::equal_to> + __device__ Value insert(value_type const& insert_pair, + Hash hash = Hash{}, + KeyEqual key_equal = KeyEqual{}) noexcept; + /** + * @brief Inserts the specified key/value pair into the map. + * + * Returns a pair consisting of an iterator to the inserted element (or to + * the element that prevented the insertion) and a `bool` denoting whether + * the insertion took place. Uses the CUDA Cooperative Groups API to + * to leverage multiple threads to perform a single insert. This provides a + * significant boost in throughput compared to the non Cooperative Group + * `insert` at moderate to high load factors. + * + * @tparam Cooperative Group type + * @tparam Hash Unary callable type + * @tparam KeyEqual Binary callable type + * + * @param g The Cooperative Group that performs the insert + * @param insert_pair The pair to insert + * @param hash The unary callable used to hash the key + * @param key_equal The binary callable used to compare two keys for + * equality + * @return `true` if the insert was successful, `false` otherwise. + */ + template , + typename KeyEqual = thrust::equal_to> + __device__ bool insert(CG g, + value_type const& insert_pair, + Hash hash = Hash{}, + KeyEqual key_equal = KeyEqual{}) noexcept; + + }; // class device mutable view + + /** + * @brief Non-owning view-type that may be used in device code to + * perform singular find and contains operations for the map. + * + * `device_view` is trivially-copyable and is intended to be passed by + * value. + * + */ + class device_view : public device_view_base { + public: + using value_type = typename device_view_base::value_type; + using key_type = typename device_view_base::key_type; + using mapped_type = typename device_view_base::mapped_type; + using iterator = typename device_view_base::iterator; + using const_iterator = typename device_view_base::const_iterator; + /** + * @brief Construct a view of the first `capacity` slots of the + * slots array pointed to by `slots`. + * + * @param slots Pointer to beginning of initialized slots array + * @param capacity The number of slots viewed by this object + * @param empty_key_sentinel The reserved value for keys to represent empty + * slots + * @param empty_value_sentinel The reserved value for mapped values to + * represent empty slots + */ + __host__ __device__ device_view(pair_atomic_type* slots, + std::size_t capacity, + Key empty_key_sentinel, + ReductionOp reduction_op = {}) noexcept + : device_view_base{slots, capacity, empty_key_sentinel, reduction_op} + { + } + + /** + * @brief Makes a copy of given `device_view` using non-owned memory. + * + * This function is intended to be used to create shared memory copies of small static maps, + * although global memory can be used as well. + * + * Example: + * @code{.cpp} + * template + * __global__ void use_device_view(const typename MapType::device_view device_view, + * map_key_t const* const keys_to_search, + * map_value_t* const values_found, + * const size_t number_of_elements) + * { + * const size_t index = blockIdx.x * blockDim.x + threadIdx.x; + * + * __shared__ typename MapType::pair_atomic_type sm_buffer[CAPACITY]; + * + * auto g = cg::this_thread_block(); + * + * const map_t::device_view sm_static_reduction_map = device_view.make_copy(g, + * sm_buffer); + * + * for (size_t i = g.thread_rank(); i < number_of_elements; i += g.size()) + * { + * values_found[i] = sm_static_reduction_map.find(keys_to_search[i])->second; + * } + * } + * @endcode + * + * @tparam CG The type of the cooperative thread group + * @param g The ooperative thread group used to copy the slots + * @param source_device_view `device_view` to copy from + * @param memory_to_use Array large enough to support `capacity` elements. Object does not take + * the ownership of the memory + * @return Copy of passed `device_view` + */ + template + __device__ static device_view make_copy(CG g, + pair_atomic_type* const memory_to_use, + device_view source_device_view) noexcept + { +#ifndef CUDART_VERSION +#error CUDART_VERSION Undefined! +#elif (CUDART_VERSION >= 11000) + __shared__ cuda::barrier barrier; + if (g.thread_rank() == 0) { init(&barrier, g.size()); } + g.sync(); + + cuda::memcpy_async(g, + memory_to_use, + source_device_view.get_slots(), + sizeof(pair_atomic_type) * source_device_view.get_capacity(), + barrier); + + barrier.arrive_and_wait(); +#else + pair_atomic_type const* const slots_ptr = source_device_view.get_slots(); + for (std::size_t i = g.thread_rank(); i < source_device_view.get_capacity(); i += g.size()) { + new (&memory_to_use[i].first) + atomic_key_type{slots_ptr[i].first.load(cuda::memory_order_relaxed)}; + new (&memory_to_use[i].second) + atomic_mapped_type{slots_ptr[i].second.load(cuda::memory_order_relaxed)}; + } + g.sync(); +#endif + + return device_view(memory_to_use, + source_device_view.get_capacity(), + source_device_view.get_empty_key_sentinel(), + source_device_view.get_empty_value_sentinel()); + } + + /** + * @brief Finds the value corresponding to the key `k`. + * + * Returns an iterator to the pair whose key is equivalent to `k`. + * If no such pair exists, returns `end()`. + * + * @tparam Hash Unary callable type + * @tparam KeyEqual Binary callable type + * @param k The key to search for + * @param hash The unary callable used to hash the key + * @param key_equal The binary callable used to compare two keys + * for equality + * @return An iterator to the position at which the key/value pair + * containing `k` was inserted + */ + template , + typename KeyEqual = thrust::equal_to> + __device__ iterator find(Key const& k, + Hash hash = Hash{}, + KeyEqual key_equal = KeyEqual{}) noexcept; + + /** @brief Finds the value corresponding to the key `k`. + * + * Returns a const_iterator to the pair whose key is equivalent to `k`. + * If no such pair exists, returns `end()`. + * + * @tparam Hash Unary callable type + * @tparam KeyEqual Binary callable type + * @param k The key to search for + * @param hash The unary callable used to hash the key + * @param key_equal The binary callable used to compare two keys + * for equality + * @return An iterator to the position at which the key/value pair + * containing `k` was inserted + */ + template , + typename KeyEqual = thrust::equal_to> + __device__ const_iterator find(Key const& k, + Hash hash = Hash{}, + KeyEqual key_equal = KeyEqual{}) const noexcept; + + /** + * @brief Finds the value corresponding to the key `k`. + * + * Returns an iterator to the pair whose key is equivalent to `k`. + * If no such pair exists, returns `end()`. Uses the CUDA Cooperative Groups API to + * to leverage multiple threads to perform a single find. This provides a + * significant boost in throughput compared to the non Cooperative Group + * `find` at moderate to high load factors. + * + * @tparam CG Cooperative Group type + * @tparam Hash Unary callable type + * @tparam KeyEqual Binary callable type + * @param g The Cooperative Group used to perform the find + * @param k The key to search for + * @param hash The unary callable used to hash the key + * @param key_equal The binary callable used to compare two keys + * for equality + * @return An iterator to the position at which the key/value pair + * containing `k` was inserted + */ + template , + typename KeyEqual = thrust::equal_to> + __device__ iterator + find(CG g, Key const& k, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}) noexcept; + + /** + * @brief Finds the value corresponding to the key `k`. + * + * Returns a const_iterator to the pair whose key is equivalent to `k`. + * If no such pair exists, returns `end()`. Uses the CUDA Cooperative Groups API to + * to leverage multiple threads to perform a single find. This provides a + * significant boost in throughput compared to the non Cooperative Group + * `find` at moderate to high load factors. + * + * @tparam CG Cooperative Group type + * @tparam Hash Unary callable type + * @tparam KeyEqual Binary callable type + * @param g The Cooperative Group used to perform the find + * @param k The key to search for + * @param hash The unary callable used to hash the key + * @param key_equal The binary callable used to compare two keys + * for equality + * @return An iterator to the position at which the key/value pair + * containing `k` was inserted + */ + template , + typename KeyEqual = thrust::equal_to> + __device__ const_iterator + find(CG g, Key const& k, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}) const noexcept; + + /** + * @brief Indicates whether the key `k` was inserted into the map. + * + * If the key `k` was inserted into the map, find returns + * true. Otherwise, it returns false. + * + * @tparam Hash Unary callable type + * @tparam KeyEqual Binary callable type + * @param k The key to search for + * @param hash The unary callable used to hash the key + * @param key_equal The binary callable used to compare two keys + * for equality + * @return A boolean indicating whether the key/value pair + * containing `k` was inserted + */ + template , + typename KeyEqual = thrust::equal_to> + __device__ bool contains(Key const& k, + Hash hash = Hash{}, + KeyEqual key_equal = KeyEqual{}) noexcept; + + /** + * @brief Indicates whether the key `k` was inserted into the map. + * + * If the key `k` was inserted into the map, find returns + * true. Otherwise, it returns false. Uses the CUDA Cooperative Groups API to + * to leverage multiple threads to perform a single contains operation. This provides a + * significant boost in throughput compared to the non Cooperative Group + * `contains` at moderate to high load factors. + * + * @tparam CG Cooperative Group type + * @tparam Hash Unary callable type + * @tparam KeyEqual Binary callable type + * @param g The Cooperative Group used to perform the contains operation + * @param k The key to search for + * @param hash The unary callable used to hash the key + * @param key_equal The binary callable used to compare two keys + * for equality + * @return A boolean indicating whether the key/value pair + * containing `k` was inserted + */ + template , + typename KeyEqual = thrust::equal_to> + __device__ bool contains(CG g, + Key const& k, + Hash hash = Hash{}, + KeyEqual key_equal = KeyEqual{}) noexcept; + }; // class device_view + + /** + * @brief Gets the maximum number of elements the hash map can hold. + * + * @return The maximum number of elements the hash map can hold + */ + std::size_t get_capacity() const noexcept { return capacity_; } + + /** + * @brief Gets the number of elements in the hash map. + * + * @return The number of elements in the map + */ + std::size_t get_size() const noexcept { return size_; } + + /** + * @brief Gets the load factor of the hash map. + * + * @return The load factor of the hash map + */ + float get_load_factor() const noexcept { return static_cast(size_) / capacity_; } + + /** + * @brief Gets the sentinel value used to represent an empty key slot. + * + * @return The sentinel value used to represent an empty key slot + */ + Key get_empty_key_sentinel() const noexcept { return empty_key_sentinel_; } + + /** + * @brief Gets the sentinel value used to represent an empty value slot. + * + * @return The sentinel value used to represent an empty value slot + */ + Value get_empty_value_sentinel() const noexcept { return empty_value_sentinel_; } + + /** + * @brief Constructs a device_view object based on the members of the `static_reduction_map` + * object. + * + * @return A device_view object based on the members of the `static_reduction_map` object + */ + device_view get_device_view() const noexcept + { + return device_view(slots_, capacity_, empty_key_sentinel_, op_); + } + + /** + * @brief Constructs a device_mutable_view object based on the members of the + * `static_reduction_map` object + * + * @return A device_mutable_view object based on the members of the `static_reduction_map` object + */ + device_mutable_view get_device_mutable_view() const noexcept + { + return device_mutable_view(slots_, capacity_, empty_key_sentinel_, op_); + } + + private: + /// Unsafe access to the slots stripping away their atomic-ness to allow non-atomic access. This + /// is a temporary solution until we have atomic_ref + value_type* raw_slots_begin() noexcept { return reinterpret_cast(slots_); } + + value_type const* raw_slots_begin() const noexcept + { + return reinterpret_cast(slots_); + } + + value_type* raw_slots_end() noexcept { return raw_slots_begin() + get_capacity(); } + + value_type const* raw_slots_end() const noexcept { return raw_slots_begin() + get_capacity(); } + + pair_atomic_type* slots_{nullptr}; ///< Pointer to flat slots storage + std::size_t capacity_{}; ///< Total number of slots + std::size_t size_{}; ///< Number of keys in map + Key empty_key_sentinel_{}; ///< Key value that represents an empty slot + Value empty_value_sentinel_{}; ///< Initial value of empty slot + atomic_ctr_type* num_successes_{}; ///< Number of successfully inserted keys on insert + ReductionOp op_{}; ///< Binary operation reduction function object + slot_allocator_type slot_allocator_{}; ///< Allocator used to allocate slots +}; +} // namespace cuco + +#include \ No newline at end of file diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 9c142ef58..a926d21f3 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -39,7 +39,7 @@ function(ConfigureTest TEST_NAME TEST_SRC) $) # Link in the CatchMain object file target_link_libraries(${TEST_NAME} Catch2::Catch2 cuco) set_target_properties(${TEST_NAME} PROPERTIES - CUDA_ARCHITECTURES ${GPU_ARCHS} + CUDA_ARCHITECTURES "${GPU_ARCHS}" RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tests") target_compile_options(${TEST_NAME} PRIVATE --expt-extended-lambda --expt-relaxed-constexpr -Xcompiler -Wno-subobject-linkage) catch_discover_tests(${TEST_NAME}) @@ -48,13 +48,9 @@ endfunction(ConfigureTest) ################################################################################################### ### test sources ################################################################################## ################################################################################################### -set(STATIC_MAP_TEST_SRC - "${CMAKE_CURRENT_SOURCE_DIR}/static_map/static_map_test.cu") -ConfigureTest(STATIC_MAP_TEST "${STATIC_MAP_TEST_SRC}") -#################################################################################################### -set(DYNAMIC_MAP_TEST_SRC - "${CMAKE_CURRENT_SOURCE_DIR}/dynamic_map/dynamic_map_test.cu") +ConfigureTest(STATIC_MAP_TEST "${CMAKE_CURRENT_SOURCE_DIR}/static_map/static_map_test.cu") -ConfigureTest(DYNAMIC_MAP_TEST "${DYNAMIC_MAP_TEST_SRC}") -#################################################################################################### \ No newline at end of file +ConfigureTest(STATIC_REDUCTION_MAP_TEST "${CMAKE_CURRENT_SOURCE_DIR}/static_reduction_map/static_reduction_map_test.cu") + +ConfigureTest(DYNAMIC_MAP_TEST "${CMAKE_CURRENT_SOURCE_DIR}/dynamic_map/dynamic_map_test.cu") \ No newline at end of file diff --git a/tests/static_reduction_map/static_reduction_map_test.cu b/tests/static_reduction_map/static_reduction_map_test.cu new file mode 100644 index 000000000..bb57f0847 --- /dev/null +++ b/tests/static_reduction_map/static_reduction_map_test.cu @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +namespace { +// Thrust logical algorithms (any_of/all_of/none_of) don't work with device +// lambdas: See https://github.com/thrust/thrust/issues/1062 +template +bool all_of(Iterator begin, Iterator end, Predicate p) +{ + auto size = thrust::distance(begin, end); + return size == thrust::count_if(begin, end, p); +} + +template +bool any_of(Iterator begin, Iterator end, Predicate p) +{ + return thrust::count_if(begin, end, p) > 0; +} + +template +bool none_of(Iterator begin, Iterator end, Predicate p) +{ + return not all_of(begin, end, p); +} +} // namespace + +TEMPLATE_TEST_CASE_SIG("Insert all identical keys", + "", + ((typename Key, typename Value), Key, Value), + (int32_t, int32_t)) +{ + thrust::device_vector keys(100, 42); + thrust::device_vector values(keys.size(), 1); + + auto const num_slots{keys.size() * 2}; + cuco::static_reduction_map, Key, Value> map{num_slots, -1}; + + auto zip = thrust::make_zip_iterator(thrust::make_tuple(keys.begin(), values.begin())); + auto zip_end = zip + keys.size(); + map.insert(zip, zip_end); + + SECTION("There should only be one key in the map") { REQUIRE(map.get_size() == 1); } + + SECTION("Map should contain the inserted key") + { + thrust::device_vector contained(keys.size()); + map.contains(keys.begin(), keys.end(), contained.begin()); + REQUIRE(all_of(contained.begin(), contained.end(), [] __device__(bool c) { return c; })); + } + + SECTION("Found value should equal aggregate of inserted values") + { + thrust::device_vector found(keys.size()); + map.find(keys.begin(), keys.end(), found.begin()); + auto const expected_aggregate = keys.size(); // All keys inserted "1", so the + // sum aggregate should be + // equal to the number of keys inserted + REQUIRE(all_of(found.begin(), found.end(), [expected_aggregate] __device__(Value v) { + return v == expected_aggregate; + })); + } +} + +TEMPLATE_TEST_CASE_SIG("Insert all unique keys", + "", + ((typename Key, typename Value), Key, Value), + (int32_t, int32_t)) +{ + constexpr std::size_t num_keys = 10000; + constexpr std::size_t num_slots{num_keys * 2}; + cuco::static_reduction_map, Key, Value> map{num_slots, -1}; + + auto keys_begin = thrust::make_counting_iterator(0); + auto values_begin = thrust::make_counting_iterator(0); + auto zip = thrust::make_zip_iterator(thrust::make_tuple(keys_begin, values_begin)); + auto zip_end = zip + num_keys; + map.insert(zip, zip_end); + + SECTION("Size of map should equal number of inserted keys") + { + REQUIRE(map.get_size() == num_keys); + } + + SECTION("Map should contain the inserted keys") + { + thrust::device_vector contained(num_keys); + map.contains(keys_begin, keys_begin + num_keys, contained.begin()); + REQUIRE(all_of(contained.begin(), contained.end(), [] __device__(bool c) { return c; })); + } + + SECTION("Found value should equal inserted value") + { + thrust::device_vector found(num_keys); + map.find(keys_begin, keys_begin + num_keys, found.begin()); + REQUIRE(thrust::equal(thrust::device, values_begin, values_begin + num_keys, found.begin())); + } +}