From 033d2bc52469ff658757f0eb762ed0e52fcc6d28 Mon Sep 17 00:00:00 2001 From: Jake Hemstad Date: Wed, 16 Dec 2020 09:54:09 -0600 Subject: [PATCH 01/69] Added initial static_reduction_map files. Copied existing static_map files and just renamed all references to static_map to static_reduction_map. --- include/cuco/detail/static_reduction_map.inl | 396 ++++++++ .../detail/static_reduction_map_kernels.cuh | 386 ++++++++ include/cuco/static_reduction_map.cuh | 929 ++++++++++++++++++ tests/CMakeLists.txt | 12 +- .../static_reduction_map_test.cu | 355 +++++++ 5 files changed, 2070 insertions(+), 8 deletions(-) create mode 100644 include/cuco/detail/static_reduction_map.inl create mode 100644 include/cuco/detail/static_reduction_map_kernels.cuh create mode 100644 include/cuco/static_reduction_map.cuh create mode 100644 tests/static_reduction_map/static_reduction_map_test.cu diff --git a/include/cuco/detail/static_reduction_map.inl b/include/cuco/detail/static_reduction_map.inl new file mode 100644 index 000000000..243032f6b --- /dev/null +++ b/include/cuco/detail/static_reduction_map.inl @@ -0,0 +1,396 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +namespace cuco { + +/**---------------------------------------------------------------------------* + * @brief Enumeration of the possible results of attempting to insert into + *a hash bucket + *---------------------------------------------------------------------------**/ +enum class insert_result { + CONTINUE, ///< Insert did not succeed, continue trying to insert + SUCCESS, ///< New pair inserted successfully + DUPLICATE ///< Insert did not succeed, key is already present +}; + +template +static_reduction_map::static_reduction_map(std::size_t capacity, + Key empty_key_sentinel, + Value empty_value_sentinel, + Allocator const& alloc) + : capacity_{capacity}, + empty_key_sentinel_{empty_key_sentinel}, + empty_value_sentinel_{empty_value_sentinel}, + slot_allocator_{alloc} +{ + slots_ = std::allocator_traits::allocate(slot_allocator_, capacity); + + auto constexpr block_size = 256; + auto constexpr stride = 4; + auto const grid_size = (capacity + stride * block_size - 1) / (stride * block_size); + detail::initialize + <<>>(slots_, empty_key_sentinel, empty_value_sentinel, capacity); + + CUCO_CUDA_TRY(cudaMallocManaged(&num_successes_, sizeof(atomic_ctr_type))); +} + +template +static_reduction_map::~static_reduction_map() +{ + std::allocator_traits::deallocate(slot_allocator_, slots_, capacity_); + CUCO_CUDA_TRY(cudaFree(num_successes_)); +} + +template +template +void static_reduction_map::insert(InputIt first, + InputIt last, + Hash hash, + KeyEqual key_equal) +{ + auto num_keys = std::distance(first, last); + auto const block_size = 128; + auto const stride = 1; + auto const tile_size = 4; + auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size); + auto view = get_device_mutable_view(); + + *num_successes_ = 0; + int device_id; + CUCO_CUDA_TRY(cudaGetDevice(&device_id)); + CUCO_CUDA_TRY(cudaMemPrefetchAsync(num_successes_, sizeof(atomic_ctr_type), device_id)); + + detail::insert + <<>>(first, first + num_keys, num_successes_, view, hash, key_equal); + CUCO_CUDA_TRY(cudaDeviceSynchronize()); + + size_ += num_successes_->load(cuda::std::memory_order_relaxed); +} + +template +template +void static_reduction_map::find( + InputIt first, InputIt last, OutputIt output_begin, Hash hash, KeyEqual key_equal) noexcept +{ + auto num_keys = std::distance(first, last); + auto const block_size = 128; + auto const stride = 1; + auto const tile_size = 4; + auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size); + auto view = get_device_view(); + + detail::find + <<>>(first, last, output_begin, view, hash, key_equal); + CUCO_CUDA_TRY(cudaDeviceSynchronize()); +} + +template +template +void static_reduction_map::contains( + InputIt first, InputIt last, OutputIt output_begin, Hash hash, KeyEqual key_equal) noexcept +{ + auto num_keys = std::distance(first, last); + auto const block_size = 128; + auto const stride = 1; + auto const tile_size = 4; + auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size); + auto view = get_device_view(); + + detail::contains + <<>>(first, last, output_begin, view, hash, key_equal); + CUCO_CUDA_TRY(cudaDeviceSynchronize()); +} + +template +template +__device__ bool static_reduction_map::device_mutable_view::insert( + value_type const& insert_pair, Hash hash, KeyEqual key_equal) noexcept +{ + auto current_slot{initial_slot(insert_pair.first, hash)}; + + while (true) { + using cuda::std::memory_order_relaxed; + auto expected_key = this->get_empty_key_sentinel(); + auto expected_value = this->get_empty_value_sentinel(); + auto& slot_key = current_slot->first; + auto& slot_value = current_slot->second; + + bool key_success = + slot_key.compare_exchange_strong(expected_key, insert_pair.first, memory_order_relaxed); + bool value_success = + slot_value.compare_exchange_strong(expected_value, insert_pair.second, memory_order_relaxed); + + if (key_success) { + while (not value_success) { + value_success = + slot_value.compare_exchange_strong(expected_value = this->get_empty_value_sentinel(), + insert_pair.second, + memory_order_relaxed); + } + return true; + } else if (value_success) { + slot_value.store(this->get_empty_value_sentinel(), memory_order_relaxed); + } + + // if the key was already inserted by another thread, than this instance is a + // duplicate, so the insert fails + if (key_equal(insert_pair.first, expected_key)) { return false; } + + // if we couldn't insert the key, but it wasn't a duplicate, then there must + // have been some other key there, so we keep looking for a slot + current_slot = next_slot(current_slot); + } +} + +template +template +__device__ bool static_reduction_map::device_mutable_view::insert( + CG g, value_type const& insert_pair, Hash hash, KeyEqual key_equal) noexcept +{ + auto current_slot = initial_slot(g, insert_pair.first, hash); + + while (true) { + key_type const existing_key = current_slot->first; + + // The user provide `key_equal` can never be used to compare against `empty_key_sentinel` as the + // sentinel is not a valid key value. Therefore, first check for the sentinel + auto const slot_is_empty = (existing_key == this->get_empty_key_sentinel()); + + // the key we are trying to insert is already in the map, so we return with failure to insert + if (g.ballot(not slot_is_empty and key_equal(existing_key, insert_pair.first))) { + return false; + } + + auto const window_contains_empty = g.ballot(slot_is_empty); + + // we found an empty slot, but not the key we are inserting, so this must + // be an empty slot into which we can insert the key + if (window_contains_empty) { + // the first lane in the group with an empty slot will attempt the insert + insert_result status{insert_result::CONTINUE}; + uint32_t src_lane = __ffs(window_contains_empty) - 1; + + if (g.thread_rank() == src_lane) { + using cuda::std::memory_order_relaxed; + auto expected_key = this->get_empty_key_sentinel(); + auto expected_value = this->get_empty_value_sentinel(); + auto& slot_key = current_slot->first; + auto& slot_value = current_slot->second; + + bool key_success = + slot_key.compare_exchange_strong(expected_key, insert_pair.first, memory_order_relaxed); + bool value_success = slot_value.compare_exchange_strong( + expected_value, insert_pair.second, memory_order_relaxed); + + if (key_success) { + while (not value_success) { + value_success = + slot_value.compare_exchange_strong(expected_value = this->get_empty_value_sentinel(), + insert_pair.second, + memory_order_relaxed); + } + status = insert_result::SUCCESS; + } else if (value_success) { + slot_value.store(this->get_empty_value_sentinel(), memory_order_relaxed); + } + + // our key was already present in the slot, so our key is a duplicate + if (key_equal(insert_pair.first, expected_key)) { status = insert_result::DUPLICATE; } + // another key was inserted in the slot we wanted to try + // so we need to try the next empty slot in the window + } + + uint32_t res_status = g.shfl(static_cast(status), src_lane); + status = static_cast(res_status); + + // successful insert + if (status == insert_result::SUCCESS) { return true; } + // duplicate present during insert + if (status == insert_result::DUPLICATE) { return false; } + // if we've gotten this far, a different key took our spot + // before we could insert. We need to retry the insert on the + // same window + } + // if there are no empty slots in the current window, + // we move onto the next window + else { + current_slot = next_slot(g, current_slot); + } + } +} + +template +template +__device__ typename static_reduction_map::device_view::iterator +static_reduction_map::device_view::find(Key const& k, + Hash hash, + KeyEqual key_equal) noexcept +{ + auto current_slot = initial_slot(k, hash); + + while (true) { + auto const existing_key = current_slot->first.load(cuda::std::memory_order_relaxed); + // Key doesn't exist, return end() + if (existing_key == this->get_empty_key_sentinel()) { return this->end(); } + + // Key exists, return iterator to location + if (key_equal(existing_key, k)) { return current_slot; } + + current_slot = next_slot(current_slot); + } +} + +template +template +__device__ typename static_reduction_map::device_view::const_iterator +static_reduction_map::device_view::find(Key const& k, + Hash hash, + KeyEqual key_equal) const + noexcept +{ + auto current_slot = initial_slot(k, hash); + + while (true) { + auto const existing_key = current_slot->first.load(cuda::std::memory_order_relaxed); + // Key doesn't exist, return end() + if (existing_key == this->get_empty_key_sentinel()) { return this->end(); } + + // Key exists, return iterator to location + if (key_equal(existing_key, k)) { return current_slot; } + + current_slot = next_slot(current_slot); + } +} + +template +template +__device__ typename static_reduction_map::device_view::iterator +static_reduction_map::device_view::find(CG g, + Key const& k, + Hash hash, + KeyEqual key_equal) noexcept +{ + auto current_slot = initial_slot(g, k, hash); + + while (true) { + auto const existing_key = current_slot->first.load(cuda::std::memory_order_relaxed); + + // The user provide `key_equal` can never be used to compare against `empty_key_sentinel` as the + // sentinel is not a valid key value. Therefore, first check for the sentinel + auto const slot_is_empty = (existing_key == this->get_empty_key_sentinel()); + + // the key we were searching for was found by one of the threads, + // so we return an iterator to the entry + auto const exists = g.ballot(not slot_is_empty and key_equal(existing_key, k)); + if (exists) { + uint32_t src_lane = __ffs(exists) - 1; + // TODO: This shouldn't cast an iterator to an int to shuffle. Instead, get the index of the + // current_slot and shuffle that instead. + intptr_t res_slot = g.shfl(reinterpret_cast(current_slot), src_lane); + return reinterpret_cast(res_slot); + } + + // we found an empty slot, meaning that the key we're searching for isn't present + if (g.ballot(slot_is_empty)) { return this->end(); } + + // otherwise, all slots in the current window are full with other keys, so we move onto the + // next window + current_slot = next_slot(g, current_slot); + } +} + +template +template +__device__ typename static_reduction_map::device_view::const_iterator +static_reduction_map::device_view::find( + CG g, Key const& k, Hash hash, KeyEqual key_equal) const noexcept +{ + auto current_slot = initial_slot(g, k, hash); + + while (true) { + auto const existing_key = current_slot->first.load(cuda::std::memory_order_relaxed); + + // The user provide `key_equal` can never be used to compare against `empty_key_sentinel` as the + // sentinel is not a valid key value. Therefore, first check for the sentinel + auto const slot_is_empty = (existing_key == this->get_empty_key_sentinel()); + + // the key we were searching for was found by one of the threads, so we return an iterator to + // the entry + auto const exists = g.ballot(not slot_is_empty and key_equal(existing_key, k)); + if (exists) { + uint32_t src_lane = __ffs(exists) - 1; + // TODO: This shouldn't cast an iterator to an int to shuffle. Instead, get the index of the + // current_slot and shuffle that instead. + intptr_t res_slot = g.shfl(reinterpret_cast(current_slot), src_lane); + return reinterpret_cast(res_slot); + } + + // we found an empty slot, meaning that the key we're searching + // for isn't in this submap, so we should move onto the next one + if (g.ballot(slot_is_empty)) { return this->end(); } + + // otherwise, all slots in the current window are full with other keys, + // so we move onto the next window in the current submap + + current_slot = next_slot(g, current_slot); + } +} + +template +template +__device__ bool static_reduction_map::device_view::contains( + Key const& k, Hash hash, KeyEqual key_equal) noexcept +{ + auto current_slot = initial_slot(k, hash); + + while (true) { + auto const existing_key = current_slot->first.load(cuda::std::memory_order_relaxed); + + if (existing_key == empty_key_sentinel_) { return false; } + + if (key_equal(existing_key, k)) { return true; } + + current_slot = next_slot(current_slot); + } +} + +template +template +__device__ bool static_reduction_map::device_view::contains( + CG g, Key const& k, Hash hash, KeyEqual key_equal) noexcept +{ + auto current_slot = initial_slot(g, k, hash); + + while (true) { + key_type const existing_key = current_slot->first.load(cuda::std::memory_order_relaxed); + + // The user provide `key_equal` can never be used to compare against `empty_key_sentinel` as the + // sentinel is not a valid key value. Therefore, first check for the sentinel + auto const slot_is_empty = (existing_key == this->get_empty_key_sentinel()); + + // the key we were searching for was found by one of the threads, so we return an iterator to + // the entry + if (g.ballot(not slot_is_empty and key_equal(existing_key, k))) { return true; } + + // we found an empty slot, meaning that the key we're searching for isn't present + if (g.ballot(slot_is_empty)) { return false; } + + // otherwise, all slots in the current window are full with other keys, so we move onto the next + // window + current_slot = next_slot(g, current_slot); + } +} +} // namespace cuco diff --git a/include/cuco/detail/static_reduction_map_kernels.cuh b/include/cuco/detail/static_reduction_map_kernels.cuh new file mode 100644 index 000000000..6ded5e99d --- /dev/null +++ b/include/cuco/detail/static_reduction_map_kernels.cuh @@ -0,0 +1,386 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +namespace cuco { +namespace detail { +namespace cg = cooperative_groups; + +/** + * @brief Initializes each slot in the flat `slots` storage to contain `k` and `v`. + * + * Each space in `slots` that can hold a key value pair is initialized to a + * `pair_atomic_type` containing the key `k` and the value `v`. + * + * @tparam atomic_key_type Type of the `Key` atomic container + * @tparam atomic_mapped_type Type of the `Value` atomic container + * @tparam Key key type + * @tparam Value value type + * @tparam pair_atomic_type key/value pair type + * @param slots Pointer to flat storage for the map's key/value pairs + * @param k Key to which all keys in `slots` are initialized + * @param v Value to which all values in `slots` are initialized + * @param size Size of the storage pointed to by `slots` + */ +template +__global__ void initialize(pair_atomic_type* const slots, Key k, Value v, std::size_t size) +{ + auto tid = threadIdx.x + blockIdx.x * blockDim.x; + while (tid < size) { + new (&slots[tid].first) atomic_key_type{k}; + new (&slots[tid].second) atomic_mapped_type{v}; + tid += gridDim.x * blockDim.x; + } +} + +/** + * @brief Inserts all key/value pairs in the range `[first, last)`. + * + * If multiple keys in `[first, last)` compare equal, it is unspecified which + * element is inserted. + * + * @tparam block_size + * @tparam InputIt Device accessible input iterator whose `value_type` is + * convertible to the map's `value_type` + * @tparam atomicT Type of atomic storage + * @tparam viewT Type of device view allowing access of hash map storage + * @tparam Hash Unary callable type + * @tparam KeyEqual Binary callable type + * @param first Beginning of the sequence of key/value pairs + * @param last End of the sequence of key/value pairs + * @param num_successes The number of successfully inserted key/value pairs + * @param view Mutable device view used to access the hash map's slot storage + * @param hash The unary function to apply to hash each key + * @param key_equal The binary function used to compare two keys for equality + */ +template +__global__ void insert( + InputIt first, InputIt last, atomicT* num_successes, viewT view, Hash hash, KeyEqual key_equal) +{ + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + std::size_t thread_num_successes = 0; + + auto tid = blockDim.x * blockIdx.x + threadIdx.x; + auto it = first + tid; + + while (it < last) { + typename viewT::value_type const insert_pair{*it}; + if (view.insert(insert_pair, hash, key_equal)) { thread_num_successes++; } + it += gridDim.x * blockDim.x; + } + + // compute number of successfully inserted elements for each block + // and atomically add to the grand total + std::size_t block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes); + if (threadIdx.x == 0) { *num_successes += block_num_successes; } +} + +/** + * @brief Inserts all key/value pairs in the range `[first, last)`. + * + * If multiple keys in `[first, last)` compare equal, it is unspecified which + * element is inserted. Uses the CUDA Cooperative Groups API to leverage groups + * of multiple threads to perform each key/value insertion. This provides a + * significant boost in throughput compared to the non Cooperative Group + * `insert` at moderate to high load factors. + * + * @tparam block_size + * @tparam tile_size The number of threads in the Cooperative Groups used to perform + * inserts + * @tparam InputIt Device accessible input iterator whose `value_type` is + * convertible to the map's `value_type` + * @tparam atomicT Type of atomic storage + * @tparam viewT Type of device view allowing access of hash map storage + * @tparam Hash Unary callable type + * @tparam KeyEqual Binary callable type + * @param first Beginning of the sequence of key/value pairs + * @param last End of the sequence of key/value pairs + * @param num_successes The number of successfully inserted key/value pairs + * @param view Mutable device view used to access the hash map's slot storage + * @param hash The unary function to apply to hash each key + * @param key_equal The binary function used to compare two keys for equality + */ +template +__global__ void insert( + InputIt first, InputIt last, atomicT* num_successes, viewT view, Hash hash, KeyEqual key_equal) +{ + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + std::size_t thread_num_successes = 0; + + auto tile = cg::tiled_partition(cg::this_thread_block()); + auto tid = blockDim.x * blockIdx.x + threadIdx.x; + auto it = first + tid / tile_size; + + while (it < last) { + // force conversion to value_type + typename viewT::value_type const insert_pair{*it}; + if (view.insert(tile, insert_pair, hash, key_equal) && tile.thread_rank() == 0) { + thread_num_successes++; + } + it += (gridDim.x * blockDim.x) / tile_size; + } + + // compute number of successfully inserted elements for each block + // and atomically add to the grand total + std::size_t block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes); + if (threadIdx.x == 0) { *num_successes += block_num_successes; } +} + +/** + * @brief Finds the values corresponding to all keys in the range `[first, last)`. + * + * If the key `*(first + i)` exists in the map, copies its associated value to `(output_begin + i)`. + * Else, copies the empty value sentinel. + * @tparam block_size The size of the thread block + * @tparam Value The type of the mapped value for the map + * @tparam InputIt Device accessible input iterator whose `value_type` is + * convertible to the map's `key_type` + * @tparam OutputIt Device accessible output iterator whose `value_type` is + * convertible to the map's `mapped_type` + * @tparam viewT Type of device view allowing access of hash map storage + * @tparam Hash Unary callable type + * @tparam KeyEqual Binary callable type + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param output_begin Beginning of the sequence of values retrieved for each key + * @param view Device view used to access the hash map's slot storage + * @param hash The unary function to apply to hash each key + * @param key_equal The binary function to compare two keys for equality + */ +template +__global__ void find( + InputIt first, InputIt last, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal) +{ + auto tid = blockDim.x * blockIdx.x + threadIdx.x; + auto key_idx = tid; + __shared__ Value writeBuffer[block_size]; + + while (first + key_idx < last) { + auto key = *(first + key_idx); + auto found = view.find(key, hash, key_equal); + + /* + * The ld.relaxed.gpu instruction used in view.find causes L1 to + * flush more frequently, causing increased sector stores from L2 to global memory. + * By writing results to shared memory and then synchronizing before writing back + * to global, we no longer rely on L1, preventing the increase in sector stores from + * L2 to global and improving performance. + */ + writeBuffer[threadIdx.x] = found->second.load(cuda::std::memory_order_relaxed); + __syncthreads(); + *(output_begin + key_idx) = writeBuffer[threadIdx.x]; + key_idx += gridDim.x * blockDim.x; + } +} + +/** + * @brief Finds the values corresponding to all keys in the range `[first, last)`. + * + * If the key `*(first + i)` exists in the map, copies its associated value to `(output_begin + i)`. + * Else, copies the empty value sentinel. Uses the CUDA Cooperative Groups API to leverage groups + * of multiple threads to find each key. This provides a significant boost in throughput compared + * to the non Cooperative Group `find` at moderate to high load factors. + * + * @tparam block_size The size of the thread block + * @tparam tile_size The number of threads in the Cooperative Groups used to perform + * inserts + * @tparam Value The type of the mapped value for the map + * @tparam InputIt Device accessible input iterator whose `value_type` is + * convertible to the map's `key_type` + * @tparam OutputIt Device accessible output iterator whose `value_type` is + * convertible to the map's `mapped_type` + * @tparam viewT Type of device view allowing access of hash map storage + * @tparam Hash Unary callable type + * @tparam KeyEqual Binary callable type + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param output_begin Beginning of the sequence of values retrieved for each key + * @param view Device view used to access the hash map's slot storage + * @param hash The unary function to apply to hash each key + * @param key_equal The binary function to compare two keys for equality + */ +template +__global__ void find( + InputIt first, InputIt last, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal) +{ + auto tile = cg::tiled_partition(cg::this_thread_block()); + auto tid = blockDim.x * blockIdx.x + threadIdx.x; + auto key_idx = tid / tile_size; + __shared__ Value writeBuffer[block_size]; + + while (first + key_idx < last) { + auto key = *(first + key_idx); + auto found = view.find(tile, key, hash, key_equal); + + /* + * The ld.relaxed.gpu instruction used in view.find causes L1 to + * flush more frequently, causing increased sector stores from L2 to global memory. + * By writing results to shared memory and then synchronizing before writing back + * to global, we no longer rely on L1, preventing the increase in sector stores from + * L2 to global and improving performance. + */ + if (tile.thread_rank() == 0) { + writeBuffer[threadIdx.x / tile_size] = found->second.load(cuda::std::memory_order_relaxed); + } + __syncthreads(); + if (tile.thread_rank() == 0) { + *(output_begin + key_idx) = writeBuffer[threadIdx.x / tile_size]; + } + key_idx += (gridDim.x * blockDim.x) / tile_size; + } +} + +/** + * @brief Indicates whether the keys in the range `[first, last)` are contained in the map. + * + * Writes a `bool` to `(output + i)` indicating if the key `*(first + i)` exists in the map. + * + * @tparam block_size The size of the thread block + * @tparam InputIt Device accessible input iterator whose `value_type` is + * convertible to the map's `key_type` + * @tparam OutputIt Device accessible output iterator whose `value_type` is + * convertible to the map's `mapped_type` + * @tparam viewT Type of device view allowing access of hash map storage + * @tparam Hash Unary callable type + * @tparam KeyEqual Binary callable type + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param output_begin Beginning of the sequence of booleans for the presence of each key + * @param view Device view used to access the hash map's slot storage + * @param hash The unary function to apply to hash each key + * @param key_equal The binary function to compare two keys for equality + */ +template +__global__ void contains( + InputIt first, InputIt last, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal) +{ + auto tid = blockDim.x * blockIdx.x + threadIdx.x; + auto key_idx = tid; + __shared__ bool writeBuffer[block_size]; + + while (first + key_idx < last) { + auto key = *(first + key_idx); + + /* + * The ld.relaxed.gpu instruction used in view.find causes L1 to + * flush more frequently, causing increased sector stores from L2 to global memory. + * By writing results to shared memory and then synchronizing before writing back + * to global, we no longer rely on L1, preventing the increase in sector stores from + * L2 to global and improving performance. + */ + writeBuffer[threadIdx.x] = view.contains(key, hash, key_equal); + __syncthreads(); + *(output_begin + key_idx) = writeBuffer[threadIdx.x]; + key_idx += gridDim.x * blockDim.x; + } +} + +/** + * @brief Indicates whether the keys in the range `[first, last)` are contained in the map. + * + * Writes a `bool` to `(output + i)` indicating if the key `*(first + i)` exists in the map. + * Uses the CUDA Cooperative Groups API to leverage groups of multiple threads to perform the + * contains operation for each key. This provides a significant boost in throughput compared + * to the non Cooperative Group `contains` at moderate to high load factors. + * + * @tparam block_size The size of the thread block + * @tparam tile_size The number of threads in the Cooperative Groups used to perform + * inserts + * @tparam InputIt Device accessible input iterator whose `value_type` is + * convertible to the map's `key_type` + * @tparam OutputIt Device accessible output iterator whose `value_type` is + * convertible to the map's `mapped_type` + * @tparam viewT Type of device view allowing access of hash map storage + * @tparam Hash Unary callable type + * @tparam KeyEqual Binary callable type + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param output_begin Beginning of the sequence of booleans for the presence of each key + * @param view Device view used to access the hash map's slot storage + * @param hash The unary function to apply to hash each key + * @param key_equal The binary function to compare two keys for equality + */ +template +__global__ void contains( + InputIt first, InputIt last, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal) +{ + auto tile = cg::tiled_partition(cg::this_thread_block()); + auto tid = blockDim.x * blockIdx.x + threadIdx.x; + auto key_idx = tid / tile_size; + __shared__ bool writeBuffer[block_size]; + + while (first + key_idx < last) { + auto key = *(first + key_idx); + auto found = view.contains(tile, key, hash, key_equal); + + /* + * The ld.relaxed.gpu instruction used in view.find causes L1 to + * flush more frequently, causing increased sector stores from L2 to global memory. + * By writing results to shared memory and then synchronizing before writing back + * to global, we no longer rely on L1, preventing the increase in sector stores from + * L2 to global and improving performance. + */ + if (tile.thread_rank() == 0) { writeBuffer[threadIdx.x / tile_size] = found; } + __syncthreads(); + if (tile.thread_rank() == 0) { + *(output_begin + key_idx) = writeBuffer[threadIdx.x / tile_size]; + } + key_idx += (gridDim.x * blockDim.x) / tile_size; + } +} + +} // namespace detail +} // namespace cuco \ No newline at end of file diff --git a/include/cuco/static_reduction_map.cuh b/include/cuco/static_reduction_map.cuh new file mode 100644 index 000000000..241ef480d --- /dev/null +++ b/include/cuco/static_reduction_map.cuh @@ -0,0 +1,929 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include +#include +#ifndef CUDART_VERSION +#error CUDART_VERSION Undefined! +#elif (CUDART_VERSION >= 11000) // including with CUDA 10.2 leads to compilation errors +#include +#endif + +#include +#include +#include +#include + +namespace cuco { + +/** + * @brief Possible reduction operations that can be performed by a `static_reduction_map`. + * + * `GENERIC` allows for any associative binary reduction operation, but may have worse performance + * compared to one of the native operations. + * + */ +enum class reduction_op { + SUM, ///< Addition + SUB, ///< Subtraction + MIN, ///< Minimum value + MAX, ///< Maximum value + AND, ///< Bitwise AND + OR, ///< Bitwise OR + XOR, ///< Bitwise XOR + GENERIC ///< User-defined, associative binary operation +}; + +/** + * @brief A GPU-accelerated, unordered, associative container of key-value + * pairs with unique keys. + * + * Allows constant time concurrent inserts or concurrent find operations (not + * concurrent insert and find) from threads in device code. + * + * Current limitations: + * - Requires keys that are Arithmetic + * - Does not support erasing keys + * - Capacity is fixed and will not grow automatically + * - Requires the user to specify sentinel values for both key and mapped value + * to indicate empty slots + * - Does not support concurrent insert and find operations + * + * The `static_reduction_map` supports two types of operations: + * - Host-side "bulk" operations + * - Device-side "singular" operations + * + * The host-side bulk operations include `insert`, `find`, and `contains`. These + * APIs should be used when there are a large number of keys to insert or lookup + * in the map. For example, given a range of keys specified by device-accessible + * iterators, the bulk `insert` function will insert all keys into the map. + * + * The singular device-side operations allow individual threads to to perform + * independent insert or find/contains operations from device code. These + * operations are accessed through non-owning, trivially copyable "view" types: + * `device_view` and `mutable_device_view`. The `device_view` class is an + * immutable view that allows only non-modifying operations such as `find` or + * `contains`. The `mutable_device_view` class only allows `insert` operations. + * The two types are separate to prevent erroneous concurrent insert/find + * operations. + * + * Example: + * \code{.cpp} + * int empty_key_sentinel = -1; + * int empty_value_sentine = -1; + * + * // Constructs a map with 100,000 slots using -1 and -1 as the empty key/value + * // sentinels. Note the capacity is chosen knowing we will insert 50,000 keys, + * // for an load factor of 50%. + * static_reduction_map m{100'000, empty_key_sentinel, empty_value_sentinel}; + * + * // Create a sequence of pairs {{0,0}, {1,1}, ... {i,i}} + * thrust::device_vector> pairs(50,000); + * thrust::transform(thrust::make_counting_iterator(0), + * thrust::make_counting_iterator(pairs.size()), + * pairs.begin(), + * []__device__(auto i){ return thrust::make_pair(i,i); }; + * + * + * // Inserts all pairs into the map + * m.insert(pairs.begin(), pairs.end()); + * + * // Get a `device_view` and passes it to a kernel where threads may perform + * // `find/contains` lookups + * kernel<<<...>>>(m.get_device_view()); + * \endcode + * + * + * @tparam Key Arithmetic type used for key + * @tparam Value Type of the mapped values + * @tparam Scope The scope in which insert/find operations will be performed by + * individual threads. + * @tparam Allocator Type of allocator used for device storage + */ +template > +class static_reduction_map { + static_assert(std::is_arithmetic::value, "Unsupported, non-arithmetic key type."); + + public: + using value_type = cuco::pair_type; + using key_type = Key; + using mapped_type = Value; + using atomic_key_type = cuda::atomic; + using atomic_mapped_type = cuda::atomic; + using pair_atomic_type = cuco::pair_type; + using atomic_ctr_type = cuda::atomic; + using allocator_type = Allocator; + using slot_allocator_type = + typename std::allocator_traits::rebind_alloc; + + static_reduction_map(static_reduction_map const&) = delete; + static_reduction_map(static_reduction_map&&) = delete; + static_reduction_map& operator=(static_reduction_map const&) = delete; + static_reduction_map& operator=(static_reduction_map&&) = delete; + + /** + * @brief Construct a fixed-size map with the specified capacity and sentinel values. + * @brief Construct a statically sized map with the specified number of slots + * and sentinel values. + * + * The capacity of the map is fixed. Insert operations will not automatically + * grow the map. Attempting to insert more unique keys than the capacity of + * the map results in undefined behavior. + * + * Performance begins to degrade significantly beyond a load factor of ~70%. + * For best performance, choose a capacity that will keep the load factor + * below 70%. E.g., if inserting `N` unique keys, choose a capacity of + * `N * (1/0.7)`. + * + * The `empty_key_sentinel` and `empty_value_sentinel` values are reserved and + * undefined behavior results from attempting to insert any key/value pair + * that contains either. + * + * @param capacity The total number of slots in the map + * @param empty_key_sentinel The reserved key value for empty slots + * @param empty_value_sentinel The reserved mapped value for empty slots + * @param alloc Allocator used for allocating device storage + */ + static_reduction_map(std::size_t capacity, + Key empty_key_sentinel, + Value empty_value_sentinel, + Allocator const& alloc = Allocator{}); + + /** + * @brief Destroys the map and frees its contents. + * + */ + ~static_reduction_map(); + + /** + * @brief Inserts all key/value pairs in the range `[first, last)`. + * + * If multiple keys in `[first, last)` compare equal, it is unspecified which + * element is inserted. + * + * @tparam InputIt Device accessible input iterator whose `value_type` is + * convertible to the map's `value_type` + * @tparam Hash Unary callable type + * @tparam KeyEqual Binary callable type + * @param first Beginning of the sequence of key/value pairs + * @param last End of the sequence of key/value pairs + * @param hash The unary function to apply to hash each key + * @param key_equal The binary function to compare two keys for equality + */ + template , + typename KeyEqual = thrust::equal_to> + void insert(InputIt first, InputIt last, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}); + + /** + * @brief Finds the values corresponding to all keys in the range `[first, last)`. + * + * If the key `*(first + i)` exists in the map, copies its associated value to `(output_begin + + * i)`. Else, copies the empty value sentinel. + * + * @tparam InputIt Device accessible input iterator whose `value_type` is + * convertible to the map's `key_type` + * @tparam OutputIt Device accessible output iterator whose `value_type` is + * convertible to the map's `mapped_type` + * @tparam Hash Unary callable type + * @tparam KeyEqual Binary callable type + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param output_begin Beginning of the sequence of values retrieved for each key + * @param hash The unary function to apply to hash each key + * @param key_equal The binary function to compare two keys for equality + */ + template , + typename KeyEqual = thrust::equal_to> + void find(InputIt first, + InputIt last, + OutputIt output_begin, + Hash hash = Hash{}, + KeyEqual key_equal = KeyEqual{}) noexcept; + + /** + * @brief Indicates whether the keys in the range `[first, last)` are contained in the map. + * + * Writes a `bool` to `(output + i)` indicating if the key `*(first + i)` exists in the map. + * + * @tparam InputIt Device accessible input iterator whose `value_type` is + * convertible to the map's `key_type` + * @tparam OutputIt Device accessible output iterator whose `value_type` is + * convertible to the map's `mapped_type` + * @tparam Hash Unary callable type + * @tparam KeyEqual Binary callable type + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param output_begin Beginning of the sequence of booleans for the presence of each key + * @param hash The unary function to apply to hash each key + * @param key_equal The binary function to compare two keys for equality + */ + template , + typename KeyEqual = thrust::equal_to> + void contains(InputIt first, + InputIt last, + OutputIt output_begin, + Hash hash = Hash{}, + KeyEqual key_equal = KeyEqual{}) noexcept; + + private: + class device_view_base { + protected: + // Import member type definitions from `static_reduction_map` + using value_type = value_type; + using key_type = Key; + using mapped_type = Value; + using iterator = pair_atomic_type*; + using const_iterator = pair_atomic_type const*; + + private: + pair_atomic_type* slots_{}; ///< Pointer to flat slots storage + std::size_t capacity_{}; ///< Total number of slots + Key empty_key_sentinel_{}; ///< Key value that represents an empty slot + Value empty_value_sentinel_{}; ///< Initial Value of empty slot + + protected: + __host__ __device__ device_view_base(pair_atomic_type* slots, + std::size_t capacity, + Key empty_key_sentinel, + Value empty_value_sentinel) noexcept + : slots_{slots}, + capacity_{capacity}, + empty_key_sentinel_{empty_key_sentinel}, + empty_value_sentinel_{empty_value_sentinel} + { + } + + /** + * @brief Gets slots array. + * + * @return Slots array + */ + __device__ pair_atomic_type* get_slots() noexcept { return slots_; } + + /** + * @brief Gets slots array. + * + * @return Slots array + */ + __device__ pair_atomic_type const* get_slots() const noexcept { return slots_; } + + /** + * @brief Returns the initial slot for a given key `k` + * + * @tparam Hash Unary callable type + * @param k The key to get the slot for + * @param hash The unary callable used to hash the key + * @return Pointer to the initial slot for `k` + */ + template + __device__ iterator initial_slot(Key const& k, Hash hash) noexcept + { + return &slots_[hash(k) % capacity_]; + } + + /** + * @brief Returns the initial slot for a given key `k` + * + * @tparam Hash Unary callable type + * @param k The key to get the slot for + * @param hash The unary callable used to hash the key + * @return Pointer to the initial slot for `k` + */ + template + __device__ const_iterator initial_slot(Key const& k, Hash hash) const noexcept + { + return &slots_[hash(k) % capacity_]; + } + + /** + * @brief Returns the initial slot for a given key `k` + * + * To be used for Cooperative Group based probing. + * + * @tparam CG Cooperative Group type + * @tparam Hash Unary callable type + * @param g the Cooperative Group for which the initial slot is needed + * @param k The key to get the slot for + * @param hash The unary callable used to hash the key + * @return Pointer to the initial slot for `k` + */ + template + __device__ iterator initial_slot(CG g, Key const& k, Hash hash) noexcept + { + return &slots_[(hash(k) + g.thread_rank()) % capacity_]; + } + + /** + * @brief Returns the initial slot for a given key `k` + * + * To be used for Cooperative Group based probing. + * + * @tparam CG Cooperative Group type + * @tparam Hash Unary callable type + * @param g the Cooperative Group for which the initial slot is needed + * @param k The key to get the slot for + * @param hash The unary callable used to hash the key + * @return Pointer to the initial slot for `k` + */ + template + __device__ const_iterator initial_slot(CG g, Key const& k, Hash hash) const noexcept + { + return &slots_[(hash(k) + g.thread_rank()) % capacity_]; + } + + /** + * @brief Given a slot `s`, returns the next slot. + * + * If `s` is the last slot, wraps back around to the first slot. + * + * @param s The slot to advance + * @return The next slot after `s` + */ + __device__ iterator next_slot(iterator s) noexcept { return (++s < end()) ? s : begin_slot(); } + + /** + * @brief Given a slot `s`, returns the next slot. + * + * If `s` is the last slot, wraps back around to the first slot. + * + * @param s The slot to advance + * @return The next slot after `s` + */ + __device__ const_iterator next_slot(const_iterator s) const noexcept + { + return (++s < end()) ? s : begin_slot(); + } + + /** + * @brief Given a slot `s`, returns the next slot. + * + * If `s` is the last slot, wraps back around to the first slot. To + * be used for Cooperative Group based probing. + * + * @tparam CG The Cooperative Group type + * @param g The Cooperative Group for which the next slot is needed + * @param s The slot to advance + * @return The next slot after `s` + */ + template + __device__ iterator next_slot(CG g, iterator s) noexcept + { + uint32_t index = s - slots_; + return &slots_[(index + g.size()) % capacity_]; + } + + /** + * @brief Given a slot `s`, returns the next slot. + * + * If `s` is the last slot, wraps back around to the first slot. To + * be used for Cooperative Group based probing. + * + * @tparam CG The Cooperative Group type + * @param g The Cooperative Group for which the next slot is needed + * @param s The slot to advance + * @return The next slot after `s` + */ + template + __device__ const_iterator next_slot(CG g, const_iterator s) const noexcept + { + uint32_t index = s - slots_; + return &slots_[(index + g.size()) % capacity_]; + } + + public: + /** + * @brief Gets the maximum number of elements the hash map can hold. + * + * @return The maximum number of elements the hash map can hold + */ + __host__ __device__ std::size_t get_capacity() const noexcept { return capacity_; } + + /** + * @brief Gets the sentinel value used to represent an empty key slot. + * + * @return The sentinel value used to represent an empty key slot + */ + __host__ __device__ Key get_empty_key_sentinel() const noexcept { return empty_key_sentinel_; } + + /** + * @brief Gets the sentinel value used to represent an empty value slot. + * + * @return The sentinel value used to represent an empty value slot + */ + __host__ __device__ Value get_empty_value_sentinel() const noexcept + { + return empty_value_sentinel_; + } + + /** + * @brief Returns iterator to the first slot. + * + * @note Unlike `std::map::begin()`, the `begin_slot()` iterator does _not_ point to the first + * occupied slot. Instead, it refers to the first slot in the array of contiguous slot storage. + * Iterating from `begin_slot()` to `end_slot()` will iterate over all slots, including those + * both empty and filled. + * + * There is no `begin()` iterator to avoid confusion as it is not possible to provide an + * iterator over only the filled slots. + * + * @return Iterator to the first slot + */ + __device__ iterator begin_slot() noexcept { return slots_; } + + /** + * @brief Returns iterator to the first slot. + * + * @note Unlike `std::map::begin()`, the `begin_slot()` iterator does _not_ point to the first + * occupied slot. Instead, it refers to the first slot in the array of contiguous slot storage. + * Iterating from `begin_slot()` to `end_slot()` will iterate over all slots, including those + * both empty and filled. + * + * There is no `begin()` iterator to avoid confusion as it is not possible to provide an + * iterator over only the filled slots. + * + * @return Iterator to the first slot + */ + __device__ const_iterator begin_slot() const noexcept { return slots_; } + + /** + * @brief Returns a const_iterator to one past the last slot. + * + * @return A const_iterator to one past the last slot + */ + __host__ __device__ const_iterator end_slot() const noexcept { return slots_ + capacity_; } + + /** + * @brief Returns an iterator to one past the last slot. + * + * @return An iterator to one past the last slot + */ + __host__ __device__ iterator end_slot() noexcept { return slots_ + capacity_; } + + /** + * @brief Returns a const_iterator to one past the last slot. + * + * `end()` calls `end_slot()` and is provided for convenience for those familiar with checking + * an iterator returned from `find()` against the `end()` iterator. + * + * @return A const_iterator to one past the last slot + */ + __host__ __device__ const_iterator end() const noexcept { return end_slot(); } + + /** + * @brief Returns an iterator to one past the last slot. + * + * `end()` calls `end_slot()` and is provided for convenience for those familiar with checking + * an iterator returned from `find()` against the `end()` iterator. + * + * @return An iterator to one past the last slot + */ + __host__ __device__ iterator end() noexcept { return end_slot(); } + }; + + public: + /** + * @brief Mutable, non-owning view-type that may be used in device code to + * perform singular inserts into the map. + * + * `device_mutable_view` is trivially-copyable and is intended to be passed by + * value. + * + * Example: + * \code{.cpp} + * cuco::static_reduction_map m{100'000, -1, -1}; + * + * // Inserts a sequence of pairs {{0,0}, {1,1}, ... {i,i}} + * thrust::for_each(thrust::make_counting_iterator(0), + * thrust::make_counting_iterator(50'000), + * [map = m.get_mutable_device_view()] + * __device__ (auto i) mutable { + * map.insert(thrust::make_pair(i,i)); + * }); + * \endcode + */ + class device_mutable_view : public device_view_base { + public: + using value_type = typename device_view_base::value_type; + using key_type = typename device_view_base::key_type; + using mapped_type = typename device_view_base::mapped_type; + using iterator = typename device_view_base::iterator; + using const_iterator = typename device_view_base::const_iterator; + /** + * @brief Construct a mutable view of the first `capacity` slots of the + * slots array pointed to by `slots`. + * + * @param slots Pointer to beginning of initialized slots array + * @param capacity The number of slots viewed by this object + * @param empty_key_sentinel The reserved value for keys to represent empty + * slots + * @param empty_value_sentinel The reserved value for mapped values to + * represent empty slots + */ + __host__ __device__ device_mutable_view(pair_atomic_type* slots, + std::size_t capacity, + Key empty_key_sentinel, + Value empty_value_sentinel) noexcept + : device_view_base{slots, capacity, empty_key_sentinel, empty_value_sentinel} + { + } + + /** + * @brief Inserts the specified key/value pair into the map. + * + * Returns a pair consisting of an iterator to the inserted element (or to + * the element that prevented the insertion) and a `bool` denoting whether + * the insertion took place. + * + * @tparam Hash Unary callable type + * @tparam KeyEqual Binary callable type + * @param insert_pair The pair to insert + * @param hash The unary callable used to hash the key + * @param key_equal The binary callable used to compare two keys for + * equality + * @return `true` if the insert was successful, `false` otherwise. + */ + template , + typename KeyEqual = thrust::equal_to> + __device__ bool insert(value_type const& insert_pair, + Hash hash = Hash{}, + KeyEqual key_equal = KeyEqual{}) noexcept; + /** + * @brief Inserts the specified key/value pair into the map. + * + * Returns a pair consisting of an iterator to the inserted element (or to + * the element that prevented the insertion) and a `bool` denoting whether + * the insertion took place. Uses the CUDA Cooperative Groups API to + * to leverage multiple threads to perform a single insert. This provides a + * significant boost in throughput compared to the non Cooperative Group + * `insert` at moderate to high load factors. + * + * @tparam Cooperative Group type + * @tparam Hash Unary callable type + * @tparam KeyEqual Binary callable type + * + * @param g The Cooperative Group that performs the insert + * @param insert_pair The pair to insert + * @param hash The unary callable used to hash the key + * @param key_equal The binary callable used to compare two keys for + * equality + * @return `true` if the insert was successful, `false` otherwise. + */ + template , + typename KeyEqual = thrust::equal_to> + __device__ bool insert(CG g, + value_type const& insert_pair, + Hash hash = Hash{}, + KeyEqual key_equal = KeyEqual{}) noexcept; + + }; // class device mutable view + + /** + * @brief Non-owning view-type that may be used in device code to + * perform singular find and contains operations for the map. + * + * `device_view` is trivially-copyable and is intended to be passed by + * value. + * + */ + class device_view : public device_view_base { + public: + using value_type = typename device_view_base::value_type; + using key_type = typename device_view_base::key_type; + using mapped_type = typename device_view_base::mapped_type; + using iterator = typename device_view_base::iterator; + using const_iterator = typename device_view_base::const_iterator; + /** + * @brief Construct a view of the first `capacity` slots of the + * slots array pointed to by `slots`. + * + * @param slots Pointer to beginning of initialized slots array + * @param capacity The number of slots viewed by this object + * @param empty_key_sentinel The reserved value for keys to represent empty + * slots + * @param empty_value_sentinel The reserved value for mapped values to + * represent empty slots + */ + __host__ __device__ device_view(pair_atomic_type* slots, + std::size_t capacity, + Key empty_key_sentinel, + Value empty_value_sentinel) noexcept + : device_view_base{slots, capacity, empty_key_sentinel, empty_value_sentinel} + { + } + + /** + * @brief Makes a copy of given `device_view` using non-owned memory. + * + * This function is intended to be used to create shared memory copies of small static maps, + * although global memory can be used as well. + * + * Example: + * @code{.cpp} + * template + * __global__ void use_device_view(const typename MapType::device_view device_view, + * map_key_t const* const keys_to_search, + * map_value_t* const values_found, + * const size_t number_of_elements) + * { + * const size_t index = blockIdx.x * blockDim.x + threadIdx.x; + * + * __shared__ typename MapType::pair_atomic_type sm_buffer[CAPACITY]; + * + * auto g = cg::this_thread_block(); + * + * const map_t::device_view sm_static_reduction_map = device_view.make_copy(g, + * sm_buffer); + * + * for (size_t i = g.thread_rank(); i < number_of_elements; i += g.size()) + * { + * values_found[i] = sm_static_reduction_map.find(keys_to_search[i])->second; + * } + * } + * @endcode + * + * @tparam CG The type of the cooperative thread group + * @param g The ooperative thread group used to copy the slots + * @param source_device_view `device_view` to copy from + * @param memory_to_use Array large enough to support `capacity` elements. Object does not take + * the ownership of the memory + * @return Copy of passed `device_view` + */ + template + __device__ static device_view make_copy(CG g, + pair_atomic_type* const memory_to_use, + device_view source_device_view) noexcept + { +#ifndef CUDART_VERSION +#error CUDART_VERSION Undefined! +#elif (CUDART_VERSION >= 11000) + __shared__ cuda::barrier barrier; + if (g.thread_rank() == 0) { init(&barrier, g.size()); } + g.sync(); + + cuda::memcpy_async(g, + memory_to_use, + source_device_view.get_slots(), + sizeof(pair_atomic_type) * source_device_view.get_capacity(), + barrier); + + barrier.arrive_and_wait(); +#else + pair_atomic_type const* const slots_ptr = source_device_view.get_slots(); + for (std::size_t i = g.thread_rank(); i < source_device_view.get_capacity(); i += g.size()) { + new (&memory_to_use[i].first) + atomic_key_type{slots_ptr[i].first.load(cuda::memory_order_relaxed)}; + new (&memory_to_use[i].second) + atomic_mapped_type{slots_ptr[i].second.load(cuda::memory_order_relaxed)}; + } + g.sync(); +#endif + + return device_view(memory_to_use, + source_device_view.get_capacity(), + source_device_view.get_empty_key_sentinel(), + source_device_view.get_empty_value_sentinel()); + } + + /** + * @brief Finds the value corresponding to the key `k`. + * + * Returns an iterator to the pair whose key is equivalent to `k`. + * If no such pair exists, returns `end()`. + * + * @tparam Hash Unary callable type + * @tparam KeyEqual Binary callable type + * @param k The key to search for + * @param hash The unary callable used to hash the key + * @param key_equal The binary callable used to compare two keys + * for equality + * @return An iterator to the position at which the key/value pair + * containing `k` was inserted + */ + template , + typename KeyEqual = thrust::equal_to> + __device__ iterator find(Key const& k, + Hash hash = Hash{}, + KeyEqual key_equal = KeyEqual{}) noexcept; + + /** @brief Finds the value corresponding to the key `k`. + * + * Returns a const_iterator to the pair whose key is equivalent to `k`. + * If no such pair exists, returns `end()`. + * + * @tparam Hash Unary callable type + * @tparam KeyEqual Binary callable type + * @param k The key to search for + * @param hash The unary callable used to hash the key + * @param key_equal The binary callable used to compare two keys + * for equality + * @return An iterator to the position at which the key/value pair + * containing `k` was inserted + */ + template , + typename KeyEqual = thrust::equal_to> + __device__ const_iterator find(Key const& k, + Hash hash = Hash{}, + KeyEqual key_equal = KeyEqual{}) const noexcept; + + /** + * @brief Finds the value corresponding to the key `k`. + * + * Returns an iterator to the pair whose key is equivalent to `k`. + * If no such pair exists, returns `end()`. Uses the CUDA Cooperative Groups API to + * to leverage multiple threads to perform a single find. This provides a + * significant boost in throughput compared to the non Cooperative Group + * `find` at moderate to high load factors. + * + * @tparam CG Cooperative Group type + * @tparam Hash Unary callable type + * @tparam KeyEqual Binary callable type + * @param g The Cooperative Group used to perform the find + * @param k The key to search for + * @param hash The unary callable used to hash the key + * @param key_equal The binary callable used to compare two keys + * for equality + * @return An iterator to the position at which the key/value pair + * containing `k` was inserted + */ + template , + typename KeyEqual = thrust::equal_to> + __device__ iterator + find(CG g, Key const& k, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}) noexcept; + + /** + * @brief Finds the value corresponding to the key `k`. + * + * Returns a const_iterator to the pair whose key is equivalent to `k`. + * If no such pair exists, returns `end()`. Uses the CUDA Cooperative Groups API to + * to leverage multiple threads to perform a single find. This provides a + * significant boost in throughput compared to the non Cooperative Group + * `find` at moderate to high load factors. + * + * @tparam CG Cooperative Group type + * @tparam Hash Unary callable type + * @tparam KeyEqual Binary callable type + * @param g The Cooperative Group used to perform the find + * @param k The key to search for + * @param hash The unary callable used to hash the key + * @param key_equal The binary callable used to compare two keys + * for equality + * @return An iterator to the position at which the key/value pair + * containing `k` was inserted + */ + template , + typename KeyEqual = thrust::equal_to> + __device__ const_iterator + find(CG g, Key const& k, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}) const noexcept; + + /** + * @brief Indicates whether the key `k` was inserted into the map. + * + * If the key `k` was inserted into the map, find returns + * true. Otherwise, it returns false. + * + * @tparam Hash Unary callable type + * @tparam KeyEqual Binary callable type + * @param k The key to search for + * @param hash The unary callable used to hash the key + * @param key_equal The binary callable used to compare two keys + * for equality + * @return A boolean indicating whether the key/value pair + * containing `k` was inserted + */ + template , + typename KeyEqual = thrust::equal_to> + __device__ bool contains(Key const& k, + Hash hash = Hash{}, + KeyEqual key_equal = KeyEqual{}) noexcept; + + /** + * @brief Indicates whether the key `k` was inserted into the map. + * + * If the key `k` was inserted into the map, find returns + * true. Otherwise, it returns false. Uses the CUDA Cooperative Groups API to + * to leverage multiple threads to perform a single contains operation. This provides a + * significant boost in throughput compared to the non Cooperative Group + * `contains` at moderate to high load factors. + * + * @tparam CG Cooperative Group type + * @tparam Hash Unary callable type + * @tparam KeyEqual Binary callable type + * @param g The Cooperative Group used to perform the contains operation + * @param k The key to search for + * @param hash The unary callable used to hash the key + * @param key_equal The binary callable used to compare two keys + * for equality + * @return A boolean indicating whether the key/value pair + * containing `k` was inserted + */ + template , + typename KeyEqual = thrust::equal_to> + __device__ bool contains(CG g, + Key const& k, + Hash hash = Hash{}, + KeyEqual key_equal = KeyEqual{}) noexcept; + }; // class device_view + + /** + * @brief Gets the maximum number of elements the hash map can hold. + * + * @return The maximum number of elements the hash map can hold + */ + std::size_t get_capacity() const noexcept { return capacity_; } + + /** + * @brief Gets the number of elements in the hash map. + * + * @return The number of elements in the map + */ + std::size_t get_size() const noexcept { return size_; } + + /** + * @brief Gets the load factor of the hash map. + * + * @return The load factor of the hash map + */ + float get_load_factor() const noexcept { return static_cast(size_) / capacity_; } + + /** + * @brief Gets the sentinel value used to represent an empty key slot. + * + * @return The sentinel value used to represent an empty key slot + */ + Key get_empty_key_sentinel() const noexcept { return empty_key_sentinel_; } + + /** + * @brief Gets the sentinel value used to represent an empty value slot. + * + * @return The sentinel value used to represent an empty value slot + */ + Value get_empty_value_sentinel() const noexcept { return empty_value_sentinel_; } + + /** + * @brief Constructs a device_view object based on the members of the `static_reduction_map` + * object. + * + * @return A device_view object based on the members of the `static_reduction_map` object + */ + device_view get_device_view() const noexcept + { + return device_view(slots_, capacity_, empty_key_sentinel_, empty_value_sentinel_); + } + + /** + * @brief Constructs a device_mutable_view object based on the members of the + * `static_reduction_map` object + * + * @return A device_mutable_view object based on the members of the `static_reduction_map` object + */ + device_mutable_view get_device_mutable_view() const noexcept + { + return device_mutable_view(slots_, capacity_, empty_key_sentinel_, empty_value_sentinel_); + } + + private: + pair_atomic_type* slots_{nullptr}; ///< Pointer to flat slots storage + std::size_t capacity_{}; ///< Total number of slots + std::size_t size_{}; ///< Number of keys in map + Key empty_key_sentinel_{}; ///< Key value that represents an empty slot + Value empty_value_sentinel_{}; ///< Initial value of empty slot + atomic_ctr_type* num_successes_{}; ///< Number of successfully inserted keys on insert + slot_allocator_type slot_allocator_{}; ///< Allocator used to allocate slots +}; +} // namespace cuco + +#include \ No newline at end of file diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 66c1682ed..32d77b2a8 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -48,13 +48,9 @@ endfunction(ConfigureTest) ################################################################################################### ### test sources ################################################################################## ################################################################################################### -set(STATIC_MAP_TEST_SRC - "${CMAKE_CURRENT_SOURCE_DIR}/static_map/static_map_test.cu") -ConfigureTest(STATIC_MAP_TEST "${STATIC_MAP_TEST_SRC}") -#################################################################################################### -set(DYNAMIC_MAP_TEST_SRC - "${CMAKE_CURRENT_SOURCE_DIR}/dynamic_map/dynamic_map_test.cu") +ConfigureTest(STATIC_MAP_TEST "${CMAKE_CURRENT_SOURCE_DIR}/static_map/static_map_test.cu") -ConfigureTest(DYNAMIC_MAP_TEST "${DYNAMIC_MAP_TEST_SRC}") -#################################################################################################### \ No newline at end of file +ConfigureTest(STATIC_REDUCTION_MAP_TEST "${CMAKE_CURRENT_SOURCE_DIR}/static_reduction_map/static_reduction_map_test.cu") + +ConfigureTest(DYNAMIC_MAP_TEST "${CMAKE_CURRENT_SOURCE_DIR}/dynamic_map/dynamic_map_test.cu") \ No newline at end of file diff --git a/tests/static_reduction_map/static_reduction_map_test.cu b/tests/static_reduction_map/static_reduction_map_test.cu new file mode 100644 index 000000000..d69d581fc --- /dev/null +++ b/tests/static_reduction_map/static_reduction_map_test.cu @@ -0,0 +1,355 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +namespace { +namespace cg = cooperative_groups; + +// Thrust logical algorithms (any_of/all_of/none_of) don't work with device +// lambdas: See https://github.com/thrust/thrust/issues/1062 +template +bool all_of(Iterator begin, Iterator end, Predicate p) +{ + auto size = thrust::distance(begin, end); + return size == thrust::count_if(begin, end, p); +} + +template +bool any_of(Iterator begin, Iterator end, Predicate p) +{ + return thrust::count_if(begin, end, p) > 0; +} + +template +bool none_of(Iterator begin, Iterator end, Predicate p) +{ + return not all_of(begin, end, p); +} +} // namespace + +enum class dist_type { UNIQUE, UNIFORM, GAUSSIAN }; + +template +static void generate_keys(OutputIt output_begin, OutputIt output_end) +{ + auto num_keys = std::distance(output_begin, output_end); + + std::random_device rd; + std::mt19937 gen{rd()}; + + switch (Dist) { + case dist_type::UNIQUE: + for (auto i = 0; i < num_keys; ++i) { + output_begin[i] = i; + } + break; + case dist_type::UNIFORM: + for (auto i = 0; i < num_keys; ++i) { + output_begin[i] = std::abs(static_cast(gen())); + } + break; + case dist_type::GAUSSIAN: + std::normal_distribution<> dg{1e9, 1e7}; + for (auto i = 0; i < num_keys; ++i) { + output_begin[i] = std::abs(static_cast(dg(gen))); + } + break; + } +} + +TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", + "", + ((typename T, dist_type Dist), T, Dist), + (int32_t, dist_type::UNIQUE), + (int64_t, dist_type::UNIQUE), + (int32_t, dist_type::UNIFORM), + (int64_t, dist_type::UNIFORM), + (int32_t, dist_type::GAUSSIAN), + (int64_t, dist_type::GAUSSIAN)) +{ + using Key = T; + using Value = T; + + constexpr std::size_t num_keys{50'000'000}; + cuco::static_reduction_map map{100'000'000, -1, -1}; + + auto m_view = map.get_device_mutable_view(); + auto view = map.get_device_view(); + + std::vector h_keys(num_keys); + std::vector h_values(num_keys); + std::vector> h_pairs(num_keys); + + generate_keys(h_keys.begin(), h_keys.end()); + + for (auto i = 0; i < num_keys; ++i) { + Key key = h_keys[i]; + Value val = h_keys[i]; + h_pairs[i].first = key; + h_pairs[i].second = val; + h_values[i] = val; + } + + thrust::device_vector d_keys(h_keys); + thrust::device_vector d_values(h_values); + thrust::device_vector> d_pairs(h_pairs); + thrust::device_vector d_results(num_keys); + thrust::device_vector d_contained(num_keys); + + // bulk function test cases + SECTION("All inserted keys-value pairs should be correctly recovered during find") + { + map.insert(d_pairs.begin(), d_pairs.end()); + map.find(d_keys.begin(), d_keys.end(), d_results.begin()); + auto zip = thrust::make_zip_iterator(thrust::make_tuple(d_results.begin(), d_values.begin())); + + REQUIRE(all_of(zip, zip + num_keys, [] __device__(auto const& p) { + return thrust::get<0>(p) == thrust::get<1>(p); + })); + } + + SECTION("All inserted keys-value pairs should be contained") + { + map.insert(d_pairs.begin(), d_pairs.end()); + map.contains(d_keys.begin(), d_keys.end(), d_contained.begin()); + + REQUIRE( + all_of(d_contained.begin(), d_contained.end(), [] __device__(bool const& b) { return b; })); + } + + SECTION("Non-inserted keys-value pairs should not be contained") + { + map.contains(d_keys.begin(), d_keys.end(), d_contained.begin()); + + REQUIRE( + none_of(d_contained.begin(), d_contained.end(), [] __device__(bool const& b) { return b; })); + } + + SECTION("Inserting unique keys should return insert success.") + { + if (Dist == dist_type::UNIQUE) { + REQUIRE(all_of(d_pairs.begin(), + d_pairs.end(), + [m_view] __device__(cuco::pair_type const& pair) mutable { + return m_view.insert(pair); + })); + } + } + + SECTION("Cannot find any key in an empty hash map with non-const view") + { + SECTION("non-const view") + { + REQUIRE(all_of(d_pairs.begin(), + d_pairs.end(), + [view] __device__(cuco::pair_type const& pair) mutable { + return view.find(pair.first) == view.end(); + })); + } + SECTION("const view") + { + REQUIRE(all_of( + d_pairs.begin(), d_pairs.end(), [view] __device__(cuco::pair_type const& pair) { + return view.find(pair.first) == view.end(); + })); + } + } + + SECTION("Keys are all found after inserting many keys.") + { + // Bulk insert keys + thrust::for_each(thrust::device, + d_pairs.begin(), + d_pairs.end(), + [m_view] __device__(cuco::pair_type const& pair) mutable { + m_view.insert(pair); + }); + + SECTION("non-const view") + { + // All keys should be found + REQUIRE(all_of(d_pairs.begin(), + d_pairs.end(), + [view] __device__(cuco::pair_type const& pair) mutable { + auto const found = view.find(pair.first); + return (found != view.end()) and (found->first.load() == pair.first and + found->second.load() == pair.second); + })); + } + SECTION("const view") + { + // All keys should be found + REQUIRE(all_of( + d_pairs.begin(), d_pairs.end(), [view] __device__(cuco::pair_type const& pair) { + auto const found = view.find(pair.first); + return (found != view.end()) and + (found->first.load() == pair.first and found->second.load() == pair.second); + })); + } + } +} + +template +__global__ void shared_memory_test_kernel( + typename MapType::device_view const* const device_views, + typename MapType::device_view::key_type const* const insterted_keys, + typename MapType::device_view::mapped_type const* const inserted_values, + const size_t number_of_elements, + bool* const keys_exist, + bool* const keys_and_values_correct) +{ + // Each block processes one map + const size_t map_id = blockIdx.x; + const size_t offset = map_id * number_of_elements; + + __shared__ typename MapType::pair_atomic_type sm_buffer[CAPACITY]; + + auto g = cg::this_thread_block(); + typename MapType::device_view sm_device_view = + MapType::device_view::make_copy(g, sm_buffer, device_views[map_id]); + + for (int i = g.thread_rank(); i < number_of_elements; i += g.size()) { + auto found_pair_it = sm_device_view.find(insterted_keys[offset + i]); + + if (found_pair_it != sm_device_view.end()) { + keys_exist[offset + i] = true; + if (found_pair_it->first == insterted_keys[offset + i] and + found_pair_it->second == inserted_values[offset + i]) { + keys_and_values_correct[offset + i] = true; + } else { + keys_and_values_correct[offset + i] = false; + } + } else { + keys_exist[offset + i] = false; + keys_and_values_correct[offset + i] = true; + } + } +} + +TEMPLATE_TEST_CASE_SIG("Shared memory static map", + "", + ((typename T, dist_type Dist), T, Dist), + (int32_t, dist_type::UNIQUE), + (int64_t, dist_type::UNIQUE), + (int32_t, dist_type::UNIFORM), + (int64_t, dist_type::UNIFORM), + (int32_t, dist_type::GAUSSIAN), + (int64_t, dist_type::GAUSSIAN)) +{ + using KeyType = T; + using ValueType = T; + using MapType = cuco::static_reduction_map; + using DeviceViewType = typename MapType::device_view; + using DeviceViewIteratorType = typename DeviceViewType::iterator; + + constexpr std::size_t number_of_maps = 1000; + constexpr std::size_t elements_in_map = 500; + constexpr std::size_t map_capacity = 2 * elements_in_map; + + // one array for all maps, first elements_in_map element belong to map 0, second to map 1 and so + // on + std::vector h_keys(number_of_maps * elements_in_map); + std::vector h_values(number_of_maps * elements_in_map); + std::vector> h_pairs(number_of_maps * elements_in_map); + + // using std::unique_ptr because static_reduction_map does not have copy/move + // constructor/assignment operator yet + std::vector> maps; + + for (std::size_t map_id = 0; map_id < number_of_maps; ++map_id) { + const std::size_t offset = map_id * elements_in_map; + + generate_keys(h_keys.begin() + offset, + h_keys.begin() + offset + elements_in_map); + + for (std::size_t i = 0; i < elements_in_map; ++i) { + KeyType key = h_keys[offset + i]; + ValueType val = key < std::numeric_limits::max() ? key + 1 : 0; + h_values[offset + i] = val; + h_pairs[offset + i].first = key; + h_pairs[offset + i].second = val; + } + + maps.push_back(std::make_unique(map_capacity, -1, -1)); + } + + thrust::device_vector d_keys(h_keys); + thrust::device_vector d_values(h_values); + thrust::device_vector> d_pairs(h_pairs); + + SECTION("Keys are all found after insertion.") + { + std::vector h_device_views; + for (std::size_t map_id = 0; map_id < number_of_maps; ++map_id) { + const std::size_t offset = map_id * elements_in_map; + + MapType* map = maps[map_id].get(); + map->insert(d_pairs.begin() + offset, d_pairs.begin() + offset + elements_in_map); + h_device_views.push_back(map->get_device_view()); + } + thrust::device_vector d_device_views(h_device_views); + + thrust::device_vector d_keys_exist(number_of_maps * elements_in_map); + thrust::device_vector d_keys_and_values_correct(number_of_maps * elements_in_map); + + shared_memory_test_kernel + <<>>(d_device_views.data().get(), + d_keys.data().get(), + d_values.data().get(), + elements_in_map, + d_keys_exist.data().get(), + d_keys_and_values_correct.data().get()); + + REQUIRE(d_keys_exist.size() == d_keys_and_values_correct.size()); + auto zip = thrust::make_zip_iterator( + thrust::make_tuple(d_keys_exist.begin(), d_keys_and_values_correct.begin())); + + REQUIRE(all_of(zip, zip + d_keys_exist.size(), [] __device__(auto const& z) { + return thrust::get<0>(z) and thrust::get<1>(z); + })); + } + + SECTION("No key is found before insertion.") + { + std::vector h_device_views; + for (std::size_t map_id = 0; map_id < number_of_maps; ++map_id) { + h_device_views.push_back(maps[map_id].get()->get_device_view()); + } + thrust::device_vector d_device_views(h_device_views); + + thrust::device_vector d_keys_exist(number_of_maps * elements_in_map); + thrust::device_vector d_keys_and_values_correct(number_of_maps * elements_in_map); + + shared_memory_test_kernel + <<>>(d_device_views.data().get(), + d_keys.data().get(), + d_values.data().get(), + elements_in_map, + d_keys_exist.data().get(), + d_keys_and_values_correct.data().get()); + + REQUIRE(none_of(d_keys_exist.begin(), d_keys_exist.end(), [] __device__(const bool key_found) { + return key_found; + })); + } +} \ No newline at end of file From fe606cd60d27b645d2c551fb607652658c204c41 Mon Sep 17 00:00:00 2001 From: Jake Hemstad Date: Mon, 4 Jan 2021 14:57:20 -0600 Subject: [PATCH 02/69] Add template parameter for reduction binary op. --- include/cuco/detail/static_reduction_map.inl | 172 +++++++----- include/cuco/static_reduction_map.cuh | 54 ++-- .../static_reduction_map_test.cu | 263 +----------------- 3 files changed, 134 insertions(+), 355 deletions(-) diff --git a/include/cuco/detail/static_reduction_map.inl b/include/cuco/detail/static_reduction_map.inl index 243032f6b..be28e0f28 100644 --- a/include/cuco/detail/static_reduction_map.inl +++ b/include/cuco/detail/static_reduction_map.inl @@ -26,14 +26,17 @@ enum class insert_result { DUPLICATE ///< Insert did not succeed, key is already present }; -template -static_reduction_map::static_reduction_map(std::size_t capacity, - Key empty_key_sentinel, - Value empty_value_sentinel, - Allocator const& alloc) +template +static_reduction_map::static_reduction_map( + std::size_t capacity, Key empty_key_sentinel, ReductionOp reduction_op, Allocator const& alloc) : capacity_{capacity}, empty_key_sentinel_{empty_key_sentinel}, - empty_value_sentinel_{empty_value_sentinel}, + empty_value_sentinel_{ReductionOp::identity}, + op_{reduction_op}, slot_allocator_{alloc} { slots_ = std::allocator_traits::allocate(slot_allocator_, capacity); @@ -41,25 +44,33 @@ static_reduction_map::static_reduction_map(std::si auto constexpr block_size = 256; auto constexpr stride = 4; auto const grid_size = (capacity + stride * block_size - 1) / (stride * block_size); - detail::initialize - <<>>(slots_, empty_key_sentinel, empty_value_sentinel, capacity); + detail::initialize<<>>( + slots_, get_empty_key_sentinel(), get_empty_value_sentinel(), get_capacity()); CUCO_CUDA_TRY(cudaMallocManaged(&num_successes_, sizeof(atomic_ctr_type))); } -template -static_reduction_map::~static_reduction_map() +template +static_reduction_map::~static_reduction_map() { std::allocator_traits::deallocate(slot_allocator_, slots_, capacity_); CUCO_CUDA_TRY(cudaFree(num_successes_)); } -template +template template -void static_reduction_map::insert(InputIt first, - InputIt last, - Hash hash, - KeyEqual key_equal) +void static_reduction_map::insert(InputIt first, + InputIt last, + Hash hash, + KeyEqual key_equal) { auto num_keys = std::distance(first, last); auto const block_size = 128; @@ -80,9 +91,13 @@ void static_reduction_map::insert(InputIt first, size_ += num_successes_->load(cuda::std::memory_order_relaxed); } -template +template template -void static_reduction_map::find( +void static_reduction_map::find( InputIt first, InputIt last, OutputIt output_begin, Hash hash, KeyEqual key_equal) noexcept { auto num_keys = std::distance(first, last); @@ -97,9 +112,13 @@ void static_reduction_map::find( CUCO_CUDA_TRY(cudaDeviceSynchronize()); } -template +template template -void static_reduction_map::contains( +void static_reduction_map::contains( InputIt first, InputIt last, OutputIt output_begin, Hash hash, KeyEqual key_equal) noexcept { auto num_keys = std::distance(first, last); @@ -114,9 +133,14 @@ void static_reduction_map::contains( CUCO_CUDA_TRY(cudaDeviceSynchronize()); } -template +template template -__device__ bool static_reduction_map::device_mutable_view::insert( +__device__ Value +static_reduction_map::device_mutable_view::insert( value_type const& insert_pair, Hash hash, KeyEqual key_equal) noexcept { auto current_slot{initial_slot(insert_pair.first, hash)}; @@ -128,26 +152,12 @@ __device__ bool static_reduction_map::device_mutab auto& slot_key = current_slot->first; auto& slot_value = current_slot->second; - bool key_success = + auto const key_success = slot_key.compare_exchange_strong(expected_key, insert_pair.first, memory_order_relaxed); - bool value_success = - slot_value.compare_exchange_strong(expected_value, insert_pair.second, memory_order_relaxed); - - if (key_success) { - while (not value_success) { - value_success = - slot_value.compare_exchange_strong(expected_value = this->get_empty_value_sentinel(), - insert_pair.second, - memory_order_relaxed); - } - return true; - } else if (value_success) { - slot_value.store(this->get_empty_value_sentinel(), memory_order_relaxed); - } - // if the key was already inserted by another thread, than this instance is a - // duplicate, so the insert fails - if (key_equal(insert_pair.first, expected_key)) { return false; } + if (key_success or key_equal(insert_pair.first, expected_key)) { + // return do_op{}(slot_value, insert_pair.second); + } // if we couldn't insert the key, but it wasn't a duplicate, then there must // have been some other key there, so we keep looking for a slot @@ -155,9 +165,14 @@ __device__ bool static_reduction_map::device_mutab } } -template +template template -__device__ bool static_reduction_map::device_mutable_view::insert( +__device__ bool +static_reduction_map::device_mutable_view::insert( CG g, value_type const& insert_pair, Hash hash, KeyEqual key_equal) noexcept { auto current_slot = initial_slot(g, insert_pair.first, hash); @@ -232,12 +247,16 @@ __device__ bool static_reduction_map::device_mutab } } -template +template template -__device__ typename static_reduction_map::device_view::iterator -static_reduction_map::device_view::find(Key const& k, - Hash hash, - KeyEqual key_equal) noexcept +__device__ + typename static_reduction_map::device_view::iterator + static_reduction_map::device_view::find( + Key const& k, Hash hash, KeyEqual key_equal) noexcept { auto current_slot = initial_slot(k, hash); @@ -253,13 +272,16 @@ static_reduction_map::device_view::find(Key const& } } -template +template template -__device__ typename static_reduction_map::device_view::const_iterator -static_reduction_map::device_view::find(Key const& k, - Hash hash, - KeyEqual key_equal) const - noexcept +__device__ typename static_reduction_map::device_view:: + const_iterator + static_reduction_map::device_view::find( + Key const& k, Hash hash, KeyEqual key_equal) const noexcept { auto current_slot = initial_slot(k, hash); @@ -275,13 +297,16 @@ static_reduction_map::device_view::find(Key const& } } -template +template template -__device__ typename static_reduction_map::device_view::iterator -static_reduction_map::device_view::find(CG g, - Key const& k, - Hash hash, - KeyEqual key_equal) noexcept +__device__ + typename static_reduction_map::device_view::iterator + static_reduction_map::device_view::find( + CG g, Key const& k, Hash hash, KeyEqual key_equal) noexcept { auto current_slot = initial_slot(g, k, hash); @@ -312,11 +337,16 @@ static_reduction_map::device_view::find(CG g, } } -template +template template -__device__ typename static_reduction_map::device_view::const_iterator -static_reduction_map::device_view::find( - CG g, Key const& k, Hash hash, KeyEqual key_equal) const noexcept +__device__ typename static_reduction_map::device_view:: + const_iterator + static_reduction_map::device_view::find( + CG g, Key const& k, Hash hash, KeyEqual key_equal) const noexcept { auto current_slot = initial_slot(g, k, hash); @@ -349,9 +379,14 @@ static_reduction_map::device_view::find( } } -template +template template -__device__ bool static_reduction_map::device_view::contains( +__device__ bool +static_reduction_map::device_view::contains( Key const& k, Hash hash, KeyEqual key_equal) noexcept { auto current_slot = initial_slot(k, hash); @@ -367,9 +402,14 @@ __device__ bool static_reduction_map::device_view: } } -template +template template -__device__ bool static_reduction_map::device_view::contains( +__device__ bool +static_reduction_map::device_view::contains( CG g, Key const& k, Hash hash, KeyEqual key_equal) noexcept { auto current_slot = initial_slot(g, k, hash); diff --git a/include/cuco/static_reduction_map.cuh b/include/cuco/static_reduction_map.cuh index 241ef480d..d66c6cf4a 100644 --- a/include/cuco/static_reduction_map.cuh +++ b/include/cuco/static_reduction_map.cuh @@ -38,22 +38,16 @@ namespace cuco { -/** - * @brief Possible reduction operations that can be performed by a `static_reduction_map`. - * - * `GENERIC` allows for any associative binary reduction operation, but may have worse performance - * compared to one of the native operations. - * - */ -enum class reduction_op { - SUM, ///< Addition - SUB, ///< Subtraction - MIN, ///< Minimum value - MAX, ///< Maximum value - AND, ///< Bitwise AND - OR, ///< Bitwise OR - XOR, ///< Bitwise XOR - GENERIC ///< User-defined, associative binary operation +template +struct reduce_add { + using value_type = T; + static constexpr T identity = 0; + + template + T apply(cuda::atomic& slot, T2 const& value) + { + return slot.fetch_add(value); + } }; /** @@ -122,7 +116,8 @@ enum class reduction_op { * individual threads. * @tparam Allocator Type of allocator used for device storage */ -template > @@ -171,8 +166,8 @@ class static_reduction_map { */ static_reduction_map(std::size_t capacity, Key empty_key_sentinel, - Value empty_value_sentinel, - Allocator const& alloc = Allocator{}); + ReductionOp reduction_op = {}, + Allocator const& alloc = Allocator{}); /** * @brief Destroys the map and frees its contents. @@ -270,16 +265,18 @@ class static_reduction_map { std::size_t capacity_{}; ///< Total number of slots Key empty_key_sentinel_{}; ///< Key value that represents an empty slot Value empty_value_sentinel_{}; ///< Initial Value of empty slot + ReductionOp op_{}; ///< Binary operation reduction function object protected: __host__ __device__ device_view_base(pair_atomic_type* slots, std::size_t capacity, Key empty_key_sentinel, - Value empty_value_sentinel) noexcept + ReductionOp reduction_op) noexcept : slots_{slots}, capacity_{capacity}, empty_key_sentinel_{empty_key_sentinel}, - empty_value_sentinel_{empty_value_sentinel} + empty_value_sentinel_{ReductionOp::identity}, + op_{reduction_op} { } @@ -552,8 +549,8 @@ class static_reduction_map { __host__ __device__ device_mutable_view(pair_atomic_type* slots, std::size_t capacity, Key empty_key_sentinel, - Value empty_value_sentinel) noexcept - : device_view_base{slots, capacity, empty_key_sentinel, empty_value_sentinel} + ReductionOp reduction_op = {}) noexcept + : device_view_base{slots, capacity, empty_key_sentinel, reduction_op} { } @@ -574,9 +571,9 @@ class static_reduction_map { */ template , typename KeyEqual = thrust::equal_to> - __device__ bool insert(value_type const& insert_pair, - Hash hash = Hash{}, - KeyEqual key_equal = KeyEqual{}) noexcept; + __device__ Value insert(value_type const& insert_pair, + Hash hash = Hash{}, + KeyEqual key_equal = KeyEqual{}) noexcept; /** * @brief Inserts the specified key/value pair into the map. * @@ -637,8 +634,8 @@ class static_reduction_map { __host__ __device__ device_view(pair_atomic_type* slots, std::size_t capacity, Key empty_key_sentinel, - Value empty_value_sentinel) noexcept - : device_view_base{slots, capacity, empty_key_sentinel, empty_value_sentinel} + ReductionOp reduction_op = {}) noexcept + : device_view_base{slots, capacity, empty_key_sentinel, reduction_op} { } @@ -922,6 +919,7 @@ class static_reduction_map { Key empty_key_sentinel_{}; ///< Key value that represents an empty slot Value empty_value_sentinel_{}; ///< Initial value of empty slot atomic_ctr_type* num_successes_{}; ///< Number of successfully inserted keys on insert + ReductionOp op_{}; ///< Binary operation reduction function object slot_allocator_type slot_allocator_{}; ///< Allocator used to allocate slots }; } // namespace cuco diff --git a/tests/static_reduction_map/static_reduction_map_test.cu b/tests/static_reduction_map/static_reduction_map_test.cu index d69d581fc..9d709a6c6 100644 --- a/tests/static_reduction_map/static_reduction_map_test.cu +++ b/tests/static_reduction_map/static_reduction_map_test.cu @@ -90,266 +90,7 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", using Key = T; using Value = T; - constexpr std::size_t num_keys{50'000'000}; - cuco::static_reduction_map map{100'000'000, -1, -1}; + constexpr std::size_t num_slots{50'000'000}; + cuco::static_reduction_map, Key, Value> map{num_slots, -1}; - auto m_view = map.get_device_mutable_view(); - auto view = map.get_device_view(); - - std::vector h_keys(num_keys); - std::vector h_values(num_keys); - std::vector> h_pairs(num_keys); - - generate_keys(h_keys.begin(), h_keys.end()); - - for (auto i = 0; i < num_keys; ++i) { - Key key = h_keys[i]; - Value val = h_keys[i]; - h_pairs[i].first = key; - h_pairs[i].second = val; - h_values[i] = val; - } - - thrust::device_vector d_keys(h_keys); - thrust::device_vector d_values(h_values); - thrust::device_vector> d_pairs(h_pairs); - thrust::device_vector d_results(num_keys); - thrust::device_vector d_contained(num_keys); - - // bulk function test cases - SECTION("All inserted keys-value pairs should be correctly recovered during find") - { - map.insert(d_pairs.begin(), d_pairs.end()); - map.find(d_keys.begin(), d_keys.end(), d_results.begin()); - auto zip = thrust::make_zip_iterator(thrust::make_tuple(d_results.begin(), d_values.begin())); - - REQUIRE(all_of(zip, zip + num_keys, [] __device__(auto const& p) { - return thrust::get<0>(p) == thrust::get<1>(p); - })); - } - - SECTION("All inserted keys-value pairs should be contained") - { - map.insert(d_pairs.begin(), d_pairs.end()); - map.contains(d_keys.begin(), d_keys.end(), d_contained.begin()); - - REQUIRE( - all_of(d_contained.begin(), d_contained.end(), [] __device__(bool const& b) { return b; })); - } - - SECTION("Non-inserted keys-value pairs should not be contained") - { - map.contains(d_keys.begin(), d_keys.end(), d_contained.begin()); - - REQUIRE( - none_of(d_contained.begin(), d_contained.end(), [] __device__(bool const& b) { return b; })); - } - - SECTION("Inserting unique keys should return insert success.") - { - if (Dist == dist_type::UNIQUE) { - REQUIRE(all_of(d_pairs.begin(), - d_pairs.end(), - [m_view] __device__(cuco::pair_type const& pair) mutable { - return m_view.insert(pair); - })); - } - } - - SECTION("Cannot find any key in an empty hash map with non-const view") - { - SECTION("non-const view") - { - REQUIRE(all_of(d_pairs.begin(), - d_pairs.end(), - [view] __device__(cuco::pair_type const& pair) mutable { - return view.find(pair.first) == view.end(); - })); - } - SECTION("const view") - { - REQUIRE(all_of( - d_pairs.begin(), d_pairs.end(), [view] __device__(cuco::pair_type const& pair) { - return view.find(pair.first) == view.end(); - })); - } - } - - SECTION("Keys are all found after inserting many keys.") - { - // Bulk insert keys - thrust::for_each(thrust::device, - d_pairs.begin(), - d_pairs.end(), - [m_view] __device__(cuco::pair_type const& pair) mutable { - m_view.insert(pair); - }); - - SECTION("non-const view") - { - // All keys should be found - REQUIRE(all_of(d_pairs.begin(), - d_pairs.end(), - [view] __device__(cuco::pair_type const& pair) mutable { - auto const found = view.find(pair.first); - return (found != view.end()) and (found->first.load() == pair.first and - found->second.load() == pair.second); - })); - } - SECTION("const view") - { - // All keys should be found - REQUIRE(all_of( - d_pairs.begin(), d_pairs.end(), [view] __device__(cuco::pair_type const& pair) { - auto const found = view.find(pair.first); - return (found != view.end()) and - (found->first.load() == pair.first and found->second.load() == pair.second); - })); - } - } } - -template -__global__ void shared_memory_test_kernel( - typename MapType::device_view const* const device_views, - typename MapType::device_view::key_type const* const insterted_keys, - typename MapType::device_view::mapped_type const* const inserted_values, - const size_t number_of_elements, - bool* const keys_exist, - bool* const keys_and_values_correct) -{ - // Each block processes one map - const size_t map_id = blockIdx.x; - const size_t offset = map_id * number_of_elements; - - __shared__ typename MapType::pair_atomic_type sm_buffer[CAPACITY]; - - auto g = cg::this_thread_block(); - typename MapType::device_view sm_device_view = - MapType::device_view::make_copy(g, sm_buffer, device_views[map_id]); - - for (int i = g.thread_rank(); i < number_of_elements; i += g.size()) { - auto found_pair_it = sm_device_view.find(insterted_keys[offset + i]); - - if (found_pair_it != sm_device_view.end()) { - keys_exist[offset + i] = true; - if (found_pair_it->first == insterted_keys[offset + i] and - found_pair_it->second == inserted_values[offset + i]) { - keys_and_values_correct[offset + i] = true; - } else { - keys_and_values_correct[offset + i] = false; - } - } else { - keys_exist[offset + i] = false; - keys_and_values_correct[offset + i] = true; - } - } -} - -TEMPLATE_TEST_CASE_SIG("Shared memory static map", - "", - ((typename T, dist_type Dist), T, Dist), - (int32_t, dist_type::UNIQUE), - (int64_t, dist_type::UNIQUE), - (int32_t, dist_type::UNIFORM), - (int64_t, dist_type::UNIFORM), - (int32_t, dist_type::GAUSSIAN), - (int64_t, dist_type::GAUSSIAN)) -{ - using KeyType = T; - using ValueType = T; - using MapType = cuco::static_reduction_map; - using DeviceViewType = typename MapType::device_view; - using DeviceViewIteratorType = typename DeviceViewType::iterator; - - constexpr std::size_t number_of_maps = 1000; - constexpr std::size_t elements_in_map = 500; - constexpr std::size_t map_capacity = 2 * elements_in_map; - - // one array for all maps, first elements_in_map element belong to map 0, second to map 1 and so - // on - std::vector h_keys(number_of_maps * elements_in_map); - std::vector h_values(number_of_maps * elements_in_map); - std::vector> h_pairs(number_of_maps * elements_in_map); - - // using std::unique_ptr because static_reduction_map does not have copy/move - // constructor/assignment operator yet - std::vector> maps; - - for (std::size_t map_id = 0; map_id < number_of_maps; ++map_id) { - const std::size_t offset = map_id * elements_in_map; - - generate_keys(h_keys.begin() + offset, - h_keys.begin() + offset + elements_in_map); - - for (std::size_t i = 0; i < elements_in_map; ++i) { - KeyType key = h_keys[offset + i]; - ValueType val = key < std::numeric_limits::max() ? key + 1 : 0; - h_values[offset + i] = val; - h_pairs[offset + i].first = key; - h_pairs[offset + i].second = val; - } - - maps.push_back(std::make_unique(map_capacity, -1, -1)); - } - - thrust::device_vector d_keys(h_keys); - thrust::device_vector d_values(h_values); - thrust::device_vector> d_pairs(h_pairs); - - SECTION("Keys are all found after insertion.") - { - std::vector h_device_views; - for (std::size_t map_id = 0; map_id < number_of_maps; ++map_id) { - const std::size_t offset = map_id * elements_in_map; - - MapType* map = maps[map_id].get(); - map->insert(d_pairs.begin() + offset, d_pairs.begin() + offset + elements_in_map); - h_device_views.push_back(map->get_device_view()); - } - thrust::device_vector d_device_views(h_device_views); - - thrust::device_vector d_keys_exist(number_of_maps * elements_in_map); - thrust::device_vector d_keys_and_values_correct(number_of_maps * elements_in_map); - - shared_memory_test_kernel - <<>>(d_device_views.data().get(), - d_keys.data().get(), - d_values.data().get(), - elements_in_map, - d_keys_exist.data().get(), - d_keys_and_values_correct.data().get()); - - REQUIRE(d_keys_exist.size() == d_keys_and_values_correct.size()); - auto zip = thrust::make_zip_iterator( - thrust::make_tuple(d_keys_exist.begin(), d_keys_and_values_correct.begin())); - - REQUIRE(all_of(zip, zip + d_keys_exist.size(), [] __device__(auto const& z) { - return thrust::get<0>(z) and thrust::get<1>(z); - })); - } - - SECTION("No key is found before insertion.") - { - std::vector h_device_views; - for (std::size_t map_id = 0; map_id < number_of_maps; ++map_id) { - h_device_views.push_back(maps[map_id].get()->get_device_view()); - } - thrust::device_vector d_device_views(h_device_views); - - thrust::device_vector d_keys_exist(number_of_maps * elements_in_map); - thrust::device_vector d_keys_and_values_correct(number_of_maps * elements_in_map); - - shared_memory_test_kernel - <<>>(d_device_views.data().get(), - d_keys.data().get(), - d_values.data().get(), - elements_in_map, - d_keys_exist.data().get(), - d_keys_and_values_correct.data().get()); - - REQUIRE(none_of(d_keys_exist.begin(), d_keys_exist.end(), [] __device__(const bool key_found) { - return key_found; - })); - } -} \ No newline at end of file From fd3b98f981d5742ea0dd98c3faa10e7eb7d6bb15 Mon Sep 17 00:00:00 2001 From: Jake Hemstad Date: Mon, 4 Jan 2021 15:19:33 -0600 Subject: [PATCH 03/69] Fix static_assert for ReductionOp::value_type. --- include/cuco/static_reduction_map.cuh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/cuco/static_reduction_map.cuh b/include/cuco/static_reduction_map.cuh index d66c6cf4a..a33de7026 100644 --- a/include/cuco/static_reduction_map.cuh +++ b/include/cuco/static_reduction_map.cuh @@ -123,6 +123,8 @@ template > class static_reduction_map { static_assert(std::is_arithmetic::value, "Unsupported, non-arithmetic key type."); + static_assert(std::is_same::value, + "Type mismatch between ReductionOp::value_type and Value"); public: using value_type = cuco::pair_type; From a3678fbd9417787f0c7818a992d4d1e6284eace6 Mon Sep 17 00:00:00 2001 From: Jake Hemstad Date: Mon, 4 Jan 2021 21:49:02 -0600 Subject: [PATCH 04/69] CG reduction insert implementation. --- include/cuco/detail/static_reduction_map.inl | 106 ++++++++----------- include/cuco/static_reduction_map.cuh | 2 +- 2 files changed, 46 insertions(+), 62 deletions(-) diff --git a/include/cuco/detail/static_reduction_map.inl b/include/cuco/detail/static_reduction_map.inl index be28e0f28..140c728a4 100644 --- a/include/cuco/detail/static_reduction_map.inl +++ b/include/cuco/detail/static_reduction_map.inl @@ -156,7 +156,7 @@ static_reduction_map::device_mutable_ slot_key.compare_exchange_strong(expected_key, insert_pair.first, memory_order_relaxed); if (key_success or key_equal(insert_pair.first, expected_key)) { - // return do_op{}(slot_value, insert_pair.second); + return op_.apply(slot_value, insert_pair.second); } // if we couldn't insert the key, but it wasn't a duplicate, then there must @@ -171,77 +171,61 @@ template template -__device__ bool +__device__ void static_reduction_map::device_mutable_view::insert( CG g, value_type const& insert_pair, Hash hash, KeyEqual key_equal) noexcept { auto current_slot = initial_slot(g, insert_pair.first, hash); + auto& slot_key = current_slot->first; + auto& slot_value = current_slot->second; while (true) { - key_type const existing_key = current_slot->first; + auto const current_key = slot_key.load(cuda::std::memory_order_relaxed); - // The user provide `key_equal` can never be used to compare against `empty_key_sentinel` as the - // sentinel is not a valid key value. Therefore, first check for the sentinel - auto const slot_is_empty = (existing_key == this->get_empty_key_sentinel()); + // The user provided `key_equal` should never be used to compare against `empty_key_sentinel` as + // the sentinel is not a valid key value. Therefore, first check for the sentinel + // TODO: Use memcmp + auto const slot_is_empty = (current_key == this->get_empty_key_sentinel()); - // the key we are trying to insert is already in the map, so we return with failure to insert - if (g.ballot(not slot_is_empty and key_equal(existing_key, insert_pair.first))) { - return false; - } + auto const key_exists = not slot_is_empty and key_equal(current_key, insert_pair.first); - auto const window_contains_empty = g.ballot(slot_is_empty); + // Key already exists, aggregate with it's value + if (key_exists) { op_.apply(slot_value, insert_pair.second); } - // we found an empty slot, but not the key we are inserting, so this must - // be an empty slot into which we can insert the key - if (window_contains_empty) { + // If key already exists in the CG window, all threads exit + if (g.ballot(key_exists)) { return; } + + auto const window_empty_mask = g.ballot(slot_is_empty); + + if (window_empty_mask) { // the first lane in the group with an empty slot will attempt the insert - insert_result status{insert_result::CONTINUE}; - uint32_t src_lane = __ffs(window_contains_empty) - 1; - - if (g.thread_rank() == src_lane) { - using cuda::std::memory_order_relaxed; - auto expected_key = this->get_empty_key_sentinel(); - auto expected_value = this->get_empty_value_sentinel(); - auto& slot_key = current_slot->first; - auto& slot_value = current_slot->second; - - bool key_success = - slot_key.compare_exchange_strong(expected_key, insert_pair.first, memory_order_relaxed); - bool value_success = slot_value.compare_exchange_strong( - expected_value, insert_pair.second, memory_order_relaxed); - - if (key_success) { - while (not value_success) { - value_success = - slot_value.compare_exchange_strong(expected_value = this->get_empty_value_sentinel(), - insert_pair.second, - memory_order_relaxed); + auto const src_lane = __ffs(window_empty_mask) - 1; + + auto const thread_success = [&]() { + if (g.thread_rank() == src_lane) { + auto expected_key = this->get_empty_key_sentinel(); + + auto const key_success = slot_key.compare_exchange_strong( + expected_key, insert_pair.first, cuda::memory_order_relaxed); + + if (key_success or key_equal(insert_pair.first, expected_key)) { + op_.apply(slot_value, insert_pair.second); + return true; } - status = insert_result::SUCCESS; - } else if (value_success) { - slot_value.store(this->get_empty_value_sentinel(), memory_order_relaxed); } + return false; + }(); - // our key was already present in the slot, so our key is a duplicate - if (key_equal(insert_pair.first, expected_key)) { status = insert_result::DUPLICATE; } - // another key was inserted in the slot we wanted to try - // so we need to try the next empty slot in the window - } + auto const src_success = g.shfl(thread_success, src_lane); - uint32_t res_status = g.shfl(static_cast(status), src_lane); - status = static_cast(res_status); + if (src_success) { return; } - // successful insert - if (status == insert_result::SUCCESS) { return true; } - // duplicate present during insert - if (status == insert_result::DUPLICATE) { return false; } // if we've gotten this far, a different key took our spot // before we could insert. We need to retry the insert on the // same window - } - // if there are no empty slots in the current window, - // we move onto the next window - else { + } else { + // if there are no empty slots in the current window, + // we move onto the next window current_slot = next_slot(g, current_slot); } } @@ -313,8 +297,8 @@ __device__ while (true) { auto const existing_key = current_slot->first.load(cuda::std::memory_order_relaxed); - // The user provide `key_equal` can never be used to compare against `empty_key_sentinel` as the - // sentinel is not a valid key value. Therefore, first check for the sentinel + // The user provide `key_equal` can never be used to compare against `empty_key_sentinel` as + // the sentinel is not a valid key value. Therefore, first check for the sentinel auto const slot_is_empty = (existing_key == this->get_empty_key_sentinel()); // the key we were searching for was found by one of the threads, @@ -353,8 +337,8 @@ __device__ typename static_reduction_mapfirst.load(cuda::std::memory_order_relaxed); - // The user provide `key_equal` can never be used to compare against `empty_key_sentinel` as the - // sentinel is not a valid key value. Therefore, first check for the sentinel + // The user provide `key_equal` can never be used to compare against `empty_key_sentinel` as + // the sentinel is not a valid key value. Therefore, first check for the sentinel auto const slot_is_empty = (existing_key == this->get_empty_key_sentinel()); // the key we were searching for was found by one of the threads, so we return an iterator to @@ -417,8 +401,8 @@ static_reduction_map::device_view::co while (true) { key_type const existing_key = current_slot->first.load(cuda::std::memory_order_relaxed); - // The user provide `key_equal` can never be used to compare against `empty_key_sentinel` as the - // sentinel is not a valid key value. Therefore, first check for the sentinel + // The user provide `key_equal` can never be used to compare against `empty_key_sentinel` as + // the sentinel is not a valid key value. Therefore, first check for the sentinel auto const slot_is_empty = (existing_key == this->get_empty_key_sentinel()); // the key we were searching for was found by one of the threads, so we return an iterator to @@ -428,8 +412,8 @@ static_reduction_map::device_view::co // we found an empty slot, meaning that the key we're searching for isn't present if (g.ballot(slot_is_empty)) { return false; } - // otherwise, all slots in the current window are full with other keys, so we move onto the next - // window + // otherwise, all slots in the current window are full with other keys, so we move onto the + // next window current_slot = next_slot(g, current_slot); } } diff --git a/include/cuco/static_reduction_map.cuh b/include/cuco/static_reduction_map.cuh index a33de7026..47b21d6f2 100644 --- a/include/cuco/static_reduction_map.cuh +++ b/include/cuco/static_reduction_map.cuh @@ -600,7 +600,7 @@ class static_reduction_map { template , typename KeyEqual = thrust::equal_to> - __device__ bool insert(CG g, + __device__ void insert(CG g, value_type const& insert_pair, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}) noexcept; From 5a65bf61674077971dd0a65bfe870f75bcfeb1da Mon Sep 17 00:00:00 2001 From: Jake Hemstad Date: Tue, 5 Jan 2021 09:02:51 -0600 Subject: [PATCH 05/69] Cleanup of CG insert. --- include/cuco/detail/static_reduction_map.inl | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/include/cuco/detail/static_reduction_map.inl b/include/cuco/detail/static_reduction_map.inl index 140c728a4..d833d2650 100644 --- a/include/cuco/detail/static_reduction_map.inl +++ b/include/cuco/detail/static_reduction_map.inl @@ -201,7 +201,7 @@ static_reduction_map::device_mutable_ // the first lane in the group with an empty slot will attempt the insert auto const src_lane = __ffs(window_empty_mask) - 1; - auto const thread_success = [&]() { + auto const update_success = [&]() { if (g.thread_rank() == src_lane) { auto expected_key = this->get_empty_key_sentinel(); @@ -216,16 +216,12 @@ static_reduction_map::device_mutable_ return false; }(); - auto const src_success = g.shfl(thread_success, src_lane); + // If the update succeeded, the thread group exits + if (g.shfl(update_success, src_lane)) { return; } - if (src_success) { return; } - - // if we've gotten this far, a different key took our spot - // before we could insert. We need to retry the insert on the - // same window + // A different key took the current slot. Look for an empty slot in the current window } else { - // if there are no empty slots in the current window, - // we move onto the next window + // No empty slots in the current window, move onto the next window current_slot = next_slot(g, current_slot); } } From 28e09953cd2ceb1fadf6b8ca93aa379624887b9d Mon Sep 17 00:00:00 2001 From: Jake Hemstad Date: Tue, 5 Jan 2021 14:49:16 -0600 Subject: [PATCH 06/69] Pass reduction op to device view ctors. --- include/cuco/static_reduction_map.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/cuco/static_reduction_map.cuh b/include/cuco/static_reduction_map.cuh index 47b21d6f2..1a94b8270 100644 --- a/include/cuco/static_reduction_map.cuh +++ b/include/cuco/static_reduction_map.cuh @@ -900,7 +900,7 @@ class static_reduction_map { */ device_view get_device_view() const noexcept { - return device_view(slots_, capacity_, empty_key_sentinel_, empty_value_sentinel_); + return device_view(slots_, capacity_, empty_key_sentinel_, op_); } /** @@ -911,7 +911,7 @@ class static_reduction_map { */ device_mutable_view get_device_mutable_view() const noexcept { - return device_mutable_view(slots_, capacity_, empty_key_sentinel_, empty_value_sentinel_); + return device_mutable_view(slots_, capacity_, empty_key_sentinel_, op_); } private: From 8dc64ee9f4f06392e3c526a2507c2b8d7f1dd8dd Mon Sep 17 00:00:00 2001 From: Jake Hemstad Date: Tue, 5 Jan 2021 14:50:19 -0600 Subject: [PATCH 07/69] Add pair ctor for constructing from two elements. --- include/cuco/detail/pair.cuh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/cuco/detail/pair.cuh b/include/cuco/detail/pair.cuh index 8bc6ec6b4..8ed10b32b 100644 --- a/include/cuco/detail/pair.cuh +++ b/include/cuco/detail/pair.cuh @@ -65,6 +65,10 @@ struct alignas(detail::pair_alignment()) pair { : first{p.first}, second{p.second} { } + __host__ __device__ constexpr pair(First const& f, Second const& s) noexcept + : first{f}, second{s} + { + } }; template From 573bce28f00b2fd57749537d684f39ad09d08148 Mon Sep 17 00:00:00 2001 From: Jake Hemstad Date: Tue, 5 Jan 2021 14:50:38 -0600 Subject: [PATCH 08/69] Allow bulk insert kernel to work on iterators over tuples. --- include/cuco/detail/static_reduction_map_kernels.cuh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/cuco/detail/static_reduction_map_kernels.cuh b/include/cuco/detail/static_reduction_map_kernels.cuh index 6ded5e99d..9849efb44 100644 --- a/include/cuco/detail/static_reduction_map_kernels.cuh +++ b/include/cuco/detail/static_reduction_map_kernels.cuh @@ -142,7 +142,10 @@ __global__ void insert( while (it < last) { // force conversion to value_type - typename viewT::value_type const insert_pair{*it}; + typename viewT::value_type const insert_pair{ + static_cast(thrust::get<0>(*it)), + static_cast(thrust::get<1>(*it))}; + if (view.insert(tile, insert_pair, hash, key_equal) && tile.thread_rank() == 0) { thread_num_successes++; } From d9236e588e1eb87051b891de707b1326624c8795 Mon Sep 17 00:00:00 2001 From: Jake Hemstad Date: Tue, 5 Jan 2021 15:22:16 -0600 Subject: [PATCH 09/69] Add device decorator to reduction op definition. --- include/cuco/static_reduction_map.cuh | 1 + 1 file changed, 1 insertion(+) diff --git a/include/cuco/static_reduction_map.cuh b/include/cuco/static_reduction_map.cuh index 1a94b8270..bac8132ae 100644 --- a/include/cuco/static_reduction_map.cuh +++ b/include/cuco/static_reduction_map.cuh @@ -44,6 +44,7 @@ struct reduce_add { static constexpr T identity = 0; template + __device__ T apply(cuda::atomic& slot, T2 const& value) { return slot.fetch_add(value); From 89ed44e656e4ab16981127d4d7449788b1c390e1 Mon Sep 17 00:00:00 2001 From: Jake Hemstad Date: Tue, 5 Jan 2021 15:22:48 -0600 Subject: [PATCH 10/69] Add get_op function to allow accessing the op from the derived types. --- include/cuco/static_reduction_map.cuh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/cuco/static_reduction_map.cuh b/include/cuco/static_reduction_map.cuh index bac8132ae..5f2f0341d 100644 --- a/include/cuco/static_reduction_map.cuh +++ b/include/cuco/static_reduction_map.cuh @@ -283,6 +283,12 @@ class static_reduction_map { { } + /** + * @brief Gets the binary op + * + */ + __device__ ReductionOp get_op() const { return op_; } + /** * @brief Gets slots array. * From e28db800db2afca8df262592f68e1a698cfb12e3 Mon Sep 17 00:00:00 2001 From: Jake Hemstad Date: Tue, 5 Jan 2021 15:23:18 -0600 Subject: [PATCH 11/69] Make insert return a bool after all. We need to return a bool so we can keep track of how many unique keys were inserted in a bulk insert. --- include/cuco/static_reduction_map.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/cuco/static_reduction_map.cuh b/include/cuco/static_reduction_map.cuh index 5f2f0341d..c8e2ecc13 100644 --- a/include/cuco/static_reduction_map.cuh +++ b/include/cuco/static_reduction_map.cuh @@ -607,7 +607,7 @@ class static_reduction_map { template , typename KeyEqual = thrust::equal_to> - __device__ void insert(CG g, + __device__ bool insert(CG g, value_type const& insert_pair, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}) noexcept; From 0eeac206df5acfaf869bd611c66216230dc63fb8 Mon Sep 17 00:00:00 2001 From: Jake Hemstad Date: Tue, 5 Jan 2021 15:23:45 -0600 Subject: [PATCH 12/69] Use get_op in implementation. --- include/cuco/detail/static_reduction_map.inl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/cuco/detail/static_reduction_map.inl b/include/cuco/detail/static_reduction_map.inl index d833d2650..cf98e46b3 100644 --- a/include/cuco/detail/static_reduction_map.inl +++ b/include/cuco/detail/static_reduction_map.inl @@ -156,7 +156,7 @@ static_reduction_map::device_mutable_ slot_key.compare_exchange_strong(expected_key, insert_pair.first, memory_order_relaxed); if (key_success or key_equal(insert_pair.first, expected_key)) { - return op_.apply(slot_value, insert_pair.second); + return this->get_op().apply(slot_value, insert_pair.second); } // if we couldn't insert the key, but it wasn't a duplicate, then there must @@ -190,7 +190,7 @@ static_reduction_map::device_mutable_ auto const key_exists = not slot_is_empty and key_equal(current_key, insert_pair.first); // Key already exists, aggregate with it's value - if (key_exists) { op_.apply(slot_value, insert_pair.second); } + if (key_exists) { this->get_op().apply(slot_value, insert_pair.second); } // If key already exists in the CG window, all threads exit if (g.ballot(key_exists)) { return; } @@ -209,7 +209,7 @@ static_reduction_map::device_mutable_ expected_key, insert_pair.first, cuda::memory_order_relaxed); if (key_success or key_equal(insert_pair.first, expected_key)) { - op_.apply(slot_value, insert_pair.second); + this->get_op().apply(slot_value, insert_pair.second); return true; } } From fa31c8117abe243ff9ac55c9edda8ac69719ceef Mon Sep 17 00:00:00 2001 From: Jake Hemstad Date: Tue, 5 Jan 2021 15:24:02 -0600 Subject: [PATCH 13/69] Make insert return a bool. --- include/cuco/detail/static_reduction_map.inl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/cuco/detail/static_reduction_map.inl b/include/cuco/detail/static_reduction_map.inl index cf98e46b3..2c1e434aa 100644 --- a/include/cuco/detail/static_reduction_map.inl +++ b/include/cuco/detail/static_reduction_map.inl @@ -171,7 +171,7 @@ template template -__device__ void +__device__ bool static_reduction_map::device_mutable_view::insert( CG g, value_type const& insert_pair, Hash hash, KeyEqual key_equal) noexcept { @@ -193,7 +193,7 @@ static_reduction_map::device_mutable_ if (key_exists) { this->get_op().apply(slot_value, insert_pair.second); } // If key already exists in the CG window, all threads exit - if (g.ballot(key_exists)) { return; } + if (g.ballot(key_exists)) { return false; } auto const window_empty_mask = g.ballot(slot_is_empty); @@ -217,7 +217,7 @@ static_reduction_map::device_mutable_ }(); // If the update succeeded, the thread group exits - if (g.shfl(update_success, src_lane)) { return; } + if (g.shfl(update_success, src_lane)) { return true; } // A different key took the current slot. Look for an empty slot in the current window } else { From ab81b2b7d38ab4d414681b35b9b905685738f9d9 Mon Sep 17 00:00:00 2001 From: Jake Hemstad Date: Tue, 5 Jan 2021 16:07:52 -0600 Subject: [PATCH 14/69] Correct insert to return if the key was the first key inserted. --- include/cuco/detail/static_reduction_map.inl | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/include/cuco/detail/static_reduction_map.inl b/include/cuco/detail/static_reduction_map.inl index 2c1e434aa..9025ceefa 100644 --- a/include/cuco/detail/static_reduction_map.inl +++ b/include/cuco/detail/static_reduction_map.inl @@ -190,7 +190,9 @@ static_reduction_map::device_mutable_ auto const key_exists = not slot_is_empty and key_equal(current_key, insert_pair.first); // Key already exists, aggregate with it's value - if (key_exists) { this->get_op().apply(slot_value, insert_pair.second); } + if (key_exists) { + this->get_op().apply(slot_value, insert_pair.second); + } // If key already exists in the CG window, all threads exit if (g.ballot(key_exists)) { return false; } @@ -210,14 +212,16 @@ static_reduction_map::device_mutable_ if (key_success or key_equal(insert_pair.first, expected_key)) { this->get_op().apply(slot_value, insert_pair.second); - return true; + return key_success; } } return false; }(); // If the update succeeded, the thread group exits - if (g.shfl(update_success, src_lane)) { return true; } + if (g.shfl(update_success, src_lane)) { + return true; + } // A different key took the current slot. Look for an empty slot in the current window } else { From 46f9b73794a28cef4b4dfc7dd5489e0966741b7f Mon Sep 17 00:00:00 2001 From: Jake Hemstad Date: Tue, 5 Jan 2021 16:08:02 -0600 Subject: [PATCH 15/69] First test verifying size passed. --- tests/static_reduction_map/static_reduction_map_test.cu | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/static_reduction_map/static_reduction_map_test.cu b/tests/static_reduction_map/static_reduction_map_test.cu index 9d709a6c6..958571129 100644 --- a/tests/static_reduction_map/static_reduction_map_test.cu +++ b/tests/static_reduction_map/static_reduction_map_test.cu @@ -93,4 +93,12 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", constexpr std::size_t num_slots{50'000'000}; cuco::static_reduction_map, Key, Value> map{num_slots, -1}; + SECTION("Inserting all the same key should sum all of their corresponding values") { + thrust::device_vector keys(100, 42); + thrust::device_vector values(keys.size(), 1); + auto zip = thrust::make_zip_iterator(thrust::make_tuple(keys.begin(), values.begin())); + auto zip_end = zip + keys.size(); + map.insert(zip, zip_end); + REQUIRE(map.get_size() == 1); + } } From 8aebabbbdf79a307d61f9e3a085fbd060362e5e2 Mon Sep 17 00:00:00 2001 From: Jake Hemstad Date: Wed, 6 Jan 2021 13:01:49 -0600 Subject: [PATCH 16/69] Update CG insert logic. The mapped value is updated in the case of a new insert or updating an existing key, but we need to track if the insert was the first time that key was inserted. --- include/cuco/detail/static_reduction_map.inl | 33 ++++++++++---------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/include/cuco/detail/static_reduction_map.inl b/include/cuco/detail/static_reduction_map.inl index 9025ceefa..f5e26ac58 100644 --- a/include/cuco/detail/static_reduction_map.inl +++ b/include/cuco/detail/static_reduction_map.inl @@ -190,9 +190,7 @@ static_reduction_map::device_mutable_ auto const key_exists = not slot_is_empty and key_equal(current_key, insert_pair.first); // Key already exists, aggregate with it's value - if (key_exists) { - this->get_op().apply(slot_value, insert_pair.second); - } + if (key_exists) { this->get_op().apply(slot_value, insert_pair.second); } // If key already exists in the CG window, all threads exit if (g.ballot(key_exists)) { return false; } @@ -203,24 +201,27 @@ static_reduction_map::device_mutable_ // the first lane in the group with an empty slot will attempt the insert auto const src_lane = __ffs(window_empty_mask) - 1; - auto const update_success = [&]() { - if (g.thread_rank() == src_lane) { - auto expected_key = this->get_empty_key_sentinel(); + auto const attempt_update = [&]() { + auto expected_key = this->get_empty_key_sentinel(); - auto const key_success = slot_key.compare_exchange_strong( - expected_key, insert_pair.first, cuda::memory_order_relaxed); + auto const key_success = slot_key.compare_exchange_strong( + expected_key, insert_pair.first, cuda::memory_order_relaxed); - if (key_success or key_equal(insert_pair.first, expected_key)) { - this->get_op().apply(slot_value, insert_pair.second); - return key_success; - } + if (key_success or key_equal(insert_pair.first, expected_key)) { + this->get_op().apply(slot_value, insert_pair.second); + return key_success ? insert_result::SUCCESS : insert_result::DUPLICATE; } - return false; - }(); + return insert_result::CONTINUE; + }; + + auto const update_result = + (g.thread_rank() == src_lane) ? attempt_update() : insert_result::CONTINUE; + + auto const window_result = g.shfl(update_result, src_lane); // If the update succeeded, the thread group exits - if (g.shfl(update_success, src_lane)) { - return true; + if (window_result != insert_result::CONTINUE) { + return (window_result == insert_result::SUCCESS); } // A different key took the current slot. Look for an empty slot in the current window From 9fb930ec216d6c2a074eb7d537a2572c5686a585 Mon Sep 17 00:00:00 2001 From: Jake Hemstad Date: Wed, 6 Jan 2021 13:01:57 -0600 Subject: [PATCH 17/69] Add more tests. --- .../static_reduction_map_test.cu | 44 +++++++++++++------ 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/tests/static_reduction_map/static_reduction_map_test.cu b/tests/static_reduction_map/static_reduction_map_test.cu index 958571129..144c02ec8 100644 --- a/tests/static_reduction_map/static_reduction_map_test.cu +++ b/tests/static_reduction_map/static_reduction_map_test.cu @@ -80,25 +80,41 @@ static void generate_keys(OutputIt output_begin, OutputIt output_end) TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", "", ((typename T, dist_type Dist), T, Dist), - (int32_t, dist_type::UNIQUE), - (int64_t, dist_type::UNIQUE), - (int32_t, dist_type::UNIFORM), - (int64_t, dist_type::UNIFORM), - (int32_t, dist_type::GAUSSIAN), - (int64_t, dist_type::GAUSSIAN)) + (int32_t, dist_type::UNIQUE)) { using Key = T; using Value = T; - constexpr std::size_t num_slots{50'000'000}; + constexpr std::size_t num_slots{200}; cuco::static_reduction_map, Key, Value> map{num_slots, -1}; - SECTION("Inserting all the same key should sum all of their corresponding values") { - thrust::device_vector keys(100, 42); - thrust::device_vector values(keys.size(), 1); - auto zip = thrust::make_zip_iterator(thrust::make_tuple(keys.begin(), values.begin())); - auto zip_end = zip + keys.size(); - map.insert(zip, zip_end); - REQUIRE(map.get_size() == 1); + SECTION("Inserting identical keys") + { + thrust::device_vector keys(100, 42); + thrust::device_vector values(keys.size(), 1); + auto zip = thrust::make_zip_iterator(thrust::make_tuple(keys.begin(), values.begin())); + auto zip_end = zip + keys.size(); + map.insert(zip, zip_end); + + SECTION("There should only be one key in the map") { REQUIRE(map.get_size() == 1); } + + SECTION("Map should contain the inserted key") + { + thrust::device_vector contained(keys.size()); + map.contains(keys.begin(), keys.end(), contained.begin()); + REQUIRE(all_of(contained.begin(), contained.end(), [] __device__(bool c) { return c; })); + } + + SECTION("Found value should equal aggregate of inserted values") + { + thrust::device_vector found(keys.size()); + map.find(keys.begin(), keys.end(), found.begin()); + auto const expected_aggregate = keys.size(); // All keys inserted "1", so the + // sum aggregate should be + // equal to the number of keys inserted + REQUIRE(all_of(found.begin(), found.end(), [expected_aggregate] __device__(Value v) { + return v == expected_aggregate; + })); + } } } From 24261b2a9b4e327409a26869804dea8abc78a993 Mon Sep 17 00:00:00 2001 From: Jake Hemstad Date: Thu, 7 Jan 2021 11:55:12 -0600 Subject: [PATCH 18/69] Add test for inserting all unique keys. --- .../static_reduction_map_test.cu | 113 +++++++++--------- 1 file changed, 55 insertions(+), 58 deletions(-) diff --git a/tests/static_reduction_map/static_reduction_map_test.cu b/tests/static_reduction_map/static_reduction_map_test.cu index 144c02ec8..084b24563 100644 --- a/tests/static_reduction_map/static_reduction_map_test.cu +++ b/tests/static_reduction_map/static_reduction_map_test.cu @@ -23,8 +23,6 @@ #include namespace { -namespace cg = cooperative_groups; - // Thrust logical algorithms (any_of/all_of/none_of) don't work with device // lambdas: See https://github.com/thrust/thrust/issues/1062 template @@ -47,74 +45,73 @@ bool none_of(Iterator begin, Iterator end, Predicate p) } } // namespace -enum class dist_type { UNIQUE, UNIFORM, GAUSSIAN }; - -template -static void generate_keys(OutputIt output_begin, OutputIt output_end) +TEMPLATE_TEST_CASE_SIG("Insert all identical keys", + "", + ((typename Key, typename Value), Key, Value), + (int32_t, int32_t)) { - auto num_keys = std::distance(output_begin, output_end); + constexpr std::size_t num_slots{200}; + cuco::static_reduction_map, Key, Value> map{num_slots, -1}; - std::random_device rd; - std::mt19937 gen{rd()}; + thrust::device_vector keys(100, 42); + thrust::device_vector values(keys.size(), 1); + auto zip = thrust::make_zip_iterator(thrust::make_tuple(keys.begin(), values.begin())); + auto zip_end = zip + keys.size(); + map.insert(zip, zip_end); + + SECTION("There should only be one key in the map") { REQUIRE(map.get_size() == 1); } + + SECTION("Map should contain the inserted key") + { + thrust::device_vector contained(keys.size()); + map.contains(keys.begin(), keys.end(), contained.begin()); + REQUIRE(all_of(contained.begin(), contained.end(), [] __device__(bool c) { return c; })); + } - switch (Dist) { - case dist_type::UNIQUE: - for (auto i = 0; i < num_keys; ++i) { - output_begin[i] = i; - } - break; - case dist_type::UNIFORM: - for (auto i = 0; i < num_keys; ++i) { - output_begin[i] = std::abs(static_cast(gen())); - } - break; - case dist_type::GAUSSIAN: - std::normal_distribution<> dg{1e9, 1e7}; - for (auto i = 0; i < num_keys; ++i) { - output_begin[i] = std::abs(static_cast(dg(gen))); - } - break; + SECTION("Found value should equal aggregate of inserted values") + { + thrust::device_vector found(keys.size()); + map.find(keys.begin(), keys.end(), found.begin()); + auto const expected_aggregate = keys.size(); // All keys inserted "1", so the + // sum aggregate should be + // equal to the number of keys inserted + REQUIRE(all_of(found.begin(), found.end(), [expected_aggregate] __device__(Value v) { + return v == expected_aggregate; + })); } } -TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", +TEMPLATE_TEST_CASE_SIG("Insert all unique keys", "", - ((typename T, dist_type Dist), T, Dist), - (int32_t, dist_type::UNIQUE)) + ((typename Key, typename Value), Key, Value), + (int32_t, int32_t)) { - using Key = T; - using Value = T; - - constexpr std::size_t num_slots{200}; + constexpr std::size_t num_keys = 100; + constexpr std::size_t num_slots{num_keys * 3}; cuco::static_reduction_map, Key, Value> map{num_slots, -1}; - SECTION("Inserting identical keys") - { - thrust::device_vector keys(100, 42); - thrust::device_vector values(keys.size(), 1); - auto zip = thrust::make_zip_iterator(thrust::make_tuple(keys.begin(), values.begin())); - auto zip_end = zip + keys.size(); - map.insert(zip, zip_end); + auto keys_begin = thrust::make_counting_iterator(0); + auto values_begin = thrust::make_counting_iterator(0); + auto zip = thrust::make_zip_iterator(thrust::make_tuple(keys_begin, values_begin)); + auto zip_end = zip + num_keys; + map.insert(zip, zip_end); - SECTION("There should only be one key in the map") { REQUIRE(map.get_size() == 1); } + SECTION("Size of map should equal number of inserted keys") + { + REQUIRE(map.get_size() == num_keys); + } - SECTION("Map should contain the inserted key") - { - thrust::device_vector contained(keys.size()); - map.contains(keys.begin(), keys.end(), contained.begin()); - REQUIRE(all_of(contained.begin(), contained.end(), [] __device__(bool c) { return c; })); - } + SECTION("Map should contain the inserted keys") + { + thrust::device_vector contained(num_keys); + map.contains(keys_begin, keys_begin + num_keys, contained.begin()); + REQUIRE(all_of(contained.begin(), contained.end(), [] __device__(bool c) { return c; })); + } - SECTION("Found value should equal aggregate of inserted values") - { - thrust::device_vector found(keys.size()); - map.find(keys.begin(), keys.end(), found.begin()); - auto const expected_aggregate = keys.size(); // All keys inserted "1", so the - // sum aggregate should be - // equal to the number of keys inserted - REQUIRE(all_of(found.begin(), found.end(), [expected_aggregate] __device__(Value v) { - return v == expected_aggregate; - })); - } + SECTION("Found value should equal inserted value") + { + thrust::device_vector found(num_keys); + map.find(keys_begin, keys_begin + num_keys, found.begin()); + REQUIRE(thrust::equal(thrust::device, values_begin, values_begin + num_keys, found.begin())); } } From e635e3179d4f9d8186e033acbe21d8c5275b213c Mon Sep 17 00:00:00 2001 From: Jake Hemstad Date: Thu, 7 Jan 2021 11:55:19 -0600 Subject: [PATCH 19/69] Use relaxed fetch_add. --- include/cuco/static_reduction_map.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/cuco/static_reduction_map.cuh b/include/cuco/static_reduction_map.cuh index c8e2ecc13..34fa2cba0 100644 --- a/include/cuco/static_reduction_map.cuh +++ b/include/cuco/static_reduction_map.cuh @@ -47,7 +47,7 @@ struct reduce_add { __device__ T apply(cuda::atomic& slot, T2 const& value) { - return slot.fetch_add(value); + return slot.fetch_add(value, cuda::memory_order_relaxed); } }; From d749445c6b967384aaf23e27b419ebcab26883b1 Mon Sep 17 00:00:00 2001 From: Jake Hemstad Date: Thu, 7 Jan 2021 13:07:19 -0600 Subject: [PATCH 20/69] Update the slot references each iteration. --- include/cuco/detail/static_reduction_map.inl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/cuco/detail/static_reduction_map.inl b/include/cuco/detail/static_reduction_map.inl index f5e26ac58..86b500e4e 100644 --- a/include/cuco/detail/static_reduction_map.inl +++ b/include/cuco/detail/static_reduction_map.inl @@ -176,10 +176,10 @@ static_reduction_map::device_mutable_ CG g, value_type const& insert_pair, Hash hash, KeyEqual key_equal) noexcept { auto current_slot = initial_slot(g, insert_pair.first, hash); - auto& slot_key = current_slot->first; - auto& slot_value = current_slot->second; while (true) { + auto& slot_key = current_slot->first; + auto& slot_value = current_slot->second; auto const current_key = slot_key.load(cuda::std::memory_order_relaxed); // The user provided `key_equal` should never be used to compare against `empty_key_sentinel` as From ca9f7d6b4362cad46521e3f9bce1b6b9926f2345 Mon Sep 17 00:00:00 2001 From: Jake Hemstad Date: Thu, 7 Jan 2021 13:07:30 -0600 Subject: [PATCH 21/69] Increase size of unique key test. --- tests/static_reduction_map/static_reduction_map_test.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/static_reduction_map/static_reduction_map_test.cu b/tests/static_reduction_map/static_reduction_map_test.cu index 084b24563..9a61b8b4d 100644 --- a/tests/static_reduction_map/static_reduction_map_test.cu +++ b/tests/static_reduction_map/static_reduction_map_test.cu @@ -86,8 +86,8 @@ TEMPLATE_TEST_CASE_SIG("Insert all unique keys", ((typename Key, typename Value), Key, Value), (int32_t, int32_t)) { - constexpr std::size_t num_keys = 100; - constexpr std::size_t num_slots{num_keys * 3}; + constexpr std::size_t num_keys = 10000; + constexpr std::size_t num_slots{num_keys * 2}; cuco::static_reduction_map, Key, Value> map{num_slots, -1}; auto keys_begin = thrust::make_counting_iterator(0); From 9eebd172295416e1bb3a598b95da6418450ff19b Mon Sep 17 00:00:00 2001 From: Jake Hemstad Date: Thu, 7 Jan 2021 14:33:20 -0600 Subject: [PATCH 22/69] Make map size function of number of keys. --- tests/static_reduction_map/static_reduction_map_test.cu | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/static_reduction_map/static_reduction_map_test.cu b/tests/static_reduction_map/static_reduction_map_test.cu index 9a61b8b4d..bb57f0847 100644 --- a/tests/static_reduction_map/static_reduction_map_test.cu +++ b/tests/static_reduction_map/static_reduction_map_test.cu @@ -50,11 +50,12 @@ TEMPLATE_TEST_CASE_SIG("Insert all identical keys", ((typename Key, typename Value), Key, Value), (int32_t, int32_t)) { - constexpr std::size_t num_slots{200}; - cuco::static_reduction_map, Key, Value> map{num_slots, -1}; - thrust::device_vector keys(100, 42); thrust::device_vector values(keys.size(), 1); + + auto const num_slots{keys.size() * 2}; + cuco::static_reduction_map, Key, Value> map{num_slots, -1}; + auto zip = thrust::make_zip_iterator(thrust::make_tuple(keys.begin(), values.begin())); auto zip_end = zip + keys.size(); map.insert(zip, zip_end); From 212b8f6dbc212d11d48e9aee6d14cfd24bd83927 Mon Sep 17 00:00:00 2001 From: Jake Hemstad Date: Thu, 7 Jan 2021 16:51:18 -0600 Subject: [PATCH 23/69] Add other agg ops. --- include/cuco/static_reduction_map.cuh | 41 ++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/include/cuco/static_reduction_map.cuh b/include/cuco/static_reduction_map.cuh index 34fa2cba0..470589724 100644 --- a/include/cuco/static_reduction_map.cuh +++ b/include/cuco/static_reduction_map.cuh @@ -45,12 +45,51 @@ struct reduce_add { template __device__ - T apply(cuda::atomic& slot, T2 const& value) + T apply(cuda::atomic& slot, T2 const& value) const { return slot.fetch_add(value, cuda::memory_order_relaxed); } }; +template +struct reduce_sub { + using value_type = T; + static constexpr T identity = 0; + + template + __device__ + T apply(cuda::atomic& slot, T2 const& value) const + { + return slot.fetch_sub(value, cuda::memory_order_relaxed); + } +}; + +template +struct reduce_min { + using value_type = T; + static constexpr T identity = std::numeric_limits::max(); + + template + __device__ + T apply(cuda::atomic& slot, T2 const& value) const + { + return slot.fetch_min(value, cuda::memory_order_relaxed); + } +}; + +template +struct reduce_max { + using value_type = T; + static constexpr T identity = std::numeric_limits::lowest(); + + template + __device__ + T apply(cuda::atomic& slot, T2 const& value) const + { + return slot.fetch_max(value, cuda::memory_order_relaxed); + } +}; + /** * @brief A GPU-accelerated, unordered, associative container of key-value * pairs with unique keys. From cda527a52043c5e34115d71c854851fcbb97a542 Mon Sep 17 00:00:00 2001 From: Jake Hemstad Date: Thu, 7 Jan 2021 17:01:12 -0600 Subject: [PATCH 24/69] Add custom binary op. --- include/cuco/static_reduction_map.cuh | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/include/cuco/static_reduction_map.cuh b/include/cuco/static_reduction_map.cuh index 470589724..c8dca31b0 100644 --- a/include/cuco/static_reduction_map.cuh +++ b/include/cuco/static_reduction_map.cuh @@ -44,8 +44,7 @@ struct reduce_add { static constexpr T identity = 0; template - __device__ - T apply(cuda::atomic& slot, T2 const& value) const + __device__ T apply(cuda::atomic& slot, T2 const& value) const { return slot.fetch_add(value, cuda::memory_order_relaxed); } @@ -57,8 +56,7 @@ struct reduce_sub { static constexpr T identity = 0; template - __device__ - T apply(cuda::atomic& slot, T2 const& value) const + __device__ T apply(cuda::atomic& slot, T2 const& value) const { return slot.fetch_sub(value, cuda::memory_order_relaxed); } @@ -70,8 +68,7 @@ struct reduce_min { static constexpr T identity = std::numeric_limits::max(); template - __device__ - T apply(cuda::atomic& slot, T2 const& value) const + __device__ T apply(cuda::atomic& slot, T2 const& value) const { return slot.fetch_min(value, cuda::memory_order_relaxed); } @@ -83,13 +80,27 @@ struct reduce_max { static constexpr T identity = std::numeric_limits::lowest(); template - __device__ - T apply(cuda::atomic& slot, T2 const& value) const + __device__ T apply(cuda::atomic& slot, T2 const& value) const { return slot.fetch_max(value, cuda::memory_order_relaxed); } }; +template +struct custom_op { + using value_type = T; + static constexpr T identity = Identity; + + Op op; + + template + __device__ T apply(cuda::atomic& slot, T2 const& value) const + { + auto old = slot.load(cuda::memory_order_relaxed); + while (not slot.compare_exchange_strong(old, op(old, value), cuda::memory_order_relaxed)) {} + } +}; + /** * @brief A GPU-accelerated, unordered, associative container of key-value * pairs with unique keys. From 7c1af0f4f42f3565b16dd17aefcd194568c38624 Mon Sep 17 00:00:00 2001 From: Jake Hemstad Date: Thu, 7 Jan 2021 17:17:00 -0600 Subject: [PATCH 25/69] Return old value in custom op. --- include/cuco/static_reduction_map.cuh | 1 + 1 file changed, 1 insertion(+) diff --git a/include/cuco/static_reduction_map.cuh b/include/cuco/static_reduction_map.cuh index c8dca31b0..b13c1f5af 100644 --- a/include/cuco/static_reduction_map.cuh +++ b/include/cuco/static_reduction_map.cuh @@ -98,6 +98,7 @@ struct custom_op { { auto old = slot.load(cuda::memory_order_relaxed); while (not slot.compare_exchange_strong(old, op(old, value), cuda::memory_order_relaxed)) {} + return old; } }; From 3f1b59d9362f0ad517199c539c947d5e32692f81 Mon Sep 17 00:00:00 2001 From: Jake Hemstad Date: Thu, 8 Apr 2021 09:55:38 -0500 Subject: [PATCH 26/69] reduction map benchmarks. --- benchmarks/CMakeLists.txt | 7 +- benchmarks/hash_table/static_map_bench.cu | 132 +++++++++--------- .../hash_table/static_reduction_map_bench.cu | 130 +++++++++++++++++ examples/CMakeLists.txt | 2 +- tests/CMakeLists.txt | 2 +- 5 files changed, 206 insertions(+), 67 deletions(-) create mode 100644 benchmarks/hash_table/static_reduction_map_bench.cu diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 45b02848d..f9464a6eb 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -28,6 +28,8 @@ if("${GPU_ARCHS}" STREQUAL "") evaluate_gpu_archs(GPU_ARCHS) endif() +message("GPU_ARCHS = ${GPU_ARCHS}") + ################################################################################################### # - compiler function ----------------------------------------------------------------------------- @@ -35,7 +37,7 @@ function(ConfigureBench BENCH_NAME BENCH_SRC) add_executable(${BENCH_NAME} "${BENCH_SRC}") set_target_properties(${BENCH_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON - CUDA_ARCHITECTURES ${GPU_ARCHS} + CUDA_ARCHITECTURES "${GPU_ARCHS}" RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/gbenchmarks") target_include_directories(${BENCH_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}") @@ -58,6 +60,9 @@ ConfigureBench(DYNAMIC_MAP_BENCH "${DYNAMIC_MAP_BENCH_SRC}") set(STATIC_MAP_BENCH_SRC "${CMAKE_CURRENT_SOURCE_DIR}/hash_table/static_map_bench.cu") ConfigureBench(STATIC_MAP_BENCH "${STATIC_MAP_BENCH_SRC}") +################################################################################################### +ConfigureBench(STATIC_REDUCTION_MAP_BENCH "${CMAKE_CURRENT_SOURCE_DIR}/hash_table/static_reduction_map_bench.cu") + ################################################################################################### set(RBK_BENCH_SRC "${CMAKE_CURRENT_SOURCE_DIR}/reduce_by_key/reduce_by_key.cu") ConfigureBench(RBK_BENCH "${RBK_BENCH_SRC}") diff --git a/benchmarks/hash_table/static_map_bench.cu b/benchmarks/hash_table/static_map_bench.cu index 165465518..563769df6 100644 --- a/benchmarks/hash_table/static_map_bench.cu +++ b/benchmarks/hash_table/static_map_bench.cu @@ -15,40 +15,38 @@ */ #include -#include "cuco/static_map.cuh" -#include #include -#include +#include +#include #include +#include #include +#include "cuco/static_map.cuh" -enum class dist_type { - UNIQUE, - UNIFORM, - GAUSSIAN -}; +enum class dist_type { UNIQUE, UNIFORM, GAUSSIAN }; -template -static void generate_keys(OutputIt output_begin, OutputIt output_end) { +template +static void generate_keys(OutputIt output_begin, OutputIt output_end) +{ auto num_keys = std::distance(output_begin, output_end); - + std::random_device rd; std::mt19937 gen{rd()}; - switch(Dist) { + switch (Dist) { case dist_type::UNIQUE: - for(auto i = 0; i < num_keys; ++i) { + for (auto i = 0; i < num_keys; ++i) { output_begin[i] = i; } break; case dist_type::UNIFORM: - for(auto i = 0; i < num_keys; ++i) { + for (auto i = 0; i < num_keys; ++i) { output_begin[i] = std::abs(static_cast(gen())); } break; case dist_type::GAUSSIAN: std::normal_distribution<> dg{1e9, 1e7}; - for(auto i = 0; i < num_keys; ++i) { + for (auto i = 0; i < num_keys; ++i) { output_begin[i] = std::abs(static_cast(dg(gen))); } break; @@ -59,88 +57,84 @@ static void generate_keys(OutputIt output_begin, OutputIt output_end) { * @brief Generates input sizes and hash table occupancies * */ -static void generate_size_and_occupancy(benchmark::internal::Benchmark* b) { - for (auto size = 100'000'000; size <= 100'000'000; size *= 10) { - for (auto occupancy = 10; occupancy <= 90; occupancy += 10) { +static void generate_size_and_occupancy(benchmark::internal::Benchmark* b) +{ + for (auto size = 4096; size <= 1 << 28; size *= 2) { + for (auto occupancy = 60; occupancy <= 60; occupancy += 10) { b->Args({size, occupancy}); } } } - - template -static void BM_static_map_insert(::benchmark::State& state) { +static void BM_static_map_insert(::benchmark::State& state) +{ using map_type = cuco::static_map; - + std::size_t num_keys = state.range(0); - float occupancy = state.range(1) / float{100}; - std::size_t size = num_keys / occupancy; + float occupancy = state.range(1) / float{100}; + std::size_t size = num_keys / occupancy; + + std::vector h_keys(num_keys); + std::vector> h_pairs(num_keys); - std::vector h_keys( num_keys ); - std::vector> h_pairs( num_keys ); - generate_keys(h_keys.begin(), h_keys.end()); - - for(auto i = 0; i < num_keys; ++i) { - Key key = h_keys[i]; - Value val = h_keys[i]; - h_pairs[i].first = key; + + for (auto i = 0; i < num_keys; ++i) { + Key key = h_keys[i]; + Value val = h_keys[i]; + h_pairs[i].first = key; h_pairs[i].second = val; } - thrust::device_vector> d_pairs( h_pairs ); + thrust::device_vector> d_pairs(h_pairs); - for(auto _ : state) { - state.ResumeTiming(); - state.PauseTiming(); + for (auto _ : state) { map_type map{size, -1, -1}; - state.ResumeTiming(); - - map.insert(d_pairs.begin(), d_pairs.end()); - state.PauseTiming(); + { + cuda_event_timer raii{state}; + map.insert(d_pairs.begin(), d_pairs.end()); + } } - state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * - int64_t(state.iterations()) * + state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) * int64_t(state.range(0))); } - - template -static void BM_static_map_search_all(::benchmark::State& state) { +static void BM_static_map_search_all(::benchmark::State& state) +{ using map_type = cuco::static_map; - + std::size_t num_keys = state.range(0); - float occupancy = state.range(1) / float{100}; - std::size_t size = num_keys / occupancy; + float occupancy = state.range(1) / float{100}; + std::size_t size = num_keys / occupancy; map_type map{size, -1, -1}; auto view = map.get_device_mutable_view(); - std::vector h_keys( num_keys ); - std::vector h_values( num_keys ); - std::vector> h_pairs ( num_keys ); - std::vector h_results (num_keys); + std::vector h_keys(num_keys); + std::vector h_values(num_keys); + std::vector> h_pairs(num_keys); + std::vector h_results(num_keys); generate_keys(h_keys.begin(), h_keys.end()); - - for(auto i = 0; i < num_keys; ++i) { - Key key = h_keys[i]; - Value val = h_keys[i]; - h_pairs[i].first = key; + + for (auto i = 0; i < num_keys; ++i) { + Key key = h_keys[i]; + Value val = h_keys[i]; + h_pairs[i].first = key; h_pairs[i].second = val; } - thrust::device_vector d_keys( h_keys ); - thrust::device_vector d_results( num_keys); - thrust::device_vector> d_pairs( h_pairs ); + thrust::device_vector d_keys(h_keys); + thrust::device_vector d_results(num_keys); + thrust::device_vector> d_pairs(h_pairs); map.insert(d_pairs.begin(), d_pairs.end()); - - for(auto _ : state) { + + for (auto _ : state) { map.find(d_keys.begin(), d_keys.end(), d_results.begin()); } @@ -148,52 +142,62 @@ static void BM_static_map_search_all(::benchmark::State& state) { int64_t(state.range(0))); } - - BENCHMARK_TEMPLATE(BM_static_map_insert, int32_t, int32_t, dist_type::UNIQUE) ->Unit(benchmark::kMillisecond) + ->UseManualTime() ->Apply(generate_size_and_occupancy); BENCHMARK_TEMPLATE(BM_static_map_search_all, int32_t, int32_t, dist_type::UNIQUE) ->Unit(benchmark::kMillisecond) + ->UseManualTime() ->Apply(generate_size_and_occupancy); BENCHMARK_TEMPLATE(BM_static_map_insert, int32_t, int32_t, dist_type::UNIFORM) ->Unit(benchmark::kMillisecond) + ->UseManualTime() ->Apply(generate_size_and_occupancy); BENCHMARK_TEMPLATE(BM_static_map_search_all, int32_t, int32_t, dist_type::UNIFORM) ->Unit(benchmark::kMillisecond) + ->UseManualTime() ->Apply(generate_size_and_occupancy); BENCHMARK_TEMPLATE(BM_static_map_insert, int32_t, int32_t, dist_type::GAUSSIAN) ->Unit(benchmark::kMillisecond) + ->UseManualTime() ->Apply(generate_size_and_occupancy); BENCHMARK_TEMPLATE(BM_static_map_search_all, int32_t, int32_t, dist_type::GAUSSIAN) ->Unit(benchmark::kMillisecond) + ->UseManualTime() ->Apply(generate_size_and_occupancy); BENCHMARK_TEMPLATE(BM_static_map_insert, int64_t, int64_t, dist_type::UNIQUE) ->Unit(benchmark::kMillisecond) + ->UseManualTime() ->Apply(generate_size_and_occupancy); BENCHMARK_TEMPLATE(BM_static_map_search_all, int64_t, int64_t, dist_type::UNIQUE) ->Unit(benchmark::kMillisecond) + ->UseManualTime() ->Apply(generate_size_and_occupancy); BENCHMARK_TEMPLATE(BM_static_map_insert, int64_t, int64_t, dist_type::UNIFORM) ->Unit(benchmark::kMillisecond) + ->UseManualTime() ->Apply(generate_size_and_occupancy); BENCHMARK_TEMPLATE(BM_static_map_search_all, int64_t, int64_t, dist_type::UNIFORM) ->Unit(benchmark::kMillisecond) + ->UseManualTime() ->Apply(generate_size_and_occupancy); BENCHMARK_TEMPLATE(BM_static_map_insert, int64_t, int64_t, dist_type::GAUSSIAN) ->Unit(benchmark::kMillisecond) + ->UseManualTime() ->Apply(generate_size_and_occupancy); BENCHMARK_TEMPLATE(BM_static_map_search_all, int64_t, int64_t, dist_type::GAUSSIAN) ->Unit(benchmark::kMillisecond) + ->UseManualTime() ->Apply(generate_size_and_occupancy); \ No newline at end of file diff --git a/benchmarks/hash_table/static_reduction_map_bench.cu b/benchmarks/hash_table/static_reduction_map_bench.cu new file mode 100644 index 000000000..92a2ab788 --- /dev/null +++ b/benchmarks/hash_table/static_reduction_map_bench.cu @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "cuco/static_reduction_map.cuh" + +enum class dist_type { UNIQUE, UNIFORM, GAUSSIAN }; + +template +static void generate_keys(OutputIt output_begin, OutputIt output_end) +{ + auto num_keys = std::distance(output_begin, output_end); + + std::random_device rd; + std::mt19937 gen{rd()}; + + switch (Dist) { + case dist_type::UNIQUE: + for (auto i = 0; i < num_keys; ++i) { + output_begin[i] = i; + } + break; + case dist_type::UNIFORM: + for (auto i = 0; i < num_keys; ++i) { + output_begin[i] = std::abs(static_cast(gen())); + } + break; + case dist_type::GAUSSIAN: + std::normal_distribution<> dg{1e9, 1e7}; + for (auto i = 0; i < num_keys; ++i) { + output_begin[i] = std::abs(static_cast(dg(gen))); + } + break; + } +} + +/** + * @brief Generates input sizes and hash table occupancies + * + */ +static void generate_size_and_occupancy(benchmark::internal::Benchmark* b) +{ + for (auto size = 4096; size <= 1 << 28; size *= 2) { + for (auto occupancy = 60; occupancy <= 60; occupancy += 10) { + b->Args({size, occupancy}); + } + } +} + +template typename ReductionOp> +static void BM_static_map_insert(::benchmark::State& state) +{ + using map_type = cuco::static_reduction_map, Key, Value>; + + std::size_t num_keys = state.range(0); + float occupancy = state.range(1) / float{100}; + std::size_t size = num_keys / occupancy; + + std::vector h_keys(num_keys); + std::vector> h_pairs(num_keys); + + generate_keys(h_keys.begin(), h_keys.end()); + + thrust::device_vector d_keys(h_keys); + thrust::device_vector d_values(h_keys); + + auto pairs_begin = + thrust::make_zip_iterator(thrust::make_tuple(d_keys.begin(), d_values.begin())); + auto pairs_end = pairs_begin + num_keys; + + for (auto _ : state) { + map_type map{size, -1}; + { + cuda_event_timer raii{state}; + map.insert(pairs_begin, pairs_end); + } + } + + state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) * + int64_t(state.range(0))); +} + +BENCHMARK_TEMPLATE(BM_static_map_insert, int32_t, int32_t, dist_type::UNIQUE, cuco::reduce_add) + ->Unit(benchmark::kMillisecond) + ->UseManualTime() + ->Apply(generate_size_and_occupancy); + +BENCHMARK_TEMPLATE(BM_static_map_insert, int32_t, int32_t, dist_type::UNIFORM, cuco::reduce_add) + ->Unit(benchmark::kMillisecond) + ->UseManualTime() + ->Apply(generate_size_and_occupancy); + +BENCHMARK_TEMPLATE(BM_static_map_insert, int32_t, int32_t, dist_type::GAUSSIAN, cuco::reduce_add) + ->Unit(benchmark::kMillisecond) + ->UseManualTime() + ->Apply(generate_size_and_occupancy); + +BENCHMARK_TEMPLATE(BM_static_map_insert, int64_t, int64_t, dist_type::UNIQUE, cuco::reduce_add) + ->Unit(benchmark::kMillisecond) + ->UseManualTime() + ->Apply(generate_size_and_occupancy); + +BENCHMARK_TEMPLATE(BM_static_map_insert, int64_t, int64_t, dist_type::UNIFORM, cuco::reduce_add) + ->Unit(benchmark::kMillisecond) + ->UseManualTime() + ->Apply(generate_size_and_occupancy); + +BENCHMARK_TEMPLATE(BM_static_map_insert, int64_t, int64_t, dist_type::GAUSSIAN, cuco::reduce_add) + ->Unit(benchmark::kMillisecond) + ->UseManualTime() + ->Apply(generate_size_and_occupancy); \ No newline at end of file diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index a70b53da8..e840e1905 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -15,7 +15,7 @@ endif() function(ConfigureExample EXAMPLE_NAME EXAMPLE_SRC) add_executable(${EXAMPLE_NAME} "${EXAMPLE_SRC}") set_target_properties(${EXAMPLE_NAME} PROPERTIES - CUDA_ARCHITECTURES ${GPU_ARCHS} + CUDA_ARCHITECTURES "${GPU_ARCHS}" RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/examples") target_include_directories(${EXAMPLE_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}") diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 32d77b2a8..471a15a7a 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -39,7 +39,7 @@ function(ConfigureTest TEST_NAME TEST_SRC) $) # Link in the CatchMain object file target_link_libraries(${TEST_NAME} Catch2::Catch2 cuco) set_target_properties(${TEST_NAME} PROPERTIES - CUDA_ARCHITECTURES ${GPU_ARCHS} + CUDA_ARCHITECTURES "${GPU_ARCHS}" RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tests") target_compile_options(${TEST_NAME} PRIVATE --expt-extended-lambda --expt-relaxed-constexpr) catch_discover_tests(${TEST_NAME}) From 2a38d70c4fc01d98a324d627b41bff739b8f5a66 Mon Sep 17 00:00:00 2001 From: Jake Hemstad Date: Thu, 13 May 2021 11:57:30 -0500 Subject: [PATCH 27/69] Remove redundant ctor. --- include/cuco/detail/pair.cuh | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/include/cuco/detail/pair.cuh b/include/cuco/detail/pair.cuh index de75ad680..dfdf7632e 100644 --- a/include/cuco/detail/pair.cuh +++ b/include/cuco/detail/pair.cuh @@ -68,8 +68,8 @@ struct is_thrust_pair_like_impl : std::false_type { template struct is_thrust_pair_like_impl(std::declval())), - decltype(thrust::get<1>(std::declval()))>> + std::void_t(std::declval())), + decltype(thrust::get<1>(std::declval()))>> : std::conditional_t::value == 2, std::true_type, std::false_type> { }; @@ -116,10 +116,6 @@ struct alignas(detail::pair_alignment()) pair { thrust::get<1>(thrust::raw_reference_cast(t))} { } - __host__ __device__ constexpr pair(First const& f, Second const& s) noexcept - : first{f}, second{s} - { - } }; template From f2d1a2607c9e36b56bbac5baea47dc5066d78327 Mon Sep 17 00:00:00 2001 From: Jake Hemstad Date: Thu, 13 May 2021 11:57:59 -0500 Subject: [PATCH 28/69] Add initial static_reduction_map example. --- examples/CMakeLists.txt | 2 + examples/static_reduction_map.cu | 82 ++++++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+) create mode 100644 examples/static_reduction_map.cu diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index e840e1905..be1a760e6 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -28,3 +28,5 @@ endfunction(ConfigureExample) ################################################################################################### ConfigureExample(STATIC_MAP_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/static_map_example.cu") + +ConfigureExample(STATIC_REDUCTION_MAP_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_reduction_map.cu") diff --git a/examples/static_reduction_map.cu b/examples/static_reduction_map.cu new file mode 100644 index 000000000..c3921ad10 --- /dev/null +++ b/examples/static_reduction_map.cu @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +/** + * @file host_bulk_example.cu + * @brief Demonstrates usage of the static_map "bulk" host APIs. + * + * The bulk APIs are only invocable from the host and are used for doing operations like insert or + * find on a set of keys. + * + */ + +int main(void) +{ + using Key = int; + using Value = int; + + // Empty slots are represented by reserved "sentinel" values. These values should be selected such + // that they never occur in your input data. + Key const empty_key_sentinel = -1; + + // Number of key/value pairs to be inserted + std::size_t num_keys = 50'000; + + // Compute capacity based on a 50% load factor + auto const load_factor = 0.5; + std::size_t const capacity = std::ceil(num_keys / load_factor); + + // Constructs a map each key with "capacity" slots using -1 as the + // empty key sentinel. The initial payload value for empty slots is determined by the identity of + // the reduction operation. By using the `reduce_add` operation, all values associated with a + // given key will be summed. + cuco::static_reduction_map, Key, Value> map{capacity, empty_key_sentinel}; + + // Create a sequence of random keys in `[0, num_keys/2]` + thrust::device_vector insert_keys(num_keys); + thrust::transform(thrust::device, + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(insert_keys.size()), + insert_keys.begin(), + [=] __device__(auto i) { + thrust::default_random_engine rng(i); + thrust::uniform_int_distribution dist{std::size_t{0}, num_keys/2}; + return dist(rng); + }); + + // Insert each key with a payload of `1` to count the number of times each key was inserted by + // using the `reduce_add` op + auto zipped = thrust::make_zip_iterator( + thrust::make_tuple(insert_keys.begin(), thrust::make_constant_iterator(1))); + + // Inserts all pairs into the map, accumulating the payloads with the `reduce_add` operation + map.insert(zipped, zipped + insert_keys.size()); + + std::cout << "Num unique keys: " << map.get_size() << std::endl; + +} \ No newline at end of file From 3c797013e3504a3ce9396430f553be91d7a3bec0 Mon Sep 17 00:00:00 2001 From: Jake Hemstad Date: Thu, 13 May 2021 11:58:15 -0500 Subject: [PATCH 29/69] Remove cuda_memcmp header. --- include/cuco/static_reduction_map.cuh | 1 - 1 file changed, 1 deletion(-) diff --git a/include/cuco/static_reduction_map.cuh b/include/cuco/static_reduction_map.cuh index b13c1f5af..ebddaebc7 100644 --- a/include/cuco/static_reduction_map.cuh +++ b/include/cuco/static_reduction_map.cuh @@ -24,7 +24,6 @@ #include #include -#include #ifndef CUDART_VERSION #error CUDART_VERSION Undefined! #elif (CUDART_VERSION >= 11000) // including with CUDA 10.2 leads to compilation errors From 8261d939e6f962b10093b37fb58410c8a9d7223e Mon Sep 17 00:00:00 2001 From: Jake Hemstad Date: Wed, 19 May 2021 12:06:54 -0500 Subject: [PATCH 30/69] Add unsafe accessors to raw slots via reinterpret_cast. --- include/cuco/static_reduction_map.cuh | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/include/cuco/static_reduction_map.cuh b/include/cuco/static_reduction_map.cuh index ebddaebc7..3f98737c8 100644 --- a/include/cuco/static_reduction_map.cuh +++ b/include/cuco/static_reduction_map.cuh @@ -972,6 +972,19 @@ class static_reduction_map { } private: + /// Unsafe access to the slots stripping away their atomic-ness to allow non-atomic access. This + /// is a temporary solution until we have atomic_ref + value_type* raw_slots_begin() noexcept { return reinterpret_cast(slots_); } + + value_type const* raw_slots_begin() const noexcept + { + return reinterpret_cast(slots_); + } + + value_type* raw_slots_end() noexcept { return raw_slots_begin() + get_capacity(); } + + value_type const* raw_slots_end() const noexcept { return raw_slots_begin() + get_capacity(); } + pair_atomic_type* slots_{nullptr}; ///< Pointer to flat slots storage std::size_t capacity_{}; ///< Total number of slots std::size_t size_{}; ///< Number of keys in map From c6daa09029a9305203c7438376039a817185cad9 Mon Sep 17 00:00:00 2001 From: Jake Hemstad Date: Wed, 19 May 2021 12:07:20 -0500 Subject: [PATCH 31/69] Add retreive_all implementation. --- include/cuco/detail/static_reduction_map.inl | 39 ++++++++++++++++++++ include/cuco/static_reduction_map.cuh | 23 ++++++++++++ 2 files changed, 62 insertions(+) diff --git a/include/cuco/detail/static_reduction_map.inl b/include/cuco/detail/static_reduction_map.inl index 86b500e4e..bd9907ebc 100644 --- a/include/cuco/detail/static_reduction_map.inl +++ b/include/cuco/detail/static_reduction_map.inl @@ -112,6 +112,45 @@ void static_reduction_map::find( CUCO_CUDA_TRY(cudaDeviceSynchronize()); } +namespace detail { +template +struct slot_to_tuple { + template + __device__ thrust::tuple operator()(S const& s) + { + return thrust::tuple(s.first, s.second); + } +}; + +template +struct slot_is_filled { + Key empty_key_sentinel; + template + __device__ bool operator()(S const& s) + { + return thrust::get<0>(s) != empty_key_sentinel; + } +}; +} // namespace detail + +template +template +void static_reduction_map::retrieve_all( + KeyOut keys_out, ValueOut values_out) +{ + // Convert pair_type to thrust::tuple to allow assigning to a zip iterator + auto begin = thrust::make_transform_iterator(raw_slots_begin(), detail::slot_to_tuple{}); + auto end = begin + get_capacity(); + auto filled = detail::slot_is_filled{get_empty_key_sentinel()}; + auto zipped_out = thrust::make_zip_iterator(thrust::make_tuple(keys_out, values_out)); + + thrust::copy_if(thrust::device, begin, end, zipped_out, filled); +} + template #include #include +#include +#include +#include +#include #include #ifndef CUDART_VERSION @@ -276,6 +280,25 @@ class static_reduction_map { Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}) noexcept; + /** + * @brief Retrieves all of the keys and their associated values. + * + * The order in which keys are returned is implementation defined and not guaranteed to be + * consistent between subsequent calls to `retrieve_all`. + * + * Behavior is undefined if the range beginning at `keys_out` or `values_out` is not large enough + * to contain the number of keys in the map. + * + * @tparam KeyOut Device accessible random access output iterator whose `value_type` is + * convertible from `key_type`. + * @tparam ValueOut Device accesible random access output iterator whose `value_type` is + * convertible from `mapped_type`. + * @param keys_out Beginning output iterator for keys + * @param values_out Beginning output iterator for values + */ + template + void retrieve_all(KeyOut keys_out, ValueOut values_out); + /** * @brief Indicates whether the keys in the range `[first, last)` are contained in the map. * From 62a99ab6fb9f28614624879168eba468dcc648de Mon Sep 17 00:00:00 2001 From: Jake Hemstad Date: Wed, 19 May 2021 12:07:33 -0500 Subject: [PATCH 32/69] Add retrieve_all to example. --- examples/static_reduction_map.cu | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/examples/static_reduction_map.cu b/examples/static_reduction_map.cu index c3921ad10..f152ceb78 100644 --- a/examples/static_reduction_map.cu +++ b/examples/static_reduction_map.cu @@ -45,7 +45,7 @@ int main(void) Key const empty_key_sentinel = -1; // Number of key/value pairs to be inserted - std::size_t num_keys = 50'000; + std::size_t num_keys = 257; // Compute capacity based on a 50% load factor auto const load_factor = 0.5; @@ -64,8 +64,9 @@ int main(void) thrust::make_counting_iterator(insert_keys.size()), insert_keys.begin(), [=] __device__(auto i) { - thrust::default_random_engine rng(i); - thrust::uniform_int_distribution dist{std::size_t{0}, num_keys/2}; + thrust::default_random_engine rng; + thrust::uniform_int_distribution dist{0, 10}; + rng.discard(i); return dist(rng); }); @@ -79,4 +80,12 @@ int main(void) std::cout << "Num unique keys: " << map.get_size() << std::endl; + thrust::device_vector unique_keys(map.get_size()); + thrust::device_vector count_per_key(map.get_size()); + + map.retrieve_all(unique_keys.begin(), count_per_key.begin()); + + for (int i = 0; i < unique_keys.size(); ++i) { + std::cout << "Key: " << unique_keys[i] << " Count: " << count_per_key[i] << std::endl; + } } \ No newline at end of file From c1fe449e1aaf68c15393e777774032a0740859de Mon Sep 17 00:00:00 2001 From: Daniel Juenger Date: Sun, 1 Aug 2021 21:07:17 +0000 Subject: [PATCH 33/69] Sync static_reduction_map with latest changes in static_map. --- examples/static_reduction_map.cu | 18 +- include/cuco/detail/static_reduction_map.inl | 67 +++--- .../detail/static_reduction_map_kernels.cuh | 43 ++-- include/cuco/detail/traits.hpp | 54 +++++ include/cuco/static_reduction_map.cuh | 195 ++++++++++++------ 5 files changed, 256 insertions(+), 121 deletions(-) create mode 100644 include/cuco/detail/traits.hpp diff --git a/examples/static_reduction_map.cu b/examples/static_reduction_map.cu index f152ceb78..8d3839658 100644 --- a/examples/static_reduction_map.cu +++ b/examples/static_reduction_map.cu @@ -27,14 +27,12 @@ #include /** - * @file host_bulk_example.cu - * @brief Demonstrates usage of the static_map "bulk" host APIs. + * @brief Demonstrates usage of the static_reduction_map "bulk" host APIs. * * The bulk APIs are only invocable from the host and are used for doing operations like insert or * find on a set of keys. * */ - int main(void) { using Key = int; @@ -45,11 +43,14 @@ int main(void) Key const empty_key_sentinel = -1; // Number of key/value pairs to be inserted - std::size_t num_keys = 257; + std::size_t const num_elems = 256; + + // average number of values per distinct key + std::size_t const multiplicity = 4; // Compute capacity based on a 50% load factor auto const load_factor = 0.5; - std::size_t const capacity = std::ceil(num_keys / load_factor); + std::size_t const capacity = std::ceil(num_elems / load_factor); // Constructs a map each key with "capacity" slots using -1 as the // empty key sentinel. The initial payload value for empty slots is determined by the identity of @@ -57,15 +58,16 @@ int main(void) // given key will be summed. cuco::static_reduction_map, Key, Value> map{capacity, empty_key_sentinel}; - // Create a sequence of random keys in `[0, num_keys/2]` - thrust::device_vector insert_keys(num_keys); + // Create a sequence of random keys + thrust::device_vector insert_keys(num_elems); thrust::transform(thrust::device, thrust::make_counting_iterator(0), thrust::make_counting_iterator(insert_keys.size()), insert_keys.begin(), [=] __device__(auto i) { thrust::default_random_engine rng; - thrust::uniform_int_distribution dist{0, 10}; + thrust::uniform_int_distribution dist( + Key{1}, static_cast(num_elems / multiplicity)); rng.discard(i); return dist(rng); }); diff --git a/include/cuco/detail/static_reduction_map.inl b/include/cuco/detail/static_reduction_map.inl index bd9907ebc..dcb385ae0 100644 --- a/include/cuco/detail/static_reduction_map.inl +++ b/include/cuco/detail/static_reduction_map.inl @@ -14,6 +14,8 @@ * limitations under the License. */ +#include + namespace cuco { /**---------------------------------------------------------------------------* @@ -23,7 +25,7 @@ namespace cuco { enum class insert_result { CONTINUE, ///< Insert did not succeed, continue trying to insert SUCCESS, ///< New pair inserted successfully - DUPLICATE ///< Insert did not succeed, key is already present + DUPLICATE ///< Key is already present }; template ::static_reductio op_{reduction_op}, slot_allocator_{alloc} { - slots_ = std::allocator_traits::allocate(slot_allocator_, capacity); + slots_ = std::allocator_traits::allocate(slot_allocator_, capacity_); auto constexpr block_size = 256; auto constexpr stride = 4; - auto const grid_size = (capacity + stride * block_size - 1) / (stride * block_size); - detail::initialize<<>>( + auto const grid_size = (capacity_ + stride * block_size - 1) / (stride * block_size); + detail::initialize<<>>( slots_, get_empty_key_sentinel(), get_empty_value_sentinel(), get_capacity()); CUCO_CUDA_TRY(cudaMallocManaged(&num_successes_, sizeof(atomic_ctr_type))); @@ -58,7 +60,7 @@ template ::~static_reduction_map() { std::allocator_traits::deallocate(slot_allocator_, slots_, capacity_); - CUCO_CUDA_TRY(cudaFree(num_successes_)); + CUCO_ASSERT_CUDA_SUCCESS(cudaFree(num_successes_)); } template ::insert(Inp Hash hash, KeyEqual key_equal) { - auto num_keys = std::distance(first, last); + auto num_keys = std::distance(first, last); + if (num_keys == 0) { return; } + auto const block_size = 128; auto const stride = 1; auto const tile_size = 4; @@ -98,9 +102,11 @@ template template void static_reduction_map::find( - InputIt first, InputIt last, OutputIt output_begin, Hash hash, KeyEqual key_equal) noexcept + InputIt first, InputIt last, OutputIt output_begin, Hash hash, KeyEqual key_equal) { - auto num_keys = std::distance(first, last); + auto num_keys = std::distance(first, last); + if (num_keys == 0) { return; } + auto const block_size = 128; auto const stride = 1; auto const tile_size = 4; @@ -143,7 +149,8 @@ void static_reduction_map::retrieve_a KeyOut keys_out, ValueOut values_out) { // Convert pair_type to thrust::tuple to allow assigning to a zip iterator - auto begin = thrust::make_transform_iterator(raw_slots_begin(), detail::slot_to_tuple{}); + auto begin = + thrust::make_transform_iterator(raw_slots_begin(), detail::slot_to_tuple{}); auto end = begin + get_capacity(); auto filled = detail::slot_is_filled{get_empty_key_sentinel()}; auto zipped_out = thrust::make_zip_iterator(thrust::make_tuple(keys_out, values_out)); @@ -158,9 +165,11 @@ template template void static_reduction_map::contains( - InputIt first, InputIt last, OutputIt output_begin, Hash hash, KeyEqual key_equal) noexcept + InputIt first, InputIt last, OutputIt output_begin, Hash hash, KeyEqual key_equal) { - auto num_keys = std::distance(first, last); + auto num_keys = std::distance(first, last); + if (num_keys == 0) { return; } + auto const block_size = 128; auto const stride = 1; auto const tile_size = 4; @@ -178,7 +187,7 @@ template template -__device__ Value +__device__ bool static_reduction_map::device_mutable_view::insert( value_type const& insert_pair, Hash hash, KeyEqual key_equal) noexcept { @@ -195,7 +204,10 @@ static_reduction_map::device_mutable_ slot_key.compare_exchange_strong(expected_key, insert_pair.first, memory_order_relaxed); if (key_success or key_equal(insert_pair.first, expected_key)) { - return this->get_op().apply(slot_value, insert_pair.second); + this->get_op().apply(slot_value, insert_pair.second); + + // only return true if a new has been inserted + return key_success; } // if we couldn't insert the key, but it wasn't a duplicate, then there must @@ -212,7 +224,7 @@ template __device__ bool static_reduction_map::device_mutable_view::insert( - CG g, value_type const& insert_pair, Hash hash, KeyEqual key_equal) noexcept + CG const& g, value_type const& insert_pair, Hash hash, KeyEqual key_equal) noexcept { auto current_slot = initial_slot(g, insert_pair.first, hash); @@ -224,7 +236,7 @@ static_reduction_map::device_mutable_ // The user provided `key_equal` should never be used to compare against `empty_key_sentinel` as // the sentinel is not a valid key value. Therefore, first check for the sentinel // TODO: Use memcmp - auto const slot_is_empty = (current_key == this->get_empty_key_sentinel()); + auto const slot_is_empty = detail::bitwise_compare(current_key, this->get_empty_key_sentinel()); auto const key_exists = not slot_is_empty and key_equal(current_key, insert_pair.first); @@ -287,7 +299,9 @@ __device__ while (true) { auto const existing_key = current_slot->first.load(cuda::std::memory_order_relaxed); // Key doesn't exist, return end() - if (existing_key == this->get_empty_key_sentinel()) { return this->end(); } + if (detail::bitwise_compare(existing_key, this->get_empty_key_sentinel())) { + return this->end(); + } // Key exists, return iterator to location if (key_equal(existing_key, k)) { return current_slot; } @@ -312,7 +326,9 @@ __device__ typename static_reduction_mapfirst.load(cuda::std::memory_order_relaxed); // Key doesn't exist, return end() - if (existing_key == this->get_empty_key_sentinel()) { return this->end(); } + if (detail::bitwise_compare(existing_key, this->get_empty_key_sentinel())) { + return this->end(); + } // Key exists, return iterator to location if (key_equal(existing_key, k)) { return current_slot; } @@ -330,7 +346,7 @@ template __device__ typename static_reduction_map::device_view::iterator static_reduction_map::device_view::find( - CG g, Key const& k, Hash hash, KeyEqual key_equal) noexcept + CG const& g, Key const& k, Hash hash, KeyEqual key_equal) noexcept { auto current_slot = initial_slot(g, k, hash); @@ -339,7 +355,8 @@ __device__ // The user provide `key_equal` can never be used to compare against `empty_key_sentinel` as // the sentinel is not a valid key value. Therefore, first check for the sentinel - auto const slot_is_empty = (existing_key == this->get_empty_key_sentinel()); + auto const slot_is_empty = + detail::bitwise_compare(existing_key, this->get_empty_key_sentinel()); // the key we were searching for was found by one of the threads, // so we return an iterator to the entry @@ -370,7 +387,7 @@ template __device__ typename static_reduction_map::device_view:: const_iterator static_reduction_map::device_view::find( - CG g, Key const& k, Hash hash, KeyEqual key_equal) const noexcept + CG const& g, Key const& k, Hash hash, KeyEqual key_equal) const noexcept { auto current_slot = initial_slot(g, k, hash); @@ -379,7 +396,8 @@ __device__ typename static_reduction_mapget_empty_key_sentinel()); + auto const slot_is_empty = + detail::bitwise_compare(existing_key, this->get_empty_key_sentinel()); // the key we were searching for was found by one of the threads, so we return an iterator to // the entry @@ -418,7 +436,7 @@ static_reduction_map::device_view::co while (true) { auto const existing_key = current_slot->first.load(cuda::std::memory_order_relaxed); - if (existing_key == empty_key_sentinel_) { return false; } + if (detail::bitwise_compare(existing_key, empty_key_sentinel_)) { return false; } if (key_equal(existing_key, k)) { return true; } @@ -434,7 +452,7 @@ template __device__ bool static_reduction_map::device_view::contains( - CG g, Key const& k, Hash hash, KeyEqual key_equal) noexcept + CG const& g, Key const& k, Hash hash, KeyEqual key_equal) noexcept { auto current_slot = initial_slot(g, k, hash); @@ -443,7 +461,8 @@ static_reduction_map::device_view::co // The user provide `key_equal` can never be used to compare against `empty_key_sentinel` as // the sentinel is not a valid key value. Therefore, first check for the sentinel - auto const slot_is_empty = (existing_key == this->get_empty_key_sentinel()); + auto const slot_is_empty = + detail::bitwise_compare(existing_key, this->get_empty_key_sentinel()); // the key we were searching for was found by one of the threads, so we return an iterator to // the entry diff --git a/include/cuco/detail/static_reduction_map_kernels.cuh b/include/cuco/detail/static_reduction_map_kernels.cuh index 9849efb44..93c86d5ff 100644 --- a/include/cuco/detail/static_reduction_map_kernels.cuh +++ b/include/cuco/detail/static_reduction_map_kernels.cuh @@ -34,18 +34,19 @@ namespace cg = cooperative_groups; * @param v Value to which all values in `slots` are initialized * @param size Size of the storage pointed to by `slots` */ -template __global__ void initialize(pair_atomic_type* const slots, Key k, Value v, std::size_t size) { - auto tid = threadIdx.x + blockIdx.x * blockDim.x; + auto tid = block_size * blockIdx.x + threadIdx.x; while (tid < size) { new (&slots[tid].first) atomic_key_type{k}; new (&slots[tid].second) atomic_mapped_type{v}; - tid += gridDim.x * blockDim.x; + tid += gridDim.x * block_size; } } @@ -69,7 +70,7 @@ __global__ void initialize(pair_atomic_type* const slots, Key k, Value v, std::s * @param hash The unary function to apply to hash each key * @param key_equal The binary function used to compare two keys for equality */ -template (cg::this_thread_block()); - auto tid = blockDim.x * blockIdx.x + threadIdx.x; + auto tid = block_size * blockIdx.x + threadIdx.x; auto it = first + tid / tile_size; while (it < last) { @@ -149,7 +150,7 @@ __global__ void insert( if (view.insert(tile, insert_pair, hash, key_equal) && tile.thread_rank() == 0) { thread_num_successes++; } - it += (gridDim.x * blockDim.x) / tile_size; + it += (gridDim.x * block_size) / tile_size; } // compute number of successfully inserted elements for each block @@ -179,7 +180,7 @@ __global__ void insert( * @param hash The unary function to apply to hash each key * @param key_equal The binary function to compare two keys for equality */ -template second.load(cuda::std::memory_order_relaxed); __syncthreads(); *(output_begin + key_idx) = writeBuffer[threadIdx.x]; - key_idx += gridDim.x * blockDim.x; + key_idx += gridDim.x * block_size; } } @@ -237,7 +238,7 @@ __global__ void find( * @param hash The unary function to apply to hash each key * @param key_equal The binary function to compare two keys for equality */ -template (cg::this_thread_block()); - auto tid = blockDim.x * blockIdx.x + threadIdx.x; + auto tid = block_size * blockIdx.x + threadIdx.x; auto key_idx = tid / tile_size; __shared__ Value writeBuffer[block_size]; @@ -271,7 +272,7 @@ __global__ void find( if (tile.thread_rank() == 0) { *(output_begin + key_idx) = writeBuffer[threadIdx.x / tile_size]; } - key_idx += (gridDim.x * blockDim.x) / tile_size; + key_idx += (gridDim.x * block_size) / tile_size; } } @@ -295,7 +296,7 @@ __global__ void find( * @param hash The unary function to apply to hash each key * @param key_equal The binary function to compare two keys for equality */ -template (cg::this_thread_block()); - auto tid = blockDim.x * blockIdx.x + threadIdx.x; + auto tid = block_size * blockIdx.x + threadIdx.x; auto key_idx = tid / tile_size; __shared__ bool writeBuffer[block_size]; @@ -381,7 +382,7 @@ __global__ void contains( if (tile.thread_rank() == 0) { *(output_begin + key_idx) = writeBuffer[threadIdx.x / tile_size]; } - key_idx += (gridDim.x * blockDim.x) / tile_size; + key_idx += (gridDim.x * block_size) / tile_size; } } diff --git a/include/cuco/detail/traits.hpp b/include/cuco/detail/traits.hpp new file mode 100644 index 000000000..53ef38433 --- /dev/null +++ b/include/cuco/detail/traits.hpp @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + */ + +#pragma once + +namespace cuco { +/** + * @brief Customization point that can be specialized to indicate that it is safe to perform bitwise + * equality comparisons on objects of type `T`. + * + * By default, only types where `std::has_unique_object_representations_v` is true are safe for + * bitwise equality. However, this can be too restrictive for some types, e.g., floating point + * types. + * + * User-defined specializations of `is_bitwise_comparable` are allowed, but it is the users + * responsibility to ensure values do not occur that would lead to unexpected behavior. For example, + * if a `NaN` bit pattern were used as the empty sentinel value, it may not compare bitwise equal to + * other `NaN` bit patterns. + * + */ +template +struct is_bitwise_comparable : std::false_type { +}; + +/// By default, only types with unique object representations are allowed +template +struct is_bitwise_comparable>> + : std::true_type { +}; + +/** + * @brief Declares that a type `Type` is bitwise comparable. + * + */ +#define CUCO_DECLARE_BITWISE_COMPARABLE(Type) \ + namespace cuco { \ + template <> \ + struct is_bitwise_comparable : std::true_type { \ + }; \ + } + +} // namespace cuco \ No newline at end of file diff --git a/include/cuco/static_reduction_map.cuh b/include/cuco/static_reduction_map.cuh index c66958fb8..56c62bde8 100644 --- a/include/cuco/static_reduction_map.cuh +++ b/include/cuco/static_reduction_map.cuh @@ -17,20 +17,24 @@ #pragma once #include +#include #include #include -#include -#include -#include -#include #include #include #include +#include +#include +#include #include -#ifndef CUDART_VERSION -#error CUDART_VERSION Undefined! -#elif (CUDART_VERSION >= 11000) // including with CUDA 10.2 leads to compilation errors + +#if defined(CUDART_VERSION) && (CUDART_VERSION >= 11000) && defined(__CUDA_ARCH__) && \ + (__CUDA_ARCH__ >= 700) +#define CUCO_HAS_CUDA_BARRIER +#endif + +#if defined(CUCO_HAS_CUDA_BARRIER) #include #endif @@ -38,6 +42,7 @@ #include #include #include +#include namespace cuco { @@ -107,17 +112,18 @@ struct custom_op { /** * @brief A GPU-accelerated, unordered, associative container of key-value - * pairs with unique keys. + * pairs that reduces the values associated to the same key according to a + * functor. * * Allows constant time concurrent inserts or concurrent find operations (not * concurrent insert and find) from threads in device code. * * Current limitations: - * - Requires keys that are Arithmetic + * - Requires key types where `cuco::is_bitwise_comparable::value` is true * - Does not support erasing keys * - Capacity is fixed and will not grow automatically - * - Requires the user to specify sentinel values for both key and mapped value - * to indicate empty slots + * - Requires the user to specify sentinel value for the key to indicate empty + * slots * - Does not support concurrent insert and find operations * * The `static_reduction_map` supports two types of operations: @@ -129,7 +135,7 @@ struct custom_op { * in the map. For example, given a range of keys specified by device-accessible * iterators, the bulk `insert` function will insert all keys into the map. * - * The singular device-side operations allow individual threads to to perform + * The singular device-side operations allow individual threads to perform * independent insert or find/contains operations from device code. These * operations are accessed through non-owning, trivially copyable "view" types: * `device_view` and `mutable_device_view`. The `device_view` class is an @@ -138,26 +144,51 @@ struct custom_op { * The two types are separate to prevent erroneous concurrent insert/find * operations. * - * Example: - * \code{.cpp} - * int empty_key_sentinel = -1; - * int empty_value_sentine = -1; + * Example: + * \code{.cpp} + * + * // Empty slots are represented by reserved "sentinel" values. These values should be selected + * such + * // that they never occur in your input data. + * int const empty_key_sentinel = -1; + * + * // Number of key/value pairs to be inserted + * std::size_t const num_elems = 256; + * + * // average number of values per distinct key + * std::size_t const multiplicity = 4; + * + * // Compute capacity based on a 50% load factor + * auto const load_factor = 0.5; + * std::size_t const capacity = std::ceil(num_elems / load_factor); * - * // Constructs a map with 100,000 slots using -1 and -1 as the empty key/value - * // sentinels. Note the capacity is chosen knowing we will insert 50,000 keys, - * // for an load factor of 50%. - * static_reduction_map m{100'000, empty_key_sentinel, empty_value_sentinel}; + * // Constructs a map each key with "capacity" slots using -1 as the + * // empty key sentinel. The initial payload value for empty slots is determined by the identity of + * // the reduction operation. By using the `reduce_add` operation, all values associated with a + * // given key will be summed. + * cuco::static_reduction_map, int, int> map{capacity, empty_key_sentinel}; * - * // Create a sequence of pairs {{0,0}, {1,1}, ... {i,i}} - * thrust::device_vector> pairs(50,000); - * thrust::transform(thrust::make_counting_iterator(0), - * thrust::make_counting_iterator(pairs.size()), - * pairs.begin(), - * []__device__(auto i){ return thrust::make_pair(i,i); }; + * // Create a sequence of random keys + * thrust::device_vector insert_keys(num_elems); + * thrust::transform(thrust::device, + * thrust::make_counting_iterator(0), + * thrust::make_counting_iterator(insert_keys.size()), + * insert_keys.begin(), + * [=] __device__(auto i) { + * thrust::default_random_engine rng; + * thrust::uniform_int_distribution dist( + * int{1}, static_cast(num_elems / multiplicity)); + * rng.discard(i); + * return dist(rng); + * }); * + * // Insert each key with a payload of `1` to count the number of times each key was inserted by + * // using the `reduce_add` op + * auto zipped = thrust::make_zip_iterator( + * thrust::make_tuple(insert_keys.begin(), thrust::make_constant_iterator(1))); * - * // Inserts all pairs into the map - * m.insert(pairs.begin(), pairs.end()); + * // Inserts all pairs into the map, accumulating the payloads with the `reduce_add` operation + * map.insert(zipped, zipped + insert_keys.size()); * * // Get a `device_view` and passes it to a kernel where threads may perform * // `find/contains` lookups @@ -177,7 +208,11 @@ template > class static_reduction_map { - static_assert(std::is_arithmetic::value, "Unsupported, non-arithmetic key type."); + static_assert( + is_bitwise_comparable::value, + "Key type must have unique object representations or have been explicitly declared as safe for " + "bitwise comparison via specialization of cuco::is_bitwise_comparable."); + static_assert(std::is_same::value, "Type mismatch between ReductionOp::value_type and Value"); @@ -193,32 +228,38 @@ class static_reduction_map { using slot_allocator_type = typename std::allocator_traits::rebind_alloc; +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700) + static_assert(atomic_key_type::is_always_lock_free, + "A key type larger than 8B is supported for only sm_70 and up."); + static_assert(atomic_mapped_type::is_always_lock_free, + "A value type larger than 8B is supported for only sm_70 and up."); +#endif + static_reduction_map(static_reduction_map const&) = delete; static_reduction_map(static_reduction_map&&) = delete; static_reduction_map& operator=(static_reduction_map const&) = delete; static_reduction_map& operator=(static_reduction_map&&) = delete; /** - * @brief Construct a fixed-size map with the specified capacity and sentinel values. + * @brief Construct a fixed-size map with the specified capacity and sentinel key. * @brief Construct a statically sized map with the specified number of slots - * and sentinel values. + * and sentinel key. * * The capacity of the map is fixed. Insert operations will not automatically * grow the map. Attempting to insert more unique keys than the capacity of - * the map results in undefined behavior. + * the map results in undefined behavior (there should be at least one empty slot). * * Performance begins to degrade significantly beyond a load factor of ~70%. * For best performance, choose a capacity that will keep the load factor * below 70%. E.g., if inserting `N` unique keys, choose a capacity of * `N * (1/0.7)`. * - * The `empty_key_sentinel` and `empty_value_sentinel` values are reserved and - * undefined behavior results from attempting to insert any key/value pair - * that contains either. + * The `empty_key_sentinel` is reserved and undefined behaviour results from + * attempting to insert said key. * * @param capacity The total number of slots in the map * @param empty_key_sentinel The reserved key value for empty slots - * @param empty_value_sentinel The reserved mapped value for empty slots + * @param reduction_op Reduction operator * @param alloc Allocator used for allocating device storage */ static_reduction_map(std::size_t capacity, @@ -235,9 +276,6 @@ class static_reduction_map { /** * @brief Inserts all key/value pairs in the range `[first, last)`. * - * If multiple keys in `[first, last)` compare equal, it is unspecified which - * element is inserted. - * * @tparam InputIt Device accessible input iterator whose `value_type` is * convertible to the map's `value_type` * @tparam Hash Unary callable type @@ -278,7 +316,7 @@ class static_reduction_map { InputIt last, OutputIt output_begin, Hash hash = Hash{}, - KeyEqual key_equal = KeyEqual{}) noexcept; + KeyEqual key_equal = KeyEqual{}); /** * @brief Retrieves all of the keys and their associated values. @@ -324,7 +362,7 @@ class static_reduction_map { InputIt last, OutputIt output_begin, Hash hash = Hash{}, - KeyEqual key_equal = KeyEqual{}) noexcept; + KeyEqual key_equal = KeyEqual{}); private: class device_view_base { @@ -360,7 +398,7 @@ class static_reduction_map { * @brief Gets the binary op * */ - __device__ ReductionOp get_op() const { return op_; } + __device__ ReductionOp get_op() const noexcept { return op_; } /** * @brief Gets slots array. @@ -417,7 +455,7 @@ class static_reduction_map { * @return Pointer to the initial slot for `k` */ template - __device__ iterator initial_slot(CG g, Key const& k, Hash hash) noexcept + __device__ iterator initial_slot(CG const& g, Key const& k, Hash hash) noexcept { return &slots_[(hash(k) + g.thread_rank()) % capacity_]; } @@ -435,7 +473,7 @@ class static_reduction_map { * @return Pointer to the initial slot for `k` */ template - __device__ const_iterator initial_slot(CG g, Key const& k, Hash hash) const noexcept + __device__ const_iterator initial_slot(CG const& g, Key const& k, Hash hash) const noexcept { return &slots_[(hash(k) + g.thread_rank()) % capacity_]; } @@ -475,7 +513,7 @@ class static_reduction_map { * @return The next slot after `s` */ template - __device__ iterator next_slot(CG g, iterator s) noexcept + __device__ iterator next_slot(CG const& g, iterator s) noexcept { uint32_t index = s - slots_; return &slots_[(index + g.size()) % capacity_]; @@ -493,7 +531,7 @@ class static_reduction_map { * @return The next slot after `s` */ template - __device__ const_iterator next_slot(CG g, const_iterator s) const noexcept + __device__ const_iterator next_slot(CG const& g, const_iterator s) const noexcept { uint32_t index = s - slots_; return &slots_[(index + g.size()) % capacity_]; @@ -599,7 +637,7 @@ class static_reduction_map { * * Example: * \code{.cpp} - * cuco::static_reduction_map m{100'000, -1, -1}; + * cuco::static_reduction_mapint,int> m{100'000, -1}; * * // Inserts a sequence of pairs {{0,0}, {1,1}, ... {i,i}} * thrust::for_each(thrust::make_counting_iterator(0), @@ -635,7 +673,17 @@ class static_reduction_map { : device_view_base{slots, capacity, empty_key_sentinel, reduction_op} { } - + template + __device__ static device_mutable_view make_from_uninitialized_slots( + CG const& g, + pair_atomic_type* slots, + std::size_t capacity, + Key empty_key_sentinel, + ReductionOp reduction_op) noexcept + { + device_view_base::initialize_slots(g, slots, capacity, empty_key_sentinel, reduction_op); + return device_mutable_view{slots, capacity, empty_key_sentinel, reduction_op}; + } /** * @brief Inserts the specified key/value pair into the map. * @@ -649,13 +697,13 @@ class static_reduction_map { * @param hash The unary callable used to hash the key * @param key_equal The binary callable used to compare two keys for * equality - * @return `true` if the insert was successful, `false` otherwise. + * @return `true` if the insert (of a new key) was successful, `false` otherwise. */ template , typename KeyEqual = thrust::equal_to> - __device__ Value insert(value_type const& insert_pair, - Hash hash = Hash{}, - KeyEqual key_equal = KeyEqual{}) noexcept; + __device__ bool insert(value_type const& insert_pair, + Hash hash = Hash{}, + KeyEqual key_equal = KeyEqual{}) noexcept; /** * @brief Inserts the specified key/value pair into the map. * @@ -666,7 +714,7 @@ class static_reduction_map { * significant boost in throughput compared to the non Cooperative Group * `insert` at moderate to high load factors. * - * @tparam Cooperative Group type + * @tparam CG Cooperative Group type * @tparam Hash Unary callable type * @tparam KeyEqual Binary callable type * @@ -675,16 +723,15 @@ class static_reduction_map { * @param hash The unary callable used to hash the key * @param key_equal The binary callable used to compare two keys for * equality - * @return `true` if the insert was successful, `false` otherwise. + * @return `true` if the insert (of a new key) was successful, `false` otherwise. */ template , typename KeyEqual = thrust::equal_to> - __device__ bool insert(CG g, + __device__ bool insert(CG const& g, value_type const& insert_pair, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}) noexcept; - }; // class device mutable view /** @@ -710,8 +757,7 @@ class static_reduction_map { * @param capacity The number of slots viewed by this object * @param empty_key_sentinel The reserved value for keys to represent empty * slots - * @param empty_value_sentinel The reserved value for mapped values to - * represent empty slots + * @param reduction_op The reduction functor */ __host__ __device__ device_view(pair_atomic_type* slots, std::size_t capacity, @@ -721,6 +767,19 @@ class static_reduction_map { { } + /** + * @brief Construct a `device_view` from a `device_mutable_view` object + * + * @param mutable_map object of type `device_mutable_view` + */ + __host__ __device__ explicit device_view(device_mutable_view mutable_map) + : device_view_base{mutable_map.get_slots(), + mutable_map.get_capacity(), + mutable_map.get_empty_key_sentinel(), + mutable_map.get_op()} + { + } + /** * @brief Makes a copy of given `device_view` using non-owned memory. * @@ -752,20 +811,18 @@ class static_reduction_map { * @endcode * * @tparam CG The type of the cooperative thread group - * @param g The ooperative thread group used to copy the slots + * @param g The cooperative thread group used to copy the slots * @param source_device_view `device_view` to copy from * @param memory_to_use Array large enough to support `capacity` elements. Object does not take * the ownership of the memory * @return Copy of passed `device_view` */ template - __device__ static device_view make_copy(CG g, + __device__ static device_view make_copy(CG const& g, pair_atomic_type* const memory_to_use, device_view source_device_view) noexcept { -#ifndef CUDART_VERSION -#error CUDART_VERSION Undefined! -#elif (CUDART_VERSION >= 11000) +#if defined(CUDA_HAS_CUDA_BARRIER) __shared__ cuda::barrier barrier; if (g.thread_rank() == 0) { init(&barrier, g.size()); } g.sync(); @@ -791,7 +848,7 @@ class static_reduction_map { return device_view(memory_to_use, source_device_view.get_capacity(), source_device_view.get_empty_key_sentinel(), - source_device_view.get_empty_value_sentinel()); + source_device_view.get_op()); } /** @@ -859,7 +916,7 @@ class static_reduction_map { typename Hash = cuco::detail::MurmurHash3_32, typename KeyEqual = thrust::equal_to> __device__ iterator - find(CG g, Key const& k, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}) noexcept; + find(CG const& g, Key const& k, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}) noexcept; /** * @brief Finds the value corresponding to the key `k`. @@ -884,8 +941,10 @@ class static_reduction_map { template , typename KeyEqual = thrust::equal_to> - __device__ const_iterator - find(CG g, Key const& k, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}) const noexcept; + __device__ const_iterator find(CG const& g, + Key const& k, + Hash hash = Hash{}, + KeyEqual key_equal = KeyEqual{}) const noexcept; /** * @brief Indicates whether the key `k` was inserted into the map. @@ -931,7 +990,7 @@ class static_reduction_map { template , typename KeyEqual = thrust::equal_to> - __device__ bool contains(CG g, + __device__ bool contains(CG const& g, Key const& k, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}) noexcept; From fb9c0ecf7f09cf9d5f526f8315236656e476fa28 Mon Sep 17 00:00:00 2001 From: Daniel Juenger Date: Sun, 1 Aug 2021 21:09:07 +0000 Subject: [PATCH 34/69] Tests for static_reduction_map added. --- tests/CMakeLists.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 40bd2b30a..45435b14e 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -45,3 +45,8 @@ set(DYNAMIC_MAP_TEST_SRC ConfigureTest(DYNAMIC_MAP_TEST "${DYNAMIC_MAP_TEST_SRC}") #################################################################################################### +set(STATIC_REDUCTION_MAP_TEST_SRC + "${CMAKE_CURRENT_SOURCE_DIR}/static_reduction_map/static_reduction_map_test.cu") + +ConfigureTest(STATIC_REDUCTION_MAP_TEST "${STATIC_REDUCTION_MAP_TEST_SRC}") +#################################################################################################### From e8e54611c8909be966bfb7db526594fd30a370fc Mon Sep 17 00:00:00 2001 From: Daniel Juenger Date: Sun, 1 Aug 2021 21:15:02 +0000 Subject: [PATCH 35/69] Benchmarks for static_reduction_map added + reduce-by-key performance comparison (cuco, Thrust, CUB). --- benchmarks/CMakeLists.txt | 48 ++++- .../hash_table/static_reduction_map_bench.cu | 202 +++++++++--------- benchmarks/key_generator.hpp | 164 ++++++++++++++ .../reduce_by_key/cub_reduce_by_key_bench.cu | 119 +++++++++++ .../reduce_by_key/cuco_reduce_by_key_bench.cu | 151 +++++++++++++ benchmarks/reduce_by_key/reduce_by_key.cu | 88 -------- .../thrust_reduce_by_key_bench.cu | 107 ++++++++++ benchmarks/util.hpp | 40 ++++ 8 files changed, 728 insertions(+), 191 deletions(-) create mode 100644 benchmarks/key_generator.hpp create mode 100644 benchmarks/reduce_by_key/cub_reduce_by_key_bench.cu create mode 100644 benchmarks/reduce_by_key/cuco_reduce_by_key_bench.cu delete mode 100644 benchmarks/reduce_by_key/reduce_by_key.cu create mode 100644 benchmarks/reduce_by_key/thrust_reduce_by_key_bench.cu create mode 100644 benchmarks/util.hpp diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 467893be6..24932576c 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -12,10 +12,19 @@ CPMAddPackage( "RUN_HAVE_STD_REGEX 0" # ) -if (benchmark_ADDED) - # patch google benchmark target - set_target_properties(benchmark PROPERTIES CXX_STANDARD 14) -endif() +#if (benchmark_ADDED) +# # patch google benchmark target +# set_target_properties(benchmark PROPERTIES CXX_STANDARD 14) +#endif() + +CPMAddPackage( + NAME nvbench + GITHUB_REPOSITORY NVIDIA/nvbench + GIT_TAG main + GIT_SHALLOW TRUE +) + +set_target_properties(benchmark PROPERTIES CXX_STANDARD 17) ################################################################################################### # - compiler function ----------------------------------------------------------------------------- @@ -35,6 +44,22 @@ function(ConfigureBench BENCH_NAME BENCH_SRC) CUDA::cudart) endfunction(ConfigureBench) +################################################################################################### +function(ConfigureNVBench BENCH_NAME BENCH_SRC) + add_executable(${BENCH_NAME} "${BENCH_SRC}") + set_target_properties(${BENCH_NAME} PROPERTIES + POSITION_INDEPENDENT_CODE ON + RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/nvbenchmarks") + target_include_directories(${BENCH_NAME} PRIVATE + "${CMAKE_CURRENT_SOURCE_DIR}") + #"${NVBench_SOURCE_DIR}") + target_compile_options(${BENCH_NAME} PRIVATE --expt-extended-lambda --expt-relaxed-constexpr) + target_link_libraries(${BENCH_NAME} PRIVATE + nvbench::main + pthread + cuco) +endfunction(ConfigureNVBench) + ################################################################################################### ### test sources ################################################################################## ################################################################################################### @@ -48,8 +73,17 @@ set(STATIC_MAP_BENCH_SRC "${CMAKE_CURRENT_SOURCE_DIR}/hash_table/static_map_benc ConfigureBench(STATIC_MAP_BENCH "${STATIC_MAP_BENCH_SRC}") ################################################################################################### -ConfigureBench(STATIC_REDUCTION_MAP_BENCH "${CMAKE_CURRENT_SOURCE_DIR}/hash_table/static_reduction_map_bench.cu") +set(STATIC_REDUCTION_MAP_BENCH_SRC "${CMAKE_CURRENT_SOURCE_DIR}/hash_table/static_reduction_map_bench.cu") +ConfigureNVBench(STATIC_REDUCTION_MAP_BENCH "${STATIC_REDUCTION_MAP_BENCH_SRC}") + +################################################################################################### +set(CUCO_RBK_BENCH_SRC "${CMAKE_CURRENT_SOURCE_DIR}/reduce_by_key/cuco_reduce_by_key_bench.cu") +ConfigureNVBench(CUCO_RBK_BENCH "${CUCO_RBK_BENCH_SRC}") + +################################################################################################### +set(THRUST_RBK_BENCH_SRC "${CMAKE_CURRENT_SOURCE_DIR}/reduce_by_key/thrust_reduce_by_key_bench.cu") +ConfigureNVBench(THRUST_RBK_BENCH "${THRUST_RBK_BENCH_SRC}") ################################################################################################### -set(RBK_BENCH_SRC "${CMAKE_CURRENT_SOURCE_DIR}/reduce_by_key/reduce_by_key.cu") -ConfigureBench(RBK_BENCH "${RBK_BENCH_SRC}") +set(CUB_RBK_BENCH_SRC "${CMAKE_CURRENT_SOURCE_DIR}/reduce_by_key/cub_reduce_by_key_bench.cu") +ConfigureNVBench(CUB_RBK_BENCH "${CUB_RBK_BENCH_SRC}") diff --git a/benchmarks/hash_table/static_reduction_map_bench.cu b/benchmarks/hash_table/static_reduction_map_bench.cu index 92a2ab788..ea6580554 100644 --- a/benchmarks/hash_table/static_reduction_map_bench.cu +++ b/benchmarks/hash_table/static_reduction_map_bench.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,117 +14,127 @@ * limitations under the License. */ -#include -#include #include -#include -#include -#include -#include -#include "cuco/static_reduction_map.cuh" +#include +#include +#include +#include +#include -enum class dist_type { UNIQUE, UNIFORM, GAUSSIAN }; +/** + * @brief Enum representation for reduction operators + */ +enum class op_type { REDUCE_ADD, CUSTOM_OP }; + +NVBENCH_DECLARE_ENUM_TYPE_STRINGS( + // Enum type: + op_type, + // Callable to generate input strings: + // Short identifier used for tables, command-line args, etc. + // Used when context is available to figure out the enum type. + [](op_type o) { + switch (o) { + case op_type::REDUCE_ADD: return "REDUCE_ADD"; + case op_type::CUSTOM_OP: return "CUSTOM_OP"; + default: return "ERROR"; + } + }, + // Callable to generate descriptions: + // If non-empty, these are used in `--list` to describe values. + // Used when context may not be available to figure out the type from the + // input string. + // Just use `[](auto) { return std::string{}; }` if you don't want these. + [](auto) { return std::string{}; }) -template -static void generate_keys(OutputIt output_begin, OutputIt output_end) -{ - auto num_keys = std::distance(output_begin, output_end); - - std::random_device rd; - std::mt19937 gen{rd()}; - - switch (Dist) { - case dist_type::UNIQUE: - for (auto i = 0; i < num_keys; ++i) { - output_begin[i] = i; - } - break; - case dist_type::UNIFORM: - for (auto i = 0; i < num_keys; ++i) { - output_begin[i] = std::abs(static_cast(gen())); - } - break; - case dist_type::GAUSSIAN: - std::normal_distribution<> dg{1e9, 1e7}; - for (auto i = 0; i < num_keys; ++i) { - output_begin[i] = std::abs(static_cast(dg(gen))); - } - break; - } -} +/** + * @brief Maps the enum value of a cuco reduction operator to its actual type + */ +template +struct op_type_map { +}; + +template <> +struct op_type_map { + template + using type = cuco::reduce_add; +}; + +template <> +struct op_type_map { + template + using type = cuco::custom_op>; // sum reduction with CAS loop +}; /** - * @brief Generates input sizes and hash table occupancies - * + * @brief A benchmark evaluating insert performance. */ -static void generate_size_and_occupancy(benchmark::internal::Benchmark* b) +template +void nvbench_cuco_static_reduction_map_insert( + nvbench::state& state, nvbench::type_list>) { - for (auto size = 4096; size <= 1 << 28; size *= 2) { - for (auto occupancy = 60; occupancy <= 60; occupancy += 10) { - b->Args({size, occupancy}); - } - } -} + using map_type = cuco::static_reduction_map::type, Key, Value>; -template typename ReductionOp> -static void BM_static_map_insert(::benchmark::State& state) -{ - using map_type = cuco::static_reduction_map, Key, Value>; + auto const num_elems = state.get_int64("NumInputs"); + auto const occupancy = state.get_float64("Occupancy"); + auto const dist = state.get_string("Distribution"); + auto const multiplicity = state.get_int64_or_default("Multiplicity", 8); - std::size_t num_keys = state.range(0); - float occupancy = state.range(1) / float{100}; - std::size_t size = num_keys / occupancy; + std::vector h_keys(num_elems); + std::vector h_values(num_elems); - std::vector h_keys(num_keys); - std::vector> h_pairs(num_keys); + generate_keys(state, dist, h_keys.begin(), h_keys.end(), multiplicity); - generate_keys(h_keys.begin(), h_keys.end()); + // generate uniform random values + generate_keys(state, "UNIFORM", h_values.begin(), h_values.end(), 1); + + // the size of the hash table under a given target occupancy depends on the + // number of unique keys in the input + std::size_t const unique = count_unique(h_keys.begin(), h_keys.end()); + std::size_t const capacity = std::ceil(SDIV(unique, occupancy)); + + // alternative occupancy calculation based on the total number of inputs + // std::size_t const capacity = num_elems / occupancy; thrust::device_vector d_keys(h_keys); - thrust::device_vector d_values(h_keys); + thrust::device_vector d_values(h_values); - auto pairs_begin = + auto d_pairs_begin = thrust::make_zip_iterator(thrust::make_tuple(d_keys.begin(), d_values.begin())); - auto pairs_end = pairs_begin + num_keys; + auto d_pairs_end = d_pairs_begin + num_elems; - for (auto _ : state) { - map_type map{size, -1}; - { - cuda_event_timer raii{state}; - map.insert(pairs_begin, pairs_end); - } - } + state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, + [&](nvbench::launch& launch, auto& timer) { + map_type map{capacity, -1}; - state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) * - int64_t(state.range(0))); + timer.start(); + // TODO use CUDA stream provided by nvbench::launch + map.insert(d_pairs_begin, d_pairs_end); + timer.stop(); + }); } -BENCHMARK_TEMPLATE(BM_static_map_insert, int32_t, int32_t, dist_type::UNIQUE, cuco::reduce_add) - ->Unit(benchmark::kMillisecond) - ->UseManualTime() - ->Apply(generate_size_and_occupancy); - -BENCHMARK_TEMPLATE(BM_static_map_insert, int32_t, int32_t, dist_type::UNIFORM, cuco::reduce_add) - ->Unit(benchmark::kMillisecond) - ->UseManualTime() - ->Apply(generate_size_and_occupancy); - -BENCHMARK_TEMPLATE(BM_static_map_insert, int32_t, int32_t, dist_type::GAUSSIAN, cuco::reduce_add) - ->Unit(benchmark::kMillisecond) - ->UseManualTime() - ->Apply(generate_size_and_occupancy); - -BENCHMARK_TEMPLATE(BM_static_map_insert, int64_t, int64_t, dist_type::UNIQUE, cuco::reduce_add) - ->Unit(benchmark::kMillisecond) - ->UseManualTime() - ->Apply(generate_size_and_occupancy); - -BENCHMARK_TEMPLATE(BM_static_map_insert, int64_t, int64_t, dist_type::UNIFORM, cuco::reduce_add) - ->Unit(benchmark::kMillisecond) - ->UseManualTime() - ->Apply(generate_size_and_occupancy); - -BENCHMARK_TEMPLATE(BM_static_map_insert, int64_t, int64_t, dist_type::GAUSSIAN, cuco::reduce_add) - ->Unit(benchmark::kMillisecond) - ->UseManualTime() - ->Apply(generate_size_and_occupancy); \ No newline at end of file +// type parameter dimensions for benchmark +using key_type_range = nvbench::type_list; +using value_type_range = nvbench::type_list; +using op_type_range = nvbench::enum_type_list; + +// benchmark setups +NVBENCH_BENCH_TYPES(nvbench_cuco_static_reduction_map_insert, + NVBENCH_TYPE_AXES(key_type_range, value_type_range, op_type_range)) + .set_name("cuco_static_reduction_map_insert_occupancy") + .set_type_axes_names({"Key", "Value", "ReductionOp"}) + .set_max_noise(3) // Custom noise: 3%. By default: 0.5%. + .add_int64_axis("NumInputs", {100'000'000}) // Total number of key/value pairs + .add_float64_axis("Occupancy", nvbench::range(0.5, 0.9, 0.1)) // occupancy range + .add_int64_axis("Multiplicity", {8}) // only applies to uniform distribution + .add_string_axis("Distribution", {"GAUSSIAN", "UNIFORM", "UNIQUE", "SAME"}); + +NVBENCH_BENCH_TYPES(nvbench_cuco_static_reduction_map_insert, + NVBENCH_TYPE_AXES(key_type_range, value_type_range, op_type_range)) + .set_name("cuco_static_reduction_map_insert_multiplicity") + .set_type_axes_names({"Key", "Value", "ReductionOp"}) + .set_max_noise(3) // Custom noise: 3%. By default: 0.5%. + .add_int64_axis("NumInputs", {100'000'000}) // Total number of key/value pairs + .add_float64_axis("Occupancy", {0.8}) // fixed occupancy + .add_int64_axis("Multiplicity", {1, 10, 100, 1'000, 10'000, 100'000}) // key multiplicity range + .add_string_axis("Distribution", {"UNIFORM"}); \ No newline at end of file diff --git a/benchmarks/key_generator.hpp b/benchmarks/key_generator.hpp new file mode 100644 index 000000000..c16015866 --- /dev/null +++ b/benchmarks/key_generator.hpp @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include + +enum class dist_type { GAUSSIAN, GEOMETRIC, UNIFORM, UNIQUE, SAME }; + +NVBENCH_DECLARE_ENUM_TYPE_STRINGS( + // Enum type: + dist_type, + // Callable to generate input strings: + // Short identifier used for tables, command-line args, etc. + // Used when context is available to figure out the enum type. + [](dist_type d) { + switch (d) { + case dist_type::GAUSSIAN: return "GAUSSIAN"; + case dist_type::GEOMETRIC: return "GEOMETRIC"; + case dist_type::UNIFORM: return "UNIFORM"; + case dist_type::UNIQUE: return "UNIQUE"; + case dist_type::SAME: return "SAME"; + default: return "ERROR"; + } + }, + // Callable to generate descriptions: + // If non-empty, these are used in `--list` to describe values. + // Used when context may not be available to figure out the type from the + // input string. + // Just use `[](auto) { return std::string{}; }` if you don't want these. + [](auto) { return std::string{}; }) + +template +static void generate_keys(nvbench::state& state, + dist_type dist, + OutputIt output_begin, + OutputIt output_end, + std::size_t multiplicity = 8) +{ + auto const num_keys = std::distance(output_begin, output_end); + + std::random_device rd; + std::mt19937 gen{rd()}; + + switch (dist) { + case dist_type::GAUSSIAN: { + auto const mean = static_cast(num_keys / 2); + auto const dev = static_cast(num_keys / 5); + + std::normal_distribution<> distribution{mean, dev}; + + for (auto i = 0; i < num_keys; ++i) { + auto k = distribution(gen); + while (k >= num_keys) { + k = distribution(gen); + } + output_begin[i] = k; + } + break; + } + case dist_type::GEOMETRIC: { + auto const max = std::numeric_limits::max(); + auto const coeff = static_cast(num_keys) / static_cast(max); + // Random sampling in range [0, INT32_MAX] + std::geometric_distribution distribution{1e-9}; + + for (auto i = 0; i < num_keys; ++i) { + output_begin[i] = distribution(gen) * coeff; + } + break; + } + case dist_type::UNIFORM: { + std::uniform_int_distribution distribution{1, static_cast(num_keys / multiplicity)}; + + for (auto i = 0; i < num_keys; ++i) { + output_begin[i] = distribution(gen); + } + break; + } + case dist_type::UNIQUE: { + // 3 because some HT implementations use 0, 1 as sentinels + for (auto i = 2; i < num_keys + 2; ++i) { + output_begin[i] = i; + } + std::random_shuffle(output_begin, output_end); + break; + } + case dist_type::SAME: { + std::fill(output_begin, output_end, Key(42)); + break; + } + default: { + state.skip("unknown distribution type"); + break; + } + } // switch +} + +template +static void generate_keys(nvbench::state& state, + std::string const& dist, + OutputIt output_begin, + OutputIt output_end, + std::size_t multiplicity = 8) +{ + dist_type enum_value{}; + + if (dist == "GAUSSIAN") { + enum_value = dist_type::GAUSSIAN; + } else if (dist == "GEOMETRIC") { + enum_value = dist_type::GEOMETRIC; + } else if (dist == "UNIFORM") { + enum_value = dist_type::UNIFORM; + } else if (dist == "UNIQUE") { + enum_value = dist_type::UNIQUE; + } else if (dist == "SAME") { + enum_value = dist_type::SAME; + } else { + state.skip("unknown distribution type"); + return; + } + + generate_keys(state, enum_value, output_begin, output_end, multiplicity); +} + +template +static void generate_prob_keys(double const matching_rate, + OutputIt output_begin, + OutputIt output_end) +{ + auto const num_keys = std::distance(output_begin, output_end); + auto const max = std::numeric_limits::max() - 2; + + std::random_device rd; + std::mt19937 gen{rd()}; + + std::uniform_real_distribution rate_dist(0.0, 1.0); + std::uniform_int_distribution non_match_dist{static_cast(num_keys + 2), max}; + + for (auto i = 0; i < num_keys; ++i) { + auto const tmp_rate = rate_dist(gen); + + if (tmp_rate > matching_rate) { output_begin[i] = non_match_dist(gen); } + } + + std::random_shuffle(output_begin, output_end); +} \ No newline at end of file diff --git a/benchmarks/reduce_by_key/cub_reduce_by_key_bench.cu b/benchmarks/reduce_by_key/cub_reduce_by_key_bench.cu new file mode 100644 index 000000000..0f481c158 --- /dev/null +++ b/benchmarks/reduce_by_key/cub_reduce_by_key_bench.cu @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +/** + * @brief A benchmark evaluating reduce-by-key performance. + */ +template +void nvbench_cub_reduce_by_key(nvbench::state& state, nvbench::type_list) +{ + auto const num_elems_in = state.get_int64("NumInputs"); + auto const dist = state.get_string("Distribution"); + auto const multiplicity = state.get_int64_or_default("Multiplicity", 8); + + std::vector h_keys(num_elems_in); + std::vector h_values(num_elems_in); + + generate_keys(state, dist, h_keys.begin(), h_keys.end(), multiplicity); + + // generate uniform random values + generate_keys(state, "UNIFORM", h_values.begin(), h_values.end(), 1); + + // double buffer (ying/yang) + thrust::device_vector d_keys_ying(h_keys); + thrust::device_vector d_values_ying(h_values); + + thrust::device_vector d_keys_yang(num_elems_in); + thrust::device_vector d_values_yang(num_elems_in); + + // CUB requires a dry-run in order to determine the size of required temp memory + std::size_t temp_bytes_sort = 0; + cub::DeviceRadixSort::SortPairs(nullptr, + temp_bytes_sort, + d_keys_ying.data().get(), + d_keys_yang.data().get(), + d_values_ying.data().get(), + d_values_yang.data().get(), + num_elems_in); + + thrust::device_vector d_num_elems_out(1); + + std::size_t temp_bytes_reduce = 0; + cub::DeviceReduce::ReduceByKey(nullptr, + temp_bytes_reduce, + d_keys_yang.data().get(), + d_keys_ying.data().get(), + d_values_yang.data().get(), + d_values_ying.data().get(), + d_num_elems_out.data().get(), + cub::Sum(), + num_elems_in); + + thrust::device_vector d_temp(std::max(temp_bytes_sort, temp_bytes_reduce)); + + state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, + [&](nvbench::launch& launch, auto& timer) { + timer.start(); + cub::DeviceRadixSort::SortPairs(d_temp.data().get(), + temp_bytes_sort, + d_keys_ying.data().get(), + d_keys_yang.data().get(), + d_values_ying.data().get(), + d_values_yang.data().get(), + num_elems_in, + 0, + sizeof(Key) * 8, + launch.get_stream()); + + cub::DeviceReduce::ReduceByKey(d_temp.data().get(), + temp_bytes_reduce, + d_keys_yang.data().get(), + d_keys_ying.data().get(), + d_values_yang.data().get(), + d_values_ying.data().get(), + d_num_elems_out.data().get(), + cub::Sum(), + num_elems_in, + launch.get_stream()); + timer.stop(); + }); +} + +// type parameter dimensions for benchmark +using key_type_range = nvbench::type_list; +using value_type_range = nvbench::type_list; + +// benchmark setups +NVBENCH_BENCH_TYPES(nvbench_cub_reduce_by_key, NVBENCH_TYPE_AXES(key_type_range, value_type_range)) + .set_name("nvbench_cub_reduce_by_key_distribution") + .set_type_axes_names({"Key", "Value"}) + .set_max_noise(3) // Custom noise: 3%. By default: 0.5%. + .add_int64_axis("NumInputs", {100'000'000}) // Total number of key/value pairs + .add_int64_axis("Multiplicity", {8}) // only applies to uniform distribution + .add_string_axis("Distribution", {"GAUSSIAN", "UNIFORM", "UNIQUE", "SAME"}); + +NVBENCH_BENCH_TYPES(nvbench_cub_reduce_by_key, NVBENCH_TYPE_AXES(key_type_range, value_type_range)) + .set_name("nvbench_cub_reduce_by_key_multiplicity") + .set_type_axes_names({"Key", "Value"}) + .set_max_noise(3) // Custom noise: 3%. By default: 0.5%. + .add_int64_axis("NumInputs", {100'000'000}) // Total number of key/value pairs + .add_int64_axis("Multiplicity", {1, 10, 100, 1'000, 10'000, 100'000}) // key multiplicity range + .add_string_axis("Distribution", {"UNIFORM"}); \ No newline at end of file diff --git a/benchmarks/reduce_by_key/cuco_reduce_by_key_bench.cu b/benchmarks/reduce_by_key/cuco_reduce_by_key_bench.cu new file mode 100644 index 000000000..f67896833 --- /dev/null +++ b/benchmarks/reduce_by_key/cuco_reduce_by_key_bench.cu @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include + +/** + * @brief Enum representation for reduction operators + */ +enum class op_type { REDUCE_ADD, CUSTOM_OP }; + +NVBENCH_DECLARE_ENUM_TYPE_STRINGS( + // Enum type: + op_type, + // Callable to generate input strings: + // Short identifier used for tables, command-line args, etc. + // Used when context is available to figure out the enum type. + [](op_type o) { + switch (o) { + case op_type::REDUCE_ADD: return "REDUCE_ADD"; + case op_type::CUSTOM_OP: return "CUSTOM_OP"; + default: return "ERROR"; + } + }, + // Callable to generate descriptions: + // If non-empty, these are used in `--list` to describe values. + // Used when context may not be available to figure out the type from the + // input string. + // Just use `[](auto) { return std::string{}; }` if you don't want these. + [](auto) { return std::string{}; }) + +/** + * @brief Maps the enum value of a cuco reduction operator to its actual type + */ +template +struct op_type_map {}; + +template <> +struct op_type_map { + template + using type = cuco::reduce_add; +}; + +template <> +struct op_type_map { + template + using type = cuco::custom_op>; // sum reduction with CAS loop +}; + +/** + * @brief A benchmark evaluating reduce-by-key performance. + */ +template < + typename Key, + typename Value, + op_type Op> +void nvbench_cuco_static_reduction_map_reduce_by_key( + nvbench::state& state, + nvbench::type_list< + Key, + Value, + nvbench::enum_type>) +{ + using map_type = + cuco::static_reduction_map::type, Key, Value>; + + auto const num_elems = state.get_int64("NumInputs"); + auto const occupancy = state.get_float64("Occupancy"); + auto const dist = state.get_string("Distribution"); + auto const multiplicity = state.get_int64_or_default("Multiplicity", 8); + + std::vector h_keys(num_elems); + std::vector h_values(num_elems); + + generate_keys(state, dist, h_keys.begin(), h_keys.end(), multiplicity); + + // generate uniform random values + generate_keys(state, "UNIFORM", h_values.begin(), h_values.end(), 1); + + // the size of the hash table under a given target occupancy depends on the + // number of unique keys in the input + std::size_t const unique = count_unique(h_keys.begin(), h_keys.end()); + std::size_t const capacity = std::ceil(SDIV(unique, occupancy)); + + // alternative occupancy calculation based on the total number of inputs + // std::size_t const capacity = num_elems / occupancy; + + thrust::device_vector d_keys(h_keys); + thrust::device_vector d_values(h_values); + + auto d_pairs_begin = + thrust::make_zip_iterator(thrust::make_tuple(d_keys.begin(), d_values.begin())); + auto d_pairs_end = d_pairs_begin + num_elems; + + state.exec( + nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { + map_type map{capacity, -1}; + + timer.start(); + // TODO use CUDA stream provided by nvbench::launch + map.insert(d_pairs_begin, d_pairs_end); + map.retrieve_all(d_keys.begin(), d_values.begin()); + timer.stop(); + }); +} + +// type parameter dimensions for benchmark +using key_type_range = nvbench::type_list; +using value_type_range = nvbench::type_list; +using op_type_range = nvbench::enum_type_list; + +NVBENCH_BENCH_TYPES(nvbench_cuco_static_reduction_map_reduce_by_key, + NVBENCH_TYPE_AXES(key_type_range, + value_type_range, + op_type_range)) + .set_name("cuco_static_reduction_map_reduce_by_key_occupancy") + .set_type_axes_names({"Key", "Value", "ReductionOp"}) + .set_max_noise(3) // Custom noise: 3%. By default: 0.5%. + .add_int64_axis("NumInputs", {100'000'000}) // Total number of key/value pairs + .add_float64_axis("Occupancy", nvbench::range(0.5, 0.9, 0.1)) // occupancy range + .add_int64_axis("Multiplicity", {8}) // only applies to uniform distribution + .add_string_axis("Distribution", {"GAUSSIAN", "UNIFORM", "UNIQUE", "SAME"}); + +NVBENCH_BENCH_TYPES(nvbench_cuco_static_reduction_map_reduce_by_key, + NVBENCH_TYPE_AXES(key_type_range, + value_type_range, + op_type_range)) + .set_name("cuco_static_reduction_map_reduce_by_key_multiplicity") + .set_type_axes_names({"Key", "Value", "ReductionOp"}) + .set_max_noise(3) // Custom noise: 3%. By default: 0.5%. + .add_int64_axis("NumInputs", {100'000'000}) // Total number of key/value pairs + .add_float64_axis("Occupancy", {0.8}) // fixed occupancy + .add_int64_axis("Multiplicity", {1, 10, 100, 1'000, 10'000, 100'000}) // key multiplicity range + .add_string_axis("Distribution", {"UNIFORM"}); \ No newline at end of file diff --git a/benchmarks/reduce_by_key/reduce_by_key.cu b/benchmarks/reduce_by_key/reduce_by_key.cu deleted file mode 100644 index 0ca08144f..000000000 --- a/benchmarks/reduce_by_key/reduce_by_key.cu +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Copyright (c) 2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include -#include -#include -#include -#include -#include - -/** - * @brief Generates input sizes and number of unique keys - * - */ -static void generate_size_and_num_unique(benchmark::internal::Benchmark* b) -{ - for (auto num_unique = 64; num_unique <= 1 << 20; num_unique <<= 1) { - for (auto size = 10'000'000; size <= 10'000'000; size *= 10) { - b->Args({size, num_unique}); - } - } -} - -template -void thrust_reduce_by_key(KeyRandomIterator keys_begin, - KeyRandomIterator keys_end, - ValueRandomIterator values_begin) -{ - using Key = typename thrust::iterator_traits::value_type; - using Value = typename thrust::iterator_traits::value_type; - - // Exact size of output is unknown (number of unique keys), but upper bounded - // by the number of keys - auto maximum_output_size = thrust::distance(keys_begin, keys_end); - thrust::device_vector output_keys(maximum_output_size); - thrust::device_vector output_values(maximum_output_size); - - thrust::sort_by_key(thrust::device, keys_begin, keys_end, values_begin); - thrust::reduce_by_key( - thrust::device, keys_begin, keys_end, values_begin, output_keys.begin(), output_values.end()); -} - -template -static void BM_thrust(::benchmark::State& state) -{ - auto const num_unique_keys = state.range(1); - for (auto _ : state) { - state.PauseTiming(); - thrust::device_vector keys(state.range(0)); - auto begin = thrust::make_counting_iterator(0); - thrust::transform( - begin, begin + state.range(0), keys.begin(), [num_unique_keys] __device__(auto i) { - return i % num_unique_keys; - }); - - thrust::device_vector values(state.range(0)); - state.ResumeTiming(); - thrust_reduce_by_key(keys.begin(), keys.end(), values.begin()); - cudaDeviceSynchronize(); - } -} -BENCHMARK_TEMPLATE(BM_thrust, int32_t, int32_t) - ->Unit(benchmark::kMillisecond) - ->Apply(generate_size_and_num_unique); - -BENCHMARK_TEMPLATE(BM_thrust, int64_t, int64_t) - ->Unit(benchmark::kMillisecond) - ->Apply(generate_size_and_num_unique); - -// TODO: Hash based reduce by key benchmark - - diff --git a/benchmarks/reduce_by_key/thrust_reduce_by_key_bench.cu b/benchmarks/reduce_by_key/thrust_reduce_by_key_bench.cu new file mode 100644 index 000000000..ad1c77058 --- /dev/null +++ b/benchmarks/reduce_by_key/thrust_reduce_by_key_bench.cu @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/** + * @brief Reduce-by-key implementation in Thrust. + */ +template +void thrust_reduce_by_key(KeyRandomIterator keys_begin, + KeyRandomIterator keys_end, + ValueRandomIterator values_begin) +{ + using Key = typename thrust::iterator_traits::value_type; + using Value = typename thrust::iterator_traits::value_type; + + // Exact size of output is unknown (number of unique keys), but upper-bounded + // by the number of keys + auto maximum_output_size = thrust::distance(keys_begin, keys_end); + thrust::device_vector output_keys(maximum_output_size); + thrust::device_vector output_values(maximum_output_size); + + thrust::sort_by_key(thrust::device, keys_begin, keys_end, values_begin); + thrust::reduce_by_key( + thrust::device, keys_begin, keys_end, values_begin, output_keys.begin(), output_values.begin()); +} + +/** + * @brief A benchmark evaluating reduce-by-key performance. + */ +template < + typename Key, + typename Value> +void nvbench_thrust_reduce_by_key( + nvbench::state& state, + nvbench::type_list< + Key, + Value>) +{ + auto const num_elems = state.get_int64("NumInputs"); + auto const dist = state.get_string("Distribution"); + auto const multiplicity = state.get_int64_or_default("Multiplicity", 8); + + std::vector h_keys(num_elems); + std::vector h_values(num_elems); + + generate_keys(state, dist, h_keys.begin(), h_keys.end(), multiplicity); + + // generate uniform random values + generate_keys(state, "UNIFORM", h_values.begin(), h_values.end(), 1); + + thrust::device_vector d_keys(h_keys); + thrust::device_vector d_values(h_values); + + state.exec( + nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { + timer.start(); + // TODO use CUDA stream provided by nvbench::launch + thrust_reduce_by_key(d_keys.begin(), d_keys.end(), d_values.begin()); + timer.stop(); + }); +} + +// type parameter dimensions for benchmark +using key_type_range = nvbench::type_list; +using value_type_range = nvbench::type_list; + +// benchmark setups +NVBENCH_BENCH_TYPES(nvbench_thrust_reduce_by_key, + NVBENCH_TYPE_AXES(key_type_range, + value_type_range)) + .set_name("nvbench_thrust_reduce_by_key_distribution") + .set_type_axes_names({"Key", "Value"}) + .set_max_noise(3) // Custom noise: 3%. By default: 0.5%. + .add_int64_axis("NumInputs", {100'000'000}) // Total number of key/value pairs + .add_int64_axis("Multiplicity", {8}) // only applies to uniform distribution + .add_string_axis("Distribution", {"GAUSSIAN", "UNIFORM", "UNIQUE", "SAME"}); + +NVBENCH_BENCH_TYPES(nvbench_thrust_reduce_by_key, + NVBENCH_TYPE_AXES(key_type_range, + value_type_range)) + .set_name("nvbench_thrust_reduce_by_key_multiplicity") + .set_type_axes_names({"Key", "Value"}) + .set_max_noise(3) // Custom noise: 3%. By default: 0.5%. + .add_int64_axis("NumInputs", {100'000'000}) // Total number of key/value pairs + .add_int64_axis("Multiplicity", {1, 10, 100, 1'000, 10'000, 100'000}) // key multiplicity range + .add_string_axis("Distribution", {"UNIFORM"}); \ No newline at end of file diff --git a/benchmarks/util.hpp b/benchmarks/util.hpp new file mode 100644 index 000000000..bfc115743 --- /dev/null +++ b/benchmarks/util.hpp @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +/** + * @brief Count the number of unique elements within a range + */ +template +std::size_t count_unique(Iter begin, Iter end) { + using value_type = typename std::iterator_traits::value_type; + + const auto size = std::distance(begin, end); + std::vector v(size); + std::copy(begin, end, v.begin()); + std::sort(v.begin(), v.end()); + + return std::distance(v.begin(), std::unique(v.begin(), v.end())); +} + +// safe division +#ifndef SDIV + #define SDIV(x,y)(((x)+(y)-1)/(y)) +#endif \ No newline at end of file From 1d97a6fed59f7bdcf7bb926e44bb128904550207 Mon Sep 17 00:00:00 2001 From: Daniel Juenger Date: Sun, 1 Aug 2021 22:32:45 +0000 Subject: [PATCH 36/69] Added CUDA stream support for static_reduction_map. --- .../hash_table/static_reduction_map_bench.cu | 3 +- .../reduce_by_key/cuco_reduce_by_key_bench.cu | 66 ++++++++---------- include/cuco/detail/static_reduction_map.inl | 68 ++++++++++++------- include/cuco/static_reduction_map.cuh | 40 +++++++---- 4 files changed, 97 insertions(+), 80 deletions(-) diff --git a/benchmarks/hash_table/static_reduction_map_bench.cu b/benchmarks/hash_table/static_reduction_map_bench.cu index ea6580554..93eac5aed 100644 --- a/benchmarks/hash_table/static_reduction_map_bench.cu +++ b/benchmarks/hash_table/static_reduction_map_bench.cu @@ -107,8 +107,7 @@ void nvbench_cuco_static_reduction_map_insert( map_type map{capacity, -1}; timer.start(); - // TODO use CUDA stream provided by nvbench::launch - map.insert(d_pairs_begin, d_pairs_end); + map.insert(d_pairs_begin, d_pairs_end, launch.get_stream()); timer.stop(); }); } diff --git a/benchmarks/reduce_by_key/cuco_reduce_by_key_bench.cu b/benchmarks/reduce_by_key/cuco_reduce_by_key_bench.cu index f67896833..e5ee1a7a2 100644 --- a/benchmarks/reduce_by_key/cuco_reduce_by_key_bench.cu +++ b/benchmarks/reduce_by_key/cuco_reduce_by_key_bench.cu @@ -16,10 +16,10 @@ #include #include -#include +#include #include +#include #include -#include /** * @brief Enum representation for reduction operators @@ -35,7 +35,7 @@ NVBENCH_DECLARE_ENUM_TYPE_STRINGS( [](op_type o) { switch (o) { case op_type::REDUCE_ADD: return "REDUCE_ADD"; - case op_type::CUSTOM_OP: return "CUSTOM_OP"; + case op_type::CUSTOM_OP: return "CUSTOM_OP"; default: return "ERROR"; } }, @@ -50,7 +50,8 @@ NVBENCH_DECLARE_ENUM_TYPE_STRINGS( * @brief Maps the enum value of a cuco reduction operator to its actual type */ template -struct op_type_map {}; +struct op_type_map { +}; template <> struct op_type_map { @@ -61,25 +62,17 @@ struct op_type_map { template <> struct op_type_map { template - using type = cuco::custom_op>; // sum reduction with CAS loop + using type = cuco::custom_op>; // sum reduction with CAS loop }; /** * @brief A benchmark evaluating reduce-by-key performance. */ -template < - typename Key, - typename Value, - op_type Op> +template void nvbench_cuco_static_reduction_map_reduce_by_key( - nvbench::state& state, - nvbench::type_list< - Key, - Value, - nvbench::enum_type>) + nvbench::state& state, nvbench::type_list>) { - using map_type = - cuco::static_reduction_map::type, Key, Value>; + using map_type = cuco::static_reduction_map::type, Key, Value>; auto const num_elems = state.get_int64("NumInputs"); auto const occupancy = state.get_float64("Occupancy"); @@ -102,50 +95,45 @@ void nvbench_cuco_static_reduction_map_reduce_by_key( // alternative occupancy calculation based on the total number of inputs // std::size_t const capacity = num_elems / occupancy; - thrust::device_vector d_keys(h_keys); + thrust::device_vector d_keys(h_keys); thrust::device_vector d_values(h_values); auto d_pairs_begin = thrust::make_zip_iterator(thrust::make_tuple(d_keys.begin(), d_values.begin())); auto d_pairs_end = d_pairs_begin + num_elems; - state.exec( - nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { - map_type map{capacity, -1}; + state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, + [&](nvbench::launch& launch, auto& timer) { + map_type map{capacity, -1}; - timer.start(); - // TODO use CUDA stream provided by nvbench::launch - map.insert(d_pairs_begin, d_pairs_end); - map.retrieve_all(d_keys.begin(), d_values.begin()); - timer.stop(); - }); + timer.start(); + map.insert(d_pairs_begin, d_pairs_end, launch.get_stream()); + map.retrieve_all(d_keys.begin(), d_values.begin(), launch.get_stream()); + timer.stop(); + }); } // type parameter dimensions for benchmark using key_type_range = nvbench::type_list; using value_type_range = nvbench::type_list; -using op_type_range = nvbench::enum_type_list; +using op_type_range = nvbench::enum_type_list; NVBENCH_BENCH_TYPES(nvbench_cuco_static_reduction_map_reduce_by_key, - NVBENCH_TYPE_AXES(key_type_range, - value_type_range, - op_type_range)) + NVBENCH_TYPE_AXES(key_type_range, value_type_range, op_type_range)) .set_name("cuco_static_reduction_map_reduce_by_key_occupancy") .set_type_axes_names({"Key", "Value", "ReductionOp"}) .set_max_noise(3) // Custom noise: 3%. By default: 0.5%. - .add_int64_axis("NumInputs", {100'000'000}) // Total number of key/value pairs - .add_float64_axis("Occupancy", nvbench::range(0.5, 0.9, 0.1)) // occupancy range - .add_int64_axis("Multiplicity", {8}) // only applies to uniform distribution + .add_int64_axis("NumInputs", {100'000'000}) // Total number of key/value pairs + .add_float64_axis("Occupancy", nvbench::range(0.5, 0.9, 0.1)) // occupancy range + .add_int64_axis("Multiplicity", {8}) // only applies to uniform distribution .add_string_axis("Distribution", {"GAUSSIAN", "UNIFORM", "UNIQUE", "SAME"}); NVBENCH_BENCH_TYPES(nvbench_cuco_static_reduction_map_reduce_by_key, - NVBENCH_TYPE_AXES(key_type_range, - value_type_range, - op_type_range)) + NVBENCH_TYPE_AXES(key_type_range, value_type_range, op_type_range)) .set_name("cuco_static_reduction_map_reduce_by_key_multiplicity") .set_type_axes_names({"Key", "Value", "ReductionOp"}) .set_max_noise(3) // Custom noise: 3%. By default: 0.5%. - .add_int64_axis("NumInputs", {100'000'000}) // Total number of key/value pairs - .add_float64_axis("Occupancy", {0.8}) // fixed occupancy - .add_int64_axis("Multiplicity", {1, 10, 100, 1'000, 10'000, 100'000}) // key multiplicity range + .add_int64_axis("NumInputs", {100'000'000}) // Total number of key/value pairs + .add_float64_axis("Occupancy", {0.8}) // fixed occupancy + .add_int64_axis("Multiplicity", {1, 10, 100, 1'000, 10'000, 100'000}) // key multiplicity range .add_string_axis("Distribution", {"UNIFORM"}); \ No newline at end of file diff --git a/include/cuco/detail/static_reduction_map.inl b/include/cuco/detail/static_reduction_map.inl index dcb385ae0..7ec7a676e 100644 --- a/include/cuco/detail/static_reduction_map.inl +++ b/include/cuco/detail/static_reduction_map.inl @@ -39,7 +39,8 @@ static_reduction_map::static_reductio empty_key_sentinel_{empty_key_sentinel}, empty_value_sentinel_{ReductionOp::identity}, op_{reduction_op}, - slot_allocator_{alloc} + slot_allocator_{alloc}, + counter_allocator_{alloc} { slots_ = std::allocator_traits::allocate(slot_allocator_, capacity_); @@ -48,8 +49,6 @@ static_reduction_map::static_reductio auto const grid_size = (capacity_ + stride * block_size - 1) / (stride * block_size); detail::initialize<<>>( slots_, get_empty_key_sentinel(), get_empty_value_sentinel(), get_capacity()); - - CUCO_CUDA_TRY(cudaMallocManaged(&num_successes_, sizeof(atomic_ctr_type))); } template ::~static_reduction_map() { std::allocator_traits::deallocate(slot_allocator_, slots_, capacity_); - CUCO_ASSERT_CUDA_SUCCESS(cudaFree(num_successes_)); } template template -void static_reduction_map::insert(InputIt first, - InputIt last, - Hash hash, - KeyEqual key_equal) +void static_reduction_map::insert( + InputIt first, InputIt last, cudaStream_t stream, Hash hash, KeyEqual key_equal) { auto num_keys = std::distance(first, last); if (num_keys == 0) { return; } @@ -83,16 +79,29 @@ void static_reduction_map::insert(Inp auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size); auto view = get_device_mutable_view(); - *num_successes_ = 0; - int device_id; - CUCO_CUDA_TRY(cudaGetDevice(&device_id)); - CUCO_CUDA_TRY(cudaMemPrefetchAsync(num_successes_, sizeof(atomic_ctr_type), device_id)); + atomic_ctr_type *h_num_successes, *d_num_successes; + CUCO_CUDA_TRY(cudaMallocHost(&h_num_successes, sizeof(atomic_ctr_type))); + + auto tmp_counter_allocator = counter_allocator_; + d_num_successes = + std::allocator_traits::allocate(tmp_counter_allocator, 1); + + h_num_successes->store(static_cast(0), cuda::std::memory_order_relaxed); + CUCO_CUDA_TRY(cudaMemcpyAsync( + d_num_successes, h_num_successes, sizeof(atomic_ctr_type), cudaMemcpyHostToDevice, stream)); + + detail::insert<<>>( + first, first + num_keys, d_num_successes, view, hash, key_equal); + + CUCO_CUDA_TRY(cudaMemcpyAsync( + h_num_successes, d_num_successes, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream)); + CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); - detail::insert - <<>>(first, first + num_keys, num_successes_, view, hash, key_equal); - CUCO_CUDA_TRY(cudaDeviceSynchronize()); + size_ += h_num_successes->load(cuda::std::memory_order_relaxed); - size_ += num_successes_->load(cuda::std::memory_order_relaxed); + CUCO_CUDA_TRY(cudaFreeHost(h_num_successes)); + std::allocator_traits::deallocate( + tmp_counter_allocator, d_num_successes, 1); } template template -void static_reduction_map::find( - InputIt first, InputIt last, OutputIt output_begin, Hash hash, KeyEqual key_equal) +void static_reduction_map::find(InputIt first, + InputIt last, + OutputIt output_begin, + cudaStream_t stream, + Hash hash, + KeyEqual key_equal) { auto num_keys = std::distance(first, last); if (num_keys == 0) { return; } @@ -114,8 +127,8 @@ void static_reduction_map::find( auto view = get_device_view(); detail::find - <<>>(first, last, output_begin, view, hash, key_equal); - CUCO_CUDA_TRY(cudaDeviceSynchronize()); + <<>>(first, last, output_begin, view, hash, key_equal); + CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); } namespace detail { @@ -146,7 +159,7 @@ template template void static_reduction_map::retrieve_all( - KeyOut keys_out, ValueOut values_out) + KeyOut keys_out, ValueOut values_out, cudaStream_t stream) { // Convert pair_type to thrust::tuple to allow assigning to a zip iterator auto begin = @@ -155,7 +168,7 @@ void static_reduction_map::retrieve_a auto filled = detail::slot_is_filled{get_empty_key_sentinel()}; auto zipped_out = thrust::make_zip_iterator(thrust::make_tuple(keys_out, values_out)); - thrust::copy_if(thrust::device, begin, end, zipped_out, filled); + thrust::copy_if(thrust::cuda::par.on(stream), begin, end, zipped_out, filled); } template template void static_reduction_map::contains( - InputIt first, InputIt last, OutputIt output_begin, Hash hash, KeyEqual key_equal) + InputIt first, + InputIt last, + OutputIt output_begin, + cudaStream_t stream, + Hash hash, + KeyEqual key_equal) { auto num_keys = std::distance(first, last); if (num_keys == 0) { return; } @@ -177,8 +195,8 @@ void static_reduction_map::contains( auto view = get_device_view(); detail::contains - <<>>(first, last, output_begin, view, hash, key_equal); - CUCO_CUDA_TRY(cudaDeviceSynchronize()); + <<>>(first, last, output_begin, view, hash, key_equal); + CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); } template ::rebind_alloc; + using counter_allocator_type = + typename std::allocator_traits::rebind_alloc; #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700) static_assert(atomic_key_type::is_always_lock_free, @@ -282,13 +284,18 @@ class static_reduction_map { * @tparam KeyEqual Binary callable type * @param first Beginning of the sequence of key/value pairs * @param last End of the sequence of key/value pairs + * @param stream CUDA stream used for insert * @param hash The unary function to apply to hash each key * @param key_equal The binary function to compare two keys for equality */ template , typename KeyEqual = thrust::equal_to> - void insert(InputIt first, InputIt last, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}); + void insert(InputIt first, + InputIt last, + cudaStream_t stream = 0, + Hash hash = Hash{}, + KeyEqual key_equal = KeyEqual{}); /** * @brief Finds the values corresponding to all keys in the range `[first, last)`. @@ -305,6 +312,7 @@ class static_reduction_map { * @param first Beginning of the sequence of keys * @param last End of the sequence of keys * @param output_begin Beginning of the sequence of values retrieved for each key + * @param stream CUDA stream used for this operation * @param hash The unary function to apply to hash each key * @param key_equal The binary function to compare two keys for equality */ @@ -315,8 +323,9 @@ class static_reduction_map { void find(InputIt first, InputIt last, OutputIt output_begin, - Hash hash = Hash{}, - KeyEqual key_equal = KeyEqual{}); + cudaStream_t stream = 0, + Hash hash = Hash{}, + KeyEqual key_equal = KeyEqual{}); /** * @brief Retrieves all of the keys and their associated values. @@ -333,9 +342,10 @@ class static_reduction_map { * convertible from `mapped_type`. * @param keys_out Beginning output iterator for keys * @param values_out Beginning output iterator for values + * @param stream CUDA stream used for this operation */ template - void retrieve_all(KeyOut keys_out, ValueOut values_out); + void retrieve_all(KeyOut keys_out, ValueOut values_out, cudaStream_t stream = 0); /** * @brief Indicates whether the keys in the range `[first, last)` are contained in the map. @@ -351,6 +361,7 @@ class static_reduction_map { * @param first Beginning of the sequence of keys * @param last End of the sequence of keys * @param output_begin Beginning of the sequence of booleans for the presence of each key + * @param stream CUDA stream used * @param hash The unary function to apply to hash each key * @param key_equal The binary function to compare two keys for equality */ @@ -361,8 +372,9 @@ class static_reduction_map { void contains(InputIt first, InputIt last, OutputIt output_begin, - Hash hash = Hash{}, - KeyEqual key_equal = KeyEqual{}); + cudaStream_t stream = 0, + Hash hash = Hash{}, + KeyEqual key_equal = KeyEqual{}); private: class device_view_base { @@ -1067,14 +1079,14 @@ class static_reduction_map { value_type const* raw_slots_end() const noexcept { return raw_slots_begin() + get_capacity(); } - pair_atomic_type* slots_{nullptr}; ///< Pointer to flat slots storage - std::size_t capacity_{}; ///< Total number of slots - std::size_t size_{}; ///< Number of keys in map - Key empty_key_sentinel_{}; ///< Key value that represents an empty slot - Value empty_value_sentinel_{}; ///< Initial value of empty slot - atomic_ctr_type* num_successes_{}; ///< Number of successfully inserted keys on insert - ReductionOp op_{}; ///< Binary operation reduction function object - slot_allocator_type slot_allocator_{}; ///< Allocator used to allocate slots + pair_atomic_type* slots_{nullptr}; ///< Pointer to flat slots storage + std::size_t capacity_{}; ///< Total number of slots + std::size_t size_{}; ///< Number of keys in map + Key empty_key_sentinel_{}; ///< Key value that represents an empty slot + Value empty_value_sentinel_{}; ///< Initial value of empty slot + ReductionOp op_{}; ///< Binary operation reduction function object + slot_allocator_type slot_allocator_{}; ///< Allocator used to allocate slots + counter_allocator_type counter_allocator_{}; ///< Allocator used to allocate counters }; } // namespace cuco From b4351fc24affe048becc40bab34e4d1f6feb0b65 Mon Sep 17 00:00:00 2001 From: Daniel Juenger Date: Mon, 2 Aug 2021 02:12:33 +0000 Subject: [PATCH 37/69] Fix custom reduction op implementation and add exponential backoff strategy. --- include/cuco/static_reduction_map.cuh | 51 +++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 2 deletions(-) diff --git a/include/cuco/static_reduction_map.cuh b/include/cuco/static_reduction_map.cuh index cde8cc6ec..06b9ce454 100644 --- a/include/cuco/static_reduction_map.cuh +++ b/include/cuco/static_reduction_map.cuh @@ -38,6 +38,7 @@ #include #endif +#include #include #include #include @@ -46,6 +47,12 @@ namespace cuco { +/** + * @brief `+` reduction functor that internally uses an atomic fetch-and-add + * operation. + * + * @tparam T The data type used for reduction + */ template struct reduce_add { using value_type = T; @@ -58,6 +65,12 @@ struct reduce_add { } }; +/** + * @brief `-` reduction functor that internally uses an atomic fetch-and-add + * operation. + * + * @tparam T The data type used for reduction + */ template struct reduce_sub { using value_type = T; @@ -70,6 +83,12 @@ struct reduce_sub { } }; +/** + * @brief `min` reduction functor that internally uses an atomic fetch-and-add + * operation. + * + * @tparam T The data type used for reduction + */ template struct reduce_min { using value_type = T; @@ -82,6 +101,12 @@ struct reduce_min { } }; +/** + * @brief `max` reduction functor that internally uses an atomic fetch-and-add + * operation. + * + * @tparam T The data type used for reduction + */ template struct reduce_max { using value_type = T; @@ -94,7 +119,19 @@ struct reduce_max { } }; -template +/** + * @brief Wrapper for a user-defined custom reduction operator. + * @brief Internally uses an atomic compare-and-swap loop. + * + * @tparam T The data type used for reduction + * @tparam Identity Neutral element under the given reduction group + * @tparam Op Commutative and associative binary operator + */ +template struct custom_op { using value_type = T; static constexpr T identity = Identity; @@ -104,8 +141,18 @@ struct custom_op { template __device__ T apply(cuda::atomic& slot, T2 const& value) const { + [[maybe_unused]] unsigned ns = BackoffBaseDelay; + auto old = slot.load(cuda::memory_order_relaxed); - while (not slot.compare_exchange_strong(old, op(old, value), cuda::memory_order_relaxed)) {} + while (not slot.compare_exchange_strong(old, op(old, value), cuda::memory_order_relaxed)) { +#if __CUDA_ARCH__ >= 700 + // exponential backoff strategy to reduce atomic contention + if (true) { + asm volatile("nanosleep.u32 %0;" ::"r"((unsigned)ns) :); + if (ns < BackoffMaxDelay) { ns *= 2; } + } +#endif + } return old; } }; From 80ef0eef34805a28b8253310f0a7f43daaa20188 Mon Sep 17 00:00:00 2001 From: Daniel Juenger Date: Mon, 2 Aug 2021 02:14:12 +0000 Subject: [PATCH 38/69] Parameter grid search for CAS loop backoff added. --- benchmarks/CMakeLists.txt | 4 + .../hash_table/static_reduction_map_bench.cu | 72 ++++++++++++++ .../static_reduction_map_param_grid_search.cu | 97 +++++++++++++++++++ 3 files changed, 173 insertions(+) create mode 100644 benchmarks/hash_table/static_reduction_map_param_grid_search.cu diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 24932576c..5f16ca5bf 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -87,3 +87,7 @@ ConfigureNVBench(THRUST_RBK_BENCH "${THRUST_RBK_BENCH_SRC}") ################################################################################################### set(CUB_RBK_BENCH_SRC "${CMAKE_CURRENT_SOURCE_DIR}/reduce_by_key/cub_reduce_by_key_bench.cu") ConfigureNVBench(CUB_RBK_BENCH "${CUB_RBK_BENCH_SRC}") + +################################################################################################### +set(STATIC_REDUCTION_MAP_PARAM_GRID_SEARCH_SRC "${CMAKE_CURRENT_SOURCE_DIR}/hash_table/static_reduction_map_param_grid_search.cu") +ConfigureNVBench(STATIC_REDUCTION_MAP_PARAM_GRID_SEARCH "${STATIC_REDUCTION_MAP_PARAM_GRID_SEARCH_SRC}") \ No newline at end of file diff --git a/benchmarks/hash_table/static_reduction_map_bench.cu b/benchmarks/hash_table/static_reduction_map_bench.cu index 93eac5aed..0d651139c 100644 --- a/benchmarks/hash_table/static_reduction_map_bench.cu +++ b/benchmarks/hash_table/static_reduction_map_bench.cu @@ -112,10 +112,68 @@ void nvbench_cuco_static_reduction_map_insert( }); } +/** + * @brief A benchmark evaluating insert performance. + */ +template +void nvbench_cuco_static_reduction_map_custom_op_insert( + nvbench::state& state, + nvbench::type_list, + nvbench::enum_type>) +{ + using custom_op_type = + cuco::custom_op, BackoffBaseDelay, BackoffMaxDelay>; + using map_type = cuco::static_reduction_map; + + auto const num_elems = state.get_int64("NumInputs"); + auto const occupancy = state.get_float64("Occupancy"); + auto const dist = state.get_string("Distribution"); + auto const multiplicity = state.get_int64_or_default("Multiplicity", 8); + + std::vector h_keys(num_elems); + std::vector h_values(num_elems); + + generate_keys(state, dist, h_keys.begin(), h_keys.end(), multiplicity); + + // generate uniform random values + generate_keys(state, "UNIFORM", h_values.begin(), h_values.end(), 1); + + // the size of the hash table under a given target occupancy depends on the + // number of unique keys in the input + std::size_t const unique = count_unique(h_keys.begin(), h_keys.end()); + std::size_t const capacity = std::ceil(SDIV(unique, occupancy)); + + // alternative occupancy calculation based on the total number of inputs + // std::size_t const capacity = num_elems / occupancy; + + thrust::device_vector d_keys(h_keys); + thrust::device_vector d_values(h_values); + + auto d_pairs_begin = + thrust::make_zip_iterator(thrust::make_tuple(d_keys.begin(), d_values.begin())); + auto d_pairs_end = d_pairs_begin + num_elems; + + state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, + [&](nvbench::launch& launch, auto& timer) { + map_type map{capacity, -1}; + + timer.start(); + map.insert(d_pairs_begin, d_pairs_end, launch.get_stream()); + timer.stop(); + }); +} + // type parameter dimensions for benchmark using key_type_range = nvbench::type_list; using value_type_range = nvbench::type_list; using op_type_range = nvbench::enum_type_list; +using base_delay_range = nvbench::enum_type_list<0, 8, 16, 32, 64, 128, 256>; +using max_delay_range = nvbench::enum_type_list<2048, 4096, 8192>; // benchmark setups NVBENCH_BENCH_TYPES(nvbench_cuco_static_reduction_map_insert, @@ -136,4 +194,18 @@ NVBENCH_BENCH_TYPES(nvbench_cuco_static_reduction_map_insert, .add_int64_axis("NumInputs", {100'000'000}) // Total number of key/value pairs .add_float64_axis("Occupancy", {0.8}) // fixed occupancy .add_int64_axis("Multiplicity", {1, 10, 100, 1'000, 10'000, 100'000}) // key multiplicity range + .add_string_axis("Distribution", {"UNIFORM"}); + +NVBENCH_BENCH_TYPES(nvbench_cuco_static_reduction_map_custom_op_insert, + NVBENCH_TYPE_AXES(nvbench::type_list, + nvbench::type_list, + base_delay_range, + max_delay_range)) + .set_name("cuco_static_reduction_map_custom_op_insert_contention") + .set_type_axes_names({"Key", "Value", "BackoffBaseDelay", "BackoffMaxDelay"}) + .set_max_noise(3) // Custom noise: 3%. By default: 0.5%. + .add_int64_axis("NumInputs", {100'000'000}) // Total number of key/value pairs + .add_float64_axis("Occupancy", {0.8}) // fixed occupancy + .add_int64_axis("Multiplicity", + {1, 10, 100, 1'000, 10'000, 100'000, 200'000}) // key multiplicity range .add_string_axis("Distribution", {"UNIFORM"}); \ No newline at end of file diff --git a/benchmarks/hash_table/static_reduction_map_param_grid_search.cu b/benchmarks/hash_table/static_reduction_map_param_grid_search.cu new file mode 100644 index 000000000..41baaa872 --- /dev/null +++ b/benchmarks/hash_table/static_reduction_map_param_grid_search.cu @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include + +/** + * @brief Grid search evaluating backoff delay params for cuco::custom_op + */ +template +void nvbench_cuco_static_reduction_map_custom_op_backoff_delay( + nvbench::state& state, + nvbench::type_list, + nvbench::enum_type>) +{ + using custom_op_type = + cuco::custom_op, BackoffBaseDelay, BackoffMaxDelay>; + using map_type = cuco::static_reduction_map; + + auto const num_elems = state.get_int64("NumInputs"); + auto const occupancy = state.get_float64("Occupancy"); + auto const dist = state.get_string("Distribution"); + auto const multiplicity = state.get_int64_or_default("Multiplicity", 8); + + std::vector h_keys(num_elems); + std::vector h_values(num_elems); + + generate_keys(state, dist, h_keys.begin(), h_keys.end(), multiplicity); + + // generate uniform random values + generate_keys(state, "UNIFORM", h_values.begin(), h_values.end(), 1); + + // the size of the hash table under a given target occupancy depends on the + // number of unique keys in the input + std::size_t const unique = count_unique(h_keys.begin(), h_keys.end()); + std::size_t const capacity = std::ceil(SDIV(unique, occupancy)); + + // alternative occupancy calculation based on the total number of inputs + // std::size_t const capacity = num_elems / occupancy; + + thrust::device_vector d_keys(h_keys); + thrust::device_vector d_values(h_values); + + auto d_pairs_begin = + thrust::make_zip_iterator(thrust::make_tuple(d_keys.begin(), d_values.begin())); + auto d_pairs_end = d_pairs_begin + num_elems; + + state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, + [&](nvbench::launch& launch, auto& timer) { + map_type map{capacity, -1}; + + timer.start(); + map.insert(d_pairs_begin, d_pairs_end, launch.get_stream()); + timer.stop(); + }); +} + +// type parameter dimensions for benchmark +using key_type_range = nvbench::type_list; +using value_type_range = nvbench::type_list; +using base_delay_range = nvbench::enum_type_list<4, 8, 16, 32, 64, 128, 256, 512>; +using max_delay_range = nvbench::enum_type_list<2'048, 4'096, 8'192, 16'384>; + +// benchmark setups +NVBENCH_BENCH_TYPES( + nvbench_cuco_static_reduction_map_custom_op_backoff_delay, + NVBENCH_TYPE_AXES(key_type_range, value_type_range, base_delay_range, max_delay_range)) + .set_name("cuco_static_reduction_map_custom_op_backoff_delay") + .set_type_axes_names({"Key", "Value", "BackoffBaseDelay", "BackoffMaxDelay"}) + .set_max_noise(3) // Custom noise: 3%. By default: 0.5%. + .add_int64_axis("NumInputs", {100'000'000}) // Total number of key/value pairs + .add_float64_axis("Occupancy", {0.8}) // fixed occupancy + .add_int64_axis("Multiplicity", + {1, 10, 100, 1'000, 10'000, 100'000, 1'000'000}) // key multiplicity range + .add_string_axis("Distribution", {"UNIFORM"}); \ No newline at end of file From 54e2022c91615c807583c5a38db881c574299e59 Mon Sep 17 00:00:00 2001 From: Daniel Juenger Date: Wed, 4 Aug 2021 21:09:19 +0000 Subject: [PATCH 39/69] Reduce-by-key performance analysis. --- benchmarks/analysis/notebooks/rbk_bench.ipynb | 716 ++++++++++++++++++ .../reduce_by_key/cub_reduce_by_key_bench.cu | 3 +- .../reduce_by_key/cuco_reduce_by_key_bench.cu | 7 +- .../thrust_reduce_by_key_bench.cu | 50 +- 4 files changed, 743 insertions(+), 33 deletions(-) create mode 100644 benchmarks/analysis/notebooks/rbk_bench.ipynb diff --git a/benchmarks/analysis/notebooks/rbk_bench.ipynb b/benchmarks/analysis/notebooks/rbk_bench.ipynb new file mode 100644 index 000000000..82dd5c5e9 --- /dev/null +++ b/benchmarks/analysis/notebooks/rbk_bench.ipynb @@ -0,0 +1,716 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Preparation" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 1, + "source": [ + "!pip3 install pandas\n", + "!pip3 install matplotlib\n", + "\n", + "# Import libraries\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib\n", + "from collections import namedtuple\n", + "\n", + "#plt.style.use('seaborn-white')" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Requirement already satisfied: pandas in /home/djuenger/miniconda3/lib/python3.9/site-packages (1.3.1)\n", + "Requirement already satisfied: numpy>=1.17.3 in /home/djuenger/miniconda3/lib/python3.9/site-packages (from pandas) (1.21.1)\n", + "Requirement already satisfied: python-dateutil>=2.7.3 in /home/djuenger/miniconda3/lib/python3.9/site-packages (from pandas) (2.8.2)\n", + "Requirement already satisfied: pytz>=2017.3 in /home/djuenger/miniconda3/lib/python3.9/site-packages (from pandas) (2021.1)\n", + "Requirement already satisfied: six>=1.5 in /home/djuenger/miniconda3/lib/python3.9/site-packages (from python-dateutil>=2.7.3->pandas) (1.16.0)\n", + "Requirement already satisfied: matplotlib in /home/djuenger/miniconda3/lib/python3.9/site-packages (3.4.2)\n", + "Requirement already satisfied: pillow>=6.2.0 in /home/djuenger/miniconda3/lib/python3.9/site-packages (from matplotlib) (8.3.1)\n", + "Requirement already satisfied: cycler>=0.10 in /home/djuenger/miniconda3/lib/python3.9/site-packages (from matplotlib) (0.10.0)\n", + "Requirement already satisfied: pyparsing>=2.2.1 in /home/djuenger/miniconda3/lib/python3.9/site-packages (from matplotlib) (2.4.7)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in /home/djuenger/miniconda3/lib/python3.9/site-packages (from matplotlib) (1.3.1)\n", + "Requirement already satisfied: numpy>=1.16 in /home/djuenger/miniconda3/lib/python3.9/site-packages (from matplotlib) (1.21.1)\n", + "Requirement already satisfied: python-dateutil>=2.7 in /home/djuenger/miniconda3/lib/python3.9/site-packages (from matplotlib) (2.8.2)\n", + "Requirement already satisfied: six in /home/djuenger/miniconda3/lib/python3.9/site-packages (from cycler>=0.10->matplotlib) (1.16.0)\n" + ] + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 2, + "source": [ + "# helper functions\n", + "\n", + "style_ = namedtuple(\"style_\", [\"color\", \"marker\", \"linestyle\"])\n", + "\n", + "styles = {\n", + " \"THRUST\" : style_('r', 'v', '-'),\n", + " \"CUB\" : style_('b', 'o', '-'),\n", + " \"CUCO\" : style_('g', 'x', '-'),\n", + " \"CUCO α=50%\" : style_('g', 'x', '-'),\n", + " \"CUCO α=80%\" : style_('g', 'x', '--')}\n", + "\n", + "def load_csv_files(csv_files):\n", + " dfs = {}\n", + " for key, fname in csv_files.items():\n", + " df = pd.read_csv(fname)\n", + " dfs[key] = df[df[\"Skipped\"] == \"No\"]\n", + " return dfs\n", + "\n", + "def filter_bench(dfs, query):\n", + " if isinstance(dfs, dict):\n", + " filtered_dfs = {}\n", + " for key in dfs.keys():\n", + " filtered_dfs[key] = dfs[key].query(query)\n", + " return filtered_dfs\n", + " else:\n", + " return dfs.query(query)\n", + "\n", + "def plot_bench(dfs, xlabel, show_legend=True, title=None, ofname=None, show_xlabel=True, show_ylabel=True, log_xscale=False, log_yscale=False, styles=styles, font_size=14):\n", + " fig, ax = plt.subplots(1, 1)\n", + "\n", + " ax.tick_params(labelsize=font_size)\n", + " if(show_ylabel):\n", + " ax.set_xlabel(xlabel, fontsize=font_size)\n", + " if(show_ylabel):\n", + " ax.set_ylabel(\"Operations per second\", fontsize=font_size)\n", + " if(log_xscale):\n", + " ax.set_xscale('log')\n", + " if(log_yscale):\n", + " ax.set_yscale('log')\n", + " ax.set_title(title, fontsize=font_size)\n", + " ax.grid()\n", + "\n", + " for key, df in dfs.items(): \n", + " style = styles[key]\n", + "\n", + " Y = df[\"NumInputs\"].unique()[0]/df[\"GPU Time (sec)\"]\n", + "\n", + " if xlabel in df.columns:\n", + " X = df[xlabel]\n", + " \n", + " ax.plot(X, Y, label=key, color=style.color, marker=style.marker, linestyle=style.linestyle)\n", + " ax.scatter(X, Y, color=style.color, marker=style.marker, linestyle=style.linestyle)\n", + " else:\n", + " ax.axhline(y=Y.iloc[0], label=key, color=style.color, linestyle=style.linestyle)\n", + "\n", + " if(show_legend):\n", + " plt.legend(fontsize=font_size - 4)\n", + "\n", + " if(ofname):\n", + " plt.savefig(ofname, dpi=1200, format='pdf', bbox_inches='tight')\n", + "\n", + " plt.show()" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 3, + "source": [ + "v100_dfs = load_csv_files({\n", + " \"CUCO\" : \"../results/cuco_rbk_v100.csv\",\n", + " \"CUB\" : \"../results/cub_rbk_v100.csv\",\n", + " \"THRUST\" : \"../results/thrust_rbk_v100.csv\"})\n", + "\n", + "a100_dfs = load_csv_files({\n", + " \"CUCO\" : \"../results/cuco_rbk_a100.csv\",\n", + " \"CUB\" : \"../results/cub_rbk_a100.csv\",\n", + " \"THRUST\" : \"../results/thrust_rbk_a100.csv\"})" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 4, + "source": [ + "# for CUCO, show distinct traces for load factors of 50% and 80%, respectively\n", + "query = 'Distribution == \"UNIFORM\" and\\\n", + " Benchmark.str.contains(\"multiplicity\")'\n", + "\n", + "v100_dfs_mult = filter_bench(v100_dfs, query)\n", + "v100_dfs_mult['CUCO α=50%'] = filter_bench(v100_dfs_mult['CUCO'], 'Occupancy == 0.5')\n", + "v100_dfs_mult['CUCO α=80%'] = filter_bench(v100_dfs_mult['CUCO'], 'Occupancy == 0.8')\n", + "del v100_dfs_mult['CUCO']\n", + "\n", + "a100_dfs_mult = filter_bench(a100_dfs, query)\n", + "a100_dfs_mult['CUCO α=50%'] = filter_bench(a100_dfs_mult['CUCO'], 'Occupancy == 0.5')\n", + "a100_dfs_mult['CUCO α=80%'] = filter_bench(a100_dfs_mult['CUCO'], 'Occupancy == 0.8')\n", + "del a100_dfs_mult['CUCO']\n", + "\n", + "\n", + "#### RBK\n", + "### V100\n", + "## Multiplicity\n", + "# I32/I32\n", + "print(\"V100 I32/I32 UNIFORM\")\n", + "query = 'Key == \"I32\" and\\\n", + " Value == \"I32\"'\n", + "plot_bench(filter_bench(v100_dfs_mult, query), \"Multiplicity\", log_xscale=True)\n", + "\n", + "# I64/I64\n", + "print(\"V100 I64/I64 UNIFORM\")\n", + "query = 'Key == \"I64\" and\\\n", + " Value == \"I64\"'\n", + "plot_bench(filter_bench(v100_dfs_mult, query), \"Multiplicity\", log_xscale=True, show_legend=False)\n", + "\n", + "###- A100\n", + "# I32/I32\n", + "print(\"A100 I32/I32 UNIFORM\")\n", + "query = 'Key == \"I32\" and\\\n", + " Value == \"I32\"'\n", + "plot_bench(filter_bench(a100_dfs_mult, query), \"Multiplicity\", log_xscale=True, show_legend=False)\n", + "\n", + "# I64/I64\n", + "print(\"A100 I64/I64 UNIFORM\")\n", + "query = 'Key == \"I64\" and\\\n", + " Value == \"I64\"'\n", + "plot_bench(filter_bench(a100_dfs_mult, query), \"Multiplicity\", log_xscale=True, show_legend=False)" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "V100 I32/I32 UNIFORM\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "" + }, + "metadata": { + "needs_background": "light" + } + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "V100 I64/I64 UNIFORM\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "" + }, + "metadata": { + "needs_background": "light" + } + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "A100 I32/I32 UNIFORM\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "" + }, + "metadata": { + "needs_background": "light" + } + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "A100 I64/I64 UNIFORM\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "" + }, + "metadata": { + "needs_background": "light" + } + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 5, + "source": [ + "#### RBK\n", + "### V100\n", + "## Occupancy\n", + "# I32/I32\n", + "print(\"V100 I32/I32 UNIFORM\")\n", + "query = 'Distribution == \"UNIFORM\" and\\\n", + " Key == \"I32\" and\\\n", + " Value == \"I32\" and\\\n", + " Benchmark.str.contains(\"occupancy|distribution\")'\n", + "plot_bench(filter_bench(v100_dfs, query), \"Occupancy\")\n", + "\n", + "print(\"V100 I32/I32 GAUSSIAN\")\n", + "query = 'Distribution == \"GAUSSIAN\" and\\\n", + " Key == \"I32\" and\\\n", + " Value == \"I32\"'\n", + "plot_bench(filter_bench(v100_dfs, query), \"Occupancy\")\n", + "\n", + "print(\"V100 I32/I32 UNIQUE\")\n", + "query = 'Distribution == \"UNIQUE\" and\\\n", + " Key == \"I32\" and\\\n", + " Value == \"I32\"'\n", + "plot_bench(filter_bench(v100_dfs, query), \"Occupancy\")\n", + "\n", + "print(\"V100 I32/I32 SAME\")\n", + "query = 'Distribution == \"SAME\" and\\\n", + " Key == \"I32\" and\\\n", + " Value == \"I32\"'\n", + "plot_bench(filter_bench(v100_dfs, query), \"Occupancy\")\n", + "\n", + "# I64/I64\n", + "print(\"V100 I64/I64 UNIFORM\")\n", + "query = 'Distribution == \"UNIFORM\" and\\\n", + " Key == \"I64\" and\\\n", + " Value == \"I64\" and\\\n", + " Benchmark.str.contains(\"occupancy|distribution\")'\n", + "plot_bench(filter_bench(v100_dfs, query), \"Occupancy\")\n", + "\n", + "print(\"V100 I64/I64 GAUSSIAN\")\n", + "query = 'Distribution == \"GAUSSIAN\" and\\\n", + " Key == \"I64\" and\\\n", + " Value == \"I64\"'\n", + "plot_bench(filter_bench(v100_dfs, query), \"Occupancy\")\n", + "\n", + "print(\"V100 I64/I64 UNIQUE\")\n", + "query = 'Distribution == \"UNIQUE\" and\\\n", + " Key == \"I64\" and\\\n", + " Value == \"I64\"'\n", + "plot_bench(filter_bench(v100_dfs, query), \"Occupancy\")\n", + "\n", + "print(\"V100 I64/I64 SAME\")\n", + "query = 'Distribution == \"SAME\" and\\\n", + " Key == \"I64\" and\\\n", + " Value == \"I64\"'\n", + "plot_bench(filter_bench(v100_dfs, query), \"Occupancy\")" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "V100 I32/I32 UNIFORM\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "" + }, + "metadata": { + "needs_background": "light" + } + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "V100 I32/I32 GAUSSIAN\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "" + }, + "metadata": { + "needs_background": "light" + } + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "V100 I32/I32 UNIQUE\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "" + }, + "metadata": { + "needs_background": "light" + } + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "V100 I32/I32 SAME\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "" + }, + "metadata": { + "needs_background": "light" + } + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "V100 I64/I64 UNIFORM\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "" + }, + "metadata": { + "needs_background": "light" + } + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "V100 I64/I64 GAUSSIAN\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "" + }, + "metadata": { + "needs_background": "light" + } + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "V100 I64/I64 UNIQUE\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "" + }, + "metadata": { + "needs_background": "light" + } + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "V100 I64/I64 SAME\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "" + }, + "metadata": { + "needs_background": "light" + } + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 6, + "source": [ + "#### RBK\n", + "### A100\n", + "## Occupancy\n", + "# I32/I32\n", + "print(\"A100 I32/I32 UNIFORM\")\n", + "query = 'Distribution == \"UNIFORM\" and\\\n", + " Key == \"I32\" and\\\n", + " Value == \"I32\" and\\\n", + " Benchmark.str.contains(\"occupancy|distribution\")'\n", + "plot_bench(filter_bench(a100_dfs, query), \"Occupancy\")\n", + "\n", + "print(\"A100 I32/I32 GAUSSIAN\")\n", + "query = 'Distribution == \"GAUSSIAN\" and\\\n", + " Key == \"I32\" and\\\n", + " Value == \"I32\"'\n", + "plot_bench(filter_bench(a100_dfs, query), \"Occupancy\")\n", + "\n", + "print(\"A100 I32/I32 UNIQUE\")\n", + "query = 'Distribution == \"UNIQUE\" and\\\n", + " Key == \"I32\" and\\\n", + " Value == \"I32\"'\n", + "plot_bench(filter_bench(a100_dfs, query), \"Occupancy\")\n", + "\n", + "print(\"A100 I32/I32 SAME\")\n", + "query = 'Distribution == \"SAME\" and\\\n", + " Key == \"I32\" and\\\n", + " Value == \"I32\"'\n", + "plot_bench(filter_bench(a100_dfs, query), \"Occupancy\")\n", + "\n", + "# I64/I64\n", + "print(\"A100 I64/I64 UNIFORM\")\n", + "query = 'Distribution == \"UNIFORM\" and\\\n", + " Key == \"I64\" and\\\n", + " Value == \"I64\" and\\\n", + " Benchmark.str.contains(\"occupancy|distribution\")'\n", + "plot_bench(filter_bench(a100_dfs, query), \"Occupancy\")\n", + "\n", + "print(\"A100 I64/I64 GAUSSIAN\")\n", + "query = 'Distribution == \"GAUSSIAN\" and\\\n", + " Key == \"I64\" and\\\n", + " Value == \"I64\"'\n", + "plot_bench(filter_bench(a100_dfs, query), \"Occupancy\")\n", + "\n", + "print(\"A100 I64/I64 UNIQUE\")\n", + "query = 'Distribution == \"UNIQUE\" and\\\n", + " Key == \"I64\" and\\\n", + " Value == \"I64\"'\n", + "plot_bench(filter_bench(a100_dfs, query), \"Occupancy\")\n", + "\n", + "print(\"A100 I64/I64 SAME\")\n", + "query = 'Distribution == \"SAME\" and\\\n", + " Key == \"I64\" and\\\n", + " Value == \"I64\"'\n", + "plot_bench(filter_bench(a100_dfs, query), \"Occupancy\")" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "A100 I32/I32 UNIFORM\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "" + }, + "metadata": { + "needs_background": "light" + } + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "A100 I32/I32 GAUSSIAN\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "" + }, + "metadata": { + "needs_background": "light" + } + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "A100 I32/I32 UNIQUE\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "" + }, + "metadata": { + "needs_background": "light" + } + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "A100 I32/I32 SAME\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "" + }, + "metadata": { + "needs_background": "light" + } + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "A100 I64/I64 UNIFORM\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "" + }, + "metadata": { + "needs_background": "light" + } + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "A100 I64/I64 GAUSSIAN\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "" + }, + "metadata": { + "needs_background": "light" + } + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "A100 I64/I64 UNIQUE\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "" + }, + "metadata": { + "needs_background": "light" + } + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "A100 I64/I64 SAME\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "" + }, + "metadata": { + "needs_background": "light" + } + } + ], + "metadata": {} + } + ], + "metadata": { + "interpreter": { + "hash": "fab55a90acef312968e5bff70ae91c3267a5b896b51d076af77c4418fdb5d582" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3.9.5 64-bit ('base': conda)" + }, + "language_info": { + "name": "python", + "version": "3.9.5", + "mimetype": "text/x-python", + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "pygments_lexer": "ipython3", + "nbconvert_exporter": "python", + "file_extension": ".py" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/benchmarks/reduce_by_key/cub_reduce_by_key_bench.cu b/benchmarks/reduce_by_key/cub_reduce_by_key_bench.cu index 0f481c158..20553238b 100644 --- a/benchmarks/reduce_by_key/cub_reduce_by_key_bench.cu +++ b/benchmarks/reduce_by_key/cub_reduce_by_key_bench.cu @@ -115,5 +115,6 @@ NVBENCH_BENCH_TYPES(nvbench_cub_reduce_by_key, NVBENCH_TYPE_AXES(key_type_range, .set_type_axes_names({"Key", "Value"}) .set_max_noise(3) // Custom noise: 3%. By default: 0.5%. .add_int64_axis("NumInputs", {100'000'000}) // Total number of key/value pairs - .add_int64_axis("Multiplicity", {1, 10, 100, 1'000, 10'000, 100'000}) // key multiplicity range + .add_int64_axis("Multiplicity", + {1, 10, 100, 1'000, 10'000, 100'000, 1'000'000}) // key multiplicity range .add_string_axis("Distribution", {"UNIFORM"}); \ No newline at end of file diff --git a/benchmarks/reduce_by_key/cuco_reduce_by_key_bench.cu b/benchmarks/reduce_by_key/cuco_reduce_by_key_bench.cu index e5ee1a7a2..ecbdd06e4 100644 --- a/benchmarks/reduce_by_key/cuco_reduce_by_key_bench.cu +++ b/benchmarks/reduce_by_key/cuco_reduce_by_key_bench.cu @@ -116,7 +116,7 @@ void nvbench_cuco_static_reduction_map_reduce_by_key( // type parameter dimensions for benchmark using key_type_range = nvbench::type_list; using value_type_range = nvbench::type_list; -using op_type_range = nvbench::enum_type_list; +using op_type_range = nvbench::enum_type_list; NVBENCH_BENCH_TYPES(nvbench_cuco_static_reduction_map_reduce_by_key, NVBENCH_TYPE_AXES(key_type_range, value_type_range, op_type_range)) @@ -134,6 +134,7 @@ NVBENCH_BENCH_TYPES(nvbench_cuco_static_reduction_map_reduce_by_key, .set_type_axes_names({"Key", "Value", "ReductionOp"}) .set_max_noise(3) // Custom noise: 3%. By default: 0.5%. .add_int64_axis("NumInputs", {100'000'000}) // Total number of key/value pairs - .add_float64_axis("Occupancy", {0.8}) // fixed occupancy - .add_int64_axis("Multiplicity", {1, 10, 100, 1'000, 10'000, 100'000}) // key multiplicity range + .add_float64_axis("Occupancy", {0.5, 0.8}) // fixed occupancy + .add_int64_axis("Multiplicity", + {1, 10, 100, 1'000, 10'000, 100'000, 1'000'000}) // key multiplicity range .add_string_axis("Distribution", {"UNIFORM"}); \ No newline at end of file diff --git a/benchmarks/reduce_by_key/thrust_reduce_by_key_bench.cu b/benchmarks/reduce_by_key/thrust_reduce_by_key_bench.cu index ad1c77058..acd8d9f8d 100644 --- a/benchmarks/reduce_by_key/thrust_reduce_by_key_bench.cu +++ b/benchmarks/reduce_by_key/thrust_reduce_by_key_bench.cu @@ -15,20 +15,20 @@ */ #include +#include +#include #include #include #include -#include -#include -#include #include +#include /** * @brief Reduce-by-key implementation in Thrust. */ template -void thrust_reduce_by_key(KeyRandomIterator keys_begin, - KeyRandomIterator keys_end, +void thrust_reduce_by_key(KeyRandomIterator keys_begin, + KeyRandomIterator keys_end, ValueRandomIterator values_begin) { using Key = typename thrust::iterator_traits::value_type; @@ -48,14 +48,8 @@ void thrust_reduce_by_key(KeyRandomIterator keys_begin, /** * @brief A benchmark evaluating reduce-by-key performance. */ -template < - typename Key, - typename Value> -void nvbench_thrust_reduce_by_key( - nvbench::state& state, - nvbench::type_list< - Key, - Value>) +template +void nvbench_thrust_reduce_by_key(nvbench::state& state, nvbench::type_list) { auto const num_elems = state.get_int64("NumInputs"); auto const dist = state.get_string("Distribution"); @@ -69,16 +63,15 @@ void nvbench_thrust_reduce_by_key( // generate uniform random values generate_keys(state, "UNIFORM", h_values.begin(), h_values.end(), 1); - thrust::device_vector d_keys(h_keys); + thrust::device_vector d_keys(h_keys); thrust::device_vector d_values(h_values); - state.exec( - nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { - timer.start(); - // TODO use CUDA stream provided by nvbench::launch - thrust_reduce_by_key(d_keys.begin(), d_keys.end(), d_values.begin()); - timer.stop(); - }); + state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, + [&](nvbench::launch& launch, auto& timer) { + timer.start(); + thrust_reduce_by_key(d_keys.begin(), d_keys.end(), d_values.begin()); + timer.stop(); + }); } // type parameter dimensions for benchmark @@ -87,21 +80,20 @@ using value_type_range = nvbench::type_list; // benchmark setups NVBENCH_BENCH_TYPES(nvbench_thrust_reduce_by_key, - NVBENCH_TYPE_AXES(key_type_range, - value_type_range)) + NVBENCH_TYPE_AXES(key_type_range, value_type_range)) .set_name("nvbench_thrust_reduce_by_key_distribution") .set_type_axes_names({"Key", "Value"}) .set_max_noise(3) // Custom noise: 3%. By default: 0.5%. - .add_int64_axis("NumInputs", {100'000'000}) // Total number of key/value pairs - .add_int64_axis("Multiplicity", {8}) // only applies to uniform distribution + .add_int64_axis("NumInputs", {100'000'000}) // Total number of key/value pairs + .add_int64_axis("Multiplicity", {8}) // only applies to uniform distribution .add_string_axis("Distribution", {"GAUSSIAN", "UNIFORM", "UNIQUE", "SAME"}); NVBENCH_BENCH_TYPES(nvbench_thrust_reduce_by_key, - NVBENCH_TYPE_AXES(key_type_range, - value_type_range)) + NVBENCH_TYPE_AXES(key_type_range, value_type_range)) .set_name("nvbench_thrust_reduce_by_key_multiplicity") .set_type_axes_names({"Key", "Value"}) .set_max_noise(3) // Custom noise: 3%. By default: 0.5%. - .add_int64_axis("NumInputs", {100'000'000}) // Total number of key/value pairs - .add_int64_axis("Multiplicity", {1, 10, 100, 1'000, 10'000, 100'000}) // key multiplicity range + .add_int64_axis("NumInputs", {100'000'000}) // Total number of key/value pairs + .add_int64_axis("Multiplicity", + {1, 10, 100, 1'000, 10'000, 100'000, 1'000'000}) // key multiplicity range .add_string_axis("Distribution", {"UNIFORM"}); \ No newline at end of file From cc853c3447a27a49b9b3564dce1bf8d5221ec42e Mon Sep 17 00:00:00 2001 From: Daniel Juenger Date: Wed, 4 Aug 2021 21:22:09 +0000 Subject: [PATCH 40/69] Additional benchmark setups for static_reduction_map. --- .../hash_table/static_reduction_map_bench.cu | 132 ++++++------------ 1 file changed, 44 insertions(+), 88 deletions(-) diff --git a/benchmarks/hash_table/static_reduction_map_bench.cu b/benchmarks/hash_table/static_reduction_map_bench.cu index 0d651139c..411e08c8d 100644 --- a/benchmarks/hash_table/static_reduction_map_bench.cu +++ b/benchmarks/hash_table/static_reduction_map_bench.cu @@ -24,7 +24,7 @@ /** * @brief Enum representation for reduction operators */ -enum class op_type { REDUCE_ADD, CUSTOM_OP }; +enum class op_type { REDUCE_ADD, CUSTOM_OP, CUSTOM_OP_NO_BACKOFF }; NVBENCH_DECLARE_ENUM_TYPE_STRINGS( // Enum type: @@ -36,6 +36,7 @@ NVBENCH_DECLARE_ENUM_TYPE_STRINGS( switch (o) { case op_type::REDUCE_ADD: return "REDUCE_ADD"; case op_type::CUSTOM_OP: return "CUSTOM_OP"; + case op_type::CUSTOM_OP_NO_BACKOFF: return "CUSTOM_OP_NO_BACKOFF"; default: return "ERROR"; } }, @@ -53,16 +54,27 @@ template struct op_type_map { }; +// Sum reduction with atomic fetch-and-add template <> struct op_type_map { template using type = cuco::reduce_add; }; +// Sum reduction with atomic compare-and-swap loop +// Note: default backoff strategy template <> struct op_type_map { template - using type = cuco::custom_op>; // sum reduction with CAS loop + using type = cuco::custom_op>; +}; + +// Sum reduction with atomic compare-and-swap loop +// Note: backoff strategy omitted +template <> +struct op_type_map { + template + using type = cuco::custom_op, 0>; }; /** @@ -79,91 +91,35 @@ void nvbench_cuco_static_reduction_map_insert( auto const dist = state.get_string("Distribution"); auto const multiplicity = state.get_int64_or_default("Multiplicity", 8); - std::vector h_keys(num_elems); - std::vector h_values(num_elems); + std::vector h_keys_in(num_elems); + std::vector h_values_in(num_elems); - generate_keys(state, dist, h_keys.begin(), h_keys.end(), multiplicity); + generate_keys(state, dist, h_keys_in.begin(), h_keys_in.end(), multiplicity); // generate uniform random values - generate_keys(state, "UNIFORM", h_values.begin(), h_values.end(), 1); + generate_keys(state, "UNIFORM", h_values_in.begin(), h_values_in.end(), 1); // the size of the hash table under a given target occupancy depends on the // number of unique keys in the input - std::size_t const unique = count_unique(h_keys.begin(), h_keys.end()); + std::size_t const unique = count_unique(h_keys_in.begin(), h_keys_in.end()); std::size_t const capacity = std::ceil(SDIV(unique, occupancy)); // alternative occupancy calculation based on the total number of inputs // std::size_t const capacity = num_elems / occupancy; - thrust::device_vector d_keys(h_keys); - thrust::device_vector d_values(h_values); + thrust::device_vector d_keys_in(h_keys_in); + thrust::device_vector d_values_in(h_values_in); - auto d_pairs_begin = - thrust::make_zip_iterator(thrust::make_tuple(d_keys.begin(), d_values.begin())); - auto d_pairs_end = d_pairs_begin + num_elems; + auto d_pairs_in_begin = + thrust::make_zip_iterator(thrust::make_tuple(d_keys_in.begin(), d_values_in.begin())); + auto d_pairs_in_end = d_pairs_in_begin + num_elems; state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { map_type map{capacity, -1}; timer.start(); - map.insert(d_pairs_begin, d_pairs_end, launch.get_stream()); - timer.stop(); - }); -} - -/** - * @brief A benchmark evaluating insert performance. - */ -template -void nvbench_cuco_static_reduction_map_custom_op_insert( - nvbench::state& state, - nvbench::type_list, - nvbench::enum_type>) -{ - using custom_op_type = - cuco::custom_op, BackoffBaseDelay, BackoffMaxDelay>; - using map_type = cuco::static_reduction_map; - - auto const num_elems = state.get_int64("NumInputs"); - auto const occupancy = state.get_float64("Occupancy"); - auto const dist = state.get_string("Distribution"); - auto const multiplicity = state.get_int64_or_default("Multiplicity", 8); - - std::vector h_keys(num_elems); - std::vector h_values(num_elems); - - generate_keys(state, dist, h_keys.begin(), h_keys.end(), multiplicity); - - // generate uniform random values - generate_keys(state, "UNIFORM", h_values.begin(), h_values.end(), 1); - - // the size of the hash table under a given target occupancy depends on the - // number of unique keys in the input - std::size_t const unique = count_unique(h_keys.begin(), h_keys.end()); - std::size_t const capacity = std::ceil(SDIV(unique, occupancy)); - - // alternative occupancy calculation based on the total number of inputs - // std::size_t const capacity = num_elems / occupancy; - - thrust::device_vector d_keys(h_keys); - thrust::device_vector d_values(h_values); - - auto d_pairs_begin = - thrust::make_zip_iterator(thrust::make_tuple(d_keys.begin(), d_values.begin())); - auto d_pairs_end = d_pairs_begin + num_elems; - - state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, - [&](nvbench::launch& launch, auto& timer) { - map_type map{capacity, -1}; - - timer.start(); - map.insert(d_pairs_begin, d_pairs_end, launch.get_stream()); + map.insert(d_pairs_in_begin, d_pairs_in_end, launch.get_stream()); timer.stop(); }); } @@ -171,11 +127,11 @@ void nvbench_cuco_static_reduction_map_custom_op_insert( // type parameter dimensions for benchmark using key_type_range = nvbench::type_list; using value_type_range = nvbench::type_list; -using op_type_range = nvbench::enum_type_list; -using base_delay_range = nvbench::enum_type_list<0, 8, 16, 32, 64, 128, 256>; -using max_delay_range = nvbench::enum_type_list<2048, 4096, 8192>; +using op_type_range = + nvbench::enum_type_list; // benchmark setups + NVBENCH_BENCH_TYPES(nvbench_cuco_static_reduction_map_insert, NVBENCH_TYPE_AXES(key_type_range, value_type_range, op_type_range)) .set_name("cuco_static_reduction_map_insert_occupancy") @@ -184,28 +140,28 @@ NVBENCH_BENCH_TYPES(nvbench_cuco_static_reduction_map_insert, .add_int64_axis("NumInputs", {100'000'000}) // Total number of key/value pairs .add_float64_axis("Occupancy", nvbench::range(0.5, 0.9, 0.1)) // occupancy range .add_int64_axis("Multiplicity", {8}) // only applies to uniform distribution - .add_string_axis("Distribution", {"GAUSSIAN", "UNIFORM", "UNIQUE", "SAME"}); + .add_string_axis("Distribution", {"GAUSSIAN", "UNIFORM", "UNIQUE"}); +// Distribution "SAME" does not work with CUSTOM_OP NVBENCH_BENCH_TYPES(nvbench_cuco_static_reduction_map_insert, - NVBENCH_TYPE_AXES(key_type_range, value_type_range, op_type_range)) - .set_name("cuco_static_reduction_map_insert_multiplicity") + NVBENCH_TYPE_AXES(key_type_range, + value_type_range, + nvbench::enum_type_list)) + .set_name("cuco_static_reduction_map_insert_occupancy") .set_type_axes_names({"Key", "Value", "ReductionOp"}) .set_max_noise(3) // Custom noise: 3%. By default: 0.5%. .add_int64_axis("NumInputs", {100'000'000}) // Total number of key/value pairs - .add_float64_axis("Occupancy", {0.8}) // fixed occupancy - .add_int64_axis("Multiplicity", {1, 10, 100, 1'000, 10'000, 100'000}) // key multiplicity range - .add_string_axis("Distribution", {"UNIFORM"}); - -NVBENCH_BENCH_TYPES(nvbench_cuco_static_reduction_map_custom_op_insert, - NVBENCH_TYPE_AXES(nvbench::type_list, - nvbench::type_list, - base_delay_range, - max_delay_range)) - .set_name("cuco_static_reduction_map_custom_op_insert_contention") - .set_type_axes_names({"Key", "Value", "BackoffBaseDelay", "BackoffMaxDelay"}) + .add_float64_axis("Occupancy", nvbench::range(0.5, 0.9, 0.1)) // occupancy range + .add_int64_axis("Multiplicity", {8}) // only applies to uniform distribution + .add_string_axis("Distribution", {"SAME"}); + +NVBENCH_BENCH_TYPES(nvbench_cuco_static_reduction_map_insert, + NVBENCH_TYPE_AXES(key_type_range, value_type_range, op_type_range)) + .set_name("cuco_static_reduction_map_insert_multiplicity") + .set_type_axes_names({"Key", "Value", "ReductionOp"}) .set_max_noise(3) // Custom noise: 3%. By default: 0.5%. .add_int64_axis("NumInputs", {100'000'000}) // Total number of key/value pairs - .add_float64_axis("Occupancy", {0.8}) // fixed occupancy + .add_float64_axis("Occupancy", nvbench::range(0.5, 0.9, 0.1)) .add_int64_axis("Multiplicity", - {1, 10, 100, 1'000, 10'000, 100'000, 200'000}) // key multiplicity range + {1, 10, 100, 1'000, 10'000, 100'000, 1'000'000}) // key multiplicity range .add_string_axis("Distribution", {"UNIFORM"}); \ No newline at end of file From 28069d0f0a791f23ecd8ebe77f82eda2e1113c95 Mon Sep 17 00:00:00 2001 From: Daniel Juenger Date: Thu, 5 Aug 2021 22:29:09 +0000 Subject: [PATCH 41/69] Make key_generator.hpp usable from other benchmark suites. --- benchmarks/CMakeLists.txt | 5 +++-- .../hash_table/static_reduction_map_bench.cu | 7 +++++-- .../static_reduction_map_param_grid_search.cu | 7 +++++-- benchmarks/key_generator.hpp | 21 ++++++++++--------- .../reduce_by_key/cub_reduce_by_key_bench.cu | 7 +++++-- .../reduce_by_key/cuco_reduce_by_key_bench.cu | 7 +++++-- .../thrust_reduce_by_key_bench.cu | 7 +++++-- 7 files changed, 39 insertions(+), 22 deletions(-) diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 5f16ca5bf..4db4d74d6 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -33,7 +33,7 @@ function(ConfigureBench BENCH_NAME BENCH_SRC) add_executable(${BENCH_NAME} "${BENCH_SRC}") set_target_properties(${BENCH_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON - RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/gbenchmarks") + RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/benchmarks") target_include_directories(${BENCH_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}") target_compile_options(${BENCH_NAME} PRIVATE --expt-extended-lambda --expt-relaxed-constexpr -Xcompiler -Wno-subobject-linkage) @@ -49,7 +49,8 @@ function(ConfigureNVBench BENCH_NAME BENCH_SRC) add_executable(${BENCH_NAME} "${BENCH_SRC}") set_target_properties(${BENCH_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON - RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/nvbenchmarks") + RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/benchmarks" + COMPILE_FLAGS -DNVBENCH_MODULE) target_include_directories(${BENCH_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}") #"${NVBench_SOURCE_DIR}") diff --git a/benchmarks/hash_table/static_reduction_map_bench.cu b/benchmarks/hash_table/static_reduction_map_bench.cu index 411e08c8d..863477a7c 100644 --- a/benchmarks/hash_table/static_reduction_map_bench.cu +++ b/benchmarks/hash_table/static_reduction_map_bench.cu @@ -94,10 +94,13 @@ void nvbench_cuco_static_reduction_map_insert( std::vector h_keys_in(num_elems); std::vector h_values_in(num_elems); - generate_keys(state, dist, h_keys_in.begin(), h_keys_in.end(), multiplicity); + if (not generate_keys(dist, h_keys_in.begin(), h_keys_in.end(), multiplicity)) { + state.skip("Invalid distribution."); + return; + } // generate uniform random values - generate_keys(state, "UNIFORM", h_values_in.begin(), h_values_in.end(), 1); + generate_keys("UNIFORM", h_values_in.begin(), h_values_in.end(), 1); // the size of the hash table under a given target occupancy depends on the // number of unique keys in the input diff --git a/benchmarks/hash_table/static_reduction_map_param_grid_search.cu b/benchmarks/hash_table/static_reduction_map_param_grid_search.cu index 41baaa872..27bcbf38d 100644 --- a/benchmarks/hash_table/static_reduction_map_param_grid_search.cu +++ b/benchmarks/hash_table/static_reduction_map_param_grid_search.cu @@ -47,10 +47,13 @@ void nvbench_cuco_static_reduction_map_custom_op_backoff_delay( std::vector h_keys(num_elems); std::vector h_values(num_elems); - generate_keys(state, dist, h_keys.begin(), h_keys.end(), multiplicity); + if (not generate_keys(dist, h_keys.begin(), h_keys.end(), multiplicity)) { + state.skip("Invalid input distribution."); + return; + } // generate uniform random values - generate_keys(state, "UNIFORM", h_values.begin(), h_values.end(), 1); + generate_keys("UNIFORM", h_values.begin(), h_values.end(), 1); // the size of the hash table under a given target occupancy depends on the // number of unique keys in the input diff --git a/benchmarks/key_generator.hpp b/benchmarks/key_generator.hpp index c16015866..6959e6823 100644 --- a/benchmarks/key_generator.hpp +++ b/benchmarks/key_generator.hpp @@ -18,12 +18,14 @@ #include #include -#include #include #include enum class dist_type { GAUSSIAN, GEOMETRIC, UNIFORM, UNIQUE, SAME }; +#if defined(NVBENCH_MODULE) +#include + NVBENCH_DECLARE_ENUM_TYPE_STRINGS( // Enum type: dist_type, @@ -46,10 +48,10 @@ NVBENCH_DECLARE_ENUM_TYPE_STRINGS( // input string. // Just use `[](auto) { return std::string{}; }` if you don't want these. [](auto) { return std::string{}; }) +#endif template -static void generate_keys(nvbench::state& state, - dist_type dist, +static bool generate_keys(dist_type dist, OutputIt output_begin, OutputIt output_end, std::size_t multiplicity = 8) @@ -107,15 +109,15 @@ static void generate_keys(nvbench::state& state, break; } default: { - state.skip("unknown distribution type"); - break; + return false; } } // switch + + return true; } template -static void generate_keys(nvbench::state& state, - std::string const& dist, +static bool generate_keys(std::string const& dist, OutputIt output_begin, OutputIt output_end, std::size_t multiplicity = 8) @@ -133,11 +135,10 @@ static void generate_keys(nvbench::state& state, } else if (dist == "SAME") { enum_value = dist_type::SAME; } else { - state.skip("unknown distribution type"); - return; + return false; } - generate_keys(state, enum_value, output_begin, output_end, multiplicity); + return generate_keys(enum_value, output_begin, output_end, multiplicity); } template diff --git a/benchmarks/reduce_by_key/cub_reduce_by_key_bench.cu b/benchmarks/reduce_by_key/cub_reduce_by_key_bench.cu index 20553238b..efbe7799d 100644 --- a/benchmarks/reduce_by_key/cub_reduce_by_key_bench.cu +++ b/benchmarks/reduce_by_key/cub_reduce_by_key_bench.cu @@ -32,10 +32,13 @@ void nvbench_cub_reduce_by_key(nvbench::state& state, nvbench::type_list h_keys(num_elems_in); std::vector h_values(num_elems_in); - generate_keys(state, dist, h_keys.begin(), h_keys.end(), multiplicity); + if (not generate_keys(dist, h_keys.begin(), h_keys.end(), multiplicity)) { + state.skip("Invalid input distribution."); + return; + } // generate uniform random values - generate_keys(state, "UNIFORM", h_values.begin(), h_values.end(), 1); + generate_keys("UNIFORM", h_values.begin(), h_values.end(), 1); // double buffer (ying/yang) thrust::device_vector d_keys_ying(h_keys); diff --git a/benchmarks/reduce_by_key/cuco_reduce_by_key_bench.cu b/benchmarks/reduce_by_key/cuco_reduce_by_key_bench.cu index ecbdd06e4..96d5ee2b3 100644 --- a/benchmarks/reduce_by_key/cuco_reduce_by_key_bench.cu +++ b/benchmarks/reduce_by_key/cuco_reduce_by_key_bench.cu @@ -82,10 +82,13 @@ void nvbench_cuco_static_reduction_map_reduce_by_key( std::vector h_keys(num_elems); std::vector h_values(num_elems); - generate_keys(state, dist, h_keys.begin(), h_keys.end(), multiplicity); + if (not generate_keys(dist, h_keys.begin(), h_keys.end(), multiplicity)) { + state.skip("Invalid input distribution."); + return; + } // generate uniform random values - generate_keys(state, "UNIFORM", h_values.begin(), h_values.end(), 1); + generate_keys("UNIFORM", h_values.begin(), h_values.end(), 1); // the size of the hash table under a given target occupancy depends on the // number of unique keys in the input diff --git a/benchmarks/reduce_by_key/thrust_reduce_by_key_bench.cu b/benchmarks/reduce_by_key/thrust_reduce_by_key_bench.cu index acd8d9f8d..8cc5ef3cc 100644 --- a/benchmarks/reduce_by_key/thrust_reduce_by_key_bench.cu +++ b/benchmarks/reduce_by_key/thrust_reduce_by_key_bench.cu @@ -58,10 +58,13 @@ void nvbench_thrust_reduce_by_key(nvbench::state& state, nvbench::type_list h_keys(num_elems); std::vector h_values(num_elems); - generate_keys(state, dist, h_keys.begin(), h_keys.end(), multiplicity); + if (not generate_keys(dist, h_keys.begin(), h_keys.end(), multiplicity)) { + state.skip("Invalid input distribution."); + return; + } // generate uniform random values - generate_keys(state, "UNIFORM", h_values.begin(), h_values.end(), 1); + generate_keys("UNIFORM", h_values.begin(), h_values.end(), 1); thrust::device_vector d_keys(h_keys); thrust::device_vector d_values(h_values); From 26787adedfbc88a0f74d7ba0abd9d534eab99993 Mon Sep 17 00:00:00 2001 From: Daniel Juenger Date: Thu, 5 Aug 2021 23:39:19 +0000 Subject: [PATCH 42/69] Fix for make_from_uninitialized_slots. --- include/cuco/static_reduction_map.cuh | 34 +++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/include/cuco/static_reduction_map.cuh b/include/cuco/static_reduction_map.cuh index 06b9ce454..7e403e5bd 100644 --- a/include/cuco/static_reduction_map.cuh +++ b/include/cuco/static_reduction_map.cuh @@ -596,6 +596,33 @@ class static_reduction_map { return &slots_[(index + g.size()) % capacity_]; } + /** + * @brief Initializes the given array of slots to the specified values given by `k` and `v` + * using the threads in the group `g`. + * + * @note This function synchronizes the group `g`. + * + * @tparam CG The type of the cooperative thread group + * @param g The cooperative thread group used to initialize the slots + * @param slots Pointer to the array of slots to initialize + * @param num_slots Number of slots to initialize + * @param k The desired key value for each slot + * @param v The desired mapped value for each slot + */ + + template + __device__ static void initialize_slots( + CG g, pair_atomic_type* slots, std::size_t num_slots, Key k, Value v) + { + auto tid = g.thread_rank(); + while (tid < num_slots) { + new (&slots[tid].first) atomic_key_type{k}; + new (&slots[tid].second) atomic_mapped_type{v}; + tid += g.size(); + } + g.sync(); + } + public: /** * @brief Gets the maximum number of elements the hash map can hold. @@ -732,17 +759,20 @@ class static_reduction_map { : device_view_base{slots, capacity, empty_key_sentinel, reduction_op} { } + template __device__ static device_mutable_view make_from_uninitialized_slots( CG const& g, pair_atomic_type* slots, std::size_t capacity, Key empty_key_sentinel, - ReductionOp reduction_op) noexcept + ReductionOp reduction_op = {}) noexcept { - device_view_base::initialize_slots(g, slots, capacity, empty_key_sentinel, reduction_op); + device_view_base::initialize_slots( + g, slots, capacity, empty_key_sentinel, ReductionOp::identity); return device_mutable_view{slots, capacity, empty_key_sentinel, reduction_op}; } + /** * @brief Inserts the specified key/value pair into the map. * From e2a81b37de9248c8459245837bca98cb5f180b65 Mon Sep 17 00:00:00 2001 From: Daniel Juenger Date: Thu, 5 Aug 2021 23:46:29 +0000 Subject: [PATCH 43/69] [WIP] Added benchmark for static_reduction_map in shared memory. --- benchmarks/CMakeLists.txt | 28 ++++++ .../static_reduction_map_smem_bench.cu | 98 +++++++++++++++++++ 2 files changed, 126 insertions(+) create mode 100644 benchmarks/hash_table/static_reduction_map_smem_bench.cu diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 4db4d74d6..dcaa3ce22 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -24,6 +24,14 @@ CPMAddPackage( GIT_SHALLOW TRUE ) +# device-side benchmark tool +CPMAddPackage( + NAME cuda_benchmark + GITHUB_REPOSITORY sleeepyjack/cuda_benchmark + GIT_TAG master + GIT_SHALLOW TRUE +) + set_target_properties(benchmark PROPERTIES CXX_STANDARD 17) ################################################################################################### @@ -61,6 +69,22 @@ function(ConfigureNVBench BENCH_NAME BENCH_SRC) cuco) endfunction(ConfigureNVBench) +################################################################################################### +function(ConfigureCUDABench BENCH_NAME BENCH_SRC) + add_executable(${BENCH_NAME} "${BENCH_SRC}") + set_target_properties(${BENCH_NAME} PROPERTIES + POSITION_INDEPENDENT_CODE ON + RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/benchmarks") + target_include_directories(${BENCH_NAME} PRIVATE + "${CMAKE_CURRENT_SOURCE_DIR}") + #"${NVBench_SOURCE_DIR}") + target_compile_options(${BENCH_NAME} PRIVATE --expt-extended-lambda --expt-relaxed-constexpr) + target_link_libraries(${BENCH_NAME} PRIVATE + cuda_benchmark + pthread + cuco) +endfunction(ConfigureCUDABench) + ################################################################################################### ### test sources ################################################################################## ################################################################################################### @@ -89,6 +113,10 @@ ConfigureNVBench(THRUST_RBK_BENCH "${THRUST_RBK_BENCH_SRC}") set(CUB_RBK_BENCH_SRC "${CMAKE_CURRENT_SOURCE_DIR}/reduce_by_key/cub_reduce_by_key_bench.cu") ConfigureNVBench(CUB_RBK_BENCH "${CUB_RBK_BENCH_SRC}") +################################################################################################### +set(STATIC_REDUCTION_MAP_SMEM_BENCH_SRC "${CMAKE_CURRENT_SOURCE_DIR}/hash_table/static_reduction_map_smem_bench.cu") +ConfigureCUDABench(STATIC_REDUCTION_MAP_SMEM_BENCH "${STATIC_REDUCTION_MAP_SMEM_BENCH_SRC}") + ################################################################################################### set(STATIC_REDUCTION_MAP_PARAM_GRID_SEARCH_SRC "${CMAKE_CURRENT_SOURCE_DIR}/hash_table/static_reduction_map_param_grid_search.cu") ConfigureNVBench(STATIC_REDUCTION_MAP_PARAM_GRID_SEARCH "${STATIC_REDUCTION_MAP_PARAM_GRID_SEARCH_SRC}") \ No newline at end of file diff --git a/benchmarks/hash_table/static_reduction_map_smem_bench.cu b/benchmarks/hash_table/static_reduction_map_smem_bench.cu new file mode 100644 index 000000000..4cdbe80e7 --- /dev/null +++ b/benchmarks/hash_table/static_reduction_map_smem_bench.cu @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include + +template +void static_reduction_map_smem_insert_bench(cuda_benchmark::controller& controller, + std::size_t num_elems, + float occupancy, + dist_type dist, + std::size_t multiplicity = 8) +{ + using map_type = cuco::static_reduction_map, Key, Value>; + using pair_type = typename map_type::value_type; + + int dev_id; + cudaGetDevice(&dev_id); + struct cudaDeviceProp dev_props; + cudaGetDeviceProperties(&dev_props, dev_id); + std::size_t const max_smem = dev_props.sharedMemPerBlock; + std::size_t const max_capacity = max_smem / sizeof(pair_type); + + std::vector h_keys_in(num_elems); + std::vector h_values_in(num_elems); + + if (not generate_keys(dist, h_keys_in.begin(), h_keys_in.end(), multiplicity)) { + std::cerr << "[ERROR] Invalid input distribution.\n"; + return; + } + + // generate uniform random values + generate_keys("UNIFORM", h_values_in.begin(), h_values_in.end(), 1); + + // the size of the hash table under a given target occupancy depends on the + // number of unique keys in the input + std::size_t const unique = count_unique(h_keys_in.begin(), h_keys_in.end()); + std::size_t const capacity = std::ceil(SDIV(unique, occupancy)); + + if (capacity > max_capacity) { + std::cerr << "[ERROR] Not enough shared memory available. (" << capacity * sizeof(pair_type) + << ">" << max_capacity * sizeof(pair_type) << " bytes)\n"; + return; + } + + thrust::device_vector d_keys_in(h_keys_in); + thrust::device_vector d_values_in(h_values_in); + + controller.benchmark( + "static_reduction_map shared memory insert", + [=, keys_ptr = d_keys_in.data().get(), values_ptr = d_values_in.data().get()] __device__( + cuda_benchmark::state & state) { + using map_type = typename cuco::static_reduction_map, + Key, + Value, + cuda::thread_scope_block>; + using map_view_type = typename map_type::device_mutable_view; + + __shared__ typename map_type::pair_atomic_type* slots; + + auto g = cooperative_groups::this_thread_block(); + auto map = map_view_type::make_from_uninitialized_slots(g, slots, capacity, -1); + auto pair = pair_type(keys_ptr[g.thread_rank()], values_ptr[g.thread_rank()]); + + g.sync(); + + for (auto _ : state) { + map.insert(pair); + g.sync(); + } + }, + max_smem); +} + +int main() +{ + cuda_benchmark::controller controller(1024, 1); + + static_reduction_map_smem_insert_bench( + controller, 10'000, 0.8, dist_type::UNIFORM); +} \ No newline at end of file From 9807d8f20b7101c0e690423abcb4a14c778ab39f Mon Sep 17 00:00:00 2001 From: Daniel Juenger Date: Fri, 6 Aug 2021 16:39:06 -0700 Subject: [PATCH 44/69] Added definition for slot_type. --- include/cuco/static_reduction_map.cuh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/cuco/static_reduction_map.cuh b/include/cuco/static_reduction_map.cuh index 7e403e5bd..247b1df18 100644 --- a/include/cuco/static_reduction_map.cuh +++ b/include/cuco/static_reduction_map.cuh @@ -270,6 +270,7 @@ class static_reduction_map { using atomic_key_type = cuda::atomic; using atomic_mapped_type = cuda::atomic; using pair_atomic_type = cuco::pair_type; + using slot_type = pair_atomic_type; using atomic_ctr_type = cuda::atomic; using allocator_type = Allocator; using slot_allocator_type = @@ -432,6 +433,7 @@ class static_reduction_map { using mapped_type = Value; using iterator = pair_atomic_type*; using const_iterator = pair_atomic_type const*; + using slot_type = slot_type; private: pair_atomic_type* slots_{}; ///< Pointer to flat slots storage @@ -741,6 +743,8 @@ class static_reduction_map { using mapped_type = typename device_view_base::mapped_type; using iterator = typename device_view_base::iterator; using const_iterator = typename device_view_base::const_iterator; + using slot_type = typename device_view_base::slot_type; + /** * @brief Construct a mutable view of the first `capacity` slots of the * slots array pointed to by `slots`. @@ -838,6 +842,8 @@ class static_reduction_map { using mapped_type = typename device_view_base::mapped_type; using iterator = typename device_view_base::iterator; using const_iterator = typename device_view_base::const_iterator; + using slot_type = typename device_view_base::slot_type; + /** * @brief Construct a view of the first `capacity` slots of the * slots array pointed to by `slots`. From 0c1bd4d98f2afeefb8ba00e2b8477a1bbe7f9aaa Mon Sep 17 00:00:00 2001 From: Daniel Juenger Date: Fri, 6 Aug 2021 16:41:05 -0700 Subject: [PATCH 45/69] Change visibility of get_slots() from protected to public. (Fix for device_view ctor) --- include/cuco/static_reduction_map.cuh | 40 +++++++++++++-------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/include/cuco/static_reduction_map.cuh b/include/cuco/static_reduction_map.cuh index 247b1df18..a09f2a25a 100644 --- a/include/cuco/static_reduction_map.cuh +++ b/include/cuco/static_reduction_map.cuh @@ -455,26 +455,6 @@ class static_reduction_map { { } - /** - * @brief Gets the binary op - * - */ - __device__ ReductionOp get_op() const noexcept { return op_; } - - /** - * @brief Gets slots array. - * - * @return Slots array - */ - __device__ pair_atomic_type* get_slots() noexcept { return slots_; } - - /** - * @brief Gets slots array. - * - * @return Slots array - */ - __device__ pair_atomic_type const* get_slots() const noexcept { return slots_; } - /** * @brief Returns the initial slot for a given key `k` * @@ -626,6 +606,26 @@ class static_reduction_map { } public: + /** + * @brief Gets the binary op + * + */ + __device__ ReductionOp get_op() const noexcept { return op_; } + + /** + * @brief Gets slots array. + * + * @return Slots array + */ + __device__ pair_atomic_type* get_slots() noexcept { return slots_; } + + /** + * @brief Gets slots array. + * + * @return Slots array + */ + __device__ pair_atomic_type const* get_slots() const noexcept { return slots_; } + /** * @brief Gets the maximum number of elements the hash map can hold. * From 58a2ead931b7b039928f9734f53e28a9683b661f Mon Sep 17 00:00:00 2001 From: Daniel Juenger Date: Fri, 6 Aug 2021 17:21:25 -0700 Subject: [PATCH 46/69] Move test helpers to util.hpp. --- tests/CMakeLists.txt | 2 + tests/dynamic_map/dynamic_map_test.cu | 95 ++++++++----------- tests/static_map/static_map_test.cu | 27 +----- .../static_reduction_map_test.cu | 26 +---- tests/util.hpp | 42 ++++++++ 5 files changed, 84 insertions(+), 108 deletions(-) create mode 100644 tests/util.hpp diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 45435b14e..71d3942e2 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -28,6 +28,8 @@ function(ConfigureTest TEST_NAME TEST_SRC) target_link_libraries(${TEST_NAME} Catch2::Catch2 cuco CUDA::cudart) set_target_properties(${TEST_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tests") + target_include_directories(${TEST_NAME} PRIVATE + "${CMAKE_CURRENT_SOURCE_DIR}") target_compile_options(${TEST_NAME} PRIVATE --expt-extended-lambda --expt-relaxed-constexpr -Xcompiler -Wno-subobject-linkage) catch_discover_tests(${TEST_NAME}) endfunction(ConfigureTest) diff --git a/tests/dynamic_map/dynamic_map_test.cu b/tests/dynamic_map/dynamic_map_test.cu index 3e4b94f02..7b9fd19b7 100644 --- a/tests/dynamic_map/dynamic_map_test.cu +++ b/tests/dynamic_map/dynamic_map_test.cu @@ -14,76 +14,53 @@ * limitations under the License. */ -#include #include #include #include #include #include #include +#include -enum class dist_type { - UNIQUE, - UNIFORM, - GAUSSIAN -}; +enum class dist_type { UNIQUE, UNIFORM, GAUSSIAN }; -template -static void generate_keys(OutputIt output_begin, OutputIt output_end) { +template +static void generate_keys(OutputIt output_begin, OutputIt output_end) +{ auto num_keys = std::distance(output_begin, output_end); std::random_device rd; std::mt19937 gen{rd()}; - switch(Dist) { + switch (Dist) { case dist_type::UNIQUE: - for(auto i = 0; i < num_keys; ++i) { + for (auto i = 0; i < num_keys; ++i) { output_begin[i] = i; } break; case dist_type::UNIFORM: - for(auto i = 0; i < num_keys; ++i) { + for (auto i = 0; i < num_keys; ++i) { output_begin[i] = std::abs(static_cast(gen())); } break; case dist_type::GAUSSIAN: std::normal_distribution<> dg{1e9, 1e7}; - for(auto i = 0; i < num_keys; ++i) { + for (auto i = 0; i < num_keys; ++i) { output_begin[i] = std::abs(static_cast(dg(gen))); } break; } } -namespace { -// Thrust logical algorithms (any_of/all_of/none_of) don't work with device -// lambdas: See https://github.com/thrust/thrust/issues/1062 -template -bool all_of(Iterator begin, Iterator end, Predicate p) -{ - auto size = thrust::distance(begin, end); - return size == thrust::count_if(begin, end, p); -} - -template -bool any_of(Iterator begin, Iterator end, Predicate p) -{ - return thrust::count_if(begin, end, p) > 0; -} - -template -bool none_of(Iterator begin, Iterator end, Predicate p) -{ - return not all_of(begin, end, p); -} -} // namespace - - -TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", "", +TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", + "", ((typename T, dist_type Dist), T, Dist), - (int32_t, dist_type::UNIQUE), (int64_t, dist_type::UNIQUE), - (int32_t, dist_type::UNIFORM), (int64_t, dist_type::UNIFORM), - (int32_t, dist_type::GAUSSIAN), (int64_t, dist_type::GAUSSIAN)) + (int32_t, dist_type::UNIQUE), + (int64_t, dist_type::UNIQUE), + (int32_t, dist_type::UNIFORM), + (int64_t, dist_type::UNIFORM), + (int32_t, dist_type::GAUSSIAN), + (int64_t, dist_type::GAUSSIAN)) { using Key = T; using Value = T; @@ -91,25 +68,25 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", "", constexpr std::size_t num_keys{50'000'000}; cuco::dynamic_map map{30'000'000, -1, -1}; - std::vector h_keys( num_keys ); - std::vector h_values( num_keys ); - std::vector> h_pairs ( num_keys ); + std::vector h_keys(num_keys); + std::vector h_values(num_keys); + std::vector> h_pairs(num_keys); generate_keys(h_keys.begin(), h_keys.end()); - for(auto i = 0; i < num_keys; ++i) { - Key key = h_keys[i]; - Value val = h_keys[i]; - h_values[i] = val; - h_pairs[i].first = key; + for (auto i = 0; i < num_keys; ++i) { + Key key = h_keys[i]; + Value val = h_keys[i]; + h_values[i] = val; + h_pairs[i].first = key; h_pairs[i].second = val; } - thrust::device_vector d_keys( h_keys ); - thrust::device_vector d_values( h_values ); - thrust::device_vector> d_pairs( h_pairs ); - thrust::device_vector d_results( num_keys ); - thrust::device_vector d_contained( num_keys ); + thrust::device_vector d_keys(h_keys); + thrust::device_vector d_values(h_values); + thrust::device_vector> d_pairs(h_pairs); + thrust::device_vector d_results(num_keys); + thrust::device_vector d_contained(num_keys); // bulk function test cases SECTION("All inserted keys-value pairs should be correctly recovered during find") @@ -118,8 +95,7 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", "", map.find(d_keys.begin(), d_keys.end(), d_results.begin()); auto zip = thrust::make_zip_iterator(thrust::make_tuple(d_results.begin(), d_values.begin())); - REQUIRE(all_of(zip, zip + num_keys, - [] __device__(auto const& p) { + REQUIRE(all_of(zip, zip + num_keys, [] __device__(auto const& p) { return thrust::get<0>(p) == thrust::get<1>(p); })); } @@ -128,7 +104,8 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", "", { map.find(d_keys.begin(), d_keys.end(), d_results.begin()); - REQUIRE(all_of(d_results.begin(), d_results.end(), [] __device__(auto const& p) { return p == -1; })); + REQUIRE( + all_of(d_results.begin(), d_results.end(), [] __device__(auto const& p) { return p == -1; })); } SECTION("All inserted keys-value pairs should be contained") @@ -136,13 +113,15 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", "", map.insert(d_pairs.begin(), d_pairs.end()); map.contains(d_keys.begin(), d_keys.end(), d_contained.begin()); - REQUIRE(all_of(d_contained.begin(), d_contained.end(), [] __device__(bool const& b) { return b; })); + REQUIRE( + all_of(d_contained.begin(), d_contained.end(), [] __device__(bool const& b) { return b; })); } SECTION("Non-inserted keys-value pairs should not be contained") { map.contains(d_keys.begin(), d_keys.end(), d_contained.begin()); - REQUIRE(none_of(d_contained.begin(), d_contained.end(), [] __device__(bool const& b) { return b; })); + REQUIRE( + none_of(d_contained.begin(), d_contained.end(), [] __device__(bool const& b) { return b; })); } } \ No newline at end of file diff --git a/tests/static_map/static_map_test.cu b/tests/static_map/static_map_test.cu index b52f3f367..ea68981ba 100644 --- a/tests/static_map/static_map_test.cu +++ b/tests/static_map/static_map_test.cu @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include #include #include @@ -22,31 +21,7 @@ #include #include #include - -namespace { -namespace cg = cooperative_groups; - -// Thrust logical algorithms (any_of/all_of/none_of) don't work with device -// lambdas: See https://github.com/thrust/thrust/issues/1062 -template -bool all_of(Iterator begin, Iterator end, Predicate p) -{ - auto size = thrust::distance(begin, end); - return size == thrust::count_if(begin, end, p); -} - -template -bool any_of(Iterator begin, Iterator end, Predicate p) -{ - return thrust::count_if(begin, end, p) > 0; -} - -template -bool none_of(Iterator begin, Iterator end, Predicate p) -{ - return not all_of(begin, end, p); -} -} // namespace +#include enum class dist_type { UNIQUE, UNIFORM, GAUSSIAN }; diff --git a/tests/static_reduction_map/static_reduction_map_test.cu b/tests/static_reduction_map/static_reduction_map_test.cu index bb57f0847..023cb7a7b 100644 --- a/tests/static_reduction_map/static_reduction_map_test.cu +++ b/tests/static_reduction_map/static_reduction_map_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,36 +14,14 @@ * limitations under the License. */ -#include #include #include #include #include #include #include +#include -namespace { -// Thrust logical algorithms (any_of/all_of/none_of) don't work with device -// lambdas: See https://github.com/thrust/thrust/issues/1062 -template -bool all_of(Iterator begin, Iterator end, Predicate p) -{ - auto size = thrust::distance(begin, end); - return size == thrust::count_if(begin, end, p); -} - -template -bool any_of(Iterator begin, Iterator end, Predicate p) -{ - return thrust::count_if(begin, end, p) > 0; -} - -template -bool none_of(Iterator begin, Iterator end, Predicate p) -{ - return not all_of(begin, end, p); -} -} // namespace TEMPLATE_TEST_CASE_SIG("Insert all identical keys", "", diff --git a/tests/util.hpp b/tests/util.hpp new file mode 100644 index 000000000..bb10a7a1f --- /dev/null +++ b/tests/util.hpp @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace cg = cooperative_groups; + +// Thrust logical algorithms (any_of/all_of/none_of) don't work with device +// lambdas: See https://github.com/thrust/thrust/issues/1062 +template +bool all_of(Iterator begin, Iterator end, Predicate p) +{ + auto size = thrust::distance(begin, end); + return size == thrust::count_if(begin, end, p); +} + +template +bool any_of(Iterator begin, Iterator end, Predicate p) +{ + return thrust::count_if(begin, end, p) > 0; +} + +template +bool none_of(Iterator begin, Iterator end, Predicate p) +{ + return not all_of(begin, end, p); +} \ No newline at end of file From f4979b1bd8a10489b0fd35a43690828e375f1e15 Mon Sep 17 00:00:00 2001 From: Daniel Juenger Date: Fri, 6 Aug 2021 17:22:31 -0700 Subject: [PATCH 47/69] Added tests for custom_op and shared memory hash table. --- .../static_reduction_map_test.cu | 62 +++++++++++++++++-- 1 file changed, 56 insertions(+), 6 deletions(-) diff --git a/tests/static_reduction_map/static_reduction_map_test.cu b/tests/static_reduction_map/static_reduction_map_test.cu index 023cb7a7b..51fee5740 100644 --- a/tests/static_reduction_map/static_reduction_map_test.cu +++ b/tests/static_reduction_map/static_reduction_map_test.cu @@ -16,23 +16,28 @@ #include #include +#include #include #include #include #include #include +// cuco::custom op functor that should give the same result as cuco::reduce_add +template +using custom_reduce_add = cuco::custom_op, 0>; TEMPLATE_TEST_CASE_SIG("Insert all identical keys", "", - ((typename Key, typename Value), Key, Value), - (int32_t, int32_t)) + ((typename Key, typename Value, typename Op), Key, Value, Op), + (int32_t, int32_t, cuco::reduce_add), + (int32_t, int32_t, custom_reduce_add)) { thrust::device_vector keys(100, 42); thrust::device_vector values(keys.size(), 1); auto const num_slots{keys.size() * 2}; - cuco::static_reduction_map, Key, Value> map{num_slots, -1}; + cuco::static_reduction_map map{num_slots, -1}; auto zip = thrust::make_zip_iterator(thrust::make_tuple(keys.begin(), values.begin())); auto zip_end = zip + keys.size(); @@ -62,12 +67,13 @@ TEMPLATE_TEST_CASE_SIG("Insert all identical keys", TEMPLATE_TEST_CASE_SIG("Insert all unique keys", "", - ((typename Key, typename Value), Key, Value), - (int32_t, int32_t)) + ((typename Key, typename Value, typename Op), Key, Value, Op), + (int32_t, int32_t, cuco::reduce_add), + (int32_t, int32_t, custom_reduce_add)) { constexpr std::size_t num_keys = 10000; constexpr std::size_t num_slots{num_keys * 2}; - cuco::static_reduction_map, Key, Value> map{num_slots, -1}; + cuco::static_reduction_map map{num_slots, -1}; auto keys_begin = thrust::make_counting_iterator(0); auto values_begin = thrust::make_counting_iterator(0); @@ -94,3 +100,47 @@ TEMPLATE_TEST_CASE_SIG("Insert all unique keys", REQUIRE(thrust::equal(thrust::device, values_begin, values_begin + num_keys, found.begin())); } } + +template +__global__ void static_reduction_map_shared_memory_kernel(bool* key_found) +{ + using Key = typename MapType::key_type; + using Value = typename MapType::mapped_type; + + namespace cg = cooperative_groups; + using mutable_view_type = typename MapType::device_mutable_view; + using view_type = typename MapType::device_view; + __shared__ typename mutable_view_type::slot_type slots[N]; + auto map = + mutable_view_type::make_from_uninitialized_slots(cg::this_thread_block(), &slots[0], N, -1); + + auto g = cg::this_thread_block(); + std::size_t index = threadIdx.x + blockIdx.x * blockDim.x; + int rank = g.thread_rank(); + + // insert {thread_rank, thread_rank} for each thread in thread-block + map.insert(cuco::pair(rank, rank)); + g.sync(); + + auto find_map = view_type(map); + auto retrieved_pair = find_map.find(rank); + if (retrieved_pair != find_map.end() && retrieved_pair->second == rank) { + key_found[index] = true; + } +} + +TEMPLATE_TEST_CASE_SIG("Shared memory hast table.", + "", + ((typename Key, typename Value, typename Op), Key, Value, Op), + (int32_t, int32_t, cuco::reduce_add), + (int32_t, int32_t, custom_reduce_add)) +{ + constexpr std::size_t N = 256; + thrust::device_vector key_found(N, false); + + static_reduction_map_shared_memory_kernel< + cuco::static_reduction_map, + N><<<8, 32>>>(key_found.data().get()); + + REQUIRE(all_of(key_found.begin(), key_found.end(), thrust::identity{})); +} From 01e75bda11af7b59b7a8eef0971f2eec880768a1 Mon Sep 17 00:00:00 2001 From: Daniel Juenger Date: Fri, 6 Aug 2021 16:39:06 -0700 Subject: [PATCH 48/69] Added definition for slot_type. --- include/cuco/static_reduction_map.cuh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/cuco/static_reduction_map.cuh b/include/cuco/static_reduction_map.cuh index 7e403e5bd..247b1df18 100644 --- a/include/cuco/static_reduction_map.cuh +++ b/include/cuco/static_reduction_map.cuh @@ -270,6 +270,7 @@ class static_reduction_map { using atomic_key_type = cuda::atomic; using atomic_mapped_type = cuda::atomic; using pair_atomic_type = cuco::pair_type; + using slot_type = pair_atomic_type; using atomic_ctr_type = cuda::atomic; using allocator_type = Allocator; using slot_allocator_type = @@ -432,6 +433,7 @@ class static_reduction_map { using mapped_type = Value; using iterator = pair_atomic_type*; using const_iterator = pair_atomic_type const*; + using slot_type = slot_type; private: pair_atomic_type* slots_{}; ///< Pointer to flat slots storage @@ -741,6 +743,8 @@ class static_reduction_map { using mapped_type = typename device_view_base::mapped_type; using iterator = typename device_view_base::iterator; using const_iterator = typename device_view_base::const_iterator; + using slot_type = typename device_view_base::slot_type; + /** * @brief Construct a mutable view of the first `capacity` slots of the * slots array pointed to by `slots`. @@ -838,6 +842,8 @@ class static_reduction_map { using mapped_type = typename device_view_base::mapped_type; using iterator = typename device_view_base::iterator; using const_iterator = typename device_view_base::const_iterator; + using slot_type = typename device_view_base::slot_type; + /** * @brief Construct a view of the first `capacity` slots of the * slots array pointed to by `slots`. From 1e866591f320edfaa436967a856abb4c4b96505c Mon Sep 17 00:00:00 2001 From: Daniel Juenger Date: Fri, 6 Aug 2021 16:41:05 -0700 Subject: [PATCH 49/69] Change visibility of get_slots() from protected to public. (Fix for device_view ctor) --- include/cuco/static_reduction_map.cuh | 40 +++++++++++++-------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/include/cuco/static_reduction_map.cuh b/include/cuco/static_reduction_map.cuh index 247b1df18..a09f2a25a 100644 --- a/include/cuco/static_reduction_map.cuh +++ b/include/cuco/static_reduction_map.cuh @@ -455,26 +455,6 @@ class static_reduction_map { { } - /** - * @brief Gets the binary op - * - */ - __device__ ReductionOp get_op() const noexcept { return op_; } - - /** - * @brief Gets slots array. - * - * @return Slots array - */ - __device__ pair_atomic_type* get_slots() noexcept { return slots_; } - - /** - * @brief Gets slots array. - * - * @return Slots array - */ - __device__ pair_atomic_type const* get_slots() const noexcept { return slots_; } - /** * @brief Returns the initial slot for a given key `k` * @@ -626,6 +606,26 @@ class static_reduction_map { } public: + /** + * @brief Gets the binary op + * + */ + __device__ ReductionOp get_op() const noexcept { return op_; } + + /** + * @brief Gets slots array. + * + * @return Slots array + */ + __device__ pair_atomic_type* get_slots() noexcept { return slots_; } + + /** + * @brief Gets slots array. + * + * @return Slots array + */ + __device__ pair_atomic_type const* get_slots() const noexcept { return slots_; } + /** * @brief Gets the maximum number of elements the hash map can hold. * From 21be2e1353bc2b5a5c77429ae10655ed313b20be Mon Sep 17 00:00:00 2001 From: Daniel Juenger Date: Fri, 6 Aug 2021 17:21:25 -0700 Subject: [PATCH 50/69] Move test helpers to util.hpp. --- tests/CMakeLists.txt | 2 + tests/dynamic_map/dynamic_map_test.cu | 95 ++++++++----------- tests/static_map/static_map_test.cu | 27 +----- .../static_reduction_map_test.cu | 26 +---- tests/util.hpp | 42 ++++++++ 5 files changed, 84 insertions(+), 108 deletions(-) create mode 100644 tests/util.hpp diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 45435b14e..71d3942e2 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -28,6 +28,8 @@ function(ConfigureTest TEST_NAME TEST_SRC) target_link_libraries(${TEST_NAME} Catch2::Catch2 cuco CUDA::cudart) set_target_properties(${TEST_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tests") + target_include_directories(${TEST_NAME} PRIVATE + "${CMAKE_CURRENT_SOURCE_DIR}") target_compile_options(${TEST_NAME} PRIVATE --expt-extended-lambda --expt-relaxed-constexpr -Xcompiler -Wno-subobject-linkage) catch_discover_tests(${TEST_NAME}) endfunction(ConfigureTest) diff --git a/tests/dynamic_map/dynamic_map_test.cu b/tests/dynamic_map/dynamic_map_test.cu index 3e4b94f02..7b9fd19b7 100644 --- a/tests/dynamic_map/dynamic_map_test.cu +++ b/tests/dynamic_map/dynamic_map_test.cu @@ -14,76 +14,53 @@ * limitations under the License. */ -#include #include #include #include #include #include #include +#include -enum class dist_type { - UNIQUE, - UNIFORM, - GAUSSIAN -}; +enum class dist_type { UNIQUE, UNIFORM, GAUSSIAN }; -template -static void generate_keys(OutputIt output_begin, OutputIt output_end) { +template +static void generate_keys(OutputIt output_begin, OutputIt output_end) +{ auto num_keys = std::distance(output_begin, output_end); std::random_device rd; std::mt19937 gen{rd()}; - switch(Dist) { + switch (Dist) { case dist_type::UNIQUE: - for(auto i = 0; i < num_keys; ++i) { + for (auto i = 0; i < num_keys; ++i) { output_begin[i] = i; } break; case dist_type::UNIFORM: - for(auto i = 0; i < num_keys; ++i) { + for (auto i = 0; i < num_keys; ++i) { output_begin[i] = std::abs(static_cast(gen())); } break; case dist_type::GAUSSIAN: std::normal_distribution<> dg{1e9, 1e7}; - for(auto i = 0; i < num_keys; ++i) { + for (auto i = 0; i < num_keys; ++i) { output_begin[i] = std::abs(static_cast(dg(gen))); } break; } } -namespace { -// Thrust logical algorithms (any_of/all_of/none_of) don't work with device -// lambdas: See https://github.com/thrust/thrust/issues/1062 -template -bool all_of(Iterator begin, Iterator end, Predicate p) -{ - auto size = thrust::distance(begin, end); - return size == thrust::count_if(begin, end, p); -} - -template -bool any_of(Iterator begin, Iterator end, Predicate p) -{ - return thrust::count_if(begin, end, p) > 0; -} - -template -bool none_of(Iterator begin, Iterator end, Predicate p) -{ - return not all_of(begin, end, p); -} -} // namespace - - -TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", "", +TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", + "", ((typename T, dist_type Dist), T, Dist), - (int32_t, dist_type::UNIQUE), (int64_t, dist_type::UNIQUE), - (int32_t, dist_type::UNIFORM), (int64_t, dist_type::UNIFORM), - (int32_t, dist_type::GAUSSIAN), (int64_t, dist_type::GAUSSIAN)) + (int32_t, dist_type::UNIQUE), + (int64_t, dist_type::UNIQUE), + (int32_t, dist_type::UNIFORM), + (int64_t, dist_type::UNIFORM), + (int32_t, dist_type::GAUSSIAN), + (int64_t, dist_type::GAUSSIAN)) { using Key = T; using Value = T; @@ -91,25 +68,25 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", "", constexpr std::size_t num_keys{50'000'000}; cuco::dynamic_map map{30'000'000, -1, -1}; - std::vector h_keys( num_keys ); - std::vector h_values( num_keys ); - std::vector> h_pairs ( num_keys ); + std::vector h_keys(num_keys); + std::vector h_values(num_keys); + std::vector> h_pairs(num_keys); generate_keys(h_keys.begin(), h_keys.end()); - for(auto i = 0; i < num_keys; ++i) { - Key key = h_keys[i]; - Value val = h_keys[i]; - h_values[i] = val; - h_pairs[i].first = key; + for (auto i = 0; i < num_keys; ++i) { + Key key = h_keys[i]; + Value val = h_keys[i]; + h_values[i] = val; + h_pairs[i].first = key; h_pairs[i].second = val; } - thrust::device_vector d_keys( h_keys ); - thrust::device_vector d_values( h_values ); - thrust::device_vector> d_pairs( h_pairs ); - thrust::device_vector d_results( num_keys ); - thrust::device_vector d_contained( num_keys ); + thrust::device_vector d_keys(h_keys); + thrust::device_vector d_values(h_values); + thrust::device_vector> d_pairs(h_pairs); + thrust::device_vector d_results(num_keys); + thrust::device_vector d_contained(num_keys); // bulk function test cases SECTION("All inserted keys-value pairs should be correctly recovered during find") @@ -118,8 +95,7 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", "", map.find(d_keys.begin(), d_keys.end(), d_results.begin()); auto zip = thrust::make_zip_iterator(thrust::make_tuple(d_results.begin(), d_values.begin())); - REQUIRE(all_of(zip, zip + num_keys, - [] __device__(auto const& p) { + REQUIRE(all_of(zip, zip + num_keys, [] __device__(auto const& p) { return thrust::get<0>(p) == thrust::get<1>(p); })); } @@ -128,7 +104,8 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", "", { map.find(d_keys.begin(), d_keys.end(), d_results.begin()); - REQUIRE(all_of(d_results.begin(), d_results.end(), [] __device__(auto const& p) { return p == -1; })); + REQUIRE( + all_of(d_results.begin(), d_results.end(), [] __device__(auto const& p) { return p == -1; })); } SECTION("All inserted keys-value pairs should be contained") @@ -136,13 +113,15 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", "", map.insert(d_pairs.begin(), d_pairs.end()); map.contains(d_keys.begin(), d_keys.end(), d_contained.begin()); - REQUIRE(all_of(d_contained.begin(), d_contained.end(), [] __device__(bool const& b) { return b; })); + REQUIRE( + all_of(d_contained.begin(), d_contained.end(), [] __device__(bool const& b) { return b; })); } SECTION("Non-inserted keys-value pairs should not be contained") { map.contains(d_keys.begin(), d_keys.end(), d_contained.begin()); - REQUIRE(none_of(d_contained.begin(), d_contained.end(), [] __device__(bool const& b) { return b; })); + REQUIRE( + none_of(d_contained.begin(), d_contained.end(), [] __device__(bool const& b) { return b; })); } } \ No newline at end of file diff --git a/tests/static_map/static_map_test.cu b/tests/static_map/static_map_test.cu index b52f3f367..ea68981ba 100644 --- a/tests/static_map/static_map_test.cu +++ b/tests/static_map/static_map_test.cu @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include #include #include @@ -22,31 +21,7 @@ #include #include #include - -namespace { -namespace cg = cooperative_groups; - -// Thrust logical algorithms (any_of/all_of/none_of) don't work with device -// lambdas: See https://github.com/thrust/thrust/issues/1062 -template -bool all_of(Iterator begin, Iterator end, Predicate p) -{ - auto size = thrust::distance(begin, end); - return size == thrust::count_if(begin, end, p); -} - -template -bool any_of(Iterator begin, Iterator end, Predicate p) -{ - return thrust::count_if(begin, end, p) > 0; -} - -template -bool none_of(Iterator begin, Iterator end, Predicate p) -{ - return not all_of(begin, end, p); -} -} // namespace +#include enum class dist_type { UNIQUE, UNIFORM, GAUSSIAN }; diff --git a/tests/static_reduction_map/static_reduction_map_test.cu b/tests/static_reduction_map/static_reduction_map_test.cu index bb57f0847..023cb7a7b 100644 --- a/tests/static_reduction_map/static_reduction_map_test.cu +++ b/tests/static_reduction_map/static_reduction_map_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,36 +14,14 @@ * limitations under the License. */ -#include #include #include #include #include #include #include +#include -namespace { -// Thrust logical algorithms (any_of/all_of/none_of) don't work with device -// lambdas: See https://github.com/thrust/thrust/issues/1062 -template -bool all_of(Iterator begin, Iterator end, Predicate p) -{ - auto size = thrust::distance(begin, end); - return size == thrust::count_if(begin, end, p); -} - -template -bool any_of(Iterator begin, Iterator end, Predicate p) -{ - return thrust::count_if(begin, end, p) > 0; -} - -template -bool none_of(Iterator begin, Iterator end, Predicate p) -{ - return not all_of(begin, end, p); -} -} // namespace TEMPLATE_TEST_CASE_SIG("Insert all identical keys", "", diff --git a/tests/util.hpp b/tests/util.hpp new file mode 100644 index 000000000..bb10a7a1f --- /dev/null +++ b/tests/util.hpp @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace cg = cooperative_groups; + +// Thrust logical algorithms (any_of/all_of/none_of) don't work with device +// lambdas: See https://github.com/thrust/thrust/issues/1062 +template +bool all_of(Iterator begin, Iterator end, Predicate p) +{ + auto size = thrust::distance(begin, end); + return size == thrust::count_if(begin, end, p); +} + +template +bool any_of(Iterator begin, Iterator end, Predicate p) +{ + return thrust::count_if(begin, end, p) > 0; +} + +template +bool none_of(Iterator begin, Iterator end, Predicate p) +{ + return not all_of(begin, end, p); +} \ No newline at end of file From 53bfe278387c8088920cecd32a2d7950154642e0 Mon Sep 17 00:00:00 2001 From: Daniel Juenger Date: Fri, 6 Aug 2021 17:22:31 -0700 Subject: [PATCH 51/69] Added tests for custom_op and shared memory hash table. --- .../static_reduction_map_test.cu | 62 +++++++++++++++++-- 1 file changed, 56 insertions(+), 6 deletions(-) diff --git a/tests/static_reduction_map/static_reduction_map_test.cu b/tests/static_reduction_map/static_reduction_map_test.cu index 023cb7a7b..51fee5740 100644 --- a/tests/static_reduction_map/static_reduction_map_test.cu +++ b/tests/static_reduction_map/static_reduction_map_test.cu @@ -16,23 +16,28 @@ #include #include +#include #include #include #include #include #include +// cuco::custom op functor that should give the same result as cuco::reduce_add +template +using custom_reduce_add = cuco::custom_op, 0>; TEMPLATE_TEST_CASE_SIG("Insert all identical keys", "", - ((typename Key, typename Value), Key, Value), - (int32_t, int32_t)) + ((typename Key, typename Value, typename Op), Key, Value, Op), + (int32_t, int32_t, cuco::reduce_add), + (int32_t, int32_t, custom_reduce_add)) { thrust::device_vector keys(100, 42); thrust::device_vector values(keys.size(), 1); auto const num_slots{keys.size() * 2}; - cuco::static_reduction_map, Key, Value> map{num_slots, -1}; + cuco::static_reduction_map map{num_slots, -1}; auto zip = thrust::make_zip_iterator(thrust::make_tuple(keys.begin(), values.begin())); auto zip_end = zip + keys.size(); @@ -62,12 +67,13 @@ TEMPLATE_TEST_CASE_SIG("Insert all identical keys", TEMPLATE_TEST_CASE_SIG("Insert all unique keys", "", - ((typename Key, typename Value), Key, Value), - (int32_t, int32_t)) + ((typename Key, typename Value, typename Op), Key, Value, Op), + (int32_t, int32_t, cuco::reduce_add), + (int32_t, int32_t, custom_reduce_add)) { constexpr std::size_t num_keys = 10000; constexpr std::size_t num_slots{num_keys * 2}; - cuco::static_reduction_map, Key, Value> map{num_slots, -1}; + cuco::static_reduction_map map{num_slots, -1}; auto keys_begin = thrust::make_counting_iterator(0); auto values_begin = thrust::make_counting_iterator(0); @@ -94,3 +100,47 @@ TEMPLATE_TEST_CASE_SIG("Insert all unique keys", REQUIRE(thrust::equal(thrust::device, values_begin, values_begin + num_keys, found.begin())); } } + +template +__global__ void static_reduction_map_shared_memory_kernel(bool* key_found) +{ + using Key = typename MapType::key_type; + using Value = typename MapType::mapped_type; + + namespace cg = cooperative_groups; + using mutable_view_type = typename MapType::device_mutable_view; + using view_type = typename MapType::device_view; + __shared__ typename mutable_view_type::slot_type slots[N]; + auto map = + mutable_view_type::make_from_uninitialized_slots(cg::this_thread_block(), &slots[0], N, -1); + + auto g = cg::this_thread_block(); + std::size_t index = threadIdx.x + blockIdx.x * blockDim.x; + int rank = g.thread_rank(); + + // insert {thread_rank, thread_rank} for each thread in thread-block + map.insert(cuco::pair(rank, rank)); + g.sync(); + + auto find_map = view_type(map); + auto retrieved_pair = find_map.find(rank); + if (retrieved_pair != find_map.end() && retrieved_pair->second == rank) { + key_found[index] = true; + } +} + +TEMPLATE_TEST_CASE_SIG("Shared memory hast table.", + "", + ((typename Key, typename Value, typename Op), Key, Value, Op), + (int32_t, int32_t, cuco::reduce_add), + (int32_t, int32_t, custom_reduce_add)) +{ + constexpr std::size_t N = 256; + thrust::device_vector key_found(N, false); + + static_reduction_map_shared_memory_kernel< + cuco::static_reduction_map, + N><<<8, 32>>>(key_found.data().get()); + + REQUIRE(all_of(key_found.begin(), key_found.end(), thrust::identity{})); +} From f4c703a502702d42540b882380385421e09a3af6 Mon Sep 17 00:00:00 2001 From: Daniel Juenger Date: Sat, 7 Aug 2021 18:33:38 -0700 Subject: [PATCH 52/69] Added benchmark for static_reduction_map in shared memory. --- benchmarks/CMakeLists.txt | 1 + .../static_reduction_map_smem_bench.cu | 164 ++++++++++++------ 2 files changed, 113 insertions(+), 52 deletions(-) diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index dcaa3ce22..d938959a2 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -81,6 +81,7 @@ function(ConfigureCUDABench BENCH_NAME BENCH_SRC) target_compile_options(${BENCH_NAME} PRIVATE --expt-extended-lambda --expt-relaxed-constexpr) target_link_libraries(${BENCH_NAME} PRIVATE cuda_benchmark + fmt pthread cuco) endfunction(ConfigureCUDABench) diff --git a/benchmarks/hash_table/static_reduction_map_smem_bench.cu b/benchmarks/hash_table/static_reduction_map_smem_bench.cu index 4cdbe80e7..147c89c92 100644 --- a/benchmarks/hash_table/static_reduction_map_smem_bench.cu +++ b/benchmarks/hash_table/static_reduction_map_smem_bench.cu @@ -15,84 +15,144 @@ */ #include -#include #include #include -#include +#include #include +#include "fmt/core.h" +#include "fmt/format.h" +template +std::string get_type_str(); + +template <> +std::string get_type_str() +{ + return "U32"; +} +template <> +std::string get_type_str() +{ + return "U64"; +} + +/** + * @brief Device-side benchmark for shared memory reduction hash table insert. + * + * @tparam Key The hash table's key type + * @tparam Value The hash table's value/reduction type + * @param controller Benchmark controller/state handler + * @param bench_name Benchmark identifier + * @param num_elems_log2 Total number of key/value pairs to be inserted (log2) + * @param multiplicity_log2 Number of times each key occures in the input (log2) + * @param occupancy Target occupancy of the hash table after inserting all elements + */ template -void static_reduction_map_smem_insert_bench(cuda_benchmark::controller& controller, - std::size_t num_elems, - float occupancy, - dist_type dist, - std::size_t multiplicity = 8) +void static_reduction_map_smem_insert_bench(cuda_benchmark::controller &controller, + std::string const &bench_name, + std::uint32_t num_elems_log2, + std::uint32_t multiplicity_log2, + float occupancy) { using map_type = cuco::static_reduction_map, Key, Value>; using pair_type = typename map_type::value_type; - int dev_id; - cudaGetDevice(&dev_id); - struct cudaDeviceProp dev_props; - cudaGetDeviceProperties(&dev_props, dev_id); - std::size_t const max_smem = dev_props.sharedMemPerBlock; - std::size_t const max_capacity = max_smem / sizeof(pair_type); + auto const num_elems = 1UL << num_elems_log2; + auto const multiplicity = 1UL << multiplicity_log2; - std::vector h_keys_in(num_elems); - std::vector h_values_in(num_elems); + std::string full_bench_name = "INSERT " + bench_name + " key_type=" + get_type_str() + + " value_type=" + get_type_str() + + " num_elems=" + std::to_string(num_elems) + + " occupancy=" + fmt::format("{:.2f}", occupancy) + + " multiplicity=" + std::to_string(multiplicity); - if (not generate_keys(dist, h_keys_in.begin(), h_keys_in.end(), multiplicity)) { - std::cerr << "[ERROR] Invalid input distribution.\n"; - return; - } + static constexpr std::size_t max_smem_bytes = 49152; // 48 KB + static constexpr std::size_t max_capacity = max_smem_bytes / sizeof(pair_type); - // generate uniform random values - generate_keys("UNIFORM", h_values_in.begin(), h_values_in.end(), 1); - - // the size of the hash table under a given target occupancy depends on the - // number of unique keys in the input - std::size_t const unique = count_unique(h_keys_in.begin(), h_keys_in.end()); - std::size_t const capacity = std::ceil(SDIV(unique, occupancy)); + auto const elems_per_thread = num_elems / controller.get_block_size(); + auto const num_unique_keys = num_elems / multiplicity; + auto const capacity = std::ceil(num_unique_keys / occupancy); if (capacity > max_capacity) { - std::cerr << "[ERROR] Not enough shared memory available. (" << capacity * sizeof(pair_type) - << ">" << max_capacity * sizeof(pair_type) << " bytes)\n"; + std::cerr << "[ERROR] (" + full_bench_name + ") Not enough shared memory available. (" + << capacity * sizeof(pair_type) << ">" << max_capacity * sizeof(pair_type) + << " bytes)\n"; return; } - thrust::device_vector d_keys_in(h_keys_in); - thrust::device_vector d_values_in(h_values_in); + controller.benchmark(std::string{full_bench_name}, [=] __device__(cuda_benchmark::state & state) { + using map_type = typename cuco:: + static_reduction_map, Key, Value, cuda::thread_scope_block>; + using map_view_type = typename map_type::device_mutable_view; - controller.benchmark( - "static_reduction_map shared memory insert", - [=, keys_ptr = d_keys_in.data().get(), values_ptr = d_values_in.data().get()] __device__( - cuda_benchmark::state & state) { - using map_type = typename cuco::static_reduction_map, - Key, - Value, - cuda::thread_scope_block>; - using map_view_type = typename map_type::device_mutable_view; + __shared__ char sm_buffer[max_smem_bytes]; - __shared__ typename map_type::pair_atomic_type* slots; + auto g = cooperative_groups::this_thread_block(); + auto map = map_view_type::make_from_uninitialized_slots( + g, reinterpret_cast(&sm_buffer[0]), capacity, ~Key(0)); - auto g = cooperative_groups::this_thread_block(); - auto map = map_view_type::make_from_uninitialized_slots(g, slots, capacity, -1); - auto pair = pair_type(keys_ptr[g.thread_rank()], values_ptr[g.thread_rank()]); + g.sync(); - g.sync(); - - for (auto _ : state) { - map.insert(pair); - g.sync(); + for (auto _ : state) { + for (Key i = g.thread_rank(); i < num_elems; i += g.size()) { + map.insert(cuco::pair((i & (multiplicity - 1)), g.thread_rank())); } - }, - max_smem); + g.sync(); + } + state.set_operations_processed(state.max_iterations() * elems_per_thread); + }); } int main() { + int device_id{}; + cudaGetDevice(&device_id); + + cudaDeviceProp prop{}; + cudaGetDeviceProperties(&prop, device_id); + + int peak_clk{}; + cudaDeviceGetAttribute(&peak_clk, cudaDevAttrClockRate, device_id); + + // can be used to calculate throughput (ops/second) + std::cout << "GPU Clock Rate: " << std::to_string(peak_clk) << " KHz\n"; + + // start one CUDA block with 1024 threads cuda_benchmark::controller controller(1024, 1); - static_reduction_map_smem_insert_bench( - controller, 10'000, 0.8, dist_type::UNIFORM); + // unique keys; total number of keys fix; varying table occupancy + for (float occupancy = 0.5; occupancy < 1.0; occupancy += 0.1) { + static_reduction_map_smem_insert_bench( + controller, "OCCUPANCY", 10, 0, occupancy); + } + + // unique keys; total number of keys fix; varying table occupancy + for (float occupancy = 0.5; occupancy < 1.0; occupancy += 0.1) { + static_reduction_map_smem_insert_bench( + controller, "OCCUPANCY", 10, 0, occupancy); + } + + // total number of keys fix; occuoancy fix; varying key multiplicity + for (float multiplicity_log2 = 1; multiplicity_log2 < 7; ++multiplicity_log2) { + static_reduction_map_smem_insert_bench( + controller, "MULTIPLICITY", 12, multiplicity_log2, 0.8); + } + + // total number of keys fix; occuoancy fix; varying key multiplicity + for (float multiplicity_log2 = 1; multiplicity_log2 < 7; ++multiplicity_log2) { + static_reduction_map_smem_insert_bench( + controller, "MULTIPLICITY", 12, multiplicity_log2, 0.8); + } + + // occupancy fix; capacity fix; varying number of keys; varying key multiplicity + for (float i = 0; i < 7; ++i) { + static_reduction_map_smem_insert_bench( + controller, "EQUAL CAPACITY", 10 + i, 0 + i, 0.8); + } + + // occupancy fix; capacity fix; varying number of keys; varying key multiplicity + for (float i = 0; i < 7; ++i) { + static_reduction_map_smem_insert_bench( + controller, "EQUAL CAPACITY", 10 + i, 0 + i, 0.8); + } } \ No newline at end of file From ec76e8a5fb11d8d403c5747e3c6ad71005d36a59 Mon Sep 17 00:00:00 2001 From: Daniel Juenger Date: Mon, 9 Aug 2021 21:02:30 +0000 Subject: [PATCH 53/69] Add example for shared memory hash table. --- examples/CMakeLists.txt | 5 +- .../shared_memory_example.cu | 87 +++++++++++++++++++ .../static_reduction_map_example.cu} | 0 3 files changed, 91 insertions(+), 1 deletion(-) create mode 100644 examples/static_reduction_map/shared_memory_example.cu rename examples/{static_reduction_map.cu => static_reduction_map/static_reduction_map_example.cu} (100%) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 2e5967724..411de1a42 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -17,6 +17,9 @@ endfunction(ConfigureExample) ### Example sources ################################################################################## ################################################################################################### +# static_map ConfigureExample(STATIC_MAP_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/static_map_example.cu") -ConfigureExample(STATIC_REDUCTION_MAP_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_reduction_map.cu") +# static_reduction_map +ConfigureExample(STATIC_REDUCTION_MAP_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_reduction_map/static_reduction_map_example.cu") +ConfigureExample(STATIC_REDUCTION_MAP_SMEM_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_reduction_map/shared_memory_example.cu") diff --git a/examples/static_reduction_map/shared_memory_example.cu b/examples/static_reduction_map/shared_memory_example.cu new file mode 100644 index 000000000..b8c302acb --- /dev/null +++ b/examples/static_reduction_map/shared_memory_example.cu @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include + +#include + +template +__global__ void static_reduction_map_shared_memory_kernel(OutputIt key_found) +{ + using Key = typename MapType::key_type; + using Value = typename MapType::mapped_type; + + namespace cg = cooperative_groups; + // define a mutable view for insert operations + using mutable_view_type = typename MapType::device_mutable_view; + // define a immutable view for find/contains operations + using view_type = typename MapType::device_view; + + // hash table storage in shared memory + __shared__ typename mutable_view_type::slot_type slots[Capacity]; + + // construct the table from the provided array in shared memory + auto map = mutable_view_type::make_from_uninitialized_slots( + cg::this_thread_block(), &slots[0], Capacity, -1); + + auto g = cg::this_thread_block(); + std::size_t index = threadIdx.x + blockIdx.x * blockDim.x; + int rank = g.thread_rank(); + + // insert {thread_rank, thread_rank} for each thread in thread-block + map.insert(cuco::pair(rank, rank)); + g.sync(); + + auto find_map = view_type(map); + // check if all previously inserted keys are present in the table + key_found[index] = find_map.contains(rank); +} + +/** + * @brief Demonstrates usage of the static_reduction_map in shared memory. + * + * We make use of the device-side API to construct and query a + * static reduction map in SM-local shared memory. + * + */ +int main(void) +{ + using Key = int; + using Value = int; + + // define the capacity of the map + static constexpr int capacity = 2048; + + // define the hash table typewith block-local thread scope + using map_type = + cuco::static_reduction_map, Key, Value, cuda::thread_scope_block>; + + // allocate storage for the result + thrust::device_vector result(1024, false); + + static_reduction_map_shared_memory_kernel<<<1, 1024>>>(result.begin()); + + auto success = + thrust::all_of(thrust::device, result.begin(), result.end(), thrust::identity()); + + std::cout << "Success: " << std::boolalpha << success << std::endl; +} \ No newline at end of file diff --git a/examples/static_reduction_map.cu b/examples/static_reduction_map/static_reduction_map_example.cu similarity index 100% rename from examples/static_reduction_map.cu rename to examples/static_reduction_map/static_reduction_map_example.cu From 961e88bdb1f3cf1fbd5366e79be88a5cea435ceb Mon Sep 17 00:00:00 2001 From: Daniel Juenger Date: Mon, 9 Aug 2021 21:04:30 +0000 Subject: [PATCH 54/69] Fix for static_reduction_map::contains. --- include/cuco/detail/bitwise_compare.cuh | 4 +++- include/cuco/detail/static_reduction_map.inl | 2 +- include/cuco/static_reduction_map.cuh | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/include/cuco/detail/bitwise_compare.cuh b/include/cuco/detail/bitwise_compare.cuh index d554ddba3..65561e631 100644 --- a/include/cuco/detail/bitwise_compare.cuh +++ b/include/cuco/detail/bitwise_compare.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,6 +14,8 @@ * limitations under the License. */ +#pragma once + #include #include diff --git a/include/cuco/detail/static_reduction_map.inl b/include/cuco/detail/static_reduction_map.inl index 7ec7a676e..0c6ff8683 100644 --- a/include/cuco/detail/static_reduction_map.inl +++ b/include/cuco/detail/static_reduction_map.inl @@ -454,7 +454,7 @@ static_reduction_map::device_view::co while (true) { auto const existing_key = current_slot->first.load(cuda::std::memory_order_relaxed); - if (detail::bitwise_compare(existing_key, empty_key_sentinel_)) { return false; } + if (detail::bitwise_compare(existing_key, this->get_empty_key_sentinel())) { return false; } if (key_equal(existing_key, k)) { return true; } diff --git a/include/cuco/static_reduction_map.cuh b/include/cuco/static_reduction_map.cuh index a09f2a25a..07a16116c 100644 --- a/include/cuco/static_reduction_map.cuh +++ b/include/cuco/static_reduction_map.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 5931c9f56d93df5502a00f81cefef89e7b4ac28e Mon Sep 17 00:00:00 2001 From: Daniel Juenger Date: Mon, 9 Aug 2021 21:30:22 +0000 Subject: [PATCH 55/69] Extend parameter range for shared memory hash table benchmark. --- benchmarks/hash_table/static_reduction_map_smem_bench.cu | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/benchmarks/hash_table/static_reduction_map_smem_bench.cu b/benchmarks/hash_table/static_reduction_map_smem_bench.cu index 147c89c92..4c18554f4 100644 --- a/benchmarks/hash_table/static_reduction_map_smem_bench.cu +++ b/benchmarks/hash_table/static_reduction_map_smem_bench.cu @@ -54,7 +54,8 @@ void static_reduction_map_smem_insert_bench(cuda_benchmark::controller &controll std::uint32_t multiplicity_log2, float occupancy) { - using map_type = cuco::static_reduction_map, Key, Value>; + using map_type = cuco:: + static_reduction_map, Key, Value, cuda::thread_scope_block>; using pair_type = typename map_type::value_type; auto const num_elems = 1UL << num_elems_log2; @@ -145,13 +146,13 @@ int main() } // occupancy fix; capacity fix; varying number of keys; varying key multiplicity - for (float i = 0; i < 7; ++i) { + for (float i = 0; i < 11; ++i) { static_reduction_map_smem_insert_bench( controller, "EQUAL CAPACITY", 10 + i, 0 + i, 0.8); } // occupancy fix; capacity fix; varying number of keys; varying key multiplicity - for (float i = 0; i < 7; ++i) { + for (float i = 0; i < 11; ++i) { static_reduction_map_smem_insert_bench( controller, "EQUAL CAPACITY", 10 + i, 0 + i, 0.8); } From 902b93a64c3c07fbc0108bdba56b2bbc6e1f9b55 Mon Sep 17 00:00:00 2001 From: Daniel Juenger Date: Mon, 9 Aug 2021 23:43:22 +0000 Subject: [PATCH 56/69] Size computation using thrust::count_if. Asynchronous bulk operations. --- include/cuco/detail/static_reduction_map.inl | 47 ++++++++----------- .../detail/static_reduction_map_kernels.cuh | 32 ++----------- include/cuco/static_reduction_map.cuh | 26 +++++----- 3 files changed, 37 insertions(+), 68 deletions(-) diff --git a/include/cuco/detail/static_reduction_map.inl b/include/cuco/detail/static_reduction_map.inl index 0c6ff8683..a44544755 100644 --- a/include/cuco/detail/static_reduction_map.inl +++ b/include/cuco/detail/static_reduction_map.inl @@ -39,8 +39,7 @@ static_reduction_map::static_reductio empty_key_sentinel_{empty_key_sentinel}, empty_value_sentinel_{ReductionOp::identity}, op_{reduction_op}, - slot_allocator_{alloc}, - counter_allocator_{alloc} + slot_allocator_{alloc} { slots_ = std::allocator_traits::allocate(slot_allocator_, capacity_); @@ -79,29 +78,8 @@ void static_reduction_map::insert( auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size); auto view = get_device_mutable_view(); - atomic_ctr_type *h_num_successes, *d_num_successes; - CUCO_CUDA_TRY(cudaMallocHost(&h_num_successes, sizeof(atomic_ctr_type))); - - auto tmp_counter_allocator = counter_allocator_; - d_num_successes = - std::allocator_traits::allocate(tmp_counter_allocator, 1); - - h_num_successes->store(static_cast(0), cuda::std::memory_order_relaxed); - CUCO_CUDA_TRY(cudaMemcpyAsync( - d_num_successes, h_num_successes, sizeof(atomic_ctr_type), cudaMemcpyHostToDevice, stream)); - - detail::insert<<>>( - first, first + num_keys, d_num_successes, view, hash, key_equal); - - CUCO_CUDA_TRY(cudaMemcpyAsync( - h_num_successes, d_num_successes, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream)); - CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); - - size_ += h_num_successes->load(cuda::std::memory_order_relaxed); - - CUCO_CUDA_TRY(cudaFreeHost(h_num_successes)); - std::allocator_traits::deallocate( - tmp_counter_allocator, d_num_successes, 1); + detail::insert + <<>>(first, first + num_keys, view, hash, key_equal); } template ::find(Input detail::find <<>>(first, last, output_begin, view, hash, key_equal); - CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); } namespace detail { @@ -196,7 +173,23 @@ void static_reduction_map::contains( detail::contains <<>>(first, last, output_begin, view, hash, key_equal); - CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); +} + +template +std::size_t static_reduction_map::get_size( + cudaStream_t stream) const noexcept +{ + // Convert pair_type to thrust::tuple to allow assigning to a zip iterator + auto begin = + thrust::make_transform_iterator(raw_slots_begin(), detail::slot_to_tuple{}); + auto end = begin + get_capacity(); + auto filled = detail::slot_is_filled{get_empty_key_sentinel()}; + + return thrust::count_if(thrust::cuda::par.on(stream), begin, end, filled); } template -__global__ void insert( - InputIt first, InputIt last, atomicT* num_successes, viewT view, Hash hash, KeyEqual key_equal) +__global__ void insert(InputIt first, InputIt last, viewT view, Hash hash, KeyEqual key_equal) { - typedef cub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - std::size_t thread_num_successes = 0; - auto tid = block_size * blockIdx.x + threadIdx.x; auto it = first + tid; while (it < last) { typename viewT::value_type const insert_pair{*it}; - if (view.insert(insert_pair, hash, key_equal)) { thread_num_successes++; } + view.insert(insert_pair, hash, key_equal); it += gridDim.x * block_size; } - - // compute number of successfully inserted elements for each block - // and atomically add to the grand total - std::size_t block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes); - if (threadIdx.x == 0) { *num_successes += block_num_successes; } } /** @@ -126,17 +115,11 @@ __global__ void insert( template -__global__ void insert( - InputIt first, InputIt last, atomicT* num_successes, viewT view, Hash hash, KeyEqual key_equal) +__global__ void insert(InputIt first, InputIt last, viewT view, Hash hash, KeyEqual key_equal) { - typedef cub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - std::size_t thread_num_successes = 0; - auto tile = cg::tiled_partition(cg::this_thread_block()); auto tid = block_size * blockIdx.x + threadIdx.x; auto it = first + tid / tile_size; @@ -147,16 +130,9 @@ __global__ void insert( static_cast(thrust::get<0>(*it)), static_cast(thrust::get<1>(*it))}; - if (view.insert(tile, insert_pair, hash, key_equal) && tile.thread_rank() == 0) { - thread_num_successes++; - } + view.insert(tile, insert_pair, hash, key_equal); it += (gridDim.x * block_size) / tile_size; } - - // compute number of successfully inserted elements for each block - // and atomically add to the grand total - std::size_t block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes); - if (threadIdx.x == 0) { *num_successes += block_num_successes; } } /** diff --git a/include/cuco/static_reduction_map.cuh b/include/cuco/static_reduction_map.cuh index 07a16116c..dfb44a9a7 100644 --- a/include/cuco/static_reduction_map.cuh +++ b/include/cuco/static_reduction_map.cuh @@ -271,12 +271,9 @@ class static_reduction_map { using atomic_mapped_type = cuda::atomic; using pair_atomic_type = cuco::pair_type; using slot_type = pair_atomic_type; - using atomic_ctr_type = cuda::atomic; using allocator_type = Allocator; using slot_allocator_type = typename std::allocator_traits::rebind_alloc; - using counter_allocator_type = - typename std::allocator_traits::rebind_alloc; #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700) static_assert(atomic_key_type::is_always_lock_free, @@ -1101,16 +1098,21 @@ class static_reduction_map { /** * @brief Gets the number of elements in the hash map. * + * @param stream CUDA stream this operation is issued in (synchronizes with host) * @return The number of elements in the map */ - std::size_t get_size() const noexcept { return size_; } + std::size_t get_size(cudaStream_t stream = 0) const noexcept; /** * @brief Gets the load factor of the hash map. * + * @param stream CUDA stream this operation is issued in (synchronizes with host) * @return The load factor of the hash map */ - float get_load_factor() const noexcept { return static_cast(size_) / capacity_; } + float get_load_factor(cudaStream_t stream = 0) const noexcept + { + return static_cast(get_size(stream)) / capacity_; + } /** * @brief Gets the sentinel value used to represent an empty key slot. @@ -1162,14 +1164,12 @@ class static_reduction_map { value_type const* raw_slots_end() const noexcept { return raw_slots_begin() + get_capacity(); } - pair_atomic_type* slots_{nullptr}; ///< Pointer to flat slots storage - std::size_t capacity_{}; ///< Total number of slots - std::size_t size_{}; ///< Number of keys in map - Key empty_key_sentinel_{}; ///< Key value that represents an empty slot - Value empty_value_sentinel_{}; ///< Initial value of empty slot - ReductionOp op_{}; ///< Binary operation reduction function object - slot_allocator_type slot_allocator_{}; ///< Allocator used to allocate slots - counter_allocator_type counter_allocator_{}; ///< Allocator used to allocate counters + pair_atomic_type* slots_{nullptr}; ///< Pointer to flat slots storage + std::size_t capacity_{}; ///< Total number of slots + Key empty_key_sentinel_{}; ///< Key value that represents an empty slot + Value empty_value_sentinel_{}; ///< Initial value of empty slot + ReductionOp op_{}; ///< Binary operation reduction function object + slot_allocator_type slot_allocator_{}; ///< Allocator used to allocate slots }; } // namespace cuco From d440243842e0858f7d8f4ea0a094144e4f28a7ca Mon Sep 17 00:00:00 2001 From: Daniel Juenger Date: Mon, 9 Aug 2021 18:21:15 -0700 Subject: [PATCH 57/69] Add throughput column to nvbench benchmarks. --- benchmarks/hash_table/static_reduction_map_bench.cu | 2 ++ benchmarks/hash_table/static_reduction_map_param_grid_search.cu | 2 ++ benchmarks/reduce_by_key/cub_reduce_by_key_bench.cu | 2 ++ benchmarks/reduce_by_key/cuco_reduce_by_key_bench.cu | 2 ++ benchmarks/reduce_by_key/thrust_reduce_by_key_bench.cu | 2 ++ 5 files changed, 10 insertions(+) diff --git a/benchmarks/hash_table/static_reduction_map_bench.cu b/benchmarks/hash_table/static_reduction_map_bench.cu index 863477a7c..2de5fa585 100644 --- a/benchmarks/hash_table/static_reduction_map_bench.cu +++ b/benchmarks/hash_table/static_reduction_map_bench.cu @@ -117,6 +117,8 @@ void nvbench_cuco_static_reduction_map_insert( thrust::make_zip_iterator(thrust::make_tuple(d_keys_in.begin(), d_values_in.begin())); auto d_pairs_in_end = d_pairs_in_begin + num_elems; + state.add_element_count(num_elems); + state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { map_type map{capacity, -1}; diff --git a/benchmarks/hash_table/static_reduction_map_param_grid_search.cu b/benchmarks/hash_table/static_reduction_map_param_grid_search.cu index 27bcbf38d..4063d7c73 100644 --- a/benchmarks/hash_table/static_reduction_map_param_grid_search.cu +++ b/benchmarks/hash_table/static_reduction_map_param_grid_search.cu @@ -70,6 +70,8 @@ void nvbench_cuco_static_reduction_map_custom_op_backoff_delay( thrust::make_zip_iterator(thrust::make_tuple(d_keys.begin(), d_values.begin())); auto d_pairs_end = d_pairs_begin + num_elems; + state.add_element_count(num_elems); + state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { map_type map{capacity, -1}; diff --git a/benchmarks/reduce_by_key/cub_reduce_by_key_bench.cu b/benchmarks/reduce_by_key/cub_reduce_by_key_bench.cu index efbe7799d..5bf48e4af 100644 --- a/benchmarks/reduce_by_key/cub_reduce_by_key_bench.cu +++ b/benchmarks/reduce_by_key/cub_reduce_by_key_bench.cu @@ -72,6 +72,8 @@ void nvbench_cub_reduce_by_key(nvbench::state& state, nvbench::type_list d_temp(std::max(temp_bytes_sort, temp_bytes_reduce)); + state.add_element_count(num_elems_in); + state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { timer.start(); diff --git a/benchmarks/reduce_by_key/cuco_reduce_by_key_bench.cu b/benchmarks/reduce_by_key/cuco_reduce_by_key_bench.cu index 96d5ee2b3..3c23330b9 100644 --- a/benchmarks/reduce_by_key/cuco_reduce_by_key_bench.cu +++ b/benchmarks/reduce_by_key/cuco_reduce_by_key_bench.cu @@ -105,6 +105,8 @@ void nvbench_cuco_static_reduction_map_reduce_by_key( thrust::make_zip_iterator(thrust::make_tuple(d_keys.begin(), d_values.begin())); auto d_pairs_end = d_pairs_begin + num_elems; + state.add_element_count(num_elems); + state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { map_type map{capacity, -1}; diff --git a/benchmarks/reduce_by_key/thrust_reduce_by_key_bench.cu b/benchmarks/reduce_by_key/thrust_reduce_by_key_bench.cu index 8cc5ef3cc..4069c3b1d 100644 --- a/benchmarks/reduce_by_key/thrust_reduce_by_key_bench.cu +++ b/benchmarks/reduce_by_key/thrust_reduce_by_key_bench.cu @@ -69,6 +69,8 @@ void nvbench_thrust_reduce_by_key(nvbench::state& state, nvbench::type_list d_keys(h_keys); thrust::device_vector d_values(h_values); + state.add_element_count(num_elems); + state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { timer.start(); From 4339b2bd84c851b33845174bd174957377cb9d8e Mon Sep 17 00:00:00 2001 From: Daniel Juenger Date: Tue, 24 Aug 2021 23:28:04 +0000 Subject: [PATCH 58/69] Fix for reductions over FP types. --- include/cuco/detail/reduction_ops.cuh | 229 ++++++++++++++++++ include/cuco/static_reduction_map.cuh | 111 +-------- .../static_reduction_map_test.cu | 8 +- 3 files changed, 236 insertions(+), 112 deletions(-) create mode 100644 include/cuco/detail/reduction_ops.cuh diff --git a/include/cuco/detail/reduction_ops.cuh b/include/cuco/detail/reduction_ops.cuh new file mode 100644 index 000000000..c8496470e --- /dev/null +++ b/include/cuco/detail/reduction_ops.cuh @@ -0,0 +1,229 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +namespace cuco { + +/** + * @brief `+` reduction functor that internally uses an atomic fetch-and-add + * operation. + * + * @tparam T The data type used for reduction + */ +template +struct reduce_add { + using value_type = T; + static constexpr T identity = 0; + + template + __device__ T apply(cuda::atomic& slot, T2 const& value) const + { + return slot.fetch_add(value, cuda::memory_order_relaxed); + } +}; + +// remove this workaround once libcu++ extends FP atomics support +// https://github.com/NVIDIA/libcudacxx/issues/104 +template <> +struct reduce_add { + using value_type = float; + static constexpr float identity = 0; + + template + __device__ float apply(cuda::atomic& slot, T2 const& value) const + { + return atomicAdd(reinterpret_cast(&slot), value); + } +}; + +template <> +struct reduce_add { + using value_type = double; + static constexpr double identity = 0; + + template + __device__ double apply(cuda::atomic& slot, T2 const& value) const + { + return atomicAdd(reinterpret_cast(&slot), value); + } +}; + +/** + * @brief `-` reduction functor that internally uses an atomic fetch-and-add + * operation. + * + * @tparam T The data type used for reduction + */ +template +struct reduce_sub { + using value_type = T; + static constexpr T identity = 0; + + template + __device__ T apply(cuda::atomic& slot, T2 const& value) const + { + return slot.fetch_sub(value, cuda::memory_order_relaxed); + } +}; + +template <> +struct reduce_sub { + using value_type = float; + static constexpr float identity = 0; + + template + __device__ float apply(cuda::atomic& slot, T2 const& value) const + { + return atomicSub(reinterpret_cast(&slot), value); + } +}; + +template <> +struct reduce_sub { + using value_type = double; + static constexpr double identity = 0; + + template + __device__ double apply(cuda::atomic& slot, T2 const& value) const + { + return atomicSub(reinterpret_cast(&slot), value); + } +}; + +/** + * @brief `min` reduction functor that internally uses an atomic fetch-and-add + * operation. + * + * @tparam T The data type used for reduction + */ +template +struct reduce_min { + using value_type = T; + static constexpr T identity = std::numeric_limits::max(); + + template + __device__ T apply(cuda::atomic& slot, T2 const& value) const + { + return slot.fetch_min(value, cuda::memory_order_relaxed); + } +}; + +template <> +struct reduce_min { + using value_type = float; + static constexpr float identity = std::numeric_limits::max(); + + template + __device__ float apply(cuda::atomic& slot, T2 const& value) const + { + return atomicMin(reinterpret_cast(&slot), value); + } +}; + +template <> +struct reduce_min { + using value_type = double; + static constexpr double identity = std::numeric_limits::max(); + + template + __device__ double apply(cuda::atomic& slot, T2 const& value) const + { + return atomicMin(reinterpret_cast(&slot), value); + } +}; + +/** + * @brief `max` reduction functor that internally uses an atomic fetch-and-add + * operation. + * + * @tparam T The data type used for reduction + */ +template +struct reduce_max { + using value_type = T; + static constexpr T identity = std::numeric_limits::lowest(); + + template + __device__ T apply(cuda::atomic& slot, T2 const& value) const + { + return slot.fetch_max(value, cuda::memory_order_relaxed); + } +}; + +template <> +struct reduce_max { + using value_type = float; + static constexpr float identity = std::numeric_limits::lowest(); + + template + __device__ float apply(cuda::atomic& slot, T2 const& value) const + { + return atomicMax(reinterpret_cast(&slot), value); + } +}; + +template <> +struct reduce_max { + using value_type = double; + static constexpr double identity = std::numeric_limits::lowest(); + + template + __device__ double apply(cuda::atomic& slot, T2 const& value) const + { + return atomicMax(reinterpret_cast(&slot), value); + } +}; + +/** + * @brief Wrapper for a user-defined custom reduction operator. + * @brief Internally uses an atomic compare-and-swap loop. + * + * @tparam T The data type used for reduction + * @tparam Identity Neutral element under the given reduction group + * @tparam Op Commutative and associative binary operator + */ +template +struct custom_op { + using value_type = T; + static constexpr T identity = Identity; + + Op op; + + template + __device__ T apply(cuda::atomic& slot, T2 const& value) const + { + [[maybe_unused]] unsigned ns = BackoffBaseDelay; + + auto old = slot.load(cuda::memory_order_relaxed); + while (not slot.compare_exchange_strong(old, op(old, value), cuda::memory_order_relaxed)) { +#if __CUDA_ARCH__ >= 700 + // exponential backoff strategy to reduce atomic contention + if (true) { + asm volatile("nanosleep.u32 %0;" ::"r"((unsigned)ns) :); + if (ns < BackoffMaxDelay) { ns *= 2; } + } +#endif + } + return old; + } +}; + +} // namespace cuco \ No newline at end of file diff --git a/include/cuco/static_reduction_map.cuh b/include/cuco/static_reduction_map.cuh index dfb44a9a7..3cc679aff 100644 --- a/include/cuco/static_reduction_map.cuh +++ b/include/cuco/static_reduction_map.cuh @@ -42,121 +42,12 @@ #include #include #include +#include #include #include namespace cuco { -/** - * @brief `+` reduction functor that internally uses an atomic fetch-and-add - * operation. - * - * @tparam T The data type used for reduction - */ -template -struct reduce_add { - using value_type = T; - static constexpr T identity = 0; - - template - __device__ T apply(cuda::atomic& slot, T2 const& value) const - { - return slot.fetch_add(value, cuda::memory_order_relaxed); - } -}; - -/** - * @brief `-` reduction functor that internally uses an atomic fetch-and-add - * operation. - * - * @tparam T The data type used for reduction - */ -template -struct reduce_sub { - using value_type = T; - static constexpr T identity = 0; - - template - __device__ T apply(cuda::atomic& slot, T2 const& value) const - { - return slot.fetch_sub(value, cuda::memory_order_relaxed); - } -}; - -/** - * @brief `min` reduction functor that internally uses an atomic fetch-and-add - * operation. - * - * @tparam T The data type used for reduction - */ -template -struct reduce_min { - using value_type = T; - static constexpr T identity = std::numeric_limits::max(); - - template - __device__ T apply(cuda::atomic& slot, T2 const& value) const - { - return slot.fetch_min(value, cuda::memory_order_relaxed); - } -}; - -/** - * @brief `max` reduction functor that internally uses an atomic fetch-and-add - * operation. - * - * @tparam T The data type used for reduction - */ -template -struct reduce_max { - using value_type = T; - static constexpr T identity = std::numeric_limits::lowest(); - - template - __device__ T apply(cuda::atomic& slot, T2 const& value) const - { - return slot.fetch_max(value, cuda::memory_order_relaxed); - } -}; - -/** - * @brief Wrapper for a user-defined custom reduction operator. - * @brief Internally uses an atomic compare-and-swap loop. - * - * @tparam T The data type used for reduction - * @tparam Identity Neutral element under the given reduction group - * @tparam Op Commutative and associative binary operator - */ -template -struct custom_op { - using value_type = T; - static constexpr T identity = Identity; - - Op op; - - template - __device__ T apply(cuda::atomic& slot, T2 const& value) const - { - [[maybe_unused]] unsigned ns = BackoffBaseDelay; - - auto old = slot.load(cuda::memory_order_relaxed); - while (not slot.compare_exchange_strong(old, op(old, value), cuda::memory_order_relaxed)) { -#if __CUDA_ARCH__ >= 700 - // exponential backoff strategy to reduce atomic contention - if (true) { - asm volatile("nanosleep.u32 %0;" ::"r"((unsigned)ns) :); - if (ns < BackoffMaxDelay) { ns *= 2; } - } -#endif - } - return old; - } -}; - /** * @brief A GPU-accelerated, unordered, associative container of key-value * pairs that reduces the values associated to the same key according to a diff --git a/tests/static_reduction_map/static_reduction_map_test.cu b/tests/static_reduction_map/static_reduction_map_test.cu index 51fee5740..f1540e041 100644 --- a/tests/static_reduction_map/static_reduction_map_test.cu +++ b/tests/static_reduction_map/static_reduction_map_test.cu @@ -31,7 +31,9 @@ TEMPLATE_TEST_CASE_SIG("Insert all identical keys", "", ((typename Key, typename Value, typename Op), Key, Value, Op), (int32_t, int32_t, cuco::reduce_add), - (int32_t, int32_t, custom_reduce_add)) + (int32_t, int32_t, custom_reduce_add), + (int32_t, float, cuco::reduce_add), + (int64_t, double, cuco::reduce_add)) { thrust::device_vector keys(100, 42); thrust::device_vector values(keys.size(), 1); @@ -133,7 +135,9 @@ TEMPLATE_TEST_CASE_SIG("Shared memory hast table.", "", ((typename Key, typename Value, typename Op), Key, Value, Op), (int32_t, int32_t, cuco::reduce_add), - (int32_t, int32_t, custom_reduce_add)) + (int32_t, int32_t, custom_reduce_add), + (int32_t, float, cuco::reduce_add), + (int64_t, double, cuco::reduce_add)) { constexpr std::size_t N = 256; thrust::device_vector key_found(N, false); From 6293117a6ea406319cbb325fabe690473425507c Mon Sep 17 00:00:00 2001 From: Jake Hemstad Date: Wed, 13 Oct 2021 16:43:19 -0500 Subject: [PATCH 59/69] Add support for both static/dynamic extent of device_view. --- .../hash_table/static_reduction_map_bench.cu | 144 +++++++++++++-- .../static_reduction_map_smem_bench.cu | 8 +- .../shared_memory_example.cu | 5 +- include/cuco/detail/static_reduction_map.inl | 32 ++-- include/cuco/static_reduction_map.cuh | 164 ++++++++++++------ .../static_reduction_map_test.cu | 8 +- 6 files changed, 275 insertions(+), 86 deletions(-) diff --git a/benchmarks/hash_table/static_reduction_map_bench.cu b/benchmarks/hash_table/static_reduction_map_bench.cu index 2de5fa585..917200290 100644 --- a/benchmarks/hash_table/static_reduction_map_bench.cu +++ b/benchmarks/hash_table/static_reduction_map_bench.cu @@ -20,6 +20,9 @@ #include #include #include +#include + +namespace cg = cooperative_groups; /** * @brief Enum representation for reduction operators @@ -77,9 +80,134 @@ struct op_type_map { using type = cuco::custom_op, 0>; }; -/** - * @brief A benchmark evaluating insert performance. - */ +enum class Extent { + DYNAMIC, STATIC +}; + +NVBENCH_DECLARE_ENUM_TYPE_STRINGS( + // Enum type: + Extent, + // Callable to generate input strings: + // Short identifier used for tables, command-line args, etc. + // Used when context is available to figure out the enum type. + [](Extent e) { + switch (e) { + case Extent::DYNAMIC: return "DYNAMIC"; + case Extent::STATIC: return "STATIC"; + default: return "ERROR"; + } + }, + // Callable to generate descriptions: + // If non-empty, these are used in `--list` to describe values. + // Used when context may not be available to figure out the type from the + // input string. + // Just use `[](auto) { return std::string{}; }` if you don't want these. + [](auto) { return std::string{}; }) + +struct always_false{ + always_false() = default; + + __host__ __device__ + operator bool(){ + return b; + } + +private: + bool b{false}; +}; + +template +__global__ +void dynamic_shmem_insert_kernel(std::size_t num_keys, std::size_t multiplicity, std::size_t capacity, always_false pred, bool* do_not_use){ + + using Map = typename cuco::static_reduction_map::device_mutable_view; + + #pragma diag_suppress static_var_with_dynamic_init + extern __shared__ typename Map::slot_type slots[]; + + auto map = Map::make_from_uninitialized_slots(cg::this_thread_block(), slots, capacity, -1); + auto tid = threadIdx.x + blockIdx.x * blockDim.x; + + bool result; + while(tid < num_keys){ + for(int i = 0; i < multiplicity; ++i){ + result = map.insert(cuco::pair{tid, i}); + } + tid += blockDim.x; + } + + // Placeholder predicated store to inject artificial side-effects and keep compiler from discarding + // the code above + if(pred){ + *do_not_use = result; + } +} + +template +__global__ +void static_shmem_insert_kernel(std::size_t num_keys, std::size_t multiplicity, always_false pred, bool* do_not_use){ + + using Map = typename cuco::static_reduction_map::device_mutable_view; + + #pragma diag_suppress static_var_with_dynamic_init + __shared__ typename Map::slot_type slots[Capacity]; + + auto map = Map::make_from_uninitialized_slots(cg::this_thread_block(), slots, -1); + auto tid = threadIdx.x + blockIdx.x * blockDim.x; + + bool result; + while(tid < num_keys){ + for(int i = 0; i < multiplicity; ++i){ + result = map.insert(cuco::pair{tid, i}); + } + tid += blockDim.x; + } + + // Placeholder predicated store to inject artificial side-effects and keep compiler from discarding + // the code above +} + +template +void static_shmem(nvbench::state& state, + nvbench::type_list, nvbench::enum_type, nvbench::enum_type>) +{ + using OpType = typename op_type_map::type; + + auto const occupancy = state.get_float64("Occupancy"); + auto const num_keys = static_cast(std::floor(Capacity * occupancy)); + auto const multiplicity = 1; + + if(num_keys > Capacity){ + throw; + } + + state.exec([&](nvbench::launch& launch){ + if constexpr(E == Extent::STATIC){ + static_shmem_insert_kernel<<<512, 1024, 0, launch.get_stream()>>>(num_keys, multiplicity, always_false{}, (bool*)nullptr); + } else { + using slot_type = typename cuco::static_reduction_map::device_mutable_view<>::slot_type; + dynamic_shmem_insert_kernel<<<512, 1024, Capacity * sizeof(slot_type), launch.get_stream()>>>(num_keys, multiplicity, Capacity, always_false{}, (bool*)nullptr); + } + }); +} + + + + +// type parameter dimensions for benchmark +using key_type_range = nvbench::type_list; +using value_type_range = nvbench::type_list; +using op_type_range = nvbench::enum_type_list; +using capacity_range = nvbench::enum_type_list<6000>; +using extent_options = nvbench::enum_type_list; + +NVBENCH_BENCH_TYPES(static_shmem, + NVBENCH_TYPE_AXES(key_type_range, value_type_range, op_type_range, capacity_range, extent_options)) + .set_name("Insert Static vs Dynamic Extent") + .set_type_axes_names({"Key", "Value", "ReductionOp", "Capacity", "Extent"}) + .add_float64_axis("Occupancy", nvbench::range(0.5, 0.9, 0.1)); + + template void nvbench_cuco_static_reduction_map_insert( nvbench::state& state, nvbench::type_list>) @@ -113,8 +241,7 @@ void nvbench_cuco_static_reduction_map_insert( thrust::device_vector d_keys_in(h_keys_in); thrust::device_vector d_values_in(h_values_in); - auto d_pairs_in_begin = - thrust::make_zip_iterator(thrust::make_tuple(d_keys_in.begin(), d_values_in.begin())); + auto d_pairs_in_begin = thrust::make_zip_iterator(thrust::make_tuple(d_keys_in.begin(), d_values_in.begin())); auto d_pairs_in_end = d_pairs_in_begin + num_elems; state.add_element_count(num_elems); @@ -129,11 +256,6 @@ void nvbench_cuco_static_reduction_map_insert( }); } -// type parameter dimensions for benchmark -using key_type_range = nvbench::type_list; -using value_type_range = nvbench::type_list; -using op_type_range = - nvbench::enum_type_list; // benchmark setups @@ -169,4 +291,4 @@ NVBENCH_BENCH_TYPES(nvbench_cuco_static_reduction_map_insert, .add_float64_axis("Occupancy", nvbench::range(0.5, 0.9, 0.1)) .add_int64_axis("Multiplicity", {1, 10, 100, 1'000, 10'000, 100'000, 1'000'000}) // key multiplicity range - .add_string_axis("Distribution", {"UNIFORM"}); \ No newline at end of file + .add_string_axis("Distribution", {"UNIFORM"}); diff --git a/benchmarks/hash_table/static_reduction_map_smem_bench.cu b/benchmarks/hash_table/static_reduction_map_smem_bench.cu index 4c18554f4..3876b016d 100644 --- a/benchmarks/hash_table/static_reduction_map_smem_bench.cu +++ b/benchmarks/hash_table/static_reduction_map_smem_bench.cu @@ -82,15 +82,13 @@ void static_reduction_map_smem_insert_bench(cuda_benchmark::controller &controll } controller.benchmark(std::string{full_bench_name}, [=] __device__(cuda_benchmark::state & state) { - using map_type = typename cuco:: - static_reduction_map, Key, Value, cuda::thread_scope_block>; - using map_view_type = typename map_type::device_mutable_view; + using map_type = typename cuco::static_reduction_map, Key, Value, cuda::thread_scope_block>; + using map_view_type = typename map_type::device_mutable_view<>; __shared__ char sm_buffer[max_smem_bytes]; auto g = cooperative_groups::this_thread_block(); - auto map = map_view_type::make_from_uninitialized_slots( - g, reinterpret_cast(&sm_buffer[0]), capacity, ~Key(0)); + auto map = map_view_type::make_from_uninitialized_slots(g, reinterpret_cast(&sm_buffer[0]), capacity, ~Key(0)); g.sync(); diff --git a/examples/static_reduction_map/shared_memory_example.cu b/examples/static_reduction_map/shared_memory_example.cu index b8c302acb..e57cd75e4 100644 --- a/examples/static_reduction_map/shared_memory_example.cu +++ b/examples/static_reduction_map/shared_memory_example.cu @@ -32,11 +32,12 @@ __global__ void static_reduction_map_shared_memory_kernel(OutputIt key_found) namespace cg = cooperative_groups; // define a mutable view for insert operations - using mutable_view_type = typename MapType::device_mutable_view; + using mutable_view_type = typename MapType::device_mutable_view<>; // define a immutable view for find/contains operations - using view_type = typename MapType::device_view; + using view_type = typename MapType::device_view<>; // hash table storage in shared memory + #pragma diag_suppress static_var_with_dynamic_init __shared__ typename mutable_view_type::slot_type slots[Capacity]; // construct the table from the provided array in shared memory diff --git a/include/cuco/detail/static_reduction_map.inl b/include/cuco/detail/static_reduction_map.inl index a44544755..eb0a211b4 100644 --- a/include/cuco/detail/static_reduction_map.inl +++ b/include/cuco/detail/static_reduction_map.inl @@ -197,9 +197,10 @@ template +template template __device__ bool -static_reduction_map::device_mutable_view::insert( +static_reduction_map::device_mutable_view::insert( value_type const& insert_pair, Hash hash, KeyEqual key_equal) noexcept { auto current_slot{initial_slot(insert_pair.first, hash)}; @@ -232,9 +233,10 @@ template +template template __device__ bool -static_reduction_map::device_mutable_view::insert( +static_reduction_map::device_mutable_view::insert( CG const& g, value_type const& insert_pair, Hash hash, KeyEqual key_equal) noexcept { auto current_slot = initial_slot(g, insert_pair.first, hash); @@ -299,10 +301,11 @@ template +template template __device__ - typename static_reduction_map::device_view::iterator - static_reduction_map::device_view::find( + typename static_reduction_map::device_view::iterator + static_reduction_map::device_view::find( Key const& k, Hash hash, KeyEqual key_equal) noexcept { auto current_slot = initial_slot(k, hash); @@ -326,10 +329,11 @@ template +template template -__device__ typename static_reduction_map::device_view:: +__device__ typename static_reduction_map::device_view:: const_iterator - static_reduction_map::device_view::find( + static_reduction_map::device_view::find( Key const& k, Hash hash, KeyEqual key_equal) const noexcept { auto current_slot = initial_slot(k, hash); @@ -353,10 +357,11 @@ template +template template __device__ - typename static_reduction_map::device_view::iterator - static_reduction_map::device_view::find( + typename static_reduction_map::device_view::iterator + static_reduction_map::device_view::find( CG const& g, Key const& k, Hash hash, KeyEqual key_equal) noexcept { auto current_slot = initial_slot(g, k, hash); @@ -394,10 +399,11 @@ template +template template -__device__ typename static_reduction_map::device_view:: +__device__ typename static_reduction_map::device_view:: const_iterator - static_reduction_map::device_view::find( + static_reduction_map::device_view::find( CG const& g, Key const& k, Hash hash, KeyEqual key_equal) const noexcept { auto current_slot = initial_slot(g, k, hash); @@ -437,9 +443,10 @@ template +template template __device__ bool -static_reduction_map::device_view::contains( +static_reduction_map::device_view::contains( Key const& k, Hash hash, KeyEqual key_equal) noexcept { auto current_slot = initial_slot(k, hash); @@ -460,9 +467,10 @@ template +template template __device__ bool -static_reduction_map::device_view::contains( +static_reduction_map::device_view::contains( CG const& g, Key const& k, Hash hash, KeyEqual key_equal) noexcept { auto current_slot = initial_slot(g, k, hash); diff --git a/include/cuco/static_reduction_map.cuh b/include/cuco/static_reduction_map.cuh index 3cc679aff..e12477036 100644 --- a/include/cuco/static_reduction_map.cuh +++ b/include/cuco/static_reduction_map.cuh @@ -48,6 +48,8 @@ namespace cuco { + static constexpr std::size_t dynamic_extent = std::numeric_limits::max(); + /** * @brief A GPU-accelerated, unordered, associative container of key-value * pairs that reduces the values associated to the same key according to a @@ -313,6 +315,26 @@ class static_reduction_map { KeyEqual key_equal = KeyEqual{}); private: + + + + template + struct slot_storage{ + slot_storage() = delete; + constexpr explicit slot_storage(Slot* p, std::size_t) noexcept : ptr{p} {} + Slot* ptr; + static constexpr std::size_t size = Extent; + }; + + template + struct slot_storage{ + slot_storage() = delete; + constexpr slot_storage(Slot* p, std::size_t n) noexcept : ptr{p}, size{n} {} + Slot* ptr; + std::size_t size; + }; + + template class device_view_base { protected: // Import member type definitions from `static_reduction_map` @@ -323,9 +345,10 @@ class static_reduction_map { using const_iterator = pair_atomic_type const*; using slot_type = slot_type; + static constexpr std::size_t extent = Extent; + private: - pair_atomic_type* slots_{}; ///< Pointer to flat slots storage - std::size_t capacity_{}; ///< Total number of slots + slot_storage storage_; Key empty_key_sentinel_{}; ///< Key value that represents an empty slot Value empty_value_sentinel_{}; ///< Initial Value of empty slot ReductionOp op_{}; ///< Binary operation reduction function object @@ -335,14 +358,23 @@ class static_reduction_map { std::size_t capacity, Key empty_key_sentinel, ReductionOp reduction_op) noexcept - : slots_{slots}, - capacity_{capacity}, + : storage_{slots, capacity}, empty_key_sentinel_{empty_key_sentinel}, empty_value_sentinel_{ReductionOp::identity}, op_{reduction_op} { + assert(extent == dynamic_extent or capacity == extent); } + + template* = nullptr> + __host__ __device__ + constexpr device_view_base(slot_type (&arr)[N], Key empty_key_sentinel, ReductionOp op) + : storage_{arr, N}, empty_key_sentinel_{empty_key_sentinel}, + empty_value_sentinel_{ReductionOp::identity}, op_{op} {} + + /** * @brief Returns the initial slot for a given key `k` * @@ -352,9 +384,10 @@ class static_reduction_map { * @return Pointer to the initial slot for `k` */ template - __device__ iterator initial_slot(Key const& k, Hash hash) noexcept + __device__ + constexpr iterator initial_slot(Key const& k, Hash hash) noexcept { - return &slots_[hash(k) % capacity_]; + return begin_slot() + (hash(k) % get_capacity()); } /** @@ -366,9 +399,10 @@ class static_reduction_map { * @return Pointer to the initial slot for `k` */ template - __device__ const_iterator initial_slot(Key const& k, Hash hash) const noexcept + __device__ + constexpr const_iterator initial_slot(Key const& k, Hash hash) const noexcept { - return &slots_[hash(k) % capacity_]; + return begin_slot() + (hash(k) % get_capacity()); } /** @@ -384,9 +418,10 @@ class static_reduction_map { * @return Pointer to the initial slot for `k` */ template - __device__ iterator initial_slot(CG const& g, Key const& k, Hash hash) noexcept + __device__ + constexpr iterator initial_slot(CG const& g, Key const& k, Hash hash) noexcept { - return &slots_[(hash(k) + g.thread_rank()) % capacity_]; + return begin_slot() + (hash(k) + g.thread_rank()) % get_capacity(); } /** @@ -404,7 +439,7 @@ class static_reduction_map { template __device__ const_iterator initial_slot(CG const& g, Key const& k, Hash hash) const noexcept { - return &slots_[(hash(k) + g.thread_rank()) % capacity_]; + return begin_slot() + (hash(k) + g.thread_rank()) % get_capacity(); } /** @@ -415,7 +450,9 @@ class static_reduction_map { * @param s The slot to advance * @return The next slot after `s` */ - __device__ iterator next_slot(iterator s) noexcept { return (++s < end()) ? s : begin_slot(); } + __device__ iterator next_slot(iterator s) noexcept { + return (++s < end()) ? s : begin_slot(); + } /** * @brief Given a slot `s`, returns the next slot. @@ -444,8 +481,8 @@ class static_reduction_map { template __device__ iterator next_slot(CG const& g, iterator s) noexcept { - uint32_t index = s - slots_; - return &slots_[(index + g.size()) % capacity_]; + auto const index = thrust::distance(begin_slot(), s); + return begin_slot() + (index + g.size()) % get_capacity(); } /** @@ -462,8 +499,8 @@ class static_reduction_map { template __device__ const_iterator next_slot(CG const& g, const_iterator s) const noexcept { - uint32_t index = s - slots_; - return &slots_[(index + g.size()) % capacity_]; + auto const index = thrust::distance(begin_slot(), s); + return begin_slot() + (index + g.size()) % get_capacity(); } /** @@ -481,8 +518,7 @@ class static_reduction_map { */ template - __device__ static void initialize_slots( - CG g, pair_atomic_type* slots, std::size_t num_slots, Key k, Value v) + __device__ static void initialize_slots(CG g, slot_type* slots, std::size_t num_slots, Key k, Value v) { auto tid = g.thread_rank(); while (tid < num_slots) { @@ -505,21 +541,24 @@ class static_reduction_map { * * @return Slots array */ - __device__ pair_atomic_type* get_slots() noexcept { return slots_; } + __device__ + constexpr slot_type* get_slots() noexcept { return storage_.ptr; } /** * @brief Gets slots array. * * @return Slots array */ - __device__ pair_atomic_type const* get_slots() const noexcept { return slots_; } + __device__ + constexpr slot_type const* get_slots() const noexcept { return storage_.ptr; } /** * @brief Gets the maximum number of elements the hash map can hold. * * @return The maximum number of elements the hash map can hold */ - __host__ __device__ std::size_t get_capacity() const noexcept { return capacity_; } + __host__ __device__ + constexpr std::size_t get_capacity() const noexcept { return storage_.size; } /** * @brief Gets the sentinel value used to represent an empty key slot. @@ -551,7 +590,8 @@ class static_reduction_map { * * @return Iterator to the first slot */ - __device__ iterator begin_slot() noexcept { return slots_; } + __device__ + constexpr iterator begin_slot() noexcept { return get_slots(); } /** * @brief Returns iterator to the first slot. @@ -566,21 +606,24 @@ class static_reduction_map { * * @return Iterator to the first slot */ - __device__ const_iterator begin_slot() const noexcept { return slots_; } + __device__ + constexpr const_iterator begin_slot() const noexcept { return get_slots(); } /** * @brief Returns a const_iterator to one past the last slot. * * @return A const_iterator to one past the last slot */ - __host__ __device__ const_iterator end_slot() const noexcept { return slots_ + capacity_; } + __host__ __device__ + constexpr const_iterator end_slot() const noexcept { return begin_slot() + get_capacity(); } /** * @brief Returns an iterator to one past the last slot. * * @return An iterator to one past the last slot */ - __host__ __device__ iterator end_slot() noexcept { return slots_ + capacity_; } + __host__ __device__ + constexpr iterator end_slot() noexcept { return begin_slot() + get_capacity(); } /** * @brief Returns a const_iterator to one past the last slot. @@ -590,7 +633,8 @@ class static_reduction_map { * * @return A const_iterator to one past the last slot */ - __host__ __device__ const_iterator end() const noexcept { return end_slot(); } + __host__ __device__ + constexpr const_iterator end() const noexcept { return end_slot(); } /** * @brief Returns an iterator to one past the last slot. @@ -600,7 +644,8 @@ class static_reduction_map { * * @return An iterator to one past the last slot */ - __host__ __device__ iterator end() noexcept { return end_slot(); } + __host__ __device__ + constexpr iterator end() noexcept { return end_slot(); } }; public: @@ -624,14 +669,15 @@ class static_reduction_map { * }); * \endcode */ - class device_mutable_view : public device_view_base { + template + class device_mutable_view : public device_view_base { public: - using value_type = typename device_view_base::value_type; - using key_type = typename device_view_base::key_type; - using mapped_type = typename device_view_base::mapped_type; - using iterator = typename device_view_base::iterator; - using const_iterator = typename device_view_base::const_iterator; - using slot_type = typename device_view_base::slot_type; + using value_type = typename device_view_base::value_type; + using key_type = typename device_view_base::key_type; + using mapped_type = typename device_view_base::mapped_type; + using iterator = typename device_view_base::iterator; + using const_iterator = typename device_view_base::const_iterator; + using slot_type = typename device_view_base::slot_type; /** * @brief Construct a mutable view of the first `capacity` slots of the @@ -648,10 +694,12 @@ class static_reduction_map { std::size_t capacity, Key empty_key_sentinel, ReductionOp reduction_op = {}) noexcept - : device_view_base{slots, capacity, empty_key_sentinel, reduction_op} + : device_view_base{slots, capacity, empty_key_sentinel, reduction_op} { } + using device_view_base::device_view_base; + template __device__ static device_mutable_view make_from_uninitialized_slots( CG const& g, @@ -660,9 +708,20 @@ class static_reduction_map { Key empty_key_sentinel, ReductionOp reduction_op = {}) noexcept { - device_view_base::initialize_slots( - g, slots, capacity, empty_key_sentinel, ReductionOp::identity); - return device_mutable_view{slots, capacity, empty_key_sentinel, reduction_op}; + assert(extent == dynamic_extent or capacity == extent); + device_view_base::initialize_slots(g, slots, capacity, empty_key_sentinel, ReductionOp::identity); + return device_mutable_view{slots, capacity, empty_key_sentinel, reduction_op}; + } + + template + __device__ static device_mutable_view make_from_uninitialized_slots( + CG const& g, + slot_type (&slots)[N], + Key empty_key_sentinel, + ReductionOp reduction_op = {}) noexcept + { + device_view_base::initialize_slots(g, slots, N, empty_key_sentinel, ReductionOp::identity); + return device_mutable_view{slots, empty_key_sentinel, reduction_op}; } /** @@ -723,14 +782,15 @@ class static_reduction_map { * value. * */ - class device_view : public device_view_base { + template + class device_view : public device_view_base { public: - using value_type = typename device_view_base::value_type; - using key_type = typename device_view_base::key_type; - using mapped_type = typename device_view_base::mapped_type; - using iterator = typename device_view_base::iterator; - using const_iterator = typename device_view_base::const_iterator; - using slot_type = typename device_view_base::slot_type; + using value_type = typename device_view_base::value_type; + using key_type = typename device_view_base::key_type; + using mapped_type = typename device_view_base::mapped_type; + using iterator = typename device_view_base::iterator; + using const_iterator = typename device_view_base::const_iterator; + using slot_type = typename device_view_base::slot_type; /** * @brief Construct a view of the first `capacity` slots of the @@ -746,7 +806,7 @@ class static_reduction_map { std::size_t capacity, Key empty_key_sentinel, ReductionOp reduction_op = {}) noexcept - : device_view_base{slots, capacity, empty_key_sentinel, reduction_op} + : device_view_base{slots, capacity, empty_key_sentinel, reduction_op} { } @@ -755,8 +815,8 @@ class static_reduction_map { * * @param mutable_map object of type `device_mutable_view` */ - __host__ __device__ explicit device_view(device_mutable_view mutable_map) - : device_view_base{mutable_map.get_slots(), + __host__ __device__ explicit device_view(device_mutable_view mutable_map) + : device_view_base{mutable_map.get_slots(), mutable_map.get_capacity(), mutable_map.get_empty_key_sentinel(), mutable_map.get_op()} @@ -1025,9 +1085,9 @@ class static_reduction_map { * * @return A device_view object based on the members of the `static_reduction_map` object */ - device_view get_device_view() const noexcept + device_view<> get_device_view() const noexcept { - return device_view(slots_, capacity_, empty_key_sentinel_, op_); + return device_view<>(slots_, capacity_, empty_key_sentinel_, op_); } /** @@ -1036,9 +1096,9 @@ class static_reduction_map { * * @return A device_mutable_view object based on the members of the `static_reduction_map` object */ - device_mutable_view get_device_mutable_view() const noexcept + device_mutable_view<> get_device_mutable_view() const noexcept { - return device_mutable_view(slots_, capacity_, empty_key_sentinel_, op_); + return device_mutable_view<>(slots_, capacity_, empty_key_sentinel_, op_); } private: diff --git a/tests/static_reduction_map/static_reduction_map_test.cu b/tests/static_reduction_map/static_reduction_map_test.cu index f1540e041..158093012 100644 --- a/tests/static_reduction_map/static_reduction_map_test.cu +++ b/tests/static_reduction_map/static_reduction_map_test.cu @@ -110,11 +110,11 @@ __global__ void static_reduction_map_shared_memory_kernel(bool* key_found) using Value = typename MapType::mapped_type; namespace cg = cooperative_groups; - using mutable_view_type = typename MapType::device_mutable_view; - using view_type = typename MapType::device_view; + using mutable_view_type = typename MapType::device_mutable_view; + using view_type = typename MapType::device_view; + #pragma diag_suppress static_var_with_dynamic_init __shared__ typename mutable_view_type::slot_type slots[N]; - auto map = - mutable_view_type::make_from_uninitialized_slots(cg::this_thread_block(), &slots[0], N, -1); + auto map = mutable_view_type::make_from_uninitialized_slots(cg::this_thread_block(), slots, -1); auto g = cg::this_thread_block(); std::size_t index = threadIdx.x + blockIdx.x * blockDim.x; From cddd73391204f8137f08037cb633eb0124681c83 Mon Sep 17 00:00:00 2001 From: Jake Hemstad Date: Wed, 13 Oct 2021 16:44:59 -0500 Subject: [PATCH 60/69] Add predicated store to prevent optimization. --- benchmarks/hash_table/static_reduction_map_bench.cu | 3 +++ 1 file changed, 3 insertions(+) diff --git a/benchmarks/hash_table/static_reduction_map_bench.cu b/benchmarks/hash_table/static_reduction_map_bench.cu index 917200290..cf9159fa0 100644 --- a/benchmarks/hash_table/static_reduction_map_bench.cu +++ b/benchmarks/hash_table/static_reduction_map_bench.cu @@ -165,6 +165,9 @@ void static_shmem_insert_kernel(std::size_t num_keys, std::size_t multiplicity, // Placeholder predicated store to inject artificial side-effects and keep compiler from discarding // the code above + if(pred){ + *do_not_use = result; + } } template From c43f7fbfa7c3c9a50b91f3d6c786ff564d57e308 Mon Sep 17 00:00:00 2001 From: Jake Hemstad Date: Wed, 13 Oct 2021 19:24:19 -0500 Subject: [PATCH 61/69] Add multiplicity to benchmark. --- benchmarks/hash_table/static_reduction_map_bench.cu | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/benchmarks/hash_table/static_reduction_map_bench.cu b/benchmarks/hash_table/static_reduction_map_bench.cu index cf9159fa0..3bb9e87aa 100644 --- a/benchmarks/hash_table/static_reduction_map_bench.cu +++ b/benchmarks/hash_table/static_reduction_map_bench.cu @@ -178,7 +178,7 @@ void static_shmem(nvbench::state& state, auto const occupancy = state.get_float64("Occupancy"); auto const num_keys = static_cast(std::floor(Capacity * occupancy)); - auto const multiplicity = 1; + auto const multiplicity = state.get_int64("Multiplicity"); if(num_keys > Capacity){ throw; @@ -208,7 +208,8 @@ NVBENCH_BENCH_TYPES(static_shmem, NVBENCH_TYPE_AXES(key_type_range, value_type_range, op_type_range, capacity_range, extent_options)) .set_name("Insert Static vs Dynamic Extent") .set_type_axes_names({"Key", "Value", "ReductionOp", "Capacity", "Extent"}) - .add_float64_axis("Occupancy", nvbench::range(0.5, 0.9, 0.1)); + .add_int64_axis("Multiplicity", {1}) + .add_float64_axis("Occupancy", nvbench::range(0.1, 0.9, 0.1)); template From bef9aa66b2ef20ed3e07ad7bfe2e3823d772807d Mon Sep 17 00:00:00 2001 From: Jake Hemstad Date: Thu, 14 Oct 2021 07:59:39 -0500 Subject: [PATCH 62/69] Add appropriate host qualifier to device_view_base functions. --- include/cuco/static_reduction_map.cuh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/cuco/static_reduction_map.cuh b/include/cuco/static_reduction_map.cuh index e12477036..3c816a564 100644 --- a/include/cuco/static_reduction_map.cuh +++ b/include/cuco/static_reduction_map.cuh @@ -534,14 +534,14 @@ class static_reduction_map { * @brief Gets the binary op * */ - __device__ ReductionOp get_op() const noexcept { return op_; } + __host__ __device__ ReductionOp get_op() const noexcept { return op_; } /** * @brief Gets slots array. * * @return Slots array */ - __device__ + __host__ __device__ constexpr slot_type* get_slots() noexcept { return storage_.ptr; } /** @@ -549,7 +549,7 @@ class static_reduction_map { * * @return Slots array */ - __device__ + __host__ __device__ constexpr slot_type const* get_slots() const noexcept { return storage_.ptr; } /** @@ -590,7 +590,7 @@ class static_reduction_map { * * @return Iterator to the first slot */ - __device__ + __host__ __device__ constexpr iterator begin_slot() noexcept { return get_slots(); } /** @@ -606,7 +606,7 @@ class static_reduction_map { * * @return Iterator to the first slot */ - __device__ + __host__ __device__ constexpr const_iterator begin_slot() const noexcept { return get_slots(); } /** From a99b56f758944eb13cbbfe884d6a9e697ff7971c Mon Sep 17 00:00:00 2001 From: Jake Hemstad Date: Thu, 14 Oct 2021 08:00:22 -0500 Subject: [PATCH 63/69] Import extent static member into device_mutable_view. --- include/cuco/static_reduction_map.cuh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/cuco/static_reduction_map.cuh b/include/cuco/static_reduction_map.cuh index 3c816a564..9f6922c7d 100644 --- a/include/cuco/static_reduction_map.cuh +++ b/include/cuco/static_reduction_map.cuh @@ -679,6 +679,8 @@ class static_reduction_map { using const_iterator = typename device_view_base::const_iterator; using slot_type = typename device_view_base::slot_type; + static constexpr std::size_t extent = device_view_base::extent; + /** * @brief Construct a mutable view of the first `capacity` slots of the * slots array pointed to by `slots`. From e87fc9da0bee901c7d27985b6d5cf38113ffa8a6 Mon Sep 17 00:00:00 2001 From: Jake Hemstad Date: Thu, 14 Oct 2021 08:01:03 -0500 Subject: [PATCH 64/69] Add static_assert for factory from static array. Ensure the size of the array is the same as the extent or the extent is dynamic. --- include/cuco/static_reduction_map.cuh | 1 + 1 file changed, 1 insertion(+) diff --git a/include/cuco/static_reduction_map.cuh b/include/cuco/static_reduction_map.cuh index 9f6922c7d..3dd928878 100644 --- a/include/cuco/static_reduction_map.cuh +++ b/include/cuco/static_reduction_map.cuh @@ -722,6 +722,7 @@ class static_reduction_map { Key empty_key_sentinel, ReductionOp reduction_op = {}) noexcept { + static_assert(extent == dynamic_extent or N == extent); device_view_base::initialize_slots(g, slots, N, empty_key_sentinel, ReductionOp::identity); return device_mutable_view{slots, empty_key_sentinel, reduction_op}; } From 5f244292990dbde9d5311d28ede72e74803250ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20J=C3=BCnger?= Date: Mon, 21 Mar 2022 11:15:30 +0100 Subject: [PATCH 65/69] Minor fixes addressing reviewer comments --- benchmarks/CMakeLists.txt | 2 +- .../static_reduction_map/insert_bench.cu | 1 + .../static_reduction_map/param_sweep.cu | 3 +- .../reduce_by_key/cuco_reduce_by_key_bench.cu | 3 +- benchmarks/utils.hpp | 9 +-- include/cuco/detail/reduction_ops.cuh | 6 +- include/cuco/detail/static_reduction_map.inl | 24 +++---- .../detail/static_reduction_map_kernels.cuh | 62 ++++++++----------- include/cuco/detail/traits.hpp | 54 ---------------- include/cuco/static_reduction_map.cuh | 2 +- tests/CMakeLists.txt | 4 +- 11 files changed, 54 insertions(+), 116 deletions(-) delete mode 100644 include/cuco/detail/traits.hpp diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 927eb790b..c6ab3c868 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -71,7 +71,7 @@ function(ConfigureNVBench BENCH_NAME) endfunction(ConfigureNVBench) ################################################################################################### -### test sources ################################################################################## +### benchmark sources ############################################################################# ################################################################################################### ################################################################################################### diff --git a/benchmarks/hash_table/static_reduction_map/insert_bench.cu b/benchmarks/hash_table/static_reduction_map/insert_bench.cu index 94d3055aa..c71973e59 100644 --- a/benchmarks/hash_table/static_reduction_map/insert_bench.cu +++ b/benchmarks/hash_table/static_reduction_map/insert_bench.cu @@ -15,6 +15,7 @@ */ #include +#include #include #include #include diff --git a/benchmarks/hash_table/static_reduction_map/param_sweep.cu b/benchmarks/hash_table/static_reduction_map/param_sweep.cu index 8f2cd0c22..e3643432a 100644 --- a/benchmarks/hash_table/static_reduction_map/param_sweep.cu +++ b/benchmarks/hash_table/static_reduction_map/param_sweep.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,6 +14,7 @@ * limitations under the License. */ +#include #include #include #include diff --git a/benchmarks/reduce_by_key/cuco_reduce_by_key_bench.cu b/benchmarks/reduce_by_key/cuco_reduce_by_key_bench.cu index 32dac93b2..59b14dd12 100644 --- a/benchmarks/reduce_by_key/cuco_reduce_by_key_bench.cu +++ b/benchmarks/reduce_by_key/cuco_reduce_by_key_bench.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,6 +14,7 @@ * limitations under the License. */ +#include #include #include #include diff --git a/benchmarks/utils.hpp b/benchmarks/utils.hpp index a2597e550..a09094378 100644 --- a/benchmarks/utils.hpp +++ b/benchmarks/utils.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -33,9 +33,4 @@ std::size_t count_unique(Iter begin, Iter end) std::sort(v.begin(), v.end()); return std::distance(v.begin(), std::unique(v.begin(), v.end())); -} - -// safe division -#ifndef SDIV -#define SDIV(x, y) (((x) + (y)-1) / (y)) -#endif \ No newline at end of file +} \ No newline at end of file diff --git a/include/cuco/detail/reduction_ops.cuh b/include/cuco/detail/reduction_ops.cuh index c8496470e..32ddebdbb 100644 --- a/include/cuco/detail/reduction_ops.cuh +++ b/include/cuco/detail/reduction_ops.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,6 +16,10 @@ #pragma once +#include +#include +#include + namespace cuco { /** diff --git a/include/cuco/detail/static_reduction_map.inl b/include/cuco/detail/static_reduction_map.inl index e40db569d..af6268d3a 100644 --- a/include/cuco/detail/static_reduction_map.inl +++ b/include/cuco/detail/static_reduction_map.inl @@ -69,14 +69,14 @@ template void static_reduction_map::insert( InputIt first, InputIt last, cudaStream_t stream, Hash hash, KeyEqual key_equal) { - auto num_keys = std::distance(first, last); + auto const num_keys = std::distance(first, last); if (num_keys == 0) { return; } - auto const block_size = 128; - auto const stride = 1; - auto const tile_size = 4; - auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size); - auto view = get_device_mutable_view(); + auto constexpr block_size = 128; + auto constexpr stride = 1; + auto constexpr tile_size = 4; + auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size); + auto view = get_device_mutable_view(); detail::insert <<>>(first, first + num_keys, view, hash, key_equal); @@ -98,13 +98,13 @@ void static_reduction_map::find(Input auto num_keys = std::distance(first, last); if (num_keys == 0) { return; } - auto const block_size = 128; - auto const stride = 1; - auto const tile_size = 4; - auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size); - auto view = get_device_view(); + auto constexpr block_size = 128; + auto constexpr stride = 1; + auto constexpr tile_size = 4; + auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size); + auto const view = get_device_view(); - detail::find + detail::find <<>>(first, last, output_begin, view, hash, key_equal); } diff --git a/include/cuco/detail/static_reduction_map_kernels.cuh b/include/cuco/detail/static_reduction_map_kernels.cuh index 7dbad0c4a..8dce498e6 100644 --- a/include/cuco/detail/static_reduction_map_kernels.cuh +++ b/include/cuco/detail/static_reduction_map_kernels.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,6 +14,10 @@ * limitations under the License. */ +#include +#include +#include + namespace cuco { namespace detail { namespace cg = cooperative_groups; @@ -65,7 +69,6 @@ __global__ void initialize(pair_atomic_type* const slots, Key k, Value v, std::s * @tparam KeyEqual Binary callable type * @param first Beginning of the sequence of key/value pairs * @param last End of the sequence of key/value pairs - * @param num_successes The number of successfully inserted key/value pairs * @param view Mutable device view used to access the hash map's slot storage * @param hash The unary function to apply to hash each key * @param key_equal The binary function used to compare two keys for equality @@ -77,8 +80,8 @@ template __global__ void insert(InputIt first, InputIt last, viewT view, Hash hash, KeyEqual key_equal) { - auto tid = block_size * blockIdx.x + threadIdx.x; - auto it = first + tid; + auto const tid = block_size * blockIdx.x + threadIdx.x; + auto it = first + tid; while (it < last) { typename viewT::value_type const insert_pair{*it}; @@ -101,36 +104,29 @@ __global__ void insert(InputIt first, InputIt last, viewT view, Hash hash, KeyEq * inserts * @tparam InputIt Device accessible input iterator whose `value_type` is * convertible to the map's `value_type` - * @tparam atomicT Type of atomic storage * @tparam viewT Type of device view allowing access of hash map storage * @tparam Hash Unary callable type * @tparam KeyEqual Binary callable type * @param first Beginning of the sequence of key/value pairs * @param last End of the sequence of key/value pairs - * @param num_successes The number of successfully inserted key/value pairs * @param view Mutable device view used to access the hash map's slot storage * @param hash The unary function to apply to hash each key * @param key_equal The binary function used to compare two keys for equality */ template __global__ void insert(InputIt first, InputIt last, viewT view, Hash hash, KeyEqual key_equal) { - auto tile = cg::tiled_partition(cg::this_thread_block()); - auto tid = block_size * blockIdx.x + threadIdx.x; - auto it = first + tid / tile_size; + auto const tile = cg::tiled_partition(cg::this_thread_block()); + auto const tid = block_size * blockIdx.x + threadIdx.x; + auto it = first + tid / tile_size; while (it < last) { - // force conversion to value_type - typename viewT::value_type const insert_pair{ - static_cast(thrust::get<0>(*it)), - static_cast(thrust::get<1>(*it))}; - - view.insert(tile, insert_pair, hash, key_equal); + view.insert(tile, *it, hash, key_equal); it += (gridDim.x * block_size) / tile_size; } } @@ -141,7 +137,6 @@ __global__ void insert(InputIt first, InputIt last, viewT view, Hash hash, KeyEq * If the key `*(first + i)` exists in the map, copies its associated value to `(output_begin + i)`. * Else, copies the empty value sentinel. * @tparam block_size The size of the thread block - * @tparam Value The type of the mapped value for the map * @tparam InputIt Device accessible input iterator whose `value_type` is * convertible to the map's `key_type` * @tparam OutputIt Device accessible output iterator whose `value_type` is @@ -157,7 +152,6 @@ __global__ void insert(InputIt first, InputIt last, viewT view, Hash hash, KeyEq * @param key_equal The binary function to compare two keys for equality */ template (cg::this_thread_block()); - auto tid = block_size * blockIdx.x + threadIdx.x; - auto key_idx = tid / tile_size; - __shared__ Value writeBuffer[block_size]; + auto const tile = cg::tiled_partition(cg::this_thread_block()); + auto const tid = block_size * blockIdx.x + threadIdx.x; + auto key_idx = tid / tile_size; + __shared__ typename viewT::mapped_type writeBuffer[block_size]; while (first + key_idx < last) { auto key = *(first + key_idx); @@ -261,7 +253,7 @@ __global__ void find( * @tparam InputIt Device accessible input iterator whose `value_type` is * convertible to the map's `key_type` * @tparam OutputIt Device accessible output iterator whose `value_type` is - * convertible to the map's `mapped_type` + * convertible to `bool` * @tparam viewT Type of device view allowing access of hash map storage * @tparam Hash Unary callable type * @tparam KeyEqual Binary callable type @@ -281,8 +273,8 @@ template (cg::this_thread_block()); - auto tid = block_size * blockIdx.x + threadIdx.x; - auto key_idx = tid / tile_size; + auto const tile = cg::tiled_partition(cg::this_thread_block()); + auto const tid = block_size * blockIdx.x + threadIdx.x; + auto key_idx = tid / tile_size; __shared__ bool writeBuffer[block_size]; while (first + key_idx < last) { diff --git a/include/cuco/detail/traits.hpp b/include/cuco/detail/traits.hpp deleted file mode 100644 index 53ef38433..000000000 --- a/include/cuco/detail/traits.hpp +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - */ - -#pragma once - -namespace cuco { -/** - * @brief Customization point that can be specialized to indicate that it is safe to perform bitwise - * equality comparisons on objects of type `T`. - * - * By default, only types where `std::has_unique_object_representations_v` is true are safe for - * bitwise equality. However, this can be too restrictive for some types, e.g., floating point - * types. - * - * User-defined specializations of `is_bitwise_comparable` are allowed, but it is the users - * responsibility to ensure values do not occur that would lead to unexpected behavior. For example, - * if a `NaN` bit pattern were used as the empty sentinel value, it may not compare bitwise equal to - * other `NaN` bit patterns. - * - */ -template -struct is_bitwise_comparable : std::false_type { -}; - -/// By default, only types with unique object representations are allowed -template -struct is_bitwise_comparable>> - : std::true_type { -}; - -/** - * @brief Declares that a type `Type` is bitwise comparable. - * - */ -#define CUCO_DECLARE_BITWISE_COMPARABLE(Type) \ - namespace cuco { \ - template <> \ - struct is_bitwise_comparable : std::true_type { \ - }; \ - } - -} // namespace cuco \ No newline at end of file diff --git a/include/cuco/static_reduction_map.cuh b/include/cuco/static_reduction_map.cuh index 52c373e61..27bda61e9 100644 --- a/include/cuco/static_reduction_map.cuh +++ b/include/cuco/static_reduction_map.cuh @@ -42,7 +42,7 @@ #include #include #include -#include +#include #include namespace cuco { diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index e22b2f49e..fba49c0b1 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,5 +1,5 @@ #============================================================================= -# Copyright (c) 2018-2021, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -43,8 +43,6 @@ function(ConfigureTest TEST_NAME) target_include_directories(${TEST_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) set_target_properties(${TEST_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tests") - target_include_directories(${TEST_NAME} PRIVATE - "${CMAKE_CURRENT_SOURCE_DIR}") target_compile_options(${TEST_NAME} PRIVATE --expt-extended-lambda --expt-relaxed-constexpr -Xcompiler -Wno-subobject-linkage) catch_discover_tests(${TEST_NAME}) endfunction(ConfigureTest) From c2e4e6212ee612a05ef873b9d2b988efa5d4e560 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20J=C3=BCnger?= Date: Wed, 23 Mar 2022 15:38:25 +0100 Subject: [PATCH 66/69] Move reduction operators to include/cuco/ --- include/cuco/{detail => }/reduction_ops.cuh | 0 include/cuco/static_reduction_map.cuh | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename include/cuco/{detail => }/reduction_ops.cuh (100%) diff --git a/include/cuco/detail/reduction_ops.cuh b/include/cuco/reduction_ops.cuh similarity index 100% rename from include/cuco/detail/reduction_ops.cuh rename to include/cuco/reduction_ops.cuh diff --git a/include/cuco/static_reduction_map.cuh b/include/cuco/static_reduction_map.cuh index 27bda61e9..1fdba3cb1 100644 --- a/include/cuco/static_reduction_map.cuh +++ b/include/cuco/static_reduction_map.cuh @@ -40,8 +40,8 @@ #include #include #include -#include #include +#include #include #include From e1361a3cad8105402315b47c6441237a192c0de0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20J=C3=BCnger?= Date: Wed, 23 Mar 2022 16:15:52 +0100 Subject: [PATCH 67/69] Added a tag to ensure that only valid reduction functors can be used --- include/cuco/detail/tags.hpp | 30 +++++++++++++++ include/cuco/reduction_ops.cuh | 53 ++++++++++++++++++++------- include/cuco/static_reduction_map.cuh | 4 +- 3 files changed, 73 insertions(+), 14 deletions(-) create mode 100644 include/cuco/detail/tags.hpp diff --git a/include/cuco/detail/tags.hpp b/include/cuco/detail/tags.hpp new file mode 100644 index 000000000..7520e6e4d --- /dev/null +++ b/include/cuco/detail/tags.hpp @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +namespace cuco { + +namespace detail { + +namespace tags { + +struct reduction_op { +}; + +} // namespace tags +} // namespace detail +} // namespace cuco \ No newline at end of file diff --git a/include/cuco/reduction_ops.cuh b/include/cuco/reduction_ops.cuh index 32ddebdbb..ec3d5a936 100644 --- a/include/cuco/reduction_ops.cuh +++ b/include/cuco/reduction_ops.cuh @@ -17,6 +17,7 @@ #pragma once #include +#include #include #include @@ -30,7 +31,9 @@ namespace cuco { */ template struct reduce_add { - using value_type = T; + using value_type = T; + using tag = detail::tags::reduction_op; + static constexpr T identity = 0; template @@ -44,7 +47,9 @@ struct reduce_add { // https://github.com/NVIDIA/libcudacxx/issues/104 template <> struct reduce_add { - using value_type = float; + using value_type = float; + using tag = detail::tags::reduction_op; + static constexpr float identity = 0; template @@ -56,7 +61,9 @@ struct reduce_add { template <> struct reduce_add { - using value_type = double; + using value_type = double; + using tag = detail::tags::reduction_op; + static constexpr double identity = 0; template @@ -74,7 +81,9 @@ struct reduce_add { */ template struct reduce_sub { - using value_type = T; + using value_type = T; + using tag = detail::tags::reduction_op; + static constexpr T identity = 0; template @@ -86,7 +95,9 @@ struct reduce_sub { template <> struct reduce_sub { - using value_type = float; + using value_type = float; + using tag = detail::tags::reduction_op; + static constexpr float identity = 0; template @@ -98,7 +109,9 @@ struct reduce_sub { template <> struct reduce_sub { - using value_type = double; + using value_type = double; + using tag = detail::tags::reduction_op; + static constexpr double identity = 0; template @@ -116,7 +129,9 @@ struct reduce_sub { */ template struct reduce_min { - using value_type = T; + using value_type = T; + using tag = detail::tags::reduction_op; + static constexpr T identity = std::numeric_limits::max(); template @@ -128,7 +143,9 @@ struct reduce_min { template <> struct reduce_min { - using value_type = float; + using value_type = float; + using tag = detail::tags::reduction_op; + static constexpr float identity = std::numeric_limits::max(); template @@ -140,7 +157,9 @@ struct reduce_min { template <> struct reduce_min { - using value_type = double; + using value_type = double; + using tag = detail::tags::reduction_op; + static constexpr double identity = std::numeric_limits::max(); template @@ -158,7 +177,9 @@ struct reduce_min { */ template struct reduce_max { - using value_type = T; + using value_type = T; + using tag = detail::tags::reduction_op; + static constexpr T identity = std::numeric_limits::lowest(); template @@ -170,7 +191,9 @@ struct reduce_max { template <> struct reduce_max { - using value_type = float; + using value_type = float; + using tag = detail::tags::reduction_op; + static constexpr float identity = std::numeric_limits::lowest(); template @@ -182,7 +205,9 @@ struct reduce_max { template <> struct reduce_max { - using value_type = double; + using value_type = double; + using tag = detail::tags::reduction_op; + static constexpr double identity = std::numeric_limits::lowest(); template @@ -206,7 +231,9 @@ template struct custom_op { - using value_type = T; + using value_type = T; + using tag = detail::tags::reduction_op; + static constexpr T identity = Identity; Op op; diff --git a/include/cuco/static_reduction_map.cuh b/include/cuco/static_reduction_map.cuh index 1fdba3cb1..62d6be8e2 100644 --- a/include/cuco/static_reduction_map.cuh +++ b/include/cuco/static_reduction_map.cuh @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -151,7 +152,8 @@ class static_reduction_map { is_bitwise_comparable::value, "Key type must have unique object representations or have been explicitly declared as safe for " "bitwise comparison via specialization of cuco::is_bitwise_comparable."); - + static_assert(std::is_same::value, + "Invalid reduction functor"); static_assert(std::is_same::value, "Type mismatch between ReductionOp::value_type and Value"); From d196de5a71373c8df317b1e96a5cf10d61a0515a Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 26 May 2022 14:53:12 -0400 Subject: [PATCH 68/69] Move common kernels to a new file --- include/cuco/detail/common_kernels.cuh | 172 +++++++++++++++++++++++++ 1 file changed, 172 insertions(+) create mode 100644 include/cuco/detail/common_kernels.cuh diff --git a/include/cuco/detail/common_kernels.cuh b/include/cuco/detail/common_kernels.cuh new file mode 100644 index 000000000..e97caa564 --- /dev/null +++ b/include/cuco/detail/common_kernels.cuh @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include + +#include +#include + +namespace cuco { +namespace detail { +namespace cg = cooperative_groups; + +/** + * @brief Initializes each slot in the flat `slots` storage to contain `k` and `v`. + * + * Each space in `slots` that can hold a key value pair is initialized to a + * `pair_atomic_type` containing the key `k` and the value `v`. + * + * @tparam atomic_key_type Type of the `Key` atomic container + * @tparam atomic_mapped_type Type of the `Value` atomic container + * @tparam Key key type + * @tparam Value value type + * @tparam pair_atomic_type key/value pair type + * @param slots Pointer to flat storage for the map's key/value pairs + * @param k Key to which all keys in `slots` are initialized + * @param v Value to which all values in `slots` are initialized + * @param size Size of the storage pointed to by `slots` + */ +template +__global__ void initialize(pair_atomic_type* const slots, Key k, Value v, std::size_t size) +{ + auto tid = block_size * blockIdx.x + threadIdx.x; + while (tid < size) { + new (&slots[tid].first) atomic_key_type{k}; + new (&slots[tid].second) atomic_mapped_type{v}; + tid += gridDim.x * block_size; + } +} + +/** + * @brief Indicates whether the keys in the range `[first, last)` are contained in the map. + * + * Writes a `bool` to `(output + i)` indicating if the key `*(first + i)` exists in the map. + * + * @tparam block_size The size of the thread block + * @tparam InputIt Device accessible input iterator whose `value_type` is + * convertible to the map's `key_type` + * @tparam OutputIt Device accessible output iterator whose `value_type` is + * convertible to `bool` + * @tparam viewT Type of device view allowing access of hash map storage + * @tparam Hash Unary callable type + * @tparam KeyEqual Binary callable type + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param output_begin Beginning of the sequence of booleans for the presence of each key + * @param view Device view used to access the hash map's slot storage + * @param hash The unary function to apply to hash each key + * @param key_equal The binary function to compare two keys for equality + */ +template +__global__ void contains( + InputIt first, InputIt last, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal) +{ + auto const tid = block_size * blockIdx.x + threadIdx.x; + auto key_idx = tid; + __shared__ bool writeBuffer[block_size]; + + while (first + key_idx < last) { + auto key = *(first + key_idx); + + /* + * The ld.relaxed.gpu instruction used in view.find causes L1 to + * flush more frequently, causing increased sector stores from L2 to global memory. + * By writing results to shared memory and then synchronizing before writing back + * to global, we no longer rely on L1, preventing the increase in sector stores from + * L2 to global and improving performance. + */ + writeBuffer[threadIdx.x] = view.contains(key, hash, key_equal); + __syncthreads(); + *(output_begin + key_idx) = writeBuffer[threadIdx.x]; + key_idx += gridDim.x * block_size; + } +} + +/** + * @brief Indicates whether the keys in the range `[first, last)` are contained in the map. + * + * Writes a `bool` to `(output + i)` indicating if the key `*(first + i)` exists in the map. + * Uses the CUDA Cooperative Groups API to leverage groups of multiple threads to perform the + * contains operation for each key. This provides a significant boost in throughput compared + * to the non Cooperative Group `contains` at moderate to high load factors. + * + * @tparam block_size The size of the thread block + * @tparam tile_size The number of threads in the Cooperative Groups used to perform + * inserts + * @tparam InputIt Device accessible input iterator whose `value_type` is + * convertible to the map's `key_type` + * @tparam OutputIt Device accessible output iterator whose `value_type` is + * convertible to `bool` + * @tparam viewT Type of device view allowing access of hash map storage + * @tparam Hash Unary callable type + * @tparam KeyEqual Binary callable type + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param output_begin Beginning of the sequence of booleans for the presence of each key + * @param view Device view used to access the hash map's slot storage + * @param hash The unary function to apply to hash each key + * @param key_equal The binary function to compare two keys for equality + */ +template +__global__ void contains( + InputIt first, InputIt last, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal) +{ + auto const tile = cg::tiled_partition(cg::this_thread_block()); + auto const tid = block_size * blockIdx.x + threadIdx.x; + auto key_idx = tid / tile_size; + __shared__ bool writeBuffer[block_size]; + + while (first + key_idx < last) { + auto key = *(first + key_idx); + auto found = view.contains(tile, key, hash, key_equal); + + /* + * The ld.relaxed.gpu instruction used in view.find causes L1 to + * flush more frequently, causing increased sector stores from L2 to global memory. + * By writing results to shared memory and then synchronizing before writing back + * to global, we no longer rely on L1, preventing the increase in sector stores from + * L2 to global and improving performance. + */ + if (tile.thread_rank() == 0) { writeBuffer[threadIdx.x / tile_size] = found; } + __syncthreads(); + if (tile.thread_rank() == 0) { + *(output_begin + key_idx) = writeBuffer[threadIdx.x / tile_size]; + } + key_idx += (gridDim.x * block_size) / tile_size; + } +} + +} // namespace detail +} // namespace cuco From e904dca1dc349c7f83b6bf07dfd03048381be869 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 26 May 2022 14:55:03 -0400 Subject: [PATCH 69/69] Updates: incorporate new kernel header --- .../static_reduction_map_example.cu | 2 +- include/cuco/detail/static_map_kernels.cuh | 147 +----------------- .../detail/static_reduction_map_kernels.cuh | 145 +---------------- include/cuco/static_map.cuh | 24 +-- include/cuco/static_reduction_map.cuh | 26 ++-- 5 files changed, 31 insertions(+), 313 deletions(-) diff --git a/examples/static_reduction_map/static_reduction_map_example.cu b/examples/static_reduction_map/static_reduction_map_example.cu index 8d3839658..3173b47ce 100644 --- a/examples/static_reduction_map/static_reduction_map_example.cu +++ b/examples/static_reduction_map/static_reduction_map_example.cu @@ -90,4 +90,4 @@ int main(void) for (int i = 0; i < unique_keys.size(); ++i) { std::cout << "Key: " << unique_keys[i] << " Count: " << count_per_key[i] << std::endl; } -} \ No newline at end of file +} diff --git a/include/cuco/detail/static_map_kernels.cuh b/include/cuco/detail/static_map_kernels.cuh index 642373135..ce886ec8a 100644 --- a/include/cuco/detail/static_map_kernels.cuh +++ b/include/cuco/detail/static_map_kernels.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -13,43 +13,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#pragma once + +#include namespace cuco { namespace detail { namespace cg = cooperative_groups; -/** - * @brief Initializes each slot in the flat `slots` storage to contain `k` and `v`. - * - * Each space in `slots` that can hold a key value pair is initialized to a - * `pair_atomic_type` containing the key `k` and the value `v`. - * - * @tparam atomic_key_type Type of the `Key` atomic container - * @tparam atomic_mapped_type Type of the `Value` atomic container - * @tparam Key key type - * @tparam Value value type - * @tparam pair_atomic_type key/value pair type - * @param slots Pointer to flat storage for the map's key/value pairs - * @param k Key to which all keys in `slots` are initialized - * @param v Value to which all values in `slots` are initialized - * @param size Size of the storage pointed to by `slots` - */ -template -__global__ void initialize(pair_atomic_type* const slots, Key k, Value v, std::size_t size) -{ - auto tid = block_size * blockIdx.x + threadIdx.x; - while (tid < size) { - new (&slots[tid].first) atomic_key_type{k}; - new (&slots[tid].second) atomic_mapped_type{v}; - tid += gridDim.x * block_size; - } -} - /** * @brief Inserts all key/value pairs in the range `[first, last)`. * @@ -349,115 +320,5 @@ __global__ void find( } } -/** - * @brief Indicates whether the keys in the range `[first, last)` are contained in the map. - * - * Writes a `bool` to `(output + i)` indicating if the key `*(first + i)` exists in the map. - * - * @tparam block_size The size of the thread block - * @tparam InputIt Device accessible input iterator whose `value_type` is - * convertible to the map's `key_type` - * @tparam OutputIt Device accessible output iterator whose `value_type` is - * convertible to the map's `mapped_type` - * @tparam viewT Type of device view allowing access of hash map storage - * @tparam Hash Unary callable type - * @tparam KeyEqual Binary callable type - * @param first Beginning of the sequence of keys - * @param last End of the sequence of keys - * @param output_begin Beginning of the sequence of booleans for the presence of each key - * @param view Device view used to access the hash map's slot storage - * @param hash The unary function to apply to hash each key - * @param key_equal The binary function to compare two keys for equality - */ -template -__global__ void contains( - InputIt first, InputIt last, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal) -{ - auto tid = block_size * blockIdx.x + threadIdx.x; - auto key_idx = tid; - __shared__ bool writeBuffer[block_size]; - - while (first + key_idx < last) { - auto key = *(first + key_idx); - - /* - * The ld.relaxed.gpu instruction used in view.find causes L1 to - * flush more frequently, causing increased sector stores from L2 to global memory. - * By writing results to shared memory and then synchronizing before writing back - * to global, we no longer rely on L1, preventing the increase in sector stores from - * L2 to global and improving performance. - */ - writeBuffer[threadIdx.x] = view.contains(key, hash, key_equal); - __syncthreads(); - *(output_begin + key_idx) = writeBuffer[threadIdx.x]; - key_idx += gridDim.x * block_size; - } -} - -/** - * @brief Indicates whether the keys in the range `[first, last)` are contained in the map. - * - * Writes a `bool` to `(output + i)` indicating if the key `*(first + i)` exists in the map. - * Uses the CUDA Cooperative Groups API to leverage groups of multiple threads to perform the - * contains operation for each key. This provides a significant boost in throughput compared - * to the non Cooperative Group `contains` at moderate to high load factors. - * - * @tparam block_size The size of the thread block - * @tparam tile_size The number of threads in the Cooperative Groups used to perform - * inserts - * @tparam InputIt Device accessible input iterator whose `value_type` is - * convertible to the map's `key_type` - * @tparam OutputIt Device accessible output iterator whose `value_type` is - * convertible to the map's `mapped_type` - * @tparam viewT Type of device view allowing access of hash map storage - * @tparam Hash Unary callable type - * @tparam KeyEqual Binary callable type - * @param first Beginning of the sequence of keys - * @param last End of the sequence of keys - * @param output_begin Beginning of the sequence of booleans for the presence of each key - * @param view Device view used to access the hash map's slot storage - * @param hash The unary function to apply to hash each key - * @param key_equal The binary function to compare two keys for equality - */ -template -__global__ void contains( - InputIt first, InputIt last, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal) -{ - auto tile = cg::tiled_partition(cg::this_thread_block()); - auto tid = block_size * blockIdx.x + threadIdx.x; - auto key_idx = tid / tile_size; - __shared__ bool writeBuffer[block_size]; - - while (first + key_idx < last) { - auto key = *(first + key_idx); - auto found = view.contains(tile, key, hash, key_equal); - - /* - * The ld.relaxed.gpu instruction used in view.find causes L1 to - * flush more frequently, causing increased sector stores from L2 to global memory. - * By writing results to shared memory and then synchronizing before writing back - * to global, we no longer rely on L1, preventing the increase in sector stores from - * L2 to global and improving performance. - */ - if (tile.thread_rank() == 0) { writeBuffer[threadIdx.x / tile_size] = found; } - __syncthreads(); - if (tile.thread_rank() == 0) { - *(output_begin + key_idx) = writeBuffer[threadIdx.x / tile_size]; - } - key_idx += (gridDim.x * block_size) / tile_size; - } -} - } // namespace detail } // namespace cuco diff --git a/include/cuco/detail/static_reduction_map_kernels.cuh b/include/cuco/detail/static_reduction_map_kernels.cuh index 8dce498e6..c8588fdec 100644 --- a/include/cuco/detail/static_reduction_map_kernels.cuh +++ b/include/cuco/detail/static_reduction_map_kernels.cuh @@ -13,6 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#pragma once #include #include @@ -22,38 +23,6 @@ namespace cuco { namespace detail { namespace cg = cooperative_groups; -/** - * @brief Initializes each slot in the flat `slots` storage to contain `k` and `v`. - * - * Each space in `slots` that can hold a key value pair is initialized to a - * `pair_atomic_type` containing the key `k` and the value `v`. - * - * @tparam atomic_key_type Type of the `Key` atomic container - * @tparam atomic_mapped_type Type of the `Value` atomic container - * @tparam Key key type - * @tparam Value value type - * @tparam pair_atomic_type key/value pair type - * @param slots Pointer to flat storage for the map's key/value pairs - * @param k Key to which all keys in `slots` are initialized - * @param v Value to which all values in `slots` are initialized - * @param size Size of the storage pointed to by `slots` - */ -template -__global__ void initialize(pair_atomic_type* const slots, Key k, Value v, std::size_t size) -{ - auto tid = block_size * blockIdx.x + threadIdx.x; - while (tid < size) { - new (&slots[tid].first) atomic_key_type{k}; - new (&slots[tid].second) atomic_mapped_type{v}; - tid += gridDim.x * block_size; - } -} - /** * @brief Inserts all key/value pairs in the range `[first, last)`. * @@ -244,115 +213,5 @@ __global__ void find( } } -/** - * @brief Indicates whether the keys in the range `[first, last)` are contained in the map. - * - * Writes a `bool` to `(output + i)` indicating if the key `*(first + i)` exists in the map. - * - * @tparam block_size The size of the thread block - * @tparam InputIt Device accessible input iterator whose `value_type` is - * convertible to the map's `key_type` - * @tparam OutputIt Device accessible output iterator whose `value_type` is - * convertible to `bool` - * @tparam viewT Type of device view allowing access of hash map storage - * @tparam Hash Unary callable type - * @tparam KeyEqual Binary callable type - * @param first Beginning of the sequence of keys - * @param last End of the sequence of keys - * @param output_begin Beginning of the sequence of booleans for the presence of each key - * @param view Device view used to access the hash map's slot storage - * @param hash The unary function to apply to hash each key - * @param key_equal The binary function to compare two keys for equality - */ -template -__global__ void contains( - InputIt first, InputIt last, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal) -{ - auto const tid = block_size * blockIdx.x + threadIdx.x; - auto key_idx = tid; - __shared__ bool writeBuffer[block_size]; - - while (first + key_idx < last) { - auto key = *(first + key_idx); - - /* - * The ld.relaxed.gpu instruction used in view.find causes L1 to - * flush more frequently, causing increased sector stores from L2 to global memory. - * By writing results to shared memory and then synchronizing before writing back - * to global, we no longer rely on L1, preventing the increase in sector stores from - * L2 to global and improving performance. - */ - writeBuffer[threadIdx.x] = view.contains(key, hash, key_equal); - __syncthreads(); - *(output_begin + key_idx) = writeBuffer[threadIdx.x]; - key_idx += gridDim.x * block_size; - } -} - -/** - * @brief Indicates whether the keys in the range `[first, last)` are contained in the map. - * - * Writes a `bool` to `(output + i)` indicating if the key `*(first + i)` exists in the map. - * Uses the CUDA Cooperative Groups API to leverage groups of multiple threads to perform the - * contains operation for each key. This provides a significant boost in throughput compared - * to the non Cooperative Group `contains` at moderate to high load factors. - * - * @tparam block_size The size of the thread block - * @tparam tile_size The number of threads in the Cooperative Groups used to perform - * inserts - * @tparam InputIt Device accessible input iterator whose `value_type` is - * convertible to the map's `key_type` - * @tparam OutputIt Device accessible output iterator whose `value_type` is - * convertible to `bool` - * @tparam viewT Type of device view allowing access of hash map storage - * @tparam Hash Unary callable type - * @tparam KeyEqual Binary callable type - * @param first Beginning of the sequence of keys - * @param last End of the sequence of keys - * @param output_begin Beginning of the sequence of booleans for the presence of each key - * @param view Device view used to access the hash map's slot storage - * @param hash The unary function to apply to hash each key - * @param key_equal The binary function to compare two keys for equality - */ -template -__global__ void contains( - InputIt first, InputIt last, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal) -{ - auto const tile = cg::tiled_partition(cg::this_thread_block()); - auto const tid = block_size * blockIdx.x + threadIdx.x; - auto key_idx = tid / tile_size; - __shared__ bool writeBuffer[block_size]; - - while (first + key_idx < last) { - auto key = *(first + key_idx); - auto found = view.contains(tile, key, hash, key_equal); - - /* - * The ld.relaxed.gpu instruction used in view.find causes L1 to - * flush more frequently, causing increased sector stores from L2 to global memory. - * By writing results to shared memory and then synchronizing before writing back - * to global, we no longer rely on L1, preventing the increase in sector stores from - * L2 to global and improving performance. - */ - if (tile.thread_rank() == 0) { writeBuffer[threadIdx.x / tile_size] = found; } - __syncthreads(); - if (tile.thread_rank() == 0) { - *(output_begin + key_idx) = writeBuffer[threadIdx.x / tile_size]; - } - key_idx += (gridDim.x * block_size) / tile_size; - } -} - } // namespace detail -} // namespace cuco \ No newline at end of file +} // namespace cuco diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index 199dcd838..9aedb3e5a 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,16 +16,17 @@ #pragma once -#include -#include -#include -#include -#include -#include - #include +#include +#include +#include +#include +#include #include +#include + +#include #if defined(CUDART_VERSION) && (CUDART_VERSION >= 11000) && defined(__CUDA_ARCH__) && \ (__CUDA_ARCH__ >= 700) #define CUCO_HAS_CUDA_BARRIER @@ -35,10 +36,9 @@ #include #endif -#include -#include -#include -#include +#include +#include +#include namespace cuco { diff --git a/include/cuco/static_reduction_map.cuh b/include/cuco/static_reduction_map.cuh index 62d6be8e2..be87f8b47 100644 --- a/include/cuco/static_reduction_map.cuh +++ b/include/cuco/static_reduction_map.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,8 +15,17 @@ */ #pragma once +#include +#include +#include +#include +#include +#include +#include +#include +#include + #include -#include #include #include #include @@ -26,8 +35,6 @@ #include #include -#include - #if defined(CUDART_VERSION) && (CUDART_VERSION >= 11000) && defined(__CUDA_ARCH__) && \ (__CUDA_ARCH__ >= 700) #define CUCO_HAS_CUDA_BARRIER @@ -37,15 +44,6 @@ #include #endif -#include -#include -#include -#include -#include -#include -#include -#include - namespace cuco { static constexpr std::size_t dynamic_extent = std::numeric_limits::max(); @@ -1130,4 +1128,4 @@ class static_reduction_map { }; } // namespace cuco -#include \ No newline at end of file +#include