From 2998d9fcd49b36ae8524a4d716a681142284e041 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Mon, 11 Jul 2022 12:38:27 +0000 Subject: [PATCH 01/12] Update Catch2 to v2.13.9. --- tests/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index dec5170c4..92dd5a34d 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -23,7 +23,7 @@ include(CTest) CPMAddPackage( NAME Catch2 GITHUB_REPOSITORY catchorg/Catch2 - VERSION 2.11.1 + VERSION 2.13.9 ) if(Catch2_ADDED) From 2a8a50f2d0a403164310e04f76479dd2f6d33ca6 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Mon, 11 Jul 2022 12:49:04 +0000 Subject: [PATCH 02/12] Added reduction functors that can be used by static_reduction_map. --- .../cuco/detail/reduction_functor_impl.cuh | 128 +++++++++++ include/cuco/reduction_functors.cuh | 157 +++++++++++++ tests/CMakeLists.txt | 5 + .../reduction_functors_test.cu | 212 ++++++++++++++++++ 4 files changed, 502 insertions(+) create mode 100644 include/cuco/detail/reduction_functor_impl.cuh create mode 100644 include/cuco/reduction_functors.cuh create mode 100644 tests/static_reduction_map/reduction_functors_test.cu diff --git a/include/cuco/detail/reduction_functor_impl.cuh b/include/cuco/detail/reduction_functor_impl.cuh new file mode 100644 index 000000000..cc771dafb --- /dev/null +++ b/include/cuco/detail/reduction_functor_impl.cuh @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +namespace cuco { +namespace detail { + +/** + * @brief Base class of all reduction functors. + * + * @warning This class should not be used directly. + * + */ +class reduction_functor_base {}; + +template +struct reduce_add_impl { + template + __device__ T operator()(cuda::atomic& lhs, T const& rhs) const noexcept { + return lhs.fetch_add(rhs) + rhs; + } +}; + +template +struct reduce_min_impl { + template + __device__ T operator()(cuda::atomic& lhs, T const& rhs) const noexcept { + return min(lhs.fetch_min(rhs), rhs); + } +}; + +template +struct reduce_max_impl { + template + __device__ T operator()(cuda::atomic& lhs, T const& rhs) const noexcept { + return max(lhs.fetch_max(rhs), rhs); + } +}; + +template +struct reduce_count_impl { + template + __device__ T operator()(cuda::atomic& lhs, T const& /* rhs */) const noexcept { + return ++lhs; + } +}; + +// remove the following WAR once libcu++ extends FP atomics support and fixes signed integer atomics +// https://github.com/NVIDIA/libcudacxx/pull/286 +template +struct reduce_add_impl::value>::type> { + template + __device__ T operator()(cuda::atomic& lhs, T rhs) const noexcept { + if constexpr (Scope == cuda::thread_scope_system) + return atomicAdd_system(reinterpret_cast(&lhs), rhs) + rhs; + else if constexpr (Scope == cuda::thread_scope_device) + return atomicAdd(reinterpret_cast(&lhs), rhs) + rhs; + else + return atomicAdd_block(reinterpret_cast(&lhs), rhs) + rhs; + } +}; + +template +struct reduce_min_impl::value && cuda::std::is_signed::value>::type> { + template + __device__ T operator()(cuda::atomic& lhs, T const& rhs) const noexcept { + using InternalT = typename cuda::std::conditional::type; + InternalT * ptr = reinterpret_cast(&lhs); + InternalT value = rhs; + if constexpr (Scope == cuda::thread_scope_system) + return min(atomicMin_system(ptr, value), value); + else if constexpr (Scope == cuda::thread_scope_device) + return min(atomicMin(ptr, value), value); + else + return min(atomicMin_block(ptr, value), value); + } +}; + +template +struct reduce_max_impl::value && cuda::std::is_signed::value>::type> { + template + __device__ T operator()(cuda::atomic& lhs, T const& rhs) const noexcept { + using InternalT = typename cuda::std::conditional::type; + InternalT * ptr = reinterpret_cast(&lhs); + InternalT value = rhs; + if constexpr (Scope == cuda::thread_scope_system) + return max(atomicMax_system(ptr, value), value); + else if constexpr (Scope == cuda::thread_scope_device) + return max(atomicMax(ptr, value), value); + else + return max(atomicMax_block(ptr, value), value); + } +}; + +template +struct reduce_min_impl::value>::type> { + __device__ T operator()(T lhs, T rhs) const noexcept { + return min(lhs, rhs); + } +}; + +template +struct reduce_max_impl::value>::type> { + __device__ T operator()(T lhs, T rhs) const noexcept { + return max(lhs, rhs); + } +}; + +} // namespace detail +} // namespace cuco \ No newline at end of file diff --git a/include/cuco/reduction_functors.cuh b/include/cuco/reduction_functors.cuh new file mode 100644 index 000000000..94888f6b2 --- /dev/null +++ b/include/cuco/reduction_functors.cuh @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include +#include +#include + +namespace cuco { + +/** + * @brief Wrapper for reduction identity value. + * + * @tparam T The underlying value type used for reduction + */ +template +class identity_value { + public: + using type = T; + constexpr identity_value(T const& identity) noexcept : identity_(identity) {} + constexpr T value() const noexcept { return identity_; } + private: + T identity_; +}; + +/** + * @brief Wrapper for a user-defined custom reduction operator. + * + * External synchronization, if required, + * is established via an atomic compare-and-swap loop. + * + * Example: + * \code{.cpp} + * template + * struct custom_plus { + * __device__ T operator()(T const& lhs, T const& rhs) const noexcept { + * return lhs + rhs; + * } + * }; + * + * template + * struct custom_plus_sync { + * template + * __device__ T operator()(cuda::atomic& lhs, T const& rhs) const noexcept { + * return lhs.fetch_add(rhs) + rhs; + * } + * }; + * + * int main() { + * cuco::identity_value identity{0}; // define the identity value for the given reduction operation, i.e., op(identity, x) == x + * + * auto f1 = cuco::reduction_functor, int>(identity); // synchronized via CAS-loop + * auto f2 = cuco::reduction_functor, int>(identity); // implicitly synchronized + * + * auto custom_plus_lambda = [] __device__ (int lhs, int rhs) noexcept { return lhs + rhs; }; + * auto f3 = cuco::reduction_functor(identity, custom_plus_lambda); + * } + * \endcode + * + * @tparam Func The user-defined reduction functor + * @tparam Value The value type used for reduction + */ +template +class reduction_functor : detail::reduction_functor_base { + public: + using value_type = Value; + + reduction_functor(cuco::identity_value identity, Func functor = Func{}) noexcept : identity_(identity), functor_(functor) {} + + template + __device__ value_type operator()(cuda::atomic& lhs, value_type const& rhs) const noexcept + { + if constexpr (uses_external_sync()) { + value_type old = lhs.load(cuda::memory_order_relaxed); + value_type desired; + + do { + desired = functor_(old, rhs); + } while (!lhs.compare_exchange_weak(old, desired, cuda::memory_order_release, cuda::memory_order_relaxed)); + + return desired; + } else { + return functor_(lhs, rhs); + } + } + + __host__ __device__ value_type identity() const noexcept { + return identity_.value(); + } + + __host__ __device__ static constexpr bool uses_external_sync() noexcept { + return !atomic_invocable_ || naive_invocable_; + } + + private: + cuco::identity_value identity_; + Func functor_; + static constexpr bool naive_invocable_ = std::is_invocable_r::value; + static constexpr bool atomic_invocable_ = + std::is_invocable_r&, value_type>::value || + std::is_invocable_r&, value_type>::value || + std::is_invocable_r&, value_type>::value || + std::is_invocable_r&, value_type>::value; + + static_assert(atomic_invocable_ || naive_invocable_, "Invalid operator signature."); +}; + +/** + * @brief Synchronized `+` reduction functor. + * + * @tparam T The value type used for reduction + */ +template +auto reduce_add() { return reduction_functor(identity_value{0}, detail::reduce_add_impl{}); }; + +/** + * @brief Synchronized `min` reduction functor. + * + * @tparam T The value type used for reduction + */ +template +auto reduce_min() { return reduction_functor(identity_value{cuda::std::numeric_limits::max()}, detail::reduce_min_impl{}); }; + +/** + * @brief Synchronized `max` reduction functor. + * + * @tparam T The value type used for reduction + */ +template +auto reduce_max() { return reduction_functor(identity_value{cuda::std::numeric_limits::lowest()}, detail::reduce_max_impl{}); }; + +/** + * @brief Synchronized `count` reduction functor. + * + * @tparam T The value type used for reduction + */ +template +auto reduce_count() { return reduction_functor(identity_value{0}, detail::reduce_count_impl{}); }; + +} // namespace cuco diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 92dd5a34d..56776f1cf 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -86,3 +86,8 @@ ConfigureTest(STATIC_MULTIMAP_TEST static_multimap/multiplicity_test.cu static_multimap/non_match_test.cu static_multimap/pair_function_test.cu) + +################################################################################################### +# - static_reduction_map tests -------------------------------------------------------------------- +ConfigureTest(STATIC_REDUCTION_MAP_TEST + static_reduction_map/reduction_functors_test.cu) diff --git a/tests/static_reduction_map/reduction_functors_test.cu b/tests/static_reduction_map/reduction_functors_test.cu new file mode 100644 index 000000000..4e0fffc04 --- /dev/null +++ b/tests/static_reduction_map/reduction_functors_test.cu @@ -0,0 +1,212 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include + +#include +#include +#include +#include + +#include + +template +__global__ void reduce_kernel(InputIt first, InputIt last, OutputIt out, Func func) { + auto tid = blockDim.x * blockIdx.x + threadIdx.x; + auto it = first + tid; + + if constexpr (cuda::std::is_base_of_v) { + while (it < last) { + func(*reinterpret_cast*>(thrust::raw_pointer_cast(out)), *it); + it += gridDim.x * blockDim.x; + } + } else { + while (it < last) { + *out = func(*out, *it); + it += gridDim.x * blockDim.x; + } + } +} + +template +void reduce_seq(InputIt first, InputIt last, OutputIt out, Func func) { + reduce_kernel<<<1, 1>>>(first, last, out, func); + cudaDeviceSynchronize(); +} + +template +void reduce_par(InputIt first, InputIt last, OutputIt out, Func func) { + reduce_kernel<<<1, 1024>>>(first, last, out, func); + cudaDeviceSynchronize(); +} + +template +void test_case_impl(Func func, EquivFunc equiv, bool uses_external_sync) { + using Value = typename Func::value_type; + CHECK(cuda::std::is_base_of_v); + CHECK(cuda::std::is_same_v); + CHECK(func.uses_external_sync() == uses_external_sync); + + constexpr std::size_t num_items{100}; + + thrust::device_vector values(num_items); + thrust::sequence(values.begin(), values.end(), 1); + + thrust::device_vector results_d(3, func.identity()); + + reduce_seq(values.begin(), values.end(), results_d.data() + 0, func); + reduce_par(values.begin(), values.end(), results_d.data() + 1, func); + reduce_seq(values.begin(), values.end(), results_d.data() + 2, equiv); + + thrust::host_vector results_h = results_d; + auto sequential_result = results_h[0]; + auto parallel_result = results_h[1]; + auto correct_result = results_h[2]; + + CHECK(sequential_result == correct_result); + CHECK(parallel_result == correct_result); + CHECK(parallel_result == sequential_result); +} + +template +struct custom_plus { + __device__ T operator()(T lhs, T rhs) const noexcept { + return lhs + rhs; + } +}; + +template +struct custom_plus_constref { + __device__ T operator()(T const& lhs, T const& rhs) const noexcept { + return lhs + rhs; + } +}; + +template +struct custom_plus_sync { + template + __device__ T operator()(cuda::atomic& lhs, T const& rhs) const noexcept { + return lhs.fetch_add(rhs) + rhs; + } +}; + +template +struct equiv_count { + __device__ T operator()(T const& lhs, T const& /* rhs */) const noexcept { + return lhs + 1; + } +}; + +TEMPLATE_TEST_CASE_SIG( + "Test '+' reduction functors", + "", + ((typename Value, typename Func, bool UsesExternalSync), Value, Func, UsesExternalSync), + (int32_t, cuco::detail::reduce_add_impl, false), + (int64_t, cuco::detail::reduce_add_impl, false), + (uint32_t, cuco::detail::reduce_add_impl, false), + (uint64_t, cuco::detail::reduce_add_impl, false), + (float, cuco::detail::reduce_add_impl, false), + (double, cuco::detail::reduce_add_impl, false), + (int32_t, thrust::plus, true), + (int32_t, custom_plus_sync, false), + (int32_t, custom_plus, true), + (int32_t, custom_plus_constref, true)) +{ + test_case_impl( + cuco::reduction_functor(cuco::identity_value(0)), + thrust::plus(), + UsesExternalSync); +} + +TEMPLATE_TEST_CASE_SIG( + "Test 'min' reduction functors", + "", + ((typename Value, typename Func, bool UsesExternalSync), Value, Func, UsesExternalSync), + (int32_t, cuco::detail::reduce_min_impl, false), + (int64_t, cuco::detail::reduce_min_impl, false), + (uint32_t, cuco::detail::reduce_min_impl, false), + (uint64_t, cuco::detail::reduce_min_impl, false), + (float, cuco::detail::reduce_min_impl, true), + (double, cuco::detail::reduce_min_impl, true), + (int32_t, thrust::minimum, true)) +{ + test_case_impl( + cuco::reduction_functor(cuco::identity_value(std::numeric_limits::max())), + thrust::minimum(), + UsesExternalSync); +} + +TEMPLATE_TEST_CASE_SIG( + "Test 'max' reduction functors", + "", + ((typename Value, typename Func, bool UsesExternalSync), Value, Func, UsesExternalSync), + (int32_t, cuco::detail::reduce_max_impl, false), + (int64_t, cuco::detail::reduce_max_impl, false), + (uint32_t, cuco::detail::reduce_max_impl, false), + (uint64_t, cuco::detail::reduce_max_impl, false), + (float, cuco::detail::reduce_max_impl, true), + (double, cuco::detail::reduce_max_impl, true), + (int32_t, thrust::maximum, true)) +{ + test_case_impl( + cuco::reduction_functor(cuco::identity_value(std::numeric_limits::min())), + thrust::maximum(), + UsesExternalSync); +} + +TEMPLATE_TEST_CASE_SIG( + "Test 'count' reduction functors", + "", + ((typename Value, typename Func, bool UsesExternalSync), Value, Func, UsesExternalSync), + (int32_t, cuco::detail::reduce_count_impl, false), + (int64_t, cuco::detail::reduce_count_impl, false), + (uint32_t, cuco::detail::reduce_count_impl, false), + (uint64_t, cuco::detail::reduce_count_impl, false)) +{ + test_case_impl( + cuco::reduction_functor(cuco::identity_value(0)), + equiv_count(), + UsesExternalSync); +} + +TEST_CASE( + "Test device lambda reduction functor", + "") +{ + using Value = int; + auto identity = cuco::identity_value(0); + + auto lambda_add = [] __device__ (Value const& lhs, Value const& rhs) noexcept { + return lhs + rhs; + }; + test_case_impl( + cuco::reduction_functor(identity, lambda_add), + thrust::plus(), + true); + + using AtomicValue = cuda::atomic; + auto lambda_add_sync = [] __device__ (AtomicValue& lhs, Value const& rhs) noexcept { + return lhs.fetch_add(rhs) + rhs; + }; + test_case_impl( + cuco::reduction_functor(identity, lambda_add_sync), + thrust::plus(), + false); +} \ No newline at end of file From e4adb05dbb214c4bca49bb9fe6f070f3c4f2a9d3 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 11 Jul 2022 13:43:48 +0000 Subject: [PATCH 03/12] [pre-commit.ci] auto code formatting --- .../cuco/detail/reduction_functor_impl.cuh | 60 +++++++----- include/cuco/reduction_functors.cuh | 75 +++++++++++---- .../reduction_functors_test.cu | 96 +++++++++---------- 3 files changed, 138 insertions(+), 93 deletions(-) diff --git a/include/cuco/detail/reduction_functor_impl.cuh b/include/cuco/detail/reduction_functor_impl.cuh index cc771dafb..3128bd01f 100644 --- a/include/cuco/detail/reduction_functor_impl.cuh +++ b/include/cuco/detail/reduction_functor_impl.cuh @@ -29,12 +29,14 @@ namespace detail { * @warning This class should not be used directly. * */ -class reduction_functor_base {}; +class reduction_functor_base { +}; template struct reduce_add_impl { template - __device__ T operator()(cuda::atomic& lhs, T const& rhs) const noexcept { + __device__ T operator()(cuda::atomic& lhs, T const& rhs) const noexcept + { return lhs.fetch_add(rhs) + rhs; } }; @@ -42,7 +44,8 @@ struct reduce_add_impl { template struct reduce_min_impl { template - __device__ T operator()(cuda::atomic& lhs, T const& rhs) const noexcept { + __device__ T operator()(cuda::atomic& lhs, T const& rhs) const noexcept + { return min(lhs.fetch_min(rhs), rhs); } }; @@ -50,7 +53,8 @@ struct reduce_min_impl { template struct reduce_max_impl { template - __device__ T operator()(cuda::atomic& lhs, T const& rhs) const noexcept { + __device__ T operator()(cuda::atomic& lhs, T const& rhs) const noexcept + { return max(lhs.fetch_max(rhs), rhs); } }; @@ -58,7 +62,8 @@ struct reduce_max_impl { template struct reduce_count_impl { template - __device__ T operator()(cuda::atomic& lhs, T const& /* rhs */) const noexcept { + __device__ T operator()(cuda::atomic& lhs, T const& /* rhs */) const noexcept + { return ++lhs; } }; @@ -66,9 +71,12 @@ struct reduce_count_impl { // remove the following WAR once libcu++ extends FP atomics support and fixes signed integer atomics // https://github.com/NVIDIA/libcudacxx/pull/286 template -struct reduce_add_impl::value>::type> { +struct reduce_add_impl< + T, + typename cuda::std::enable_if::value>::type> { template - __device__ T operator()(cuda::atomic& lhs, T rhs) const noexcept { + __device__ T operator()(cuda::atomic& lhs, T rhs) const noexcept + { if constexpr (Scope == cuda::thread_scope_system) return atomicAdd_system(reinterpret_cast(&lhs), rhs) + rhs; else if constexpr (Scope == cuda::thread_scope_device) @@ -79,11 +87,14 @@ struct reduce_add_impl -struct reduce_min_impl::value && cuda::std::is_signed::value>::type> { +struct reduce_min_impl::value && + cuda::std::is_signed::value>::type> { template - __device__ T operator()(cuda::atomic& lhs, T const& rhs) const noexcept { + __device__ T operator()(cuda::atomic& lhs, T const& rhs) const noexcept + { using InternalT = typename cuda::std::conditional::type; - InternalT * ptr = reinterpret_cast(&lhs); + InternalT* ptr = reinterpret_cast(&lhs); InternalT value = rhs; if constexpr (Scope == cuda::thread_scope_system) return min(atomicMin_system(ptr, value), value); @@ -95,11 +106,14 @@ struct reduce_min_impl -struct reduce_max_impl::value && cuda::std::is_signed::value>::type> { +struct reduce_max_impl::value && + cuda::std::is_signed::value>::type> { template - __device__ T operator()(cuda::atomic& lhs, T const& rhs) const noexcept { + __device__ T operator()(cuda::atomic& lhs, T const& rhs) const noexcept + { using InternalT = typename cuda::std::conditional::type; - InternalT * ptr = reinterpret_cast(&lhs); + InternalT* ptr = reinterpret_cast(&lhs); InternalT value = rhs; if constexpr (Scope == cuda::thread_scope_system) return max(atomicMax_system(ptr, value), value); @@ -111,18 +125,18 @@ struct reduce_max_impl -struct reduce_min_impl::value>::type> { - __device__ T operator()(T lhs, T rhs) const noexcept { - return min(lhs, rhs); - } +struct reduce_min_impl< + T, + typename cuda::std::enable_if::value>::type> { + __device__ T operator()(T lhs, T rhs) const noexcept { return min(lhs, rhs); } }; template -struct reduce_max_impl::value>::type> { - __device__ T operator()(T lhs, T rhs) const noexcept { - return max(lhs, rhs); - } +struct reduce_max_impl< + T, + typename cuda::std::enable_if::value>::type> { + __device__ T operator()(T lhs, T rhs) const noexcept { return max(lhs, rhs); } }; -} // namespace detail -} // namespace cuco \ No newline at end of file +} // namespace detail +} // namespace cuco \ No newline at end of file diff --git a/include/cuco/reduction_functors.cuh b/include/cuco/reduction_functors.cuh index 94888f6b2..074adc1f3 100644 --- a/include/cuco/reduction_functors.cuh +++ b/include/cuco/reduction_functors.cuh @@ -36,6 +36,7 @@ class identity_value { using type = T; constexpr identity_value(T const& identity) noexcept : identity_(identity) {} constexpr T value() const noexcept { return identity_; } + private: T identity_; }; @@ -64,13 +65,16 @@ class identity_value { * }; * * int main() { - * cuco::identity_value identity{0}; // define the identity value for the given reduction operation, i.e., op(identity, x) == x + * cuco::identity_value identity{0}; // define the identity value for the given reduction + * operation, i.e., op(identity, x) == x * - * auto f1 = cuco::reduction_functor, int>(identity); // synchronized via CAS-loop - * auto f2 = cuco::reduction_functor, int>(identity); // implicitly synchronized + * auto f1 = cuco::reduction_functor, int>(identity); // synchronized via + * CAS-loop auto f2 = cuco::reduction_functor, int>(identity); // implicitly + * synchronized * * auto custom_plus_lambda = [] __device__ (int lhs, int rhs) noexcept { return lhs + rhs; }; - * auto f3 = cuco::reduction_functor(identity, custom_plus_lambda); + * auto f3 = cuco::reduction_functor(identity, + * custom_plus_lambda); * } * \endcode * @@ -82,10 +86,14 @@ class reduction_functor : detail::reduction_functor_base { public: using value_type = Value; - reduction_functor(cuco::identity_value identity, Func functor = Func{}) noexcept : identity_(identity), functor_(functor) {} + reduction_functor(cuco::identity_value identity, Func functor = Func{}) noexcept + : identity_(identity), functor_(functor) + { + } template - __device__ value_type operator()(cuda::atomic& lhs, value_type const& rhs) const noexcept + __device__ value_type operator()(cuda::atomic& lhs, + value_type const& rhs) const noexcept { if constexpr (uses_external_sync()) { value_type old = lhs.load(cuda::memory_order_relaxed); @@ -93,7 +101,8 @@ class reduction_functor : detail::reduction_functor_base { do { desired = functor_(old, rhs); - } while (!lhs.compare_exchange_weak(old, desired, cuda::memory_order_release, cuda::memory_order_relaxed)); + } while (!lhs.compare_exchange_weak( + old, desired, cuda::memory_order_release, cuda::memory_order_relaxed)); return desired; } else { @@ -101,23 +110,35 @@ class reduction_functor : detail::reduction_functor_base { } } - __host__ __device__ value_type identity() const noexcept { - return identity_.value(); - } + __host__ __device__ value_type identity() const noexcept { return identity_.value(); } - __host__ __device__ static constexpr bool uses_external_sync() noexcept { + __host__ __device__ static constexpr bool uses_external_sync() noexcept + { return !atomic_invocable_ || naive_invocable_; } private: cuco::identity_value identity_; Func functor_; - static constexpr bool naive_invocable_ = std::is_invocable_r::value; + static constexpr bool naive_invocable_ = + std::is_invocable_r::value; static constexpr bool atomic_invocable_ = - std::is_invocable_r&, value_type>::value || - std::is_invocable_r&, value_type>::value || - std::is_invocable_r&, value_type>::value || - std::is_invocable_r&, value_type>::value; + std::is_invocable_r&, + value_type>::value || + std::is_invocable_r&, + value_type>::value || + std::is_invocable_r&, + value_type>::value || + std::is_invocable_r&, + value_type>::value; static_assert(atomic_invocable_ || naive_invocable_, "Invalid operator signature."); }; @@ -128,7 +149,10 @@ class reduction_functor : detail::reduction_functor_base { * @tparam T The value type used for reduction */ template -auto reduce_add() { return reduction_functor(identity_value{0}, detail::reduce_add_impl{}); }; +auto reduce_add() +{ + return reduction_functor(identity_value{0}, detail::reduce_add_impl{}); +}; /** * @brief Synchronized `min` reduction functor. @@ -136,7 +160,11 @@ auto reduce_add() { return reduction_functor(identity_value{0}, detail::reduc * @tparam T The value type used for reduction */ template -auto reduce_min() { return reduction_functor(identity_value{cuda::std::numeric_limits::max()}, detail::reduce_min_impl{}); }; +auto reduce_min() +{ + return reduction_functor(identity_value{cuda::std::numeric_limits::max()}, + detail::reduce_min_impl{}); +}; /** * @brief Synchronized `max` reduction functor. @@ -144,7 +172,11 @@ auto reduce_min() { return reduction_functor(identity_value{cuda::std::numeric_l * @tparam T The value type used for reduction */ template -auto reduce_max() { return reduction_functor(identity_value{cuda::std::numeric_limits::lowest()}, detail::reduce_max_impl{}); }; +auto reduce_max() +{ + return reduction_functor(identity_value{cuda::std::numeric_limits::lowest()}, + detail::reduce_max_impl{}); +}; /** * @brief Synchronized `count` reduction functor. @@ -152,6 +184,9 @@ auto reduce_max() { return reduction_functor(identity_value{cuda::std::numeric_l * @tparam T The value type used for reduction */ template -auto reduce_count() { return reduction_functor(identity_value{0}, detail::reduce_count_impl{}); }; +auto reduce_count() +{ + return reduction_functor(identity_value{0}, detail::reduce_count_impl{}); +}; } // namespace cuco diff --git a/tests/static_reduction_map/reduction_functors_test.cu b/tests/static_reduction_map/reduction_functors_test.cu index 4e0fffc04..20d247e84 100644 --- a/tests/static_reduction_map/reduction_functors_test.cu +++ b/tests/static_reduction_map/reduction_functors_test.cu @@ -21,20 +21,23 @@ #include #include -#include #include +#include #include #include -template -__global__ void reduce_kernel(InputIt first, InputIt last, OutputIt out, Func func) { +template +__global__ void reduce_kernel(InputIt first, InputIt last, OutputIt out, Func func) +{ auto tid = blockDim.x * blockIdx.x + threadIdx.x; - auto it = first + tid; + auto it = first + tid; if constexpr (cuda::std::is_base_of_v) { while (it < last) { - func(*reinterpret_cast*>(thrust::raw_pointer_cast(out)), *it); + func(*reinterpret_cast*>( + thrust::raw_pointer_cast(out)), + *it); it += gridDim.x * blockDim.x; } } else { @@ -45,20 +48,23 @@ __global__ void reduce_kernel(InputIt first, InputIt last, OutputIt out, Func fu } } -template -void reduce_seq(InputIt first, InputIt last, OutputIt out, Func func) { +template +void reduce_seq(InputIt first, InputIt last, OutputIt out, Func func) +{ reduce_kernel<<<1, 1>>>(first, last, out, func); cudaDeviceSynchronize(); } -template -void reduce_par(InputIt first, InputIt last, OutputIt out, Func func) { +template +void reduce_par(InputIt first, InputIt last, OutputIt out, Func func) +{ reduce_kernel<<<1, 1024>>>(first, last, out, func); cudaDeviceSynchronize(); } template -void test_case_impl(Func func, EquivFunc equiv, bool uses_external_sync) { +void test_case_impl(Func func, EquivFunc equiv, bool uses_external_sync) +{ using Value = typename Func::value_type; CHECK(cuda::std::is_base_of_v); CHECK(cuda::std::is_same_v); @@ -76,9 +82,9 @@ void test_case_impl(Func func, EquivFunc equiv, bool uses_external_sync) { reduce_seq(values.begin(), values.end(), results_d.data() + 2, equiv); thrust::host_vector results_h = results_d; - auto sequential_result = results_h[0]; - auto parallel_result = results_h[1]; - auto correct_result = results_h[2]; + auto sequential_result = results_h[0]; + auto parallel_result = results_h[1]; + auto correct_result = results_h[2]; CHECK(sequential_result == correct_result); CHECK(parallel_result == correct_result); @@ -87,31 +93,26 @@ void test_case_impl(Func func, EquivFunc equiv, bool uses_external_sync) { template struct custom_plus { - __device__ T operator()(T lhs, T rhs) const noexcept { - return lhs + rhs; - } + __device__ T operator()(T lhs, T rhs) const noexcept { return lhs + rhs; } }; template struct custom_plus_constref { - __device__ T operator()(T const& lhs, T const& rhs) const noexcept { - return lhs + rhs; - } + __device__ T operator()(T const& lhs, T const& rhs) const noexcept { return lhs + rhs; } }; template struct custom_plus_sync { template - __device__ T operator()(cuda::atomic& lhs, T const& rhs) const noexcept { + __device__ T operator()(cuda::atomic& lhs, T const& rhs) const noexcept + { return lhs.fetch_add(rhs) + rhs; } }; template struct equiv_count { - __device__ T operator()(T const& lhs, T const& /* rhs */) const noexcept { - return lhs + 1; - } + __device__ T operator()(T const& lhs, T const& /* rhs */) const noexcept { return lhs + 1; } }; TEMPLATE_TEST_CASE_SIG( @@ -129,10 +130,9 @@ TEMPLATE_TEST_CASE_SIG( (int32_t, custom_plus, true), (int32_t, custom_plus_constref, true)) { - test_case_impl( - cuco::reduction_functor(cuco::identity_value(0)), - thrust::plus(), - UsesExternalSync); + test_case_impl(cuco::reduction_functor(cuco::identity_value(0)), + thrust::plus(), + UsesExternalSync); } TEMPLATE_TEST_CASE_SIG( @@ -147,10 +147,10 @@ TEMPLATE_TEST_CASE_SIG( (double, cuco::detail::reduce_min_impl, true), (int32_t, thrust::minimum, true)) { - test_case_impl( - cuco::reduction_functor(cuco::identity_value(std::numeric_limits::max())), - thrust::minimum(), - UsesExternalSync); + test_case_impl(cuco::reduction_functor( + cuco::identity_value(std::numeric_limits::max())), + thrust::minimum(), + UsesExternalSync); } TEMPLATE_TEST_CASE_SIG( @@ -165,10 +165,10 @@ TEMPLATE_TEST_CASE_SIG( (double, cuco::detail::reduce_max_impl, true), (int32_t, thrust::maximum, true)) { - test_case_impl( - cuco::reduction_functor(cuco::identity_value(std::numeric_limits::min())), - thrust::maximum(), - UsesExternalSync); + test_case_impl(cuco::reduction_functor( + cuco::identity_value(std::numeric_limits::min())), + thrust::maximum(), + UsesExternalSync); } TEMPLATE_TEST_CASE_SIG( @@ -180,29 +180,25 @@ TEMPLATE_TEST_CASE_SIG( (uint32_t, cuco::detail::reduce_count_impl, false), (uint64_t, cuco::detail::reduce_count_impl, false)) { - test_case_impl( - cuco::reduction_functor(cuco::identity_value(0)), - equiv_count(), - UsesExternalSync); + test_case_impl(cuco::reduction_functor(cuco::identity_value(0)), + equiv_count(), + UsesExternalSync); } -TEST_CASE( - "Test device lambda reduction functor", - "") +TEST_CASE("Test device lambda reduction functor", "") { - using Value = int; + using Value = int; auto identity = cuco::identity_value(0); - auto lambda_add = [] __device__ (Value const& lhs, Value const& rhs) noexcept { + auto lambda_add = [] __device__(Value const& lhs, Value const& rhs) noexcept { return lhs + rhs; }; - test_case_impl( - cuco::reduction_functor(identity, lambda_add), - thrust::plus(), - true); + test_case_impl(cuco::reduction_functor(identity, lambda_add), + thrust::plus(), + true); - using AtomicValue = cuda::atomic; - auto lambda_add_sync = [] __device__ (AtomicValue& lhs, Value const& rhs) noexcept { + using AtomicValue = cuda::atomic; + auto lambda_add_sync = [] __device__(AtomicValue & lhs, Value const& rhs) noexcept { return lhs.fetch_add(rhs) + rhs; }; test_case_impl( From bb78c499baf998f94a479170e0941d8586414118 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Mon, 11 Jul 2022 16:09:09 +0000 Subject: [PATCH 04/12] Prevent extended __host__ __device__/__device__ lambdas from being used as reduction functors. --- include/cuco/reduction_functors.cuh | 11 ++++---- .../reduction_functors_test.cu | 27 +++---------------- 2 files changed, 9 insertions(+), 29 deletions(-) diff --git a/include/cuco/reduction_functors.cuh b/include/cuco/reduction_functors.cuh index 074adc1f3..52df79b29 100644 --- a/include/cuco/reduction_functors.cuh +++ b/include/cuco/reduction_functors.cuh @@ -71,10 +71,6 @@ class identity_value { * auto f1 = cuco::reduction_functor, int>(identity); // synchronized via * CAS-loop auto f2 = cuco::reduction_functor, int>(identity); // implicitly * synchronized - * - * auto custom_plus_lambda = [] __device__ (int lhs, int rhs) noexcept { return lhs + rhs; }; - * auto f3 = cuco::reduction_functor(identity, - * custom_plus_lambda); * } * \endcode * @@ -140,7 +136,12 @@ class reduction_functor : detail::reduction_functor_base { cuda::atomic&, value_type>::value; - static_assert(atomic_invocable_ || naive_invocable_, "Invalid operator signature."); + static_assert(atomic_invocable_ || naive_invocable_, + "Invalid operator signature. Valid signatures are " + "(T const&, T const&)->T and (cuda::atomic&, T const&)->T."); + static_assert(!(__nv_is_extended_device_lambda_closure_type(Func) || __nv_is_extended_host_device_lambda_closure_type(Func)), + "Extended __device__/__host__ __device__ lambdas are not supported." + " Use a named function object instead."); }; /** diff --git a/tests/static_reduction_map/reduction_functors_test.cu b/tests/static_reduction_map/reduction_functors_test.cu index 20d247e84..55976e372 100644 --- a/tests/static_reduction_map/reduction_functors_test.cu +++ b/tests/static_reduction_map/reduction_functors_test.cu @@ -180,29 +180,8 @@ TEMPLATE_TEST_CASE_SIG( (uint32_t, cuco::detail::reduce_count_impl, false), (uint64_t, cuco::detail::reduce_count_impl, false)) { - test_case_impl(cuco::reduction_functor(cuco::identity_value(0)), - equiv_count(), - UsesExternalSync); -} - -TEST_CASE("Test device lambda reduction functor", "") -{ - using Value = int; - auto identity = cuco::identity_value(0); - - auto lambda_add = [] __device__(Value const& lhs, Value const& rhs) noexcept { - return lhs + rhs; - }; - test_case_impl(cuco::reduction_functor(identity, lambda_add), - thrust::plus(), - true); - - using AtomicValue = cuda::atomic; - auto lambda_add_sync = [] __device__(AtomicValue & lhs, Value const& rhs) noexcept { - return lhs.fetch_add(rhs) + rhs; - }; test_case_impl( - cuco::reduction_functor(identity, lambda_add_sync), - thrust::plus(), - false); + cuco::reduction_functor(cuco::identity_value(0)), + equiv_count(), + UsesExternalSync); } \ No newline at end of file From 905edd843a02c003706661fdacbe354c61ae7a46 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Mon, 11 Jul 2022 16:11:11 +0000 Subject: [PATCH 05/12] Remove spurious include. --- include/cuco/detail/reduction_functor_impl.cuh | 1 - 1 file changed, 1 deletion(-) diff --git a/include/cuco/detail/reduction_functor_impl.cuh b/include/cuco/detail/reduction_functor_impl.cuh index 3128bd01f..304158bb1 100644 --- a/include/cuco/detail/reduction_functor_impl.cuh +++ b/include/cuco/detail/reduction_functor_impl.cuh @@ -16,7 +16,6 @@ #pragma once -#include #include #include From 7d3aff327c2a11123903774dd928acee89064d49 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 11 Jul 2022 16:40:03 +0000 Subject: [PATCH 06/12] [pre-commit.ci] auto code formatting --- include/cuco/reduction_functors.cuh | 7 ++++--- tests/static_reduction_map/reduction_functors_test.cu | 7 +++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/include/cuco/reduction_functors.cuh b/include/cuco/reduction_functors.cuh index 52df79b29..a15ec5bab 100644 --- a/include/cuco/reduction_functors.cuh +++ b/include/cuco/reduction_functors.cuh @@ -137,9 +137,10 @@ class reduction_functor : detail::reduction_functor_base { value_type>::value; static_assert(atomic_invocable_ || naive_invocable_, - "Invalid operator signature. Valid signatures are " - "(T const&, T const&)->T and (cuda::atomic&, T const&)->T."); - static_assert(!(__nv_is_extended_device_lambda_closure_type(Func) || __nv_is_extended_host_device_lambda_closure_type(Func)), + "Invalid operator signature. Valid signatures are " + "(T const&, T const&)->T and (cuda::atomic&, T const&)->T."); + static_assert(!(__nv_is_extended_device_lambda_closure_type(Func) || + __nv_is_extended_host_device_lambda_closure_type(Func)), "Extended __device__/__host__ __device__ lambdas are not supported." " Use a named function object instead."); }; diff --git a/tests/static_reduction_map/reduction_functors_test.cu b/tests/static_reduction_map/reduction_functors_test.cu index 55976e372..bd59b1164 100644 --- a/tests/static_reduction_map/reduction_functors_test.cu +++ b/tests/static_reduction_map/reduction_functors_test.cu @@ -180,8 +180,7 @@ TEMPLATE_TEST_CASE_SIG( (uint32_t, cuco::detail::reduce_count_impl, false), (uint64_t, cuco::detail::reduce_count_impl, false)) { - test_case_impl( - cuco::reduction_functor(cuco::identity_value(0)), - equiv_count(), - UsesExternalSync); + test_case_impl(cuco::reduction_functor(cuco::identity_value(0)), + equiv_count(), + UsesExternalSync); } \ No newline at end of file From 9cbe8906af2c465700f4d299f99d4ddde8ed8bc0 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Tue, 12 Jul 2022 10:04:23 +0000 Subject: [PATCH 07/12] Workaround for compiler warning where nvcc is not able to see the return statement in an `if constexpr else` statement. --- .../cuco/detail/reduction_functor_impl.cuh | 102 +++++++++++++----- include/cuco/reduction_functors.cuh | 15 +-- 2 files changed, 81 insertions(+), 36 deletions(-) diff --git a/include/cuco/detail/reduction_functor_impl.cuh b/include/cuco/detail/reduction_functor_impl.cuh index 304158bb1..6107f92f4 100644 --- a/include/cuco/detail/reduction_functor_impl.cuh +++ b/include/cuco/detail/reduction_functor_impl.cuh @@ -73,15 +73,27 @@ template struct reduce_add_impl< T, typename cuda::std::enable_if::value>::type> { - template + template = true> + __device__ T operator()(cuda::atomic& lhs, T rhs) const noexcept + { + return atomicAdd_system(reinterpret_cast(&lhs), rhs) + rhs; + } + + template = true> + __device__ T operator()(cuda::atomic& lhs, T rhs) const noexcept + { + return atomicAdd(reinterpret_cast(&lhs), rhs) + rhs; + } + + template < + cuda::thread_scope Scope, + cuda::std::enable_if_t = true> __device__ T operator()(cuda::atomic& lhs, T rhs) const noexcept { - if constexpr (Scope == cuda::thread_scope_system) - return atomicAdd_system(reinterpret_cast(&lhs), rhs) + rhs; - else if constexpr (Scope == cuda::thread_scope_device) - return atomicAdd(reinterpret_cast(&lhs), rhs) + rhs; - else - return atomicAdd_block(reinterpret_cast(&lhs), rhs) + rhs; + return atomicAdd_block(reinterpret_cast(&lhs), rhs) + rhs; } }; @@ -89,18 +101,34 @@ template struct reduce_min_impl::value && cuda::std::is_signed::value>::type> { - template - __device__ T operator()(cuda::atomic& lhs, T const& rhs) const noexcept + private: + using internal_type = typename cuda::std::conditional::type; + + public: + template = true> + __device__ T operator()(cuda::atomic& lhs, T rhs) const noexcept + { + return min(atomicMin_system(reinterpret_cast(&lhs), rhs), + static_cast(rhs)); + } + + template = true> + __device__ T operator()(cuda::atomic& lhs, T rhs) const noexcept { - using InternalT = typename cuda::std::conditional::type; - InternalT* ptr = reinterpret_cast(&lhs); - InternalT value = rhs; - if constexpr (Scope == cuda::thread_scope_system) - return min(atomicMin_system(ptr, value), value); - else if constexpr (Scope == cuda::thread_scope_device) - return min(atomicMin(ptr, value), value); - else - return min(atomicMin_block(ptr, value), value); + return min(atomicMin(reinterpret_cast(&lhs), rhs), + static_cast(rhs)); + } + + template < + cuda::thread_scope Scope, + cuda::std::enable_if_t = true> + __device__ T operator()(cuda::atomic& lhs, T rhs) const noexcept + { + return min(atomicMin_block(reinterpret_cast(&lhs), rhs), + static_cast(rhs)); } }; @@ -108,18 +136,34 @@ template struct reduce_max_impl::value && cuda::std::is_signed::value>::type> { - template - __device__ T operator()(cuda::atomic& lhs, T const& rhs) const noexcept + private: + using internal_type = typename cuda::std::conditional::type; + + public: + template = true> + __device__ T operator()(cuda::atomic& lhs, T rhs) const noexcept + { + return max(atomicMax_system(reinterpret_cast(&lhs), rhs), + static_cast(rhs)); + } + + template = true> + __device__ T operator()(cuda::atomic& lhs, T rhs) const noexcept + { + return max(atomicMax(reinterpret_cast(&lhs), rhs), + static_cast(rhs)); + } + + template < + cuda::thread_scope Scope, + cuda::std::enable_if_t = true> + __device__ T operator()(cuda::atomic& lhs, T rhs) const noexcept { - using InternalT = typename cuda::std::conditional::type; - InternalT* ptr = reinterpret_cast(&lhs); - InternalT value = rhs; - if constexpr (Scope == cuda::thread_scope_system) - return max(atomicMax_system(ptr, value), value); - else if constexpr (Scope == cuda::thread_scope_device) - return max(atomicMax(ptr, value), value); - else - return max(atomicMax_block(ptr, value), value); + return max(atomicMax_block(reinterpret_cast(&lhs), rhs), + static_cast(rhs)); } }; diff --git a/include/cuco/reduction_functors.cuh b/include/cuco/reduction_functors.cuh index a15ec5bab..13478a312 100644 --- a/include/cuco/reduction_functors.cuh +++ b/include/cuco/reduction_functors.cuh @@ -65,12 +65,14 @@ class identity_value { * }; * * int main() { - * cuco::identity_value identity{0}; // define the identity value for the given reduction - * operation, i.e., op(identity, x) == x + * // define the identity value for the given reduction operation, + // i.e., op(identity, x) == x + * cuco::identity_value identity{0}; * - * auto f1 = cuco::reduction_functor, int>(identity); // synchronized via - * CAS-loop auto f2 = cuco::reduction_functor, int>(identity); // implicitly - * synchronized + * // synchronized via CAS loop + * auto f1 = cuco::reduction_functor, int>(identity); + * // implicitly synchronized + * auto f2 = cuco::reduction_functor, int>(identity); * } * \endcode * @@ -101,9 +103,8 @@ class reduction_functor : detail::reduction_functor_base { old, desired, cuda::memory_order_release, cuda::memory_order_relaxed)); return desired; - } else { - return functor_(lhs, rhs); } + if constexpr (!uses_external_sync()) { return functor_(lhs, rhs); } } __host__ __device__ value_type identity() const noexcept { return identity_.value(); } From cef8906f864332bcfc4078c5b60080e37d68cafb Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Tue, 12 Jul 2022 15:02:04 +0000 Subject: [PATCH 08/12] Fix includes und use type trait aliases. --- .../cuco/detail/reduction_functor_impl.cuh | 30 ++++++-------- include/cuco/reduction_functors.cuh | 39 +++++++++---------- .../reduction_functors_test.cu | 9 +++-- 3 files changed, 36 insertions(+), 42 deletions(-) diff --git a/include/cuco/detail/reduction_functor_impl.cuh b/include/cuco/detail/reduction_functor_impl.cuh index 6107f92f4..9547600e1 100644 --- a/include/cuco/detail/reduction_functor_impl.cuh +++ b/include/cuco/detail/reduction_functor_impl.cuh @@ -17,7 +17,7 @@ #pragma once #include -#include +#include namespace cuco { namespace detail { @@ -70,9 +70,7 @@ struct reduce_count_impl { // remove the following WAR once libcu++ extends FP atomics support and fixes signed integer atomics // https://github.com/NVIDIA/libcudacxx/pull/286 template -struct reduce_add_impl< - T, - typename cuda::std::enable_if::value>::type> { +struct reduce_add_impl>> { template = true> __device__ T operator()(cuda::atomic& lhs, T rhs) const noexcept @@ -98,11 +96,11 @@ struct reduce_add_impl< }; template -struct reduce_min_impl::value && - cuda::std::is_signed::value>::type> { +struct reduce_min_impl< + T, + typename cuda::std::enable_if_t && cuda::std::is_signed_v>> { private: - using internal_type = typename cuda::std::conditional::type; + using internal_type = typename cuda::std::conditional_t; public: template -struct reduce_max_impl::value && - cuda::std::is_signed::value>::type> { +struct reduce_max_impl< + T, + typename cuda::std::enable_if_t && cuda::std::is_signed_v>> { private: - using internal_type = typename cuda::std::conditional::type; + using internal_type = typename cuda::std::conditional_t; public: template -struct reduce_min_impl< - T, - typename cuda::std::enable_if::value>::type> { +struct reduce_min_impl>> { __device__ T operator()(T lhs, T rhs) const noexcept { return min(lhs, rhs); } }; template -struct reduce_max_impl< - T, - typename cuda::std::enable_if::value>::type> { +struct reduce_max_impl>> { __device__ T operator()(T lhs, T rhs) const noexcept { return max(lhs, rhs); } }; diff --git a/include/cuco/reduction_functors.cuh b/include/cuco/reduction_functors.cuh index 13478a312..6cf29ae23 100644 --- a/include/cuco/reduction_functors.cuh +++ b/include/cuco/reduction_functors.cuh @@ -16,12 +16,11 @@ #pragma once -#include #include #include -#include -#include +#include +#include namespace cuco { @@ -118,24 +117,24 @@ class reduction_functor : detail::reduction_functor_base { cuco::identity_value identity_; Func functor_; static constexpr bool naive_invocable_ = - std::is_invocable_r::value; + cuda::std::is_invocable_r_v; static constexpr bool atomic_invocable_ = - std::is_invocable_r&, - value_type>::value || - std::is_invocable_r&, - value_type>::value || - std::is_invocable_r&, - value_type>::value || - std::is_invocable_r&, - value_type>::value; + cuda::std::is_invocable_r_v&, + value_type> || + cuda::std::is_invocable_r_v&, + value_type> || + cuda::std::is_invocable_r_v&, + value_type> || + cuda::std::is_invocable_r_v&, + value_type>; static_assert(atomic_invocable_ || naive_invocable_, "Invalid operator signature. Valid signatures are " diff --git a/tests/static_reduction_map/reduction_functors_test.cu b/tests/static_reduction_map/reduction_functors_test.cu index bd59b1164..d2e4d2e3e 100644 --- a/tests/static_reduction_map/reduction_functors_test.cu +++ b/tests/static_reduction_map/reduction_functors_test.cu @@ -14,8 +14,6 @@ * limitations under the License. */ -#include -#include #include #include @@ -27,6 +25,9 @@ #include +#include +#include + template __global__ void reduce_kernel(InputIt first, InputIt last, OutputIt out, Func func) { @@ -148,7 +149,7 @@ TEMPLATE_TEST_CASE_SIG( (int32_t, thrust::minimum, true)) { test_case_impl(cuco::reduction_functor( - cuco::identity_value(std::numeric_limits::max())), + cuco::identity_value(cuda::std::numeric_limits::max())), thrust::minimum(), UsesExternalSync); } @@ -166,7 +167,7 @@ TEMPLATE_TEST_CASE_SIG( (int32_t, thrust::maximum, true)) { test_case_impl(cuco::reduction_functor( - cuco::identity_value(std::numeric_limits::min())), + cuco::identity_value(cuda::std::numeric_limits::min())), thrust::maximum(), UsesExternalSync); } From 80c15441092dff16d2ab7e628725b037ed14ba1e Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Tue, 12 Jul 2022 15:21:37 +0000 Subject: [PATCH 09/12] Reduction ops should use relaxed memory order. --- include/cuco/detail/reduction_functor_impl.cuh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/cuco/detail/reduction_functor_impl.cuh b/include/cuco/detail/reduction_functor_impl.cuh index 9547600e1..782a4a8bd 100644 --- a/include/cuco/detail/reduction_functor_impl.cuh +++ b/include/cuco/detail/reduction_functor_impl.cuh @@ -36,7 +36,7 @@ struct reduce_add_impl { template __device__ T operator()(cuda::atomic& lhs, T const& rhs) const noexcept { - return lhs.fetch_add(rhs) + rhs; + return lhs.fetch_add(rhs, cuda::memory_order_relaxed) + rhs; } }; @@ -45,7 +45,7 @@ struct reduce_min_impl { template __device__ T operator()(cuda::atomic& lhs, T const& rhs) const noexcept { - return min(lhs.fetch_min(rhs), rhs); + return min(lhs.fetch_min(rhs, cuda::memory_order_relaxed), rhs); } }; @@ -54,7 +54,7 @@ struct reduce_max_impl { template __device__ T operator()(cuda::atomic& lhs, T const& rhs) const noexcept { - return max(lhs.fetch_max(rhs), rhs); + return max(lhs.fetch_max(rhs, cuda::memory_order_relaxed), rhs); } }; @@ -63,7 +63,7 @@ struct reduce_count_impl { template __device__ T operator()(cuda::atomic& lhs, T const& /* rhs */) const noexcept { - return ++lhs; + return lhs.fetch_add(1, cuda::memory_order_relaxed) + 1; } }; From 6fc5ff28515f6cb56356110290c13a990a799230 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Tue, 12 Jul 2022 15:22:17 +0000 Subject: [PATCH 10/12] Make identity_value ctor explicit. --- include/cuco/reduction_functors.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/cuco/reduction_functors.cuh b/include/cuco/reduction_functors.cuh index 6cf29ae23..0054e8ad5 100644 --- a/include/cuco/reduction_functors.cuh +++ b/include/cuco/reduction_functors.cuh @@ -33,7 +33,7 @@ template class identity_value { public: using type = T; - constexpr identity_value(T const& identity) noexcept : identity_(identity) {} + explicit constexpr identity_value(T const& identity) noexcept : identity_(identity) {} constexpr T value() const noexcept { return identity_; } private: From 99945f49a24b267f7a2c45901e045135aeeef692 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Mon, 1 Aug 2022 10:01:18 +0000 Subject: [PATCH 11/12] Internally-synced reduction functors cannot have immutable target object. --- include/cuco/reduction_functors.cuh | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/include/cuco/reduction_functors.cuh b/include/cuco/reduction_functors.cuh index 0054e8ad5..5486b1b3d 100644 --- a/include/cuco/reduction_functors.cuh +++ b/include/cuco/reduction_functors.cuh @@ -135,8 +135,25 @@ class reduction_functor : detail::reduction_functor_base { Func, cuda::atomic&, value_type>; + static constexpr bool atomic_const_invocable_ = + cuda::std::is_invocable_r_v const&, + value_type> || + cuda::std::is_invocable_r_v const&, + value_type> || + cuda::std::is_invocable_r_v const&, + value_type> || + cuda::std::is_invocable_r_v const&, + value_type>; - static_assert(atomic_invocable_ || naive_invocable_, + static_assert((atomic_invocable_ && !atomic_const_invocable_) || naive_invocable_, "Invalid operator signature. Valid signatures are " "(T const&, T const&)->T and (cuda::atomic&, T const&)->T."); static_assert(!(__nv_is_extended_device_lambda_closure_type(Func) || From 4435d05350a87f4bfd581e053c843f5ca1f63a2a Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Mon, 1 Aug 2022 10:05:33 +0000 Subject: [PATCH 12/12] Use human-readable boolean operators. --- include/cuco/reduction_functors.cuh | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/include/cuco/reduction_functors.cuh b/include/cuco/reduction_functors.cuh index 5486b1b3d..892b6e83a 100644 --- a/include/cuco/reduction_functors.cuh +++ b/include/cuco/reduction_functors.cuh @@ -98,39 +98,40 @@ class reduction_functor : detail::reduction_functor_base { do { desired = functor_(old, rhs); - } while (!lhs.compare_exchange_weak( + } while (not lhs.compare_exchange_weak( old, desired, cuda::memory_order_release, cuda::memory_order_relaxed)); return desired; } - if constexpr (!uses_external_sync()) { return functor_(lhs, rhs); } + if constexpr (not uses_external_sync()) { return functor_(lhs, rhs); } } __host__ __device__ value_type identity() const noexcept { return identity_.value(); } __host__ __device__ static constexpr bool uses_external_sync() noexcept { - return !atomic_invocable_ || naive_invocable_; + return not atomic_invocable_ or naive_invocable_; } private: cuco::identity_value identity_; Func functor_; + static constexpr bool naive_invocable_ = cuda::std::is_invocable_r_v; static constexpr bool atomic_invocable_ = cuda::std::is_invocable_r_v&, - value_type> || + value_type> or cuda::std::is_invocable_r_v&, - value_type> || + value_type> or cuda::std::is_invocable_r_v&, - value_type> || + value_type> or cuda::std::is_invocable_r_v&, @@ -139,24 +140,24 @@ class reduction_functor : detail::reduction_functor_base { cuda::std::is_invocable_r_v const&, - value_type> || + value_type> or cuda::std::is_invocable_r_v const&, - value_type> || + value_type> or cuda::std::is_invocable_r_v const&, - value_type> || + value_type> or cuda::std::is_invocable_r_v const&, value_type>; - static_assert((atomic_invocable_ && !atomic_const_invocable_) || naive_invocable_, + static_assert((atomic_invocable_ and not atomic_const_invocable_) or naive_invocable_, "Invalid operator signature. Valid signatures are " "(T const&, T const&)->T and (cuda::atomic&, T const&)->T."); - static_assert(!(__nv_is_extended_device_lambda_closure_type(Func) || + static_assert(!(__nv_is_extended_device_lambda_closure_type(Func) or __nv_is_extended_host_device_lambda_closure_type(Func)), "Extended __device__/__host__ __device__ lambdas are not supported." " Use a named function object instead.");