diff --git a/include/cuco/detail/reduction_functor_impl.cuh b/include/cuco/detail/reduction_functor_impl.cuh new file mode 100644 index 000000000..782a4a8bd --- /dev/null +++ b/include/cuco/detail/reduction_functor_impl.cuh @@ -0,0 +1,179 @@ +/* + * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +namespace cuco { +namespace detail { + +/** + * @brief Base class of all reduction functors. + * + * @warning This class should not be used directly. + * + */ +class reduction_functor_base { +}; + +template +struct reduce_add_impl { + template + __device__ T operator()(cuda::atomic& lhs, T const& rhs) const noexcept + { + return lhs.fetch_add(rhs, cuda::memory_order_relaxed) + rhs; + } +}; + +template +struct reduce_min_impl { + template + __device__ T operator()(cuda::atomic& lhs, T const& rhs) const noexcept + { + return min(lhs.fetch_min(rhs, cuda::memory_order_relaxed), rhs); + } +}; + +template +struct reduce_max_impl { + template + __device__ T operator()(cuda::atomic& lhs, T const& rhs) const noexcept + { + return max(lhs.fetch_max(rhs, cuda::memory_order_relaxed), rhs); + } +}; + +template +struct reduce_count_impl { + template + __device__ T operator()(cuda::atomic& lhs, T const& /* rhs */) const noexcept + { + return lhs.fetch_add(1, cuda::memory_order_relaxed) + 1; + } +}; + +// remove the following WAR once libcu++ extends FP atomics support and fixes signed integer atomics +// https://github.com/NVIDIA/libcudacxx/pull/286 +template +struct reduce_add_impl>> { + template = true> + __device__ T operator()(cuda::atomic& lhs, T rhs) const noexcept + { + return atomicAdd_system(reinterpret_cast(&lhs), rhs) + rhs; + } + + template = true> + __device__ T operator()(cuda::atomic& lhs, T rhs) const noexcept + { + return atomicAdd(reinterpret_cast(&lhs), rhs) + rhs; + } + + template < + cuda::thread_scope Scope, + cuda::std::enable_if_t = true> + __device__ T operator()(cuda::atomic& lhs, T rhs) const noexcept + { + return atomicAdd_block(reinterpret_cast(&lhs), rhs) + rhs; + } +}; + +template +struct reduce_min_impl< + T, + typename cuda::std::enable_if_t && cuda::std::is_signed_v>> { + private: + using internal_type = typename cuda::std::conditional_t; + + public: + template = true> + __device__ T operator()(cuda::atomic& lhs, T rhs) const noexcept + { + return min(atomicMin_system(reinterpret_cast(&lhs), rhs), + static_cast(rhs)); + } + + template = true> + __device__ T operator()(cuda::atomic& lhs, T rhs) const noexcept + { + return min(atomicMin(reinterpret_cast(&lhs), rhs), + static_cast(rhs)); + } + + template < + cuda::thread_scope Scope, + cuda::std::enable_if_t = true> + __device__ T operator()(cuda::atomic& lhs, T rhs) const noexcept + { + return min(atomicMin_block(reinterpret_cast(&lhs), rhs), + static_cast(rhs)); + } +}; + +template +struct reduce_max_impl< + T, + typename cuda::std::enable_if_t && cuda::std::is_signed_v>> { + private: + using internal_type = typename cuda::std::conditional_t; + + public: + template = true> + __device__ T operator()(cuda::atomic& lhs, T rhs) const noexcept + { + return max(atomicMax_system(reinterpret_cast(&lhs), rhs), + static_cast(rhs)); + } + + template = true> + __device__ T operator()(cuda::atomic& lhs, T rhs) const noexcept + { + return max(atomicMax(reinterpret_cast(&lhs), rhs), + static_cast(rhs)); + } + + template < + cuda::thread_scope Scope, + cuda::std::enable_if_t = true> + __device__ T operator()(cuda::atomic& lhs, T rhs) const noexcept + { + return max(atomicMax_block(reinterpret_cast(&lhs), rhs), + static_cast(rhs)); + } +}; + +template +struct reduce_min_impl>> { + __device__ T operator()(T lhs, T rhs) const noexcept { return min(lhs, rhs); } +}; + +template +struct reduce_max_impl>> { + __device__ T operator()(T lhs, T rhs) const noexcept { return max(lhs, rhs); } +}; + +} // namespace detail +} // namespace cuco \ No newline at end of file diff --git a/include/cuco/reduction_functors.cuh b/include/cuco/reduction_functors.cuh new file mode 100644 index 000000000..892b6e83a --- /dev/null +++ b/include/cuco/reduction_functors.cuh @@ -0,0 +1,212 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include +#include + +namespace cuco { + +/** + * @brief Wrapper for reduction identity value. + * + * @tparam T The underlying value type used for reduction + */ +template +class identity_value { + public: + using type = T; + explicit constexpr identity_value(T const& identity) noexcept : identity_(identity) {} + constexpr T value() const noexcept { return identity_; } + + private: + T identity_; +}; + +/** + * @brief Wrapper for a user-defined custom reduction operator. + * + * External synchronization, if required, + * is established via an atomic compare-and-swap loop. + * + * Example: + * \code{.cpp} + * template + * struct custom_plus { + * __device__ T operator()(T const& lhs, T const& rhs) const noexcept { + * return lhs + rhs; + * } + * }; + * + * template + * struct custom_plus_sync { + * template + * __device__ T operator()(cuda::atomic& lhs, T const& rhs) const noexcept { + * return lhs.fetch_add(rhs) + rhs; + * } + * }; + * + * int main() { + * // define the identity value for the given reduction operation, + // i.e., op(identity, x) == x + * cuco::identity_value identity{0}; + * + * // synchronized via CAS loop + * auto f1 = cuco::reduction_functor, int>(identity); + * // implicitly synchronized + * auto f2 = cuco::reduction_functor, int>(identity); + * } + * \endcode + * + * @tparam Func The user-defined reduction functor + * @tparam Value The value type used for reduction + */ +template +class reduction_functor : detail::reduction_functor_base { + public: + using value_type = Value; + + reduction_functor(cuco::identity_value identity, Func functor = Func{}) noexcept + : identity_(identity), functor_(functor) + { + } + + template + __device__ value_type operator()(cuda::atomic& lhs, + value_type const& rhs) const noexcept + { + if constexpr (uses_external_sync()) { + value_type old = lhs.load(cuda::memory_order_relaxed); + value_type desired; + + do { + desired = functor_(old, rhs); + } while (not lhs.compare_exchange_weak( + old, desired, cuda::memory_order_release, cuda::memory_order_relaxed)); + + return desired; + } + if constexpr (not uses_external_sync()) { return functor_(lhs, rhs); } + } + + __host__ __device__ value_type identity() const noexcept { return identity_.value(); } + + __host__ __device__ static constexpr bool uses_external_sync() noexcept + { + return not atomic_invocable_ or naive_invocable_; + } + + private: + cuco::identity_value identity_; + Func functor_; + + static constexpr bool naive_invocable_ = + cuda::std::is_invocable_r_v; + static constexpr bool atomic_invocable_ = + cuda::std::is_invocable_r_v&, + value_type> or + cuda::std::is_invocable_r_v&, + value_type> or + cuda::std::is_invocable_r_v&, + value_type> or + cuda::std::is_invocable_r_v&, + value_type>; + static constexpr bool atomic_const_invocable_ = + cuda::std::is_invocable_r_v const&, + value_type> or + cuda::std::is_invocable_r_v const&, + value_type> or + cuda::std::is_invocable_r_v const&, + value_type> or + cuda::std::is_invocable_r_v const&, + value_type>; + + static_assert((atomic_invocable_ and not atomic_const_invocable_) or naive_invocable_, + "Invalid operator signature. Valid signatures are " + "(T const&, T const&)->T and (cuda::atomic&, T const&)->T."); + static_assert(!(__nv_is_extended_device_lambda_closure_type(Func) or + __nv_is_extended_host_device_lambda_closure_type(Func)), + "Extended __device__/__host__ __device__ lambdas are not supported." + " Use a named function object instead."); +}; + +/** + * @brief Synchronized `+` reduction functor. + * + * @tparam T The value type used for reduction + */ +template +auto reduce_add() +{ + return reduction_functor(identity_value{0}, detail::reduce_add_impl{}); +}; + +/** + * @brief Synchronized `min` reduction functor. + * + * @tparam T The value type used for reduction + */ +template +auto reduce_min() +{ + return reduction_functor(identity_value{cuda::std::numeric_limits::max()}, + detail::reduce_min_impl{}); +}; + +/** + * @brief Synchronized `max` reduction functor. + * + * @tparam T The value type used for reduction + */ +template +auto reduce_max() +{ + return reduction_functor(identity_value{cuda::std::numeric_limits::lowest()}, + detail::reduce_max_impl{}); +}; + +/** + * @brief Synchronized `count` reduction functor. + * + * @tparam T The value type used for reduction + */ +template +auto reduce_count() +{ + return reduction_functor(identity_value{0}, detail::reduce_count_impl{}); +}; + +} // namespace cuco diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index dec5170c4..56776f1cf 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -23,7 +23,7 @@ include(CTest) CPMAddPackage( NAME Catch2 GITHUB_REPOSITORY catchorg/Catch2 - VERSION 2.11.1 + VERSION 2.13.9 ) if(Catch2_ADDED) @@ -86,3 +86,8 @@ ConfigureTest(STATIC_MULTIMAP_TEST static_multimap/multiplicity_test.cu static_multimap/non_match_test.cu static_multimap/pair_function_test.cu) + +################################################################################################### +# - static_reduction_map tests -------------------------------------------------------------------- +ConfigureTest(STATIC_REDUCTION_MAP_TEST + static_reduction_map/reduction_functors_test.cu) diff --git a/tests/static_reduction_map/reduction_functors_test.cu b/tests/static_reduction_map/reduction_functors_test.cu new file mode 100644 index 000000000..d2e4d2e3e --- /dev/null +++ b/tests/static_reduction_map/reduction_functors_test.cu @@ -0,0 +1,187 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include +#include + +#include + +#include +#include + +template +__global__ void reduce_kernel(InputIt first, InputIt last, OutputIt out, Func func) +{ + auto tid = blockDim.x * blockIdx.x + threadIdx.x; + auto it = first + tid; + + if constexpr (cuda::std::is_base_of_v) { + while (it < last) { + func(*reinterpret_cast*>( + thrust::raw_pointer_cast(out)), + *it); + it += gridDim.x * blockDim.x; + } + } else { + while (it < last) { + *out = func(*out, *it); + it += gridDim.x * blockDim.x; + } + } +} + +template +void reduce_seq(InputIt first, InputIt last, OutputIt out, Func func) +{ + reduce_kernel<<<1, 1>>>(first, last, out, func); + cudaDeviceSynchronize(); +} + +template +void reduce_par(InputIt first, InputIt last, OutputIt out, Func func) +{ + reduce_kernel<<<1, 1024>>>(first, last, out, func); + cudaDeviceSynchronize(); +} + +template +void test_case_impl(Func func, EquivFunc equiv, bool uses_external_sync) +{ + using Value = typename Func::value_type; + CHECK(cuda::std::is_base_of_v); + CHECK(cuda::std::is_same_v); + CHECK(func.uses_external_sync() == uses_external_sync); + + constexpr std::size_t num_items{100}; + + thrust::device_vector values(num_items); + thrust::sequence(values.begin(), values.end(), 1); + + thrust::device_vector results_d(3, func.identity()); + + reduce_seq(values.begin(), values.end(), results_d.data() + 0, func); + reduce_par(values.begin(), values.end(), results_d.data() + 1, func); + reduce_seq(values.begin(), values.end(), results_d.data() + 2, equiv); + + thrust::host_vector results_h = results_d; + auto sequential_result = results_h[0]; + auto parallel_result = results_h[1]; + auto correct_result = results_h[2]; + + CHECK(sequential_result == correct_result); + CHECK(parallel_result == correct_result); + CHECK(parallel_result == sequential_result); +} + +template +struct custom_plus { + __device__ T operator()(T lhs, T rhs) const noexcept { return lhs + rhs; } +}; + +template +struct custom_plus_constref { + __device__ T operator()(T const& lhs, T const& rhs) const noexcept { return lhs + rhs; } +}; + +template +struct custom_plus_sync { + template + __device__ T operator()(cuda::atomic& lhs, T const& rhs) const noexcept + { + return lhs.fetch_add(rhs) + rhs; + } +}; + +template +struct equiv_count { + __device__ T operator()(T const& lhs, T const& /* rhs */) const noexcept { return lhs + 1; } +}; + +TEMPLATE_TEST_CASE_SIG( + "Test '+' reduction functors", + "", + ((typename Value, typename Func, bool UsesExternalSync), Value, Func, UsesExternalSync), + (int32_t, cuco::detail::reduce_add_impl, false), + (int64_t, cuco::detail::reduce_add_impl, false), + (uint32_t, cuco::detail::reduce_add_impl, false), + (uint64_t, cuco::detail::reduce_add_impl, false), + (float, cuco::detail::reduce_add_impl, false), + (double, cuco::detail::reduce_add_impl, false), + (int32_t, thrust::plus, true), + (int32_t, custom_plus_sync, false), + (int32_t, custom_plus, true), + (int32_t, custom_plus_constref, true)) +{ + test_case_impl(cuco::reduction_functor(cuco::identity_value(0)), + thrust::plus(), + UsesExternalSync); +} + +TEMPLATE_TEST_CASE_SIG( + "Test 'min' reduction functors", + "", + ((typename Value, typename Func, bool UsesExternalSync), Value, Func, UsesExternalSync), + (int32_t, cuco::detail::reduce_min_impl, false), + (int64_t, cuco::detail::reduce_min_impl, false), + (uint32_t, cuco::detail::reduce_min_impl, false), + (uint64_t, cuco::detail::reduce_min_impl, false), + (float, cuco::detail::reduce_min_impl, true), + (double, cuco::detail::reduce_min_impl, true), + (int32_t, thrust::minimum, true)) +{ + test_case_impl(cuco::reduction_functor( + cuco::identity_value(cuda::std::numeric_limits::max())), + thrust::minimum(), + UsesExternalSync); +} + +TEMPLATE_TEST_CASE_SIG( + "Test 'max' reduction functors", + "", + ((typename Value, typename Func, bool UsesExternalSync), Value, Func, UsesExternalSync), + (int32_t, cuco::detail::reduce_max_impl, false), + (int64_t, cuco::detail::reduce_max_impl, false), + (uint32_t, cuco::detail::reduce_max_impl, false), + (uint64_t, cuco::detail::reduce_max_impl, false), + (float, cuco::detail::reduce_max_impl, true), + (double, cuco::detail::reduce_max_impl, true), + (int32_t, thrust::maximum, true)) +{ + test_case_impl(cuco::reduction_functor( + cuco::identity_value(cuda::std::numeric_limits::min())), + thrust::maximum(), + UsesExternalSync); +} + +TEMPLATE_TEST_CASE_SIG( + "Test 'count' reduction functors", + "", + ((typename Value, typename Func, bool UsesExternalSync), Value, Func, UsesExternalSync), + (int32_t, cuco::detail::reduce_count_impl, false), + (int64_t, cuco::detail::reduce_count_impl, false), + (uint32_t, cuco::detail::reduce_count_impl, false), + (uint64_t, cuco::detail::reduce_count_impl, false)) +{ + test_case_impl(cuco::reduction_functor(cuco::identity_value(0)), + equiv_count(), + UsesExternalSync); +} \ No newline at end of file