NVIDIA · sleeepyjack · Jul 11, 2022 · Jul 11, 2022 · Jul 11, 2022 · Jul 11, 2022
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuda/atomic>
+#include <cuda/std/type_traits>
+
+namespace cuco {
+namespace detail {
+
+/**
+ * @brief Base class of all reduction functors.
+ *
+ * @warning This class should not be used directly.
+ *
+ */
+class reduction_functor_base {
+};
+
+template <typename T, typename Enable = void>
+struct reduce_add_impl {
+  template <cuda::thread_scope Scope>
+  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& rhs) const noexcept
+  {
+    return lhs.fetch_add(rhs, cuda::memory_order_relaxed) + rhs;
+  }
+};
+
+template <typename T, typename Enable = void>
+struct reduce_min_impl {
+  template <cuda::thread_scope Scope>
+  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& rhs) const noexcept
+  {
+    return min(lhs.fetch_min(rhs, cuda::memory_order_relaxed), rhs);
+  }
+};
+
+template <typename T, typename Enable = void>
+struct reduce_max_impl {
+  template <cuda::thread_scope Scope>
+  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& rhs) const noexcept
+  {
+    return max(lhs.fetch_max(rhs, cuda::memory_order_relaxed), rhs);
+  }
+};
+
+template <typename T, typename Enable = void>
+struct reduce_count_impl {
+  template <cuda::thread_scope Scope>
+  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& /* rhs */) const noexcept
+  {
+    return lhs.fetch_add(1, cuda::memory_order_relaxed) + 1;
+  }
+};
+
+// remove the following WAR once libcu++ extends FP atomics support and fixes signed integer atomics
+// https://github.com/NVIDIA/libcudacxx/pull/286
+template <typename T>
+struct reduce_add_impl<T, typename cuda::std::enable_if_t<cuda::std::is_floating_point_v<T>>> {
+  template <cuda::thread_scope Scope,
+            cuda::std::enable_if_t<Scope == cuda::thread_scope_system, bool> = true>
+  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T rhs) const noexcept
+  {
+    return atomicAdd_system(reinterpret_cast<T*>(&lhs), rhs) + rhs;
+  }
+
+  template <cuda::thread_scope Scope,
+            cuda::std::enable_if_t<Scope == cuda::thread_scope_device, bool> = true>
+  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T rhs) const noexcept
+  {
+    return atomicAdd(reinterpret_cast<T*>(&lhs), rhs) + rhs;
+  }
+
+  template <
+    cuda::thread_scope Scope,
+    cuda::std::enable_if_t<Scope != cuda::thread_scope_system && Scope != cuda::thread_scope_device,
+                           bool> = true>
+  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T rhs) const noexcept
+  {
+    return atomicAdd_block(reinterpret_cast<T*>(&lhs), rhs) + rhs;
+  }
+};
+
+template <typename T>
+struct reduce_min_impl<
+  T,
+  typename cuda::std::enable_if_t<cuda::std::is_integral_v<T> && cuda::std::is_signed_v<T>>> {
+ private:
+  using internal_type = typename cuda::std::conditional_t<sizeof(T) == 8, long long int, int>;
+
+ public:
+  template <cuda::thread_scope Scope,
+            cuda::std::enable_if_t<Scope == cuda::thread_scope_system, bool> = true>
+  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T rhs) const noexcept
+  {
+    return min(atomicMin_system(reinterpret_cast<internal_type*>(&lhs), rhs),
+               static_cast<internal_type>(rhs));
+  }
+
+  template <cuda::thread_scope Scope,
+            cuda::std::enable_if_t<Scope == cuda::thread_scope_device, bool> = true>
+  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T rhs) const noexcept
+  {
+    return min(atomicMin(reinterpret_cast<internal_type*>(&lhs), rhs),
+               static_cast<internal_type>(rhs));
+  }
+
+  template <
+    cuda::thread_scope Scope,
+    cuda::std::enable_if_t<Scope != cuda::thread_scope_system && Scope != cuda::thread_scope_device,
+                           bool> = true>
+  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T rhs) const noexcept
+  {
+    return min(atomicMin_block(reinterpret_cast<internal_type*>(&lhs), rhs),
+               static_cast<internal_type>(rhs));
+  }
+};
+
+template <typename T>
+struct reduce_max_impl<
+  T,
+  typename cuda::std::enable_if_t<cuda::std::is_integral_v<T> && cuda::std::is_signed_v<T>>> {
+ private:
+  using internal_type = typename cuda::std::conditional_t<sizeof(T) == 8, long long int, int>;
+
+ public:
+  template <cuda::thread_scope Scope,
+            cuda::std::enable_if_t<Scope == cuda::thread_scope_system, bool> = true>
+  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T rhs) const noexcept
+  {
+    return max(atomicMax_system(reinterpret_cast<internal_type*>(&lhs), rhs),
+               static_cast<internal_type>(rhs));
+  }
+
+  template <cuda::thread_scope Scope,
+            cuda::std::enable_if_t<Scope == cuda::thread_scope_device, bool> = true>
+  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T rhs) const noexcept
+  {
+    return max(atomicMax(reinterpret_cast<internal_type*>(&lhs), rhs),
+               static_cast<internal_type>(rhs));
+  }
+
+  template <
+    cuda::thread_scope Scope,
+    cuda::std::enable_if_t<Scope != cuda::thread_scope_system && Scope != cuda::thread_scope_device,
+                           bool> = true>
+  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T rhs) const noexcept
+  {
+    return max(atomicMax_block(reinterpret_cast<internal_type*>(&lhs), rhs),
+               static_cast<internal_type>(rhs));
+  }
+};
+
+template <typename T>
+struct reduce_min_impl<T, typename cuda::std::enable_if_t<cuda::std::is_floating_point_v<T>>> {
+  __device__ T operator()(T lhs, T rhs) const noexcept { return min(lhs, rhs); }
+};
+
+template <typename T>
+struct reduce_max_impl<T, typename cuda::std::enable_if_t<cuda::std::is_floating_point_v<T>>> {
+  __device__ T operator()(T lhs, T rhs) const noexcept { return max(lhs, rhs); }
+};
+
+}  // namespace detail
+}  // namespace cuco
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuco/detail/reduction_functor_impl.cuh>
+
+#include <cuda/atomic>
+#include <cuda/std/limits>
+#include <cuda/std/type_traits>
+
+namespace cuco {
+
+/**
+ * @brief Wrapper for reduction identity value.
+ *
+ * @tparam T The underlying value type used for reduction
+ */
+template <typename T>
+class identity_value {
+ public:
+  using type = T;
+  explicit constexpr identity_value(T const& identity) noexcept : identity_(identity) {}
+  constexpr T value() const noexcept { return identity_; }
+
+ private:
+  T identity_;
+};
+
+/**
+ * @brief Wrapper for a user-defined custom reduction operator.
+ *
+ * External synchronization, if required,
+ * is established via an atomic compare-and-swap loop.
+ *
+ * Example:
+ * \code{.cpp}
+ * template <typename T>
+ * struct custom_plus {
+ *   __device__ T operator()(T const& lhs, T const& rhs) const noexcept {
+ *     return lhs + rhs;
+ *   }
+ * };
+ *
+ * template <typename T>
+ * struct custom_plus_sync {
+ *   template <cuda::thread_scope Scope>
+ *   __device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& rhs) const noexcept {
+ *     return lhs.fetch_add(rhs) + rhs;
+ *   }
+ * };
+ *
+ * int main() {
+ *   // define the identity value for the given reduction operation,
+     // i.e., op(identity, x) == x
+ *   cuco::identity_value<int> identity{0};
+ *
+ *   // synchronized via CAS loop
+ *   auto f1 = cuco::reduction_functor<custom_plus<int>, int>(identity);
+ *   // implicitly synchronized
+ *   auto f2 = cuco::reduction_functor<custom_plus_sync<int>, int>(identity);
+ * }
+ * \endcode
+ *
+ * @tparam Func The user-defined reduction functor
+ * @tparam Value The value type used for reduction
+ */
+template <typename Func, typename Value>
+class reduction_functor : detail::reduction_functor_base {
+ public:
+  using value_type = Value;
+
+  reduction_functor(cuco::identity_value<Value> identity, Func functor = Func{}) noexcept
+    : identity_(identity), functor_(functor)
+  {
+  }
+
+  template <cuda::thread_scope Scope>
+  __device__ value_type operator()(cuda::atomic<value_type, Scope>& lhs,
+                                   value_type const& rhs) const noexcept
+  {
+    if constexpr (uses_external_sync()) {
+      value_type old = lhs.load(cuda::memory_order_relaxed);
+      value_type desired;
+
+      do {
+        desired = functor_(old, rhs);
+      } while (not lhs.compare_exchange_weak(
+        old, desired, cuda::memory_order_release, cuda::memory_order_relaxed));
+
+      return desired;
+    }
+    if constexpr (not uses_external_sync()) { return functor_(lhs, rhs); }
+  }
+
+  __host__ __device__ value_type identity() const noexcept { return identity_.value(); }
+
+  __host__ __device__ static constexpr bool uses_external_sync() noexcept
+  {
+    return not atomic_invocable_ or naive_invocable_;
+  }
+
+ private:
+  cuco::identity_value<value_type> identity_;
+  Func functor_;
+
+  static constexpr bool naive_invocable_ =
+    cuda::std::is_invocable_r_v<value_type, Func, value_type, value_type>;
+  static constexpr bool atomic_invocable_ =
+    cuda::std::is_invocable_r_v<value_type,
+                                Func,
+                                cuda::atomic<value_type, cuda::thread_scope_system>&,
+                                value_type> or
+    cuda::std::is_invocable_r_v<value_type,
+                                Func,
+                                cuda::atomic<value_type, cuda::thread_scope_device>&,
+                                value_type> or
+    cuda::std::is_invocable_r_v<value_type,
+                                Func,
+                                cuda::atomic<value_type, cuda::thread_scope_block>&,
+                                value_type> or
+    cuda::std::is_invocable_r_v<value_type,
+                                Func,
+                                cuda::atomic<value_type, cuda::thread_scope_thread>&,
+                                value_type>;
+  static constexpr bool atomic_const_invocable_ =
+    cuda::std::is_invocable_r_v<value_type,
+                                Func,
+                                cuda::atomic<value_type, cuda::thread_scope_system> const&,
+                                value_type> or
+    cuda::std::is_invocable_r_v<value_type,
+                                Func,
+                                cuda::atomic<value_type, cuda::thread_scope_device> const&,
+                                value_type> or
+    cuda::std::is_invocable_r_v<value_type,
+                                Func,
+                                cuda::atomic<value_type, cuda::thread_scope_block> const&,
+                                value_type> or
+    cuda::std::is_invocable_r_v<value_type,
+                                Func,
+                                cuda::atomic<value_type, cuda::thread_scope_thread> const&,
+                                value_type>;
+
+  static_assert((atomic_invocable_ and not atomic_const_invocable_) or naive_invocable_,
+                "Invalid operator signature. Valid signatures are "
+                "(T const&, T const&)->T and (cuda::atomic<T, Scope>&, T const&)->T.");
+  static_assert(!(__nv_is_extended_device_lambda_closure_type(Func) or
+                  __nv_is_extended_host_device_lambda_closure_type(Func)),
+                "Extended __device__/__host__ __device__ lambdas are not supported."
+                " Use a named function object instead.");
+};
+
+/**
+ * @brief Synchronized `+` reduction functor.
+ *
+ * @tparam T The value type used for reduction
+ */
+template <typename T>
+auto reduce_add()
+{
+  return reduction_functor(identity_value<T>{0}, detail::reduce_add_impl<T>{});
+};
+
+/**
+ * @brief Synchronized `min` reduction functor.
+ *
+ * @tparam T The value type used for reduction
+ */
+template <typename T>
+auto reduce_min()
+{
+  return reduction_functor(identity_value{cuda::std::numeric_limits<T>::max()},
+                           detail::reduce_min_impl<T>{});
+};
+
+/**
+ * @brief Synchronized `max` reduction functor.
+ *
+ * @tparam T The value type used for reduction
+ */
+template <typename T>
+auto reduce_max()
+{
+  return reduction_functor(identity_value{cuda::std::numeric_limits<T>::lowest()},
+                           detail::reduce_max_impl<T>{});
+};
+
+/**
+ * @brief Synchronized `count` reduction functor.
+ *
+ * @tparam T The value type used for reduction
+ */
+template <typename T>
+auto reduce_count()
+{
+  return reduction_functor(identity_value<T>{0}, detail::reduce_count_impl<T>{});
+};
+
+}  // namespace cuco