From 2998d9fcd49b36ae8524a4d716a681142284e041 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Mon, 11 Jul 2022 12:38:27 +0000
Subject: [PATCH 01/12] Update Catch2 to v2.13.9.

---
 tests/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index dec5170c4..92dd5a34d 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -23,7 +23,7 @@ include(CTest)
 CPMAddPackage(
   NAME Catch2
   GITHUB_REPOSITORY catchorg/Catch2
-  VERSION 2.11.1
+  VERSION 2.13.9
 )
 
 if(Catch2_ADDED)

From 2a8a50f2d0a403164310e04f76479dd2f6d33ca6 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Mon, 11 Jul 2022 12:49:04 +0000
Subject: [PATCH 02/12] Added reduction functors that can be used by
 static_reduction_map.

---
 .../cuco/detail/reduction_functor_impl.cuh    | 128 +++++++++++
 include/cuco/reduction_functors.cuh           | 157 +++++++++++++
 tests/CMakeLists.txt                          |   5 +
 .../reduction_functors_test.cu                | 212 ++++++++++++++++++
 4 files changed, 502 insertions(+)
 create mode 100644 include/cuco/detail/reduction_functor_impl.cuh
 create mode 100644 include/cuco/reduction_functors.cuh
 create mode 100644 tests/static_reduction_map/reduction_functors_test.cu

diff --git a/include/cuco/detail/reduction_functor_impl.cuh b/include/cuco/detail/reduction_functor_impl.cuh
new file mode 100644
index 000000000..cc771dafb
--- /dev/null
+++ b/include/cuco/detail/reduction_functor_impl.cuh
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <atomic>
+#include <cuda/atomic>
+#include <type_traits>
+
+namespace cuco {
+namespace detail {
+
+/**
+ * @brief Base class of all reduction functors.
+ *
+ * @warning This class should not be used directly.
+ *
+ */
+class reduction_functor_base {};
+
+template <typename T, typename Enable = void>
+struct reduce_add_impl {
+  template <cuda::thread_scope Scope>
+  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& rhs) const noexcept {
+    return lhs.fetch_add(rhs) + rhs;
+  }
+};
+
+template <typename T, typename Enable = void>
+struct reduce_min_impl {
+  template <cuda::thread_scope Scope>
+  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& rhs) const noexcept {
+    return min(lhs.fetch_min(rhs), rhs);
+  }
+};
+
+template <typename T, typename Enable = void>
+struct reduce_max_impl {
+  template <cuda::thread_scope Scope>
+  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& rhs) const noexcept {
+    return max(lhs.fetch_max(rhs), rhs);
+  }
+};
+
+template <typename T, typename Enable = void>
+struct reduce_count_impl {
+  template <cuda::thread_scope Scope>
+  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& /* rhs */) const noexcept {
+    return ++lhs;
+  }
+};
+
+// remove the following WAR once libcu++ extends FP atomics support and fixes signed integer atomics
+// https://github.com/NVIDIA/libcudacxx/pull/286
+template <typename T>
+struct reduce_add_impl<T, typename cuda::std::enable_if<cuda::std::is_floating_point<T>::value>::type> {
+  template <cuda::thread_scope Scope>
+  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T rhs) const noexcept {
+    if constexpr (Scope == cuda::thread_scope_system)
+      return atomicAdd_system(reinterpret_cast<T*>(&lhs), rhs) + rhs;
+    else if constexpr (Scope == cuda::thread_scope_device)
+      return atomicAdd(reinterpret_cast<T*>(&lhs), rhs) + rhs;
+    else
+      return atomicAdd_block(reinterpret_cast<T*>(&lhs), rhs) + rhs;
+  }
+};
+
+template <typename T>
+struct reduce_min_impl<T, typename cuda::std::enable_if<cuda::std::is_integral<T>::value && cuda::std::is_signed<T>::value>::type> {
+  template <cuda::thread_scope Scope>
+  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& rhs) const noexcept {
+    using InternalT = typename cuda::std::conditional<sizeof(T) == 8, long long int, int>::type;
+    InternalT * ptr = reinterpret_cast<InternalT*>(&lhs);
+    InternalT value = rhs;
+    if constexpr (Scope == cuda::thread_scope_system)
+      return min(atomicMin_system(ptr, value), value);
+    else if constexpr (Scope == cuda::thread_scope_device)
+      return min(atomicMin(ptr, value), value);
+    else
+      return min(atomicMin_block(ptr, value), value);
+  }
+};
+
+template <typename T>
+struct reduce_max_impl<T, typename cuda::std::enable_if<cuda::std::is_integral<T>::value && cuda::std::is_signed<T>::value>::type> {
+  template <cuda::thread_scope Scope>
+  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& rhs) const noexcept {
+    using InternalT = typename cuda::std::conditional<sizeof(T) == 8, long long int, int>::type;
+    InternalT * ptr = reinterpret_cast<InternalT*>(&lhs);
+    InternalT value = rhs;
+    if constexpr (Scope == cuda::thread_scope_system)
+      return max(atomicMax_system(ptr, value), value);
+    else if constexpr (Scope == cuda::thread_scope_device)
+      return max(atomicMax(ptr, value), value);
+    else
+      return max(atomicMax_block(ptr, value), value);
+  }
+};
+
+template <typename T>
+struct reduce_min_impl<T, typename cuda::std::enable_if<cuda::std::is_floating_point<T>::value>::type> {
+  __device__ T operator()(T lhs, T rhs) const noexcept {
+    return min(lhs, rhs);
+  }
+};
+
+template <typename T>
+struct reduce_max_impl<T, typename cuda::std::enable_if<cuda::std::is_floating_point<T>::value>::type> {
+  __device__ T operator()(T lhs, T rhs) const noexcept {
+    return max(lhs, rhs);
+  }
+};
+
+} // namespace detail
+} // namespace cuco
\ No newline at end of file
diff --git a/include/cuco/reduction_functors.cuh b/include/cuco/reduction_functors.cuh
new file mode 100644
index 000000000..94888f6b2
--- /dev/null
+++ b/include/cuco/reduction_functors.cuh
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <atomic>
+#include <cuco/detail/reduction_functor_impl.cuh>
+
+#include <cuda/atomic>
+#include <limits>
+#include <type_traits>
+
+namespace cuco {
+
+/**
+ * @brief Wrapper for reduction identity value.
+ *
+ * @tparam T The underlying value type used for reduction
+ */
+template <typename T>
+class identity_value {
+ public:
+  using type = T;
+  constexpr identity_value(T const& identity) noexcept : identity_(identity) {}
+  constexpr T value() const noexcept { return identity_; }
+ private:
+  T identity_;
+};
+
+/**
+ * @brief Wrapper for a user-defined custom reduction operator.
+ *
+ * External synchronization, if required,
+ * is established via an atomic compare-and-swap loop.
+ *
+ * Example:
+ * \code{.cpp}
+ * template <typename T>
+ * struct custom_plus {
+ *   __device__ T operator()(T const& lhs, T const& rhs) const noexcept {
+ *     return lhs + rhs;
+ *   }
+ * };
+ *
+ * template <typename T>
+ * struct custom_plus_sync {
+ *   template <cuda::thread_scope Scope>
+ *   __device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& rhs) const noexcept {
+ *     return lhs.fetch_add(rhs) + rhs;
+ *   }
+ * };
+ *
+ * int main() {
+ *   cuco::identity_value<int> identity{0}; // define the identity value for the given reduction operation, i.e., op(identity, x) == x
+ *
+ *   auto f1 = cuco::reduction_functor<custom_plus<int>, int>(identity); // synchronized via CAS-loop
+ *   auto f2 = cuco::reduction_functor<custom_plus_sync<int>, int>(identity); // implicitly synchronized
+ *
+ *   auto custom_plus_lambda = [] __device__ (int lhs, int rhs) noexcept { return lhs + rhs; };
+ *   auto f3 = cuco::reduction_functor<decltype(custom_plus_lambda), int>(identity, custom_plus_lambda);
+ * }
+ * \endcode
+ *
+ * @tparam Func The user-defined reduction functor
+ * @tparam Value The value type used for reduction
+ */
+template <typename Func, typename Value>
+class reduction_functor : detail::reduction_functor_base {
+ public:
+  using value_type = Value;
+
+  reduction_functor(cuco::identity_value<Value> identity, Func functor = Func{}) noexcept : identity_(identity), functor_(functor) {}
+
+  template <cuda::thread_scope Scope>
+  __device__ value_type operator()(cuda::atomic<value_type, Scope>& lhs, value_type const& rhs) const noexcept
+  {
+    if constexpr (uses_external_sync()) {
+      value_type old = lhs.load(cuda::memory_order_relaxed);
+      value_type desired;
+
+      do {
+        desired = functor_(old, rhs);
+      } while (!lhs.compare_exchange_weak(old, desired, cuda::memory_order_release, cuda::memory_order_relaxed));
+
+      return desired;
+    } else {
+      return functor_(lhs, rhs);
+    }
+  }
+
+  __host__ __device__ value_type identity() const noexcept {
+    return identity_.value();
+  }
+
+  __host__ __device__ static constexpr bool uses_external_sync() noexcept {
+    return !atomic_invocable_ || naive_invocable_;
+  }
+
+ private:
+  cuco::identity_value<value_type> identity_;
+  Func functor_;
+  static constexpr bool naive_invocable_ = std::is_invocable_r<value_type, Func, value_type, value_type>::value;
+  static constexpr bool atomic_invocable_ =
+    std::is_invocable_r<value_type, Func, cuda::atomic<value_type, cuda::thread_scope_system>&, value_type>::value ||
+    std::is_invocable_r<value_type, Func, cuda::atomic<value_type, cuda::thread_scope_device>&, value_type>::value ||
+    std::is_invocable_r<value_type, Func, cuda::atomic<value_type, cuda::thread_scope_block>&,  value_type>::value ||
+    std::is_invocable_r<value_type, Func, cuda::atomic<value_type, cuda::thread_scope_thread>&, value_type>::value;
+
+  static_assert(atomic_invocable_ || naive_invocable_, "Invalid operator signature.");
+};
+
+/**
+ * @brief Synchronized `+` reduction functor.
+ *
+ * @tparam T The value type used for reduction
+ */
+template <typename T>
+auto reduce_add() { return reduction_functor(identity_value<T>{0}, detail::reduce_add_impl<T>{}); };
+
+/**
+ * @brief Synchronized `min` reduction functor.
+ *
+ * @tparam T The value type used for reduction
+ */
+template <typename T>
+auto reduce_min() { return reduction_functor(identity_value{cuda::std::numeric_limits<T>::max()}, detail::reduce_min_impl<T>{}); };
+
+/**
+ * @brief Synchronized `max` reduction functor.
+ *
+ * @tparam T The value type used for reduction
+ */
+template <typename T>
+auto reduce_max() { return reduction_functor(identity_value{cuda::std::numeric_limits<T>::lowest()}, detail::reduce_max_impl<T>{}); };
+
+/**
+ * @brief Synchronized `count` reduction functor.
+ *
+ * @tparam T The value type used for reduction
+ */
+template <typename T>
+auto reduce_count() { return reduction_functor(identity_value<T>{0}, detail::reduce_count_impl<T>{}); };
+
+}  // namespace cuco
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 92dd5a34d..56776f1cf 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -86,3 +86,8 @@ ConfigureTest(STATIC_MULTIMAP_TEST
     static_multimap/multiplicity_test.cu
     static_multimap/non_match_test.cu
     static_multimap/pair_function_test.cu)
+
+###################################################################################################
+# - static_reduction_map tests --------------------------------------------------------------------
+ConfigureTest(STATIC_REDUCTION_MAP_TEST
+    static_reduction_map/reduction_functors_test.cu)
diff --git a/tests/static_reduction_map/reduction_functors_test.cu b/tests/static_reduction_map/reduction_functors_test.cu
new file mode 100644
index 000000000..4e0fffc04
--- /dev/null
+++ b/tests/static_reduction_map/reduction_functors_test.cu
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <limits>
+#include <type_traits>
+#include <utils.hpp>
+
+#include <cuco/reduction_functors.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/functional.h>
+#include <thrust/sequence.h>
+
+#include <catch2/catch.hpp>
+
+template<typename InputIt, typename OutputIt, typename Func>
+__global__ void reduce_kernel(InputIt first, InputIt last, OutputIt out, Func func) {
+  auto tid = blockDim.x * blockIdx.x + threadIdx.x;
+  auto it = first + tid;
+
+  if constexpr (cuda::std::is_base_of_v<cuco::detail::reduction_functor_base, Func>) {
+    while (it < last) {
+      func(*reinterpret_cast<cuda::atomic<typename Func::value_type, cuda::thread_scope_device>*>(thrust::raw_pointer_cast(out)), *it);
+      it += gridDim.x * blockDim.x;
+    }
+  } else {
+    while (it < last) {
+      *out = func(*out, *it);
+      it += gridDim.x * blockDim.x;
+    }
+  }
+}
+
+template<typename InputIt, typename OutputIt, typename Func>
+void reduce_seq(InputIt first, InputIt last, OutputIt out, Func func) {
+  reduce_kernel<<<1, 1>>>(first, last, out, func);
+  cudaDeviceSynchronize();
+}
+
+template<typename InputIt, typename OutputIt, typename Func>
+void reduce_par(InputIt first, InputIt last, OutputIt out, Func func) {
+  reduce_kernel<<<1, 1024>>>(first, last, out, func);
+  cudaDeviceSynchronize();
+}
+
+template <typename Func, typename EquivFunc>
+void test_case_impl(Func func, EquivFunc equiv, bool uses_external_sync) {
+  using Value = typename Func::value_type;
+  CHECK(cuda::std::is_base_of_v<cuco::detail::reduction_functor_base, decltype(func)>);
+  CHECK(cuda::std::is_same_v<typename decltype(func)::value_type, Value>);
+  CHECK(func.uses_external_sync() == uses_external_sync);
+
+  constexpr std::size_t num_items{100};
+
+  thrust::device_vector<Value> values(num_items);
+  thrust::sequence(values.begin(), values.end(), 1);
+
+  thrust::device_vector<Value> results_d(3, func.identity());
+
+  reduce_seq(values.begin(), values.end(), results_d.data() + 0, func);
+  reduce_par(values.begin(), values.end(), results_d.data() + 1, func);
+  reduce_seq(values.begin(), values.end(), results_d.data() + 2, equiv);
+
+  thrust::host_vector<Value> results_h = results_d;
+  auto sequential_result = results_h[0];
+  auto parallel_result = results_h[1];
+  auto correct_result = results_h[2];
+
+  CHECK(sequential_result == correct_result);
+  CHECK(parallel_result == correct_result);
+  CHECK(parallel_result == sequential_result);
+}
+
+template <typename T>
+struct custom_plus {
+  __device__ T operator()(T lhs, T rhs) const noexcept {
+    return lhs + rhs;
+  }
+};
+
+template <typename T>
+struct custom_plus_constref {
+  __device__ T operator()(T const& lhs, T const& rhs) const noexcept {
+    return lhs + rhs;
+  }
+};
+
+template <typename T>
+struct custom_plus_sync {
+  template <cuda::thread_scope Scope>
+  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& rhs) const noexcept {
+    return lhs.fetch_add(rhs) + rhs;
+  }
+};
+
+template <typename T>
+struct equiv_count {
+  __device__ T operator()(T const& lhs, T const& /* rhs */) const noexcept {
+    return lhs + 1;
+  }
+};
+
+TEMPLATE_TEST_CASE_SIG(
+  "Test '+' reduction functors",
+  "",
+  ((typename Value, typename Func, bool UsesExternalSync), Value, Func, UsesExternalSync),
+  (int32_t, cuco::detail::reduce_add_impl<int32_t>, false),
+  (int64_t, cuco::detail::reduce_add_impl<int64_t>, false),
+  (uint32_t, cuco::detail::reduce_add_impl<uint32_t>, false),
+  (uint64_t, cuco::detail::reduce_add_impl<uint64_t>, false),
+  (float, cuco::detail::reduce_add_impl<float>, false),
+  (double, cuco::detail::reduce_add_impl<double>, false),
+  (int32_t, thrust::plus<int32_t>, true),
+  (int32_t, custom_plus_sync<int32_t>, false),
+  (int32_t, custom_plus<int32_t>, true),
+  (int32_t, custom_plus_constref<int32_t>, true))
+{
+  test_case_impl(
+    cuco::reduction_functor<Func, Value>(cuco::identity_value<Value>(0)),
+    thrust::plus<Value>(),
+    UsesExternalSync);
+}
+
+TEMPLATE_TEST_CASE_SIG(
+  "Test 'min' reduction functors",
+  "",
+  ((typename Value, typename Func, bool UsesExternalSync), Value, Func, UsesExternalSync),
+  (int32_t, cuco::detail::reduce_min_impl<int32_t>, false),
+  (int64_t, cuco::detail::reduce_min_impl<int64_t>, false),
+  (uint32_t, cuco::detail::reduce_min_impl<uint32_t>, false),
+  (uint64_t, cuco::detail::reduce_min_impl<uint64_t>, false),
+  (float, cuco::detail::reduce_min_impl<float>, true),
+  (double, cuco::detail::reduce_min_impl<double>, true),
+  (int32_t, thrust::minimum<int32_t>, true))
+{
+  test_case_impl(
+    cuco::reduction_functor<Func, Value>(cuco::identity_value<Value>(std::numeric_limits<Value>::max())),
+    thrust::minimum<Value>(),
+    UsesExternalSync);
+}
+
+TEMPLATE_TEST_CASE_SIG(
+  "Test 'max' reduction functors",
+  "",
+  ((typename Value, typename Func, bool UsesExternalSync), Value, Func, UsesExternalSync),
+  (int32_t, cuco::detail::reduce_max_impl<int32_t>, false),
+  (int64_t, cuco::detail::reduce_max_impl<int64_t>, false),
+  (uint32_t, cuco::detail::reduce_max_impl<uint32_t>, false),
+  (uint64_t, cuco::detail::reduce_max_impl<uint64_t>, false),
+  (float, cuco::detail::reduce_max_impl<float>, true),
+  (double, cuco::detail::reduce_max_impl<double>, true),
+  (int32_t, thrust::maximum<int32_t>, true))
+{
+  test_case_impl(
+    cuco::reduction_functor<Func, Value>(cuco::identity_value<Value>(std::numeric_limits<Value>::min())),
+    thrust::maximum<Value>(),
+    UsesExternalSync);
+}
+
+TEMPLATE_TEST_CASE_SIG(
+  "Test 'count' reduction functors",
+  "",
+  ((typename Value, typename Func, bool UsesExternalSync), Value, Func, UsesExternalSync),
+  (int32_t, cuco::detail::reduce_count_impl<int32_t>, false),
+  (int64_t, cuco::detail::reduce_count_impl<int64_t>, false),
+  (uint32_t, cuco::detail::reduce_count_impl<uint32_t>, false),
+  (uint64_t, cuco::detail::reduce_count_impl<uint64_t>, false))
+{
+  test_case_impl(
+    cuco::reduction_functor<Func, Value>(cuco::identity_value<Value>(0)),
+    equiv_count<Value>(),
+    UsesExternalSync);
+}
+
+TEST_CASE(
+  "Test device lambda reduction functor",
+  "")
+{
+  using Value = int;
+  auto identity = cuco::identity_value<Value>(0);
+
+  auto lambda_add = [] __device__ (Value const& lhs, Value const& rhs) noexcept {
+    return lhs + rhs;
+  };
+  test_case_impl(
+    cuco::reduction_functor<decltype(lambda_add), Value>(identity, lambda_add),
+    thrust::plus<Value>(),
+    true);
+
+  using AtomicValue = cuda::atomic<int, cuda::thread_scope_device>;
+  auto lambda_add_sync = [] __device__ (AtomicValue& lhs, Value const& rhs) noexcept {
+    return lhs.fetch_add(rhs) + rhs;
+  };
+  test_case_impl(
+    cuco::reduction_functor<decltype(lambda_add_sync), Value>(identity, lambda_add_sync),
+    thrust::plus<Value>(),
+    false);
+}
\ No newline at end of file

From e4adb05dbb214c4bca49bb9fe6f070f3c4f2a9d3 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 11 Jul 2022 13:43:48 +0000
Subject: [PATCH 03/12] [pre-commit.ci] auto code formatting

---
 .../cuco/detail/reduction_functor_impl.cuh    | 60 +++++++-----
 include/cuco/reduction_functors.cuh           | 75 +++++++++++----
 .../reduction_functors_test.cu                | 96 +++++++++----------
 3 files changed, 138 insertions(+), 93 deletions(-)

diff --git a/include/cuco/detail/reduction_functor_impl.cuh b/include/cuco/detail/reduction_functor_impl.cuh
index cc771dafb..3128bd01f 100644
--- a/include/cuco/detail/reduction_functor_impl.cuh
+++ b/include/cuco/detail/reduction_functor_impl.cuh
@@ -29,12 +29,14 @@ namespace detail {
  * @warning This class should not be used directly.
  *
  */
-class reduction_functor_base {};
+class reduction_functor_base {
+};
 
 template <typename T, typename Enable = void>
 struct reduce_add_impl {
   template <cuda::thread_scope Scope>
-  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& rhs) const noexcept {
+  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& rhs) const noexcept
+  {
     return lhs.fetch_add(rhs) + rhs;
   }
 };
@@ -42,7 +44,8 @@ struct reduce_add_impl {
 template <typename T, typename Enable = void>
 struct reduce_min_impl {
   template <cuda::thread_scope Scope>
-  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& rhs) const noexcept {
+  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& rhs) const noexcept
+  {
     return min(lhs.fetch_min(rhs), rhs);
   }
 };
@@ -50,7 +53,8 @@ struct reduce_min_impl {
 template <typename T, typename Enable = void>
 struct reduce_max_impl {
   template <cuda::thread_scope Scope>
-  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& rhs) const noexcept {
+  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& rhs) const noexcept
+  {
     return max(lhs.fetch_max(rhs), rhs);
   }
 };
@@ -58,7 +62,8 @@ struct reduce_max_impl {
 template <typename T, typename Enable = void>
 struct reduce_count_impl {
   template <cuda::thread_scope Scope>
-  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& /* rhs */) const noexcept {
+  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& /* rhs */) const noexcept
+  {
     return ++lhs;
   }
 };
@@ -66,9 +71,12 @@ struct reduce_count_impl {
 // remove the following WAR once libcu++ extends FP atomics support and fixes signed integer atomics
 // https://github.com/NVIDIA/libcudacxx/pull/286
 template <typename T>
-struct reduce_add_impl<T, typename cuda::std::enable_if<cuda::std::is_floating_point<T>::value>::type> {
+struct reduce_add_impl<
+  T,
+  typename cuda::std::enable_if<cuda::std::is_floating_point<T>::value>::type> {
   template <cuda::thread_scope Scope>
-  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T rhs) const noexcept {
+  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T rhs) const noexcept
+  {
     if constexpr (Scope == cuda::thread_scope_system)
       return atomicAdd_system(reinterpret_cast<T*>(&lhs), rhs) + rhs;
     else if constexpr (Scope == cuda::thread_scope_device)
@@ -79,11 +87,14 @@ struct reduce_add_impl<T, typename cuda::std::enable_if<cuda::std::is_floating_p
 };
 
 template <typename T>
-struct reduce_min_impl<T, typename cuda::std::enable_if<cuda::std::is_integral<T>::value && cuda::std::is_signed<T>::value>::type> {
+struct reduce_min_impl<T,
+                       typename cuda::std::enable_if<cuda::std::is_integral<T>::value &&
+                                                     cuda::std::is_signed<T>::value>::type> {
   template <cuda::thread_scope Scope>
-  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& rhs) const noexcept {
+  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& rhs) const noexcept
+  {
     using InternalT = typename cuda::std::conditional<sizeof(T) == 8, long long int, int>::type;
-    InternalT * ptr = reinterpret_cast<InternalT*>(&lhs);
+    InternalT* ptr  = reinterpret_cast<InternalT*>(&lhs);
     InternalT value = rhs;
     if constexpr (Scope == cuda::thread_scope_system)
       return min(atomicMin_system(ptr, value), value);
@@ -95,11 +106,14 @@ struct reduce_min_impl<T, typename cuda::std::enable_if<cuda::std::is_integral<T
 };
 
 template <typename T>
-struct reduce_max_impl<T, typename cuda::std::enable_if<cuda::std::is_integral<T>::value && cuda::std::is_signed<T>::value>::type> {
+struct reduce_max_impl<T,
+                       typename cuda::std::enable_if<cuda::std::is_integral<T>::value &&
+                                                     cuda::std::is_signed<T>::value>::type> {
   template <cuda::thread_scope Scope>
-  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& rhs) const noexcept {
+  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& rhs) const noexcept
+  {
     using InternalT = typename cuda::std::conditional<sizeof(T) == 8, long long int, int>::type;
-    InternalT * ptr = reinterpret_cast<InternalT*>(&lhs);
+    InternalT* ptr  = reinterpret_cast<InternalT*>(&lhs);
     InternalT value = rhs;
     if constexpr (Scope == cuda::thread_scope_system)
       return max(atomicMax_system(ptr, value), value);
@@ -111,18 +125,18 @@ struct reduce_max_impl<T, typename cuda::std::enable_if<cuda::std::is_integral<T
 };
 
 template <typename T>
-struct reduce_min_impl<T, typename cuda::std::enable_if<cuda::std::is_floating_point<T>::value>::type> {
-  __device__ T operator()(T lhs, T rhs) const noexcept {
-    return min(lhs, rhs);
-  }
+struct reduce_min_impl<
+  T,
+  typename cuda::std::enable_if<cuda::std::is_floating_point<T>::value>::type> {
+  __device__ T operator()(T lhs, T rhs) const noexcept { return min(lhs, rhs); }
 };
 
 template <typename T>
-struct reduce_max_impl<T, typename cuda::std::enable_if<cuda::std::is_floating_point<T>::value>::type> {
-  __device__ T operator()(T lhs, T rhs) const noexcept {
-    return max(lhs, rhs);
-  }
+struct reduce_max_impl<
+  T,
+  typename cuda::std::enable_if<cuda::std::is_floating_point<T>::value>::type> {
+  __device__ T operator()(T lhs, T rhs) const noexcept { return max(lhs, rhs); }
 };
 
-} // namespace detail
-} // namespace cuco
\ No newline at end of file
+}  // namespace detail
+}  // namespace cuco
\ No newline at end of file
diff --git a/include/cuco/reduction_functors.cuh b/include/cuco/reduction_functors.cuh
index 94888f6b2..074adc1f3 100644
--- a/include/cuco/reduction_functors.cuh
+++ b/include/cuco/reduction_functors.cuh
@@ -36,6 +36,7 @@ class identity_value {
   using type = T;
   constexpr identity_value(T const& identity) noexcept : identity_(identity) {}
   constexpr T value() const noexcept { return identity_; }
+
  private:
   T identity_;
 };
@@ -64,13 +65,16 @@ class identity_value {
  * };
  *
  * int main() {
- *   cuco::identity_value<int> identity{0}; // define the identity value for the given reduction operation, i.e., op(identity, x) == x
+ *   cuco::identity_value<int> identity{0}; // define the identity value for the given reduction
+ * operation, i.e., op(identity, x) == x
  *
- *   auto f1 = cuco::reduction_functor<custom_plus<int>, int>(identity); // synchronized via CAS-loop
- *   auto f2 = cuco::reduction_functor<custom_plus_sync<int>, int>(identity); // implicitly synchronized
+ *   auto f1 = cuco::reduction_functor<custom_plus<int>, int>(identity); // synchronized via
+ * CAS-loop auto f2 = cuco::reduction_functor<custom_plus_sync<int>, int>(identity); // implicitly
+ * synchronized
  *
  *   auto custom_plus_lambda = [] __device__ (int lhs, int rhs) noexcept { return lhs + rhs; };
- *   auto f3 = cuco::reduction_functor<decltype(custom_plus_lambda), int>(identity, custom_plus_lambda);
+ *   auto f3 = cuco::reduction_functor<decltype(custom_plus_lambda), int>(identity,
+ * custom_plus_lambda);
  * }
  * \endcode
  *
@@ -82,10 +86,14 @@ class reduction_functor : detail::reduction_functor_base {
  public:
   using value_type = Value;
 
-  reduction_functor(cuco::identity_value<Value> identity, Func functor = Func{}) noexcept : identity_(identity), functor_(functor) {}
+  reduction_functor(cuco::identity_value<Value> identity, Func functor = Func{}) noexcept
+    : identity_(identity), functor_(functor)
+  {
+  }
 
   template <cuda::thread_scope Scope>
-  __device__ value_type operator()(cuda::atomic<value_type, Scope>& lhs, value_type const& rhs) const noexcept
+  __device__ value_type operator()(cuda::atomic<value_type, Scope>& lhs,
+                                   value_type const& rhs) const noexcept
   {
     if constexpr (uses_external_sync()) {
       value_type old = lhs.load(cuda::memory_order_relaxed);
@@ -93,7 +101,8 @@ class reduction_functor : detail::reduction_functor_base {
 
       do {
         desired = functor_(old, rhs);
-      } while (!lhs.compare_exchange_weak(old, desired, cuda::memory_order_release, cuda::memory_order_relaxed));
+      } while (!lhs.compare_exchange_weak(
+        old, desired, cuda::memory_order_release, cuda::memory_order_relaxed));
 
       return desired;
     } else {
@@ -101,23 +110,35 @@ class reduction_functor : detail::reduction_functor_base {
     }
   }
 
-  __host__ __device__ value_type identity() const noexcept {
-    return identity_.value();
-  }
+  __host__ __device__ value_type identity() const noexcept { return identity_.value(); }
 
-  __host__ __device__ static constexpr bool uses_external_sync() noexcept {
+  __host__ __device__ static constexpr bool uses_external_sync() noexcept
+  {
     return !atomic_invocable_ || naive_invocable_;
   }
 
  private:
   cuco::identity_value<value_type> identity_;
   Func functor_;
-  static constexpr bool naive_invocable_ = std::is_invocable_r<value_type, Func, value_type, value_type>::value;
+  static constexpr bool naive_invocable_ =
+    std::is_invocable_r<value_type, Func, value_type, value_type>::value;
   static constexpr bool atomic_invocable_ =
-    std::is_invocable_r<value_type, Func, cuda::atomic<value_type, cuda::thread_scope_system>&, value_type>::value ||
-    std::is_invocable_r<value_type, Func, cuda::atomic<value_type, cuda::thread_scope_device>&, value_type>::value ||
-    std::is_invocable_r<value_type, Func, cuda::atomic<value_type, cuda::thread_scope_block>&,  value_type>::value ||
-    std::is_invocable_r<value_type, Func, cuda::atomic<value_type, cuda::thread_scope_thread>&, value_type>::value;
+    std::is_invocable_r<value_type,
+                        Func,
+                        cuda::atomic<value_type, cuda::thread_scope_system>&,
+                        value_type>::value ||
+    std::is_invocable_r<value_type,
+                        Func,
+                        cuda::atomic<value_type, cuda::thread_scope_device>&,
+                        value_type>::value ||
+    std::is_invocable_r<value_type,
+                        Func,
+                        cuda::atomic<value_type, cuda::thread_scope_block>&,
+                        value_type>::value ||
+    std::is_invocable_r<value_type,
+                        Func,
+                        cuda::atomic<value_type, cuda::thread_scope_thread>&,
+                        value_type>::value;
 
   static_assert(atomic_invocable_ || naive_invocable_, "Invalid operator signature.");
 };
@@ -128,7 +149,10 @@ class reduction_functor : detail::reduction_functor_base {
  * @tparam T The value type used for reduction
  */
 template <typename T>
-auto reduce_add() { return reduction_functor(identity_value<T>{0}, detail::reduce_add_impl<T>{}); };
+auto reduce_add()
+{
+  return reduction_functor(identity_value<T>{0}, detail::reduce_add_impl<T>{});
+};
 
 /**
  * @brief Synchronized `min` reduction functor.
@@ -136,7 +160,11 @@ auto reduce_add() { return reduction_functor(identity_value<T>{0}, detail::reduc
  * @tparam T The value type used for reduction
  */
 template <typename T>
-auto reduce_min() { return reduction_functor(identity_value{cuda::std::numeric_limits<T>::max()}, detail::reduce_min_impl<T>{}); };
+auto reduce_min()
+{
+  return reduction_functor(identity_value{cuda::std::numeric_limits<T>::max()},
+                           detail::reduce_min_impl<T>{});
+};
 
 /**
  * @brief Synchronized `max` reduction functor.
@@ -144,7 +172,11 @@ auto reduce_min() { return reduction_functor(identity_value{cuda::std::numeric_l
  * @tparam T The value type used for reduction
  */
 template <typename T>
-auto reduce_max() { return reduction_functor(identity_value{cuda::std::numeric_limits<T>::lowest()}, detail::reduce_max_impl<T>{}); };
+auto reduce_max()
+{
+  return reduction_functor(identity_value{cuda::std::numeric_limits<T>::lowest()},
+                           detail::reduce_max_impl<T>{});
+};
 
 /**
  * @brief Synchronized `count` reduction functor.
@@ -152,6 +184,9 @@ auto reduce_max() { return reduction_functor(identity_value{cuda::std::numeric_l
  * @tparam T The value type used for reduction
  */
 template <typename T>
-auto reduce_count() { return reduction_functor(identity_value<T>{0}, detail::reduce_count_impl<T>{}); };
+auto reduce_count()
+{
+  return reduction_functor(identity_value<T>{0}, detail::reduce_count_impl<T>{});
+};
 
 }  // namespace cuco
diff --git a/tests/static_reduction_map/reduction_functors_test.cu b/tests/static_reduction_map/reduction_functors_test.cu
index 4e0fffc04..20d247e84 100644
--- a/tests/static_reduction_map/reduction_functors_test.cu
+++ b/tests/static_reduction_map/reduction_functors_test.cu
@@ -21,20 +21,23 @@
 #include <cuco/reduction_functors.cuh>
 
 #include <thrust/device_vector.h>
-#include <thrust/host_vector.h>
 #include <thrust/functional.h>
+#include <thrust/host_vector.h>
 #include <thrust/sequence.h>
 
 #include <catch2/catch.hpp>
 
-template<typename InputIt, typename OutputIt, typename Func>
-__global__ void reduce_kernel(InputIt first, InputIt last, OutputIt out, Func func) {
+template <typename InputIt, typename OutputIt, typename Func>
+__global__ void reduce_kernel(InputIt first, InputIt last, OutputIt out, Func func)
+{
   auto tid = blockDim.x * blockIdx.x + threadIdx.x;
-  auto it = first + tid;
+  auto it  = first + tid;
 
   if constexpr (cuda::std::is_base_of_v<cuco::detail::reduction_functor_base, Func>) {
     while (it < last) {
-      func(*reinterpret_cast<cuda::atomic<typename Func::value_type, cuda::thread_scope_device>*>(thrust::raw_pointer_cast(out)), *it);
+      func(*reinterpret_cast<cuda::atomic<typename Func::value_type, cuda::thread_scope_device>*>(
+             thrust::raw_pointer_cast(out)),
+           *it);
       it += gridDim.x * blockDim.x;
     }
   } else {
@@ -45,20 +48,23 @@ __global__ void reduce_kernel(InputIt first, InputIt last, OutputIt out, Func fu
   }
 }
 
-template<typename InputIt, typename OutputIt, typename Func>
-void reduce_seq(InputIt first, InputIt last, OutputIt out, Func func) {
+template <typename InputIt, typename OutputIt, typename Func>
+void reduce_seq(InputIt first, InputIt last, OutputIt out, Func func)
+{
   reduce_kernel<<<1, 1>>>(first, last, out, func);
   cudaDeviceSynchronize();
 }
 
-template<typename InputIt, typename OutputIt, typename Func>
-void reduce_par(InputIt first, InputIt last, OutputIt out, Func func) {
+template <typename InputIt, typename OutputIt, typename Func>
+void reduce_par(InputIt first, InputIt last, OutputIt out, Func func)
+{
   reduce_kernel<<<1, 1024>>>(first, last, out, func);
   cudaDeviceSynchronize();
 }
 
 template <typename Func, typename EquivFunc>
-void test_case_impl(Func func, EquivFunc equiv, bool uses_external_sync) {
+void test_case_impl(Func func, EquivFunc equiv, bool uses_external_sync)
+{
   using Value = typename Func::value_type;
   CHECK(cuda::std::is_base_of_v<cuco::detail::reduction_functor_base, decltype(func)>);
   CHECK(cuda::std::is_same_v<typename decltype(func)::value_type, Value>);
@@ -76,9 +82,9 @@ void test_case_impl(Func func, EquivFunc equiv, bool uses_external_sync) {
   reduce_seq(values.begin(), values.end(), results_d.data() + 2, equiv);
 
   thrust::host_vector<Value> results_h = results_d;
-  auto sequential_result = results_h[0];
-  auto parallel_result = results_h[1];
-  auto correct_result = results_h[2];
+  auto sequential_result               = results_h[0];
+  auto parallel_result                 = results_h[1];
+  auto correct_result                  = results_h[2];
 
   CHECK(sequential_result == correct_result);
   CHECK(parallel_result == correct_result);
@@ -87,31 +93,26 @@ void test_case_impl(Func func, EquivFunc equiv, bool uses_external_sync) {
 
 template <typename T>
 struct custom_plus {
-  __device__ T operator()(T lhs, T rhs) const noexcept {
-    return lhs + rhs;
-  }
+  __device__ T operator()(T lhs, T rhs) const noexcept { return lhs + rhs; }
 };
 
 template <typename T>
 struct custom_plus_constref {
-  __device__ T operator()(T const& lhs, T const& rhs) const noexcept {
-    return lhs + rhs;
-  }
+  __device__ T operator()(T const& lhs, T const& rhs) const noexcept { return lhs + rhs; }
 };
 
 template <typename T>
 struct custom_plus_sync {
   template <cuda::thread_scope Scope>
-  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& rhs) const noexcept {
+  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& rhs) const noexcept
+  {
     return lhs.fetch_add(rhs) + rhs;
   }
 };
 
 template <typename T>
 struct equiv_count {
-  __device__ T operator()(T const& lhs, T const& /* rhs */) const noexcept {
-    return lhs + 1;
-  }
+  __device__ T operator()(T const& lhs, T const& /* rhs */) const noexcept { return lhs + 1; }
 };
 
 TEMPLATE_TEST_CASE_SIG(
@@ -129,10 +130,9 @@ TEMPLATE_TEST_CASE_SIG(
   (int32_t, custom_plus<int32_t>, true),
   (int32_t, custom_plus_constref<int32_t>, true))
 {
-  test_case_impl(
-    cuco::reduction_functor<Func, Value>(cuco::identity_value<Value>(0)),
-    thrust::plus<Value>(),
-    UsesExternalSync);
+  test_case_impl(cuco::reduction_functor<Func, Value>(cuco::identity_value<Value>(0)),
+                 thrust::plus<Value>(),
+                 UsesExternalSync);
 }
 
 TEMPLATE_TEST_CASE_SIG(
@@ -147,10 +147,10 @@ TEMPLATE_TEST_CASE_SIG(
   (double, cuco::detail::reduce_min_impl<double>, true),
   (int32_t, thrust::minimum<int32_t>, true))
 {
-  test_case_impl(
-    cuco::reduction_functor<Func, Value>(cuco::identity_value<Value>(std::numeric_limits<Value>::max())),
-    thrust::minimum<Value>(),
-    UsesExternalSync);
+  test_case_impl(cuco::reduction_functor<Func, Value>(
+                   cuco::identity_value<Value>(std::numeric_limits<Value>::max())),
+                 thrust::minimum<Value>(),
+                 UsesExternalSync);
 }
 
 TEMPLATE_TEST_CASE_SIG(
@@ -165,10 +165,10 @@ TEMPLATE_TEST_CASE_SIG(
   (double, cuco::detail::reduce_max_impl<double>, true),
   (int32_t, thrust::maximum<int32_t>, true))
 {
-  test_case_impl(
-    cuco::reduction_functor<Func, Value>(cuco::identity_value<Value>(std::numeric_limits<Value>::min())),
-    thrust::maximum<Value>(),
-    UsesExternalSync);
+  test_case_impl(cuco::reduction_functor<Func, Value>(
+                   cuco::identity_value<Value>(std::numeric_limits<Value>::min())),
+                 thrust::maximum<Value>(),
+                 UsesExternalSync);
 }
 
 TEMPLATE_TEST_CASE_SIG(
@@ -180,29 +180,25 @@ TEMPLATE_TEST_CASE_SIG(
   (uint32_t, cuco::detail::reduce_count_impl<uint32_t>, false),
   (uint64_t, cuco::detail::reduce_count_impl<uint64_t>, false))
 {
-  test_case_impl(
-    cuco::reduction_functor<Func, Value>(cuco::identity_value<Value>(0)),
-    equiv_count<Value>(),
-    UsesExternalSync);
+  test_case_impl(cuco::reduction_functor<Func, Value>(cuco::identity_value<Value>(0)),
+                 equiv_count<Value>(),
+                 UsesExternalSync);
 }
 
-TEST_CASE(
-  "Test device lambda reduction functor",
-  "")
+TEST_CASE("Test device lambda reduction functor", "")
 {
-  using Value = int;
+  using Value   = int;
   auto identity = cuco::identity_value<Value>(0);
 
-  auto lambda_add = [] __device__ (Value const& lhs, Value const& rhs) noexcept {
+  auto lambda_add = [] __device__(Value const& lhs, Value const& rhs) noexcept {
     return lhs + rhs;
   };
-  test_case_impl(
-    cuco::reduction_functor<decltype(lambda_add), Value>(identity, lambda_add),
-    thrust::plus<Value>(),
-    true);
+  test_case_impl(cuco::reduction_functor<decltype(lambda_add), Value>(identity, lambda_add),
+                 thrust::plus<Value>(),
+                 true);
 
-  using AtomicValue = cuda::atomic<int, cuda::thread_scope_device>;
-  auto lambda_add_sync = [] __device__ (AtomicValue& lhs, Value const& rhs) noexcept {
+  using AtomicValue    = cuda::atomic<int, cuda::thread_scope_device>;
+  auto lambda_add_sync = [] __device__(AtomicValue & lhs, Value const& rhs) noexcept {
     return lhs.fetch_add(rhs) + rhs;
   };
   test_case_impl(

From bb78c499baf998f94a479170e0941d8586414118 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Mon, 11 Jul 2022 16:09:09 +0000
Subject: [PATCH 04/12] Prevent extended __host__ __device__/__device__ lambdas
 from being used as reduction functors.

---
 include/cuco/reduction_functors.cuh           | 11 ++++----
 .../reduction_functors_test.cu                | 27 +++----------------
 2 files changed, 9 insertions(+), 29 deletions(-)

diff --git a/include/cuco/reduction_functors.cuh b/include/cuco/reduction_functors.cuh
index 074adc1f3..52df79b29 100644
--- a/include/cuco/reduction_functors.cuh
+++ b/include/cuco/reduction_functors.cuh
@@ -71,10 +71,6 @@ class identity_value {
  *   auto f1 = cuco::reduction_functor<custom_plus<int>, int>(identity); // synchronized via
  * CAS-loop auto f2 = cuco::reduction_functor<custom_plus_sync<int>, int>(identity); // implicitly
  * synchronized
- *
- *   auto custom_plus_lambda = [] __device__ (int lhs, int rhs) noexcept { return lhs + rhs; };
- *   auto f3 = cuco::reduction_functor<decltype(custom_plus_lambda), int>(identity,
- * custom_plus_lambda);
  * }
  * \endcode
  *
@@ -140,7 +136,12 @@ class reduction_functor : detail::reduction_functor_base {
                         cuda::atomic<value_type, cuda::thread_scope_thread>&,
                         value_type>::value;
 
-  static_assert(atomic_invocable_ || naive_invocable_, "Invalid operator signature.");
+  static_assert(atomic_invocable_ || naive_invocable_,
+               "Invalid operator signature. Valid signatures are "
+               "(T const&, T const&)->T and (cuda::atomic<T, Scope>&, T const&)->T.");
+  static_assert(!(__nv_is_extended_device_lambda_closure_type(Func) || __nv_is_extended_host_device_lambda_closure_type(Func)),
+                "Extended __device__/__host__ __device__ lambdas are not supported."
+                " Use a named function object instead.");
 };
 
 /**
diff --git a/tests/static_reduction_map/reduction_functors_test.cu b/tests/static_reduction_map/reduction_functors_test.cu
index 20d247e84..55976e372 100644
--- a/tests/static_reduction_map/reduction_functors_test.cu
+++ b/tests/static_reduction_map/reduction_functors_test.cu
@@ -180,29 +180,8 @@ TEMPLATE_TEST_CASE_SIG(
   (uint32_t, cuco::detail::reduce_count_impl<uint32_t>, false),
   (uint64_t, cuco::detail::reduce_count_impl<uint64_t>, false))
 {
-  test_case_impl(cuco::reduction_functor<Func, Value>(cuco::identity_value<Value>(0)),
-                 equiv_count<Value>(),
-                 UsesExternalSync);
-}
-
-TEST_CASE("Test device lambda reduction functor", "")
-{
-  using Value   = int;
-  auto identity = cuco::identity_value<Value>(0);
-
-  auto lambda_add = [] __device__(Value const& lhs, Value const& rhs) noexcept {
-    return lhs + rhs;
-  };
-  test_case_impl(cuco::reduction_functor<decltype(lambda_add), Value>(identity, lambda_add),
-                 thrust::plus<Value>(),
-                 true);
-
-  using AtomicValue    = cuda::atomic<int, cuda::thread_scope_device>;
-  auto lambda_add_sync = [] __device__(AtomicValue & lhs, Value const& rhs) noexcept {
-    return lhs.fetch_add(rhs) + rhs;
-  };
   test_case_impl(
-    cuco::reduction_functor<decltype(lambda_add_sync), Value>(identity, lambda_add_sync),
-    thrust::plus<Value>(),
-    false);
+    cuco::reduction_functor<Func, Value>(cuco::identity_value<Value>(0)),
+    equiv_count<Value>(),
+    UsesExternalSync);
 }
\ No newline at end of file

From 905edd843a02c003706661fdacbe354c61ae7a46 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Mon, 11 Jul 2022 16:11:11 +0000
Subject: [PATCH 05/12] Remove spurious include.

---
 include/cuco/detail/reduction_functor_impl.cuh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/cuco/detail/reduction_functor_impl.cuh b/include/cuco/detail/reduction_functor_impl.cuh
index 3128bd01f..304158bb1 100644
--- a/include/cuco/detail/reduction_functor_impl.cuh
+++ b/include/cuco/detail/reduction_functor_impl.cuh
@@ -16,7 +16,6 @@
 
 #pragma once
 
-#include <atomic>
 #include <cuda/atomic>
 #include <type_traits>
 

From 7d3aff327c2a11123903774dd928acee89064d49 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 11 Jul 2022 16:40:03 +0000
Subject: [PATCH 06/12] [pre-commit.ci] auto code formatting

---
 include/cuco/reduction_functors.cuh                   | 7 ++++---
 tests/static_reduction_map/reduction_functors_test.cu | 7 +++----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/cuco/reduction_functors.cuh b/include/cuco/reduction_functors.cuh
index 52df79b29..a15ec5bab 100644
--- a/include/cuco/reduction_functors.cuh
+++ b/include/cuco/reduction_functors.cuh
@@ -137,9 +137,10 @@ class reduction_functor : detail::reduction_functor_base {
                         value_type>::value;
 
   static_assert(atomic_invocable_ || naive_invocable_,
-               "Invalid operator signature. Valid signatures are "
-               "(T const&, T const&)->T and (cuda::atomic<T, Scope>&, T const&)->T.");
-  static_assert(!(__nv_is_extended_device_lambda_closure_type(Func) || __nv_is_extended_host_device_lambda_closure_type(Func)),
+                "Invalid operator signature. Valid signatures are "
+                "(T const&, T const&)->T and (cuda::atomic<T, Scope>&, T const&)->T.");
+  static_assert(!(__nv_is_extended_device_lambda_closure_type(Func) ||
+                  __nv_is_extended_host_device_lambda_closure_type(Func)),
                 "Extended __device__/__host__ __device__ lambdas are not supported."
                 " Use a named function object instead.");
 };
diff --git a/tests/static_reduction_map/reduction_functors_test.cu b/tests/static_reduction_map/reduction_functors_test.cu
index 55976e372..bd59b1164 100644
--- a/tests/static_reduction_map/reduction_functors_test.cu
+++ b/tests/static_reduction_map/reduction_functors_test.cu
@@ -180,8 +180,7 @@ TEMPLATE_TEST_CASE_SIG(
   (uint32_t, cuco::detail::reduce_count_impl<uint32_t>, false),
   (uint64_t, cuco::detail::reduce_count_impl<uint64_t>, false))
 {
-  test_case_impl(
-    cuco::reduction_functor<Func, Value>(cuco::identity_value<Value>(0)),
-    equiv_count<Value>(),
-    UsesExternalSync);
+  test_case_impl(cuco::reduction_functor<Func, Value>(cuco::identity_value<Value>(0)),
+                 equiv_count<Value>(),
+                 UsesExternalSync);
 }
\ No newline at end of file

From 9cbe8906af2c465700f4d299f99d4ddde8ed8bc0 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Tue, 12 Jul 2022 10:04:23 +0000
Subject: [PATCH 07/12] Workaround for compiler warning where nvcc is not able
 to see the return statement in an `if constexpr else` statement.

---
 .../cuco/detail/reduction_functor_impl.cuh    | 102 +++++++++++++-----
 include/cuco/reduction_functors.cuh           |  15 +--
 2 files changed, 81 insertions(+), 36 deletions(-)

diff --git a/include/cuco/detail/reduction_functor_impl.cuh b/include/cuco/detail/reduction_functor_impl.cuh
index 304158bb1..6107f92f4 100644
--- a/include/cuco/detail/reduction_functor_impl.cuh
+++ b/include/cuco/detail/reduction_functor_impl.cuh
@@ -73,15 +73,27 @@ template <typename T>
 struct reduce_add_impl<
   T,
   typename cuda::std::enable_if<cuda::std::is_floating_point<T>::value>::type> {
-  template <cuda::thread_scope Scope>
+  template <cuda::thread_scope Scope,
+            cuda::std::enable_if_t<Scope == cuda::thread_scope_system, bool> = true>
+  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T rhs) const noexcept
+  {
+    return atomicAdd_system(reinterpret_cast<T*>(&lhs), rhs) + rhs;
+  }
+
+  template <cuda::thread_scope Scope,
+            cuda::std::enable_if_t<Scope == cuda::thread_scope_device, bool> = true>
+  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T rhs) const noexcept
+  {
+    return atomicAdd(reinterpret_cast<T*>(&lhs), rhs) + rhs;
+  }
+
+  template <
+    cuda::thread_scope Scope,
+    cuda::std::enable_if_t<Scope != cuda::thread_scope_system && Scope != cuda::thread_scope_device,
+                           bool> = true>
   __device__ T operator()(cuda::atomic<T, Scope>& lhs, T rhs) const noexcept
   {
-    if constexpr (Scope == cuda::thread_scope_system)
-      return atomicAdd_system(reinterpret_cast<T*>(&lhs), rhs) + rhs;
-    else if constexpr (Scope == cuda::thread_scope_device)
-      return atomicAdd(reinterpret_cast<T*>(&lhs), rhs) + rhs;
-    else
-      return atomicAdd_block(reinterpret_cast<T*>(&lhs), rhs) + rhs;
+    return atomicAdd_block(reinterpret_cast<T*>(&lhs), rhs) + rhs;
   }
 };
 
@@ -89,18 +101,34 @@ template <typename T>
 struct reduce_min_impl<T,
                        typename cuda::std::enable_if<cuda::std::is_integral<T>::value &&
                                                      cuda::std::is_signed<T>::value>::type> {
-  template <cuda::thread_scope Scope>
-  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& rhs) const noexcept
+ private:
+  using internal_type = typename cuda::std::conditional<sizeof(T) == 8, long long int, int>::type;
+
+ public:
+  template <cuda::thread_scope Scope,
+            cuda::std::enable_if_t<Scope == cuda::thread_scope_system, bool> = true>
+  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T rhs) const noexcept
+  {
+    return min(atomicMin_system(reinterpret_cast<internal_type*>(&lhs), rhs),
+               static_cast<internal_type>(rhs));
+  }
+
+  template <cuda::thread_scope Scope,
+            cuda::std::enable_if_t<Scope == cuda::thread_scope_device, bool> = true>
+  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T rhs) const noexcept
   {
-    using InternalT = typename cuda::std::conditional<sizeof(T) == 8, long long int, int>::type;
-    InternalT* ptr  = reinterpret_cast<InternalT*>(&lhs);
-    InternalT value = rhs;
-    if constexpr (Scope == cuda::thread_scope_system)
-      return min(atomicMin_system(ptr, value), value);
-    else if constexpr (Scope == cuda::thread_scope_device)
-      return min(atomicMin(ptr, value), value);
-    else
-      return min(atomicMin_block(ptr, value), value);
+    return min(atomicMin(reinterpret_cast<internal_type*>(&lhs), rhs),
+               static_cast<internal_type>(rhs));
+  }
+
+  template <
+    cuda::thread_scope Scope,
+    cuda::std::enable_if_t<Scope != cuda::thread_scope_system && Scope != cuda::thread_scope_device,
+                           bool> = true>
+  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T rhs) const noexcept
+  {
+    return min(atomicMin_block(reinterpret_cast<internal_type*>(&lhs), rhs),
+               static_cast<internal_type>(rhs));
   }
 };
 
@@ -108,18 +136,34 @@ template <typename T>
 struct reduce_max_impl<T,
                        typename cuda::std::enable_if<cuda::std::is_integral<T>::value &&
                                                      cuda::std::is_signed<T>::value>::type> {
-  template <cuda::thread_scope Scope>
-  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& rhs) const noexcept
+ private:
+  using internal_type = typename cuda::std::conditional<sizeof(T) == 8, long long int, int>::type;
+
+ public:
+  template <cuda::thread_scope Scope,
+            cuda::std::enable_if_t<Scope == cuda::thread_scope_system, bool> = true>
+  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T rhs) const noexcept
+  {
+    return max(atomicMax_system(reinterpret_cast<internal_type*>(&lhs), rhs),
+               static_cast<internal_type>(rhs));
+  }
+
+  template <cuda::thread_scope Scope,
+            cuda::std::enable_if_t<Scope == cuda::thread_scope_device, bool> = true>
+  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T rhs) const noexcept
+  {
+    return max(atomicMax(reinterpret_cast<internal_type*>(&lhs), rhs),
+               static_cast<internal_type>(rhs));
+  }
+
+  template <
+    cuda::thread_scope Scope,
+    cuda::std::enable_if_t<Scope != cuda::thread_scope_system && Scope != cuda::thread_scope_device,
+                           bool> = true>
+  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T rhs) const noexcept
   {
-    using InternalT = typename cuda::std::conditional<sizeof(T) == 8, long long int, int>::type;
-    InternalT* ptr  = reinterpret_cast<InternalT*>(&lhs);
-    InternalT value = rhs;
-    if constexpr (Scope == cuda::thread_scope_system)
-      return max(atomicMax_system(ptr, value), value);
-    else if constexpr (Scope == cuda::thread_scope_device)
-      return max(atomicMax(ptr, value), value);
-    else
-      return max(atomicMax_block(ptr, value), value);
+    return max(atomicMax_block(reinterpret_cast<internal_type*>(&lhs), rhs),
+               static_cast<internal_type>(rhs));
   }
 };
 
diff --git a/include/cuco/reduction_functors.cuh b/include/cuco/reduction_functors.cuh
index a15ec5bab..13478a312 100644
--- a/include/cuco/reduction_functors.cuh
+++ b/include/cuco/reduction_functors.cuh
@@ -65,12 +65,14 @@ class identity_value {
  * };
  *
  * int main() {
- *   cuco::identity_value<int> identity{0}; // define the identity value for the given reduction
- * operation, i.e., op(identity, x) == x
+ *   // define the identity value for the given reduction operation,
+     // i.e., op(identity, x) == x
+ *   cuco::identity_value<int> identity{0};
  *
- *   auto f1 = cuco::reduction_functor<custom_plus<int>, int>(identity); // synchronized via
- * CAS-loop auto f2 = cuco::reduction_functor<custom_plus_sync<int>, int>(identity); // implicitly
- * synchronized
+ *   // synchronized via CAS loop
+ *   auto f1 = cuco::reduction_functor<custom_plus<int>, int>(identity);
+ *   // implicitly synchronized
+ *   auto f2 = cuco::reduction_functor<custom_plus_sync<int>, int>(identity);
  * }
  * \endcode
  *
@@ -101,9 +103,8 @@ class reduction_functor : detail::reduction_functor_base {
         old, desired, cuda::memory_order_release, cuda::memory_order_relaxed));
 
       return desired;
-    } else {
-      return functor_(lhs, rhs);
     }
+    if constexpr (!uses_external_sync()) { return functor_(lhs, rhs); }
   }
 
   __host__ __device__ value_type identity() const noexcept { return identity_.value(); }

From cef8906f864332bcfc4078c5b60080e37d68cafb Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Tue, 12 Jul 2022 15:02:04 +0000
Subject: [PATCH 08/12] Fix includes und use type trait aliases.

---
 .../cuco/detail/reduction_functor_impl.cuh    | 30 ++++++--------
 include/cuco/reduction_functors.cuh           | 39 +++++++++----------
 .../reduction_functors_test.cu                |  9 +++--
 3 files changed, 36 insertions(+), 42 deletions(-)

diff --git a/include/cuco/detail/reduction_functor_impl.cuh b/include/cuco/detail/reduction_functor_impl.cuh
index 6107f92f4..9547600e1 100644
--- a/include/cuco/detail/reduction_functor_impl.cuh
+++ b/include/cuco/detail/reduction_functor_impl.cuh
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <cuda/atomic>
-#include <type_traits>
+#include <cuda/std/type_traits>
 
 namespace cuco {
 namespace detail {
@@ -70,9 +70,7 @@ struct reduce_count_impl {
 // remove the following WAR once libcu++ extends FP atomics support and fixes signed integer atomics
 // https://github.com/NVIDIA/libcudacxx/pull/286
 template <typename T>
-struct reduce_add_impl<
-  T,
-  typename cuda::std::enable_if<cuda::std::is_floating_point<T>::value>::type> {
+struct reduce_add_impl<T, typename cuda::std::enable_if_t<cuda::std::is_floating_point_v<T>>> {
   template <cuda::thread_scope Scope,
             cuda::std::enable_if_t<Scope == cuda::thread_scope_system, bool> = true>
   __device__ T operator()(cuda::atomic<T, Scope>& lhs, T rhs) const noexcept
@@ -98,11 +96,11 @@ struct reduce_add_impl<
 };
 
 template <typename T>
-struct reduce_min_impl<T,
-                       typename cuda::std::enable_if<cuda::std::is_integral<T>::value &&
-                                                     cuda::std::is_signed<T>::value>::type> {
+struct reduce_min_impl<
+  T,
+  typename cuda::std::enable_if_t<cuda::std::is_integral_v<T> && cuda::std::is_signed_v<T>>> {
  private:
-  using internal_type = typename cuda::std::conditional<sizeof(T) == 8, long long int, int>::type;
+  using internal_type = typename cuda::std::conditional_t<sizeof(T) == 8, long long int, int>;
 
  public:
   template <cuda::thread_scope Scope,
@@ -133,11 +131,11 @@ struct reduce_min_impl<T,
 };
 
 template <typename T>
-struct reduce_max_impl<T,
-                       typename cuda::std::enable_if<cuda::std::is_integral<T>::value &&
-                                                     cuda::std::is_signed<T>::value>::type> {
+struct reduce_max_impl<
+  T,
+  typename cuda::std::enable_if_t<cuda::std::is_integral_v<T> && cuda::std::is_signed_v<T>>> {
  private:
-  using internal_type = typename cuda::std::conditional<sizeof(T) == 8, long long int, int>::type;
+  using internal_type = typename cuda::std::conditional_t<sizeof(T) == 8, long long int, int>;
 
  public:
   template <cuda::thread_scope Scope,
@@ -168,16 +166,12 @@ struct reduce_max_impl<T,
 };
 
 template <typename T>
-struct reduce_min_impl<
-  T,
-  typename cuda::std::enable_if<cuda::std::is_floating_point<T>::value>::type> {
+struct reduce_min_impl<T, typename cuda::std::enable_if_t<cuda::std::is_floating_point_v<T>>> {
   __device__ T operator()(T lhs, T rhs) const noexcept { return min(lhs, rhs); }
 };
 
 template <typename T>
-struct reduce_max_impl<
-  T,
-  typename cuda::std::enable_if<cuda::std::is_floating_point<T>::value>::type> {
+struct reduce_max_impl<T, typename cuda::std::enable_if_t<cuda::std::is_floating_point_v<T>>> {
   __device__ T operator()(T lhs, T rhs) const noexcept { return max(lhs, rhs); }
 };
 
diff --git a/include/cuco/reduction_functors.cuh b/include/cuco/reduction_functors.cuh
index 13478a312..6cf29ae23 100644
--- a/include/cuco/reduction_functors.cuh
+++ b/include/cuco/reduction_functors.cuh
@@ -16,12 +16,11 @@
 
 #pragma once
 
-#include <atomic>
 #include <cuco/detail/reduction_functor_impl.cuh>
 
 #include <cuda/atomic>
-#include <limits>
-#include <type_traits>
+#include <cuda/std/limits>
+#include <cuda/std/type_traits>
 
 namespace cuco {
 
@@ -118,24 +117,24 @@ class reduction_functor : detail::reduction_functor_base {
   cuco::identity_value<value_type> identity_;
   Func functor_;
   static constexpr bool naive_invocable_ =
-    std::is_invocable_r<value_type, Func, value_type, value_type>::value;
+    cuda::std::is_invocable_r_v<value_type, Func, value_type, value_type>;
   static constexpr bool atomic_invocable_ =
-    std::is_invocable_r<value_type,
-                        Func,
-                        cuda::atomic<value_type, cuda::thread_scope_system>&,
-                        value_type>::value ||
-    std::is_invocable_r<value_type,
-                        Func,
-                        cuda::atomic<value_type, cuda::thread_scope_device>&,
-                        value_type>::value ||
-    std::is_invocable_r<value_type,
-                        Func,
-                        cuda::atomic<value_type, cuda::thread_scope_block>&,
-                        value_type>::value ||
-    std::is_invocable_r<value_type,
-                        Func,
-                        cuda::atomic<value_type, cuda::thread_scope_thread>&,
-                        value_type>::value;
+    cuda::std::is_invocable_r_v<value_type,
+                                Func,
+                                cuda::atomic<value_type, cuda::thread_scope_system>&,
+                                value_type> ||
+    cuda::std::is_invocable_r_v<value_type,
+                                Func,
+                                cuda::atomic<value_type, cuda::thread_scope_device>&,
+                                value_type> ||
+    cuda::std::is_invocable_r_v<value_type,
+                                Func,
+                                cuda::atomic<value_type, cuda::thread_scope_block>&,
+                                value_type> ||
+    cuda::std::is_invocable_r_v<value_type,
+                                Func,
+                                cuda::atomic<value_type, cuda::thread_scope_thread>&,
+                                value_type>;
 
   static_assert(atomic_invocable_ || naive_invocable_,
                 "Invalid operator signature. Valid signatures are "
diff --git a/tests/static_reduction_map/reduction_functors_test.cu b/tests/static_reduction_map/reduction_functors_test.cu
index bd59b1164..d2e4d2e3e 100644
--- a/tests/static_reduction_map/reduction_functors_test.cu
+++ b/tests/static_reduction_map/reduction_functors_test.cu
@@ -14,8 +14,6 @@
  * limitations under the License.
  */
 
-#include <limits>
-#include <type_traits>
 #include <utils.hpp>
 
 #include <cuco/reduction_functors.cuh>
@@ -27,6 +25,9 @@
 
 #include <catch2/catch.hpp>
 
+#include <cuda/std/limits>
+#include <cuda/std/type_traits>
+
 template <typename InputIt, typename OutputIt, typename Func>
 __global__ void reduce_kernel(InputIt first, InputIt last, OutputIt out, Func func)
 {
@@ -148,7 +149,7 @@ TEMPLATE_TEST_CASE_SIG(
   (int32_t, thrust::minimum<int32_t>, true))
 {
   test_case_impl(cuco::reduction_functor<Func, Value>(
-                   cuco::identity_value<Value>(std::numeric_limits<Value>::max())),
+                   cuco::identity_value<Value>(cuda::std::numeric_limits<Value>::max())),
                  thrust::minimum<Value>(),
                  UsesExternalSync);
 }
@@ -166,7 +167,7 @@ TEMPLATE_TEST_CASE_SIG(
   (int32_t, thrust::maximum<int32_t>, true))
 {
   test_case_impl(cuco::reduction_functor<Func, Value>(
-                   cuco::identity_value<Value>(std::numeric_limits<Value>::min())),
+                   cuco::identity_value<Value>(cuda::std::numeric_limits<Value>::min())),
                  thrust::maximum<Value>(),
                  UsesExternalSync);
 }

From 80c15441092dff16d2ab7e628725b037ed14ba1e Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Tue, 12 Jul 2022 15:21:37 +0000
Subject: [PATCH 09/12] Reduction ops should use relaxed memory order.

---
 include/cuco/detail/reduction_functor_impl.cuh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/cuco/detail/reduction_functor_impl.cuh b/include/cuco/detail/reduction_functor_impl.cuh
index 9547600e1..782a4a8bd 100644
--- a/include/cuco/detail/reduction_functor_impl.cuh
+++ b/include/cuco/detail/reduction_functor_impl.cuh
@@ -36,7 +36,7 @@ struct reduce_add_impl {
   template <cuda::thread_scope Scope>
   __device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& rhs) const noexcept
   {
-    return lhs.fetch_add(rhs) + rhs;
+    return lhs.fetch_add(rhs, cuda::memory_order_relaxed) + rhs;
   }
 };
 
@@ -45,7 +45,7 @@ struct reduce_min_impl {
   template <cuda::thread_scope Scope>
   __device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& rhs) const noexcept
   {
-    return min(lhs.fetch_min(rhs), rhs);
+    return min(lhs.fetch_min(rhs, cuda::memory_order_relaxed), rhs);
   }
 };
 
@@ -54,7 +54,7 @@ struct reduce_max_impl {
   template <cuda::thread_scope Scope>
   __device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& rhs) const noexcept
   {
-    return max(lhs.fetch_max(rhs), rhs);
+    return max(lhs.fetch_max(rhs, cuda::memory_order_relaxed), rhs);
   }
 };
 
@@ -63,7 +63,7 @@ struct reduce_count_impl {
   template <cuda::thread_scope Scope>
   __device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& /* rhs */) const noexcept
   {
-    return ++lhs;
+    return lhs.fetch_add(1, cuda::memory_order_relaxed) + 1;
   }
 };
 

From 6fc5ff28515f6cb56356110290c13a990a799230 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Tue, 12 Jul 2022 15:22:17 +0000
Subject: [PATCH 10/12] Make identity_value ctor explicit.

---
 include/cuco/reduction_functors.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/cuco/reduction_functors.cuh b/include/cuco/reduction_functors.cuh
index 6cf29ae23..0054e8ad5 100644
--- a/include/cuco/reduction_functors.cuh
+++ b/include/cuco/reduction_functors.cuh
@@ -33,7 +33,7 @@ template <typename T>
 class identity_value {
  public:
   using type = T;
-  constexpr identity_value(T const& identity) noexcept : identity_(identity) {}
+  explicit constexpr identity_value(T const& identity) noexcept : identity_(identity) {}
   constexpr T value() const noexcept { return identity_; }
 
  private:

From 99945f49a24b267f7a2c45901e045135aeeef692 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Mon, 1 Aug 2022 10:01:18 +0000
Subject: [PATCH 11/12] Internally-synced reduction functors cannot have
 immutable target object.

---
 include/cuco/reduction_functors.cuh | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/include/cuco/reduction_functors.cuh b/include/cuco/reduction_functors.cuh
index 0054e8ad5..5486b1b3d 100644
--- a/include/cuco/reduction_functors.cuh
+++ b/include/cuco/reduction_functors.cuh
@@ -135,8 +135,25 @@ class reduction_functor : detail::reduction_functor_base {
                                 Func,
                                 cuda::atomic<value_type, cuda::thread_scope_thread>&,
                                 value_type>;
+  static constexpr bool atomic_const_invocable_ =
+    cuda::std::is_invocable_r_v<value_type,
+                                Func,
+                                cuda::atomic<value_type, cuda::thread_scope_system> const&,
+                                value_type> ||
+    cuda::std::is_invocable_r_v<value_type,
+                                Func,
+                                cuda::atomic<value_type, cuda::thread_scope_device> const&,
+                                value_type> ||
+    cuda::std::is_invocable_r_v<value_type,
+                                Func,
+                                cuda::atomic<value_type, cuda::thread_scope_block> const&,
+                                value_type> ||
+    cuda::std::is_invocable_r_v<value_type,
+                                Func,
+                                cuda::atomic<value_type, cuda::thread_scope_thread> const&,
+                                value_type>;
 
-  static_assert(atomic_invocable_ || naive_invocable_,
+  static_assert((atomic_invocable_ && !atomic_const_invocable_) || naive_invocable_,
                 "Invalid operator signature. Valid signatures are "
                 "(T const&, T const&)->T and (cuda::atomic<T, Scope>&, T const&)->T.");
   static_assert(!(__nv_is_extended_device_lambda_closure_type(Func) ||

From 4435d05350a87f4bfd581e053c843f5ca1f63a2a Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Mon, 1 Aug 2022 10:05:33 +0000
Subject: [PATCH 12/12] Use human-readable boolean operators.

---
 include/cuco/reduction_functors.cuh | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/include/cuco/reduction_functors.cuh b/include/cuco/reduction_functors.cuh
index 5486b1b3d..892b6e83a 100644
--- a/include/cuco/reduction_functors.cuh
+++ b/include/cuco/reduction_functors.cuh
@@ -98,39 +98,40 @@ class reduction_functor : detail::reduction_functor_base {
 
       do {
         desired = functor_(old, rhs);
-      } while (!lhs.compare_exchange_weak(
+      } while (not lhs.compare_exchange_weak(
         old, desired, cuda::memory_order_release, cuda::memory_order_relaxed));
 
       return desired;
     }
-    if constexpr (!uses_external_sync()) { return functor_(lhs, rhs); }
+    if constexpr (not uses_external_sync()) { return functor_(lhs, rhs); }
   }
 
   __host__ __device__ value_type identity() const noexcept { return identity_.value(); }
 
   __host__ __device__ static constexpr bool uses_external_sync() noexcept
   {
-    return !atomic_invocable_ || naive_invocable_;
+    return not atomic_invocable_ or naive_invocable_;
   }
 
  private:
   cuco::identity_value<value_type> identity_;
   Func functor_;
+
   static constexpr bool naive_invocable_ =
     cuda::std::is_invocable_r_v<value_type, Func, value_type, value_type>;
   static constexpr bool atomic_invocable_ =
     cuda::std::is_invocable_r_v<value_type,
                                 Func,
                                 cuda::atomic<value_type, cuda::thread_scope_system>&,
-                                value_type> ||
+                                value_type> or
     cuda::std::is_invocable_r_v<value_type,
                                 Func,
                                 cuda::atomic<value_type, cuda::thread_scope_device>&,
-                                value_type> ||
+                                value_type> or
     cuda::std::is_invocable_r_v<value_type,
                                 Func,
                                 cuda::atomic<value_type, cuda::thread_scope_block>&,
-                                value_type> ||
+                                value_type> or
     cuda::std::is_invocable_r_v<value_type,
                                 Func,
                                 cuda::atomic<value_type, cuda::thread_scope_thread>&,
@@ -139,24 +140,24 @@ class reduction_functor : detail::reduction_functor_base {
     cuda::std::is_invocable_r_v<value_type,
                                 Func,
                                 cuda::atomic<value_type, cuda::thread_scope_system> const&,
-                                value_type> ||
+                                value_type> or
     cuda::std::is_invocable_r_v<value_type,
                                 Func,
                                 cuda::atomic<value_type, cuda::thread_scope_device> const&,
-                                value_type> ||
+                                value_type> or
     cuda::std::is_invocable_r_v<value_type,
                                 Func,
                                 cuda::atomic<value_type, cuda::thread_scope_block> const&,
-                                value_type> ||
+                                value_type> or
     cuda::std::is_invocable_r_v<value_type,
                                 Func,
                                 cuda::atomic<value_type, cuda::thread_scope_thread> const&,
                                 value_type>;
 
-  static_assert((atomic_invocable_ && !atomic_const_invocable_) || naive_invocable_,
+  static_assert((atomic_invocable_ and not atomic_const_invocable_) or naive_invocable_,
                 "Invalid operator signature. Valid signatures are "
                 "(T const&, T const&)->T and (cuda::atomic<T, Scope>&, T const&)->T.");
-  static_assert(!(__nv_is_extended_device_lambda_closure_type(Func) ||
+  static_assert(!(__nv_is_extended_device_lambda_closure_type(Func) or
                   __nv_is_extended_host_device_lambda_closure_type(Func)),
                 "Extended __device__/__host__ __device__ lambdas are not supported."
                 " Use a named function object instead.");