diff --git a/include/cuco/detail/reduction_functor_impl.cuh b/include/cuco/detail/reduction_functor_impl.cuh
new file mode 100644
index 000000000..782a4a8bd
--- /dev/null
+++ b/include/cuco/detail/reduction_functor_impl.cuh
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuda/atomic>
+#include <cuda/std/type_traits>
+
+namespace cuco {
+namespace detail {
+
+/**
+ * @brief Base class of all reduction functors.
+ *
+ * @warning This class should not be used directly.
+ *
+ */
+class reduction_functor_base {
+};
+
+template <typename T, typename Enable = void>
+struct reduce_add_impl {
+  template <cuda::thread_scope Scope>
+  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& rhs) const noexcept
+  {
+    return lhs.fetch_add(rhs, cuda::memory_order_relaxed) + rhs;
+  }
+};
+
+template <typename T, typename Enable = void>
+struct reduce_min_impl {
+  template <cuda::thread_scope Scope>
+  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& rhs) const noexcept
+  {
+    return min(lhs.fetch_min(rhs, cuda::memory_order_relaxed), rhs);
+  }
+};
+
+template <typename T, typename Enable = void>
+struct reduce_max_impl {
+  template <cuda::thread_scope Scope>
+  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& rhs) const noexcept
+  {
+    return max(lhs.fetch_max(rhs, cuda::memory_order_relaxed), rhs);
+  }
+};
+
+template <typename T, typename Enable = void>
+struct reduce_count_impl {
+  template <cuda::thread_scope Scope>
+  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& /* rhs */) const noexcept
+  {
+    return lhs.fetch_add(1, cuda::memory_order_relaxed) + 1;
+  }
+};
+
+// remove the following WAR once libcu++ extends FP atomics support and fixes signed integer atomics
+// https://github.com/NVIDIA/libcudacxx/pull/286
+template <typename T>
+struct reduce_add_impl<T, typename cuda::std::enable_if_t<cuda::std::is_floating_point_v<T>>> {
+  template <cuda::thread_scope Scope,
+            cuda::std::enable_if_t<Scope == cuda::thread_scope_system, bool> = true>
+  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T rhs) const noexcept
+  {
+    return atomicAdd_system(reinterpret_cast<T*>(&lhs), rhs) + rhs;
+  }
+
+  template <cuda::thread_scope Scope,
+            cuda::std::enable_if_t<Scope == cuda::thread_scope_device, bool> = true>
+  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T rhs) const noexcept
+  {
+    return atomicAdd(reinterpret_cast<T*>(&lhs), rhs) + rhs;
+  }
+
+  template <
+    cuda::thread_scope Scope,
+    cuda::std::enable_if_t<Scope != cuda::thread_scope_system && Scope != cuda::thread_scope_device,
+                           bool> = true>
+  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T rhs) const noexcept
+  {
+    return atomicAdd_block(reinterpret_cast<T*>(&lhs), rhs) + rhs;
+  }
+};
+
+template <typename T>
+struct reduce_min_impl<
+  T,
+  typename cuda::std::enable_if_t<cuda::std::is_integral_v<T> && cuda::std::is_signed_v<T>>> {
+ private:
+  using internal_type = typename cuda::std::conditional_t<sizeof(T) == 8, long long int, int>;
+
+ public:
+  template <cuda::thread_scope Scope,
+            cuda::std::enable_if_t<Scope == cuda::thread_scope_system, bool> = true>
+  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T rhs) const noexcept
+  {
+    return min(atomicMin_system(reinterpret_cast<internal_type*>(&lhs), rhs),
+               static_cast<internal_type>(rhs));
+  }
+
+  template <cuda::thread_scope Scope,
+            cuda::std::enable_if_t<Scope == cuda::thread_scope_device, bool> = true>
+  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T rhs) const noexcept
+  {
+    return min(atomicMin(reinterpret_cast<internal_type*>(&lhs), rhs),
+               static_cast<internal_type>(rhs));
+  }
+
+  template <
+    cuda::thread_scope Scope,
+    cuda::std::enable_if_t<Scope != cuda::thread_scope_system && Scope != cuda::thread_scope_device,
+                           bool> = true>
+  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T rhs) const noexcept
+  {
+    return min(atomicMin_block(reinterpret_cast<internal_type*>(&lhs), rhs),
+               static_cast<internal_type>(rhs));
+  }
+};
+
+template <typename T>
+struct reduce_max_impl<
+  T,
+  typename cuda::std::enable_if_t<cuda::std::is_integral_v<T> && cuda::std::is_signed_v<T>>> {
+ private:
+  using internal_type = typename cuda::std::conditional_t<sizeof(T) == 8, long long int, int>;
+
+ public:
+  template <cuda::thread_scope Scope,
+            cuda::std::enable_if_t<Scope == cuda::thread_scope_system, bool> = true>
+  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T rhs) const noexcept
+  {
+    return max(atomicMax_system(reinterpret_cast<internal_type*>(&lhs), rhs),
+               static_cast<internal_type>(rhs));
+  }
+
+  template <cuda::thread_scope Scope,
+            cuda::std::enable_if_t<Scope == cuda::thread_scope_device, bool> = true>
+  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T rhs) const noexcept
+  {
+    return max(atomicMax(reinterpret_cast<internal_type*>(&lhs), rhs),
+               static_cast<internal_type>(rhs));
+  }
+
+  template <
+    cuda::thread_scope Scope,
+    cuda::std::enable_if_t<Scope != cuda::thread_scope_system && Scope != cuda::thread_scope_device,
+                           bool> = true>
+  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T rhs) const noexcept
+  {
+    return max(atomicMax_block(reinterpret_cast<internal_type*>(&lhs), rhs),
+               static_cast<internal_type>(rhs));
+  }
+};
+
+template <typename T>
+struct reduce_min_impl<T, typename cuda::std::enable_if_t<cuda::std::is_floating_point_v<T>>> {
+  __device__ T operator()(T lhs, T rhs) const noexcept { return min(lhs, rhs); }
+};
+
+template <typename T>
+struct reduce_max_impl<T, typename cuda::std::enable_if_t<cuda::std::is_floating_point_v<T>>> {
+  __device__ T operator()(T lhs, T rhs) const noexcept { return max(lhs, rhs); }
+};
+
+}  // namespace detail
+}  // namespace cuco
\ No newline at end of file
diff --git a/include/cuco/reduction_functors.cuh b/include/cuco/reduction_functors.cuh
new file mode 100644
index 000000000..892b6e83a
--- /dev/null
+++ b/include/cuco/reduction_functors.cuh
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuco/detail/reduction_functor_impl.cuh>
+
+#include <cuda/atomic>
+#include <cuda/std/limits>
+#include <cuda/std/type_traits>
+
+namespace cuco {
+
+/**
+ * @brief Wrapper for reduction identity value.
+ *
+ * @tparam T The underlying value type used for reduction
+ */
+template <typename T>
+class identity_value {
+ public:
+  using type = T;
+  explicit constexpr identity_value(T const& identity) noexcept : identity_(identity) {}
+  constexpr T value() const noexcept { return identity_; }
+
+ private:
+  T identity_;
+};
+
+/**
+ * @brief Wrapper for a user-defined custom reduction operator.
+ *
+ * External synchronization, if required,
+ * is established via an atomic compare-and-swap loop.
+ *
+ * Example:
+ * \code{.cpp}
+ * template <typename T>
+ * struct custom_plus {
+ *   __device__ T operator()(T const& lhs, T const& rhs) const noexcept {
+ *     return lhs + rhs;
+ *   }
+ * };
+ *
+ * template <typename T>
+ * struct custom_plus_sync {
+ *   template <cuda::thread_scope Scope>
+ *   __device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& rhs) const noexcept {
+ *     return lhs.fetch_add(rhs) + rhs;
+ *   }
+ * };
+ *
+ * int main() {
+ *   // define the identity value for the given reduction operation,
+     // i.e., op(identity, x) == x
+ *   cuco::identity_value<int> identity{0};
+ *
+ *   // synchronized via CAS loop
+ *   auto f1 = cuco::reduction_functor<custom_plus<int>, int>(identity);
+ *   // implicitly synchronized
+ *   auto f2 = cuco::reduction_functor<custom_plus_sync<int>, int>(identity);
+ * }
+ * \endcode
+ *
+ * @tparam Func The user-defined reduction functor
+ * @tparam Value The value type used for reduction
+ */
+template <typename Func, typename Value>
+class reduction_functor : detail::reduction_functor_base {
+ public:
+  using value_type = Value;
+
+  reduction_functor(cuco::identity_value<Value> identity, Func functor = Func{}) noexcept
+    : identity_(identity), functor_(functor)
+  {
+  }
+
+  template <cuda::thread_scope Scope>
+  __device__ value_type operator()(cuda::atomic<value_type, Scope>& lhs,
+                                   value_type const& rhs) const noexcept
+  {
+    if constexpr (uses_external_sync()) {
+      value_type old = lhs.load(cuda::memory_order_relaxed);
+      value_type desired;
+
+      do {
+        desired = functor_(old, rhs);
+      } while (not lhs.compare_exchange_weak(
+        old, desired, cuda::memory_order_release, cuda::memory_order_relaxed));
+
+      return desired;
+    }
+    if constexpr (not uses_external_sync()) { return functor_(lhs, rhs); }
+  }
+
+  __host__ __device__ value_type identity() const noexcept { return identity_.value(); }
+
+  __host__ __device__ static constexpr bool uses_external_sync() noexcept
+  {
+    return not atomic_invocable_ or naive_invocable_;
+  }
+
+ private:
+  cuco::identity_value<value_type> identity_;
+  Func functor_;
+
+  static constexpr bool naive_invocable_ =
+    cuda::std::is_invocable_r_v<value_type, Func, value_type, value_type>;
+  static constexpr bool atomic_invocable_ =
+    cuda::std::is_invocable_r_v<value_type,
+                                Func,
+                                cuda::atomic<value_type, cuda::thread_scope_system>&,
+                                value_type> or
+    cuda::std::is_invocable_r_v<value_type,
+                                Func,
+                                cuda::atomic<value_type, cuda::thread_scope_device>&,
+                                value_type> or
+    cuda::std::is_invocable_r_v<value_type,
+                                Func,
+                                cuda::atomic<value_type, cuda::thread_scope_block>&,
+                                value_type> or
+    cuda::std::is_invocable_r_v<value_type,
+                                Func,
+                                cuda::atomic<value_type, cuda::thread_scope_thread>&,
+                                value_type>;
+  static constexpr bool atomic_const_invocable_ =
+    cuda::std::is_invocable_r_v<value_type,
+                                Func,
+                                cuda::atomic<value_type, cuda::thread_scope_system> const&,
+                                value_type> or
+    cuda::std::is_invocable_r_v<value_type,
+                                Func,
+                                cuda::atomic<value_type, cuda::thread_scope_device> const&,
+                                value_type> or
+    cuda::std::is_invocable_r_v<value_type,
+                                Func,
+                                cuda::atomic<value_type, cuda::thread_scope_block> const&,
+                                value_type> or
+    cuda::std::is_invocable_r_v<value_type,
+                                Func,
+                                cuda::atomic<value_type, cuda::thread_scope_thread> const&,
+                                value_type>;
+
+  static_assert((atomic_invocable_ and not atomic_const_invocable_) or naive_invocable_,
+                "Invalid operator signature. Valid signatures are "
+                "(T const&, T const&)->T and (cuda::atomic<T, Scope>&, T const&)->T.");
+  static_assert(!(__nv_is_extended_device_lambda_closure_type(Func) or
+                  __nv_is_extended_host_device_lambda_closure_type(Func)),
+                "Extended __device__/__host__ __device__ lambdas are not supported."
+                " Use a named function object instead.");
+};
+
+/**
+ * @brief Synchronized `+` reduction functor.
+ *
+ * @tparam T The value type used for reduction
+ */
+template <typename T>
+auto reduce_add()
+{
+  return reduction_functor(identity_value<T>{0}, detail::reduce_add_impl<T>{});
+};
+
+/**
+ * @brief Synchronized `min` reduction functor.
+ *
+ * @tparam T The value type used for reduction
+ */
+template <typename T>
+auto reduce_min()
+{
+  return reduction_functor(identity_value{cuda::std::numeric_limits<T>::max()},
+                           detail::reduce_min_impl<T>{});
+};
+
+/**
+ * @brief Synchronized `max` reduction functor.
+ *
+ * @tparam T The value type used for reduction
+ */
+template <typename T>
+auto reduce_max()
+{
+  return reduction_functor(identity_value{cuda::std::numeric_limits<T>::lowest()},
+                           detail::reduce_max_impl<T>{});
+};
+
+/**
+ * @brief Synchronized `count` reduction functor.
+ *
+ * @tparam T The value type used for reduction
+ */
+template <typename T>
+auto reduce_count()
+{
+  return reduction_functor(identity_value<T>{0}, detail::reduce_count_impl<T>{});
+};
+
+}  // namespace cuco
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index dec5170c4..56776f1cf 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -23,7 +23,7 @@ include(CTest)
 CPMAddPackage(
   NAME Catch2
   GITHUB_REPOSITORY catchorg/Catch2
-  VERSION 2.11.1
+  VERSION 2.13.9
 )
 
 if(Catch2_ADDED)
@@ -86,3 +86,8 @@ ConfigureTest(STATIC_MULTIMAP_TEST
     static_multimap/multiplicity_test.cu
     static_multimap/non_match_test.cu
     static_multimap/pair_function_test.cu)
+
+###################################################################################################
+# - static_reduction_map tests --------------------------------------------------------------------
+ConfigureTest(STATIC_REDUCTION_MAP_TEST
+    static_reduction_map/reduction_functors_test.cu)
diff --git a/tests/static_reduction_map/reduction_functors_test.cu b/tests/static_reduction_map/reduction_functors_test.cu
new file mode 100644
index 000000000..d2e4d2e3e
--- /dev/null
+++ b/tests/static_reduction_map/reduction_functors_test.cu
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utils.hpp>
+
+#include <cuco/reduction_functors.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/functional.h>
+#include <thrust/host_vector.h>
+#include <thrust/sequence.h>
+
+#include <catch2/catch.hpp>
+
+#include <cuda/std/limits>
+#include <cuda/std/type_traits>
+
+template <typename InputIt, typename OutputIt, typename Func>
+__global__ void reduce_kernel(InputIt first, InputIt last, OutputIt out, Func func)
+{
+  auto tid = blockDim.x * blockIdx.x + threadIdx.x;
+  auto it  = first + tid;
+
+  if constexpr (cuda::std::is_base_of_v<cuco::detail::reduction_functor_base, Func>) {
+    while (it < last) {
+      func(*reinterpret_cast<cuda::atomic<typename Func::value_type, cuda::thread_scope_device>*>(
+             thrust::raw_pointer_cast(out)),
+           *it);
+      it += gridDim.x * blockDim.x;
+    }
+  } else {
+    while (it < last) {
+      *out = func(*out, *it);
+      it += gridDim.x * blockDim.x;
+    }
+  }
+}
+
+template <typename InputIt, typename OutputIt, typename Func>
+void reduce_seq(InputIt first, InputIt last, OutputIt out, Func func)
+{
+  reduce_kernel<<<1, 1>>>(first, last, out, func);
+  cudaDeviceSynchronize();
+}
+
+template <typename InputIt, typename OutputIt, typename Func>
+void reduce_par(InputIt first, InputIt last, OutputIt out, Func func)
+{
+  reduce_kernel<<<1, 1024>>>(first, last, out, func);
+  cudaDeviceSynchronize();
+}
+
+template <typename Func, typename EquivFunc>
+void test_case_impl(Func func, EquivFunc equiv, bool uses_external_sync)
+{
+  using Value = typename Func::value_type;
+  CHECK(cuda::std::is_base_of_v<cuco::detail::reduction_functor_base, decltype(func)>);
+  CHECK(cuda::std::is_same_v<typename decltype(func)::value_type, Value>);
+  CHECK(func.uses_external_sync() == uses_external_sync);
+
+  constexpr std::size_t num_items{100};
+
+  thrust::device_vector<Value> values(num_items);
+  thrust::sequence(values.begin(), values.end(), 1);
+
+  thrust::device_vector<Value> results_d(3, func.identity());
+
+  reduce_seq(values.begin(), values.end(), results_d.data() + 0, func);
+  reduce_par(values.begin(), values.end(), results_d.data() + 1, func);
+  reduce_seq(values.begin(), values.end(), results_d.data() + 2, equiv);
+
+  thrust::host_vector<Value> results_h = results_d;
+  auto sequential_result               = results_h[0];
+  auto parallel_result                 = results_h[1];
+  auto correct_result                  = results_h[2];
+
+  CHECK(sequential_result == correct_result);
+  CHECK(parallel_result == correct_result);
+  CHECK(parallel_result == sequential_result);
+}
+
+template <typename T>
+struct custom_plus {
+  __device__ T operator()(T lhs, T rhs) const noexcept { return lhs + rhs; }
+};
+
+template <typename T>
+struct custom_plus_constref {
+  __device__ T operator()(T const& lhs, T const& rhs) const noexcept { return lhs + rhs; }
+};
+
+template <typename T>
+struct custom_plus_sync {
+  template <cuda::thread_scope Scope>
+  __device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& rhs) const noexcept
+  {
+    return lhs.fetch_add(rhs) + rhs;
+  }
+};
+
+template <typename T>
+struct equiv_count {
+  __device__ T operator()(T const& lhs, T const& /* rhs */) const noexcept { return lhs + 1; }
+};
+
+TEMPLATE_TEST_CASE_SIG(
+  "Test '+' reduction functors",
+  "",
+  ((typename Value, typename Func, bool UsesExternalSync), Value, Func, UsesExternalSync),
+  (int32_t, cuco::detail::reduce_add_impl<int32_t>, false),
+  (int64_t, cuco::detail::reduce_add_impl<int64_t>, false),
+  (uint32_t, cuco::detail::reduce_add_impl<uint32_t>, false),
+  (uint64_t, cuco::detail::reduce_add_impl<uint64_t>, false),
+  (float, cuco::detail::reduce_add_impl<float>, false),
+  (double, cuco::detail::reduce_add_impl<double>, false),
+  (int32_t, thrust::plus<int32_t>, true),
+  (int32_t, custom_plus_sync<int32_t>, false),
+  (int32_t, custom_plus<int32_t>, true),
+  (int32_t, custom_plus_constref<int32_t>, true))
+{
+  test_case_impl(cuco::reduction_functor<Func, Value>(cuco::identity_value<Value>(0)),
+                 thrust::plus<Value>(),
+                 UsesExternalSync);
+}
+
+TEMPLATE_TEST_CASE_SIG(
+  "Test 'min' reduction functors",
+  "",
+  ((typename Value, typename Func, bool UsesExternalSync), Value, Func, UsesExternalSync),
+  (int32_t, cuco::detail::reduce_min_impl<int32_t>, false),
+  (int64_t, cuco::detail::reduce_min_impl<int64_t>, false),
+  (uint32_t, cuco::detail::reduce_min_impl<uint32_t>, false),
+  (uint64_t, cuco::detail::reduce_min_impl<uint64_t>, false),
+  (float, cuco::detail::reduce_min_impl<float>, true),
+  (double, cuco::detail::reduce_min_impl<double>, true),
+  (int32_t, thrust::minimum<int32_t>, true))
+{
+  test_case_impl(cuco::reduction_functor<Func, Value>(
+                   cuco::identity_value<Value>(cuda::std::numeric_limits<Value>::max())),
+                 thrust::minimum<Value>(),
+                 UsesExternalSync);
+}
+
+TEMPLATE_TEST_CASE_SIG(
+  "Test 'max' reduction functors",
+  "",
+  ((typename Value, typename Func, bool UsesExternalSync), Value, Func, UsesExternalSync),
+  (int32_t, cuco::detail::reduce_max_impl<int32_t>, false),
+  (int64_t, cuco::detail::reduce_max_impl<int64_t>, false),
+  (uint32_t, cuco::detail::reduce_max_impl<uint32_t>, false),
+  (uint64_t, cuco::detail::reduce_max_impl<uint64_t>, false),
+  (float, cuco::detail::reduce_max_impl<float>, true),
+  (double, cuco::detail::reduce_max_impl<double>, true),
+  (int32_t, thrust::maximum<int32_t>, true))
+{
+  test_case_impl(cuco::reduction_functor<Func, Value>(
+                   cuco::identity_value<Value>(cuda::std::numeric_limits<Value>::min())),
+                 thrust::maximum<Value>(),
+                 UsesExternalSync);
+}
+
+TEMPLATE_TEST_CASE_SIG(
+  "Test 'count' reduction functors",
+  "",
+  ((typename Value, typename Func, bool UsesExternalSync), Value, Func, UsesExternalSync),
+  (int32_t, cuco::detail::reduce_count_impl<int32_t>, false),
+  (int64_t, cuco::detail::reduce_count_impl<int64_t>, false),
+  (uint32_t, cuco::detail::reduce_count_impl<uint32_t>, false),
+  (uint64_t, cuco::detail::reduce_count_impl<uint64_t>, false))
+{
+  test_case_impl(cuco::reduction_functor<Func, Value>(cuco::identity_value<Value>(0)),
+                 equiv_count<Value>(),
+                 UsesExternalSync);
+}
\ No newline at end of file