diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 0e7411922..ee0a5244f 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -28,6 +28,8 @@ if("${GPU_ARCHS}" STREQUAL "")
   evaluate_gpu_archs(GPU_ARCHS)
 endif()
 
+message("GPU_ARCHS = ${GPU_ARCHS}")
+
 ###################################################################################################
 # - compiler function -----------------------------------------------------------------------------
 
@@ -35,7 +37,7 @@ function(ConfigureBench BENCH_NAME BENCH_SRC)
     add_executable(${BENCH_NAME} "${BENCH_SRC}")
     set_target_properties(${BENCH_NAME} PROPERTIES 
                                         POSITION_INDEPENDENT_CODE ON
-                                        CUDA_ARCHITECTURES ${GPU_ARCHS}
+                                        CUDA_ARCHITECTURES "${GPU_ARCHS}"
                                         RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/gbenchmarks")
     target_include_directories(${BENCH_NAME} PRIVATE
                                              "${CMAKE_CURRENT_SOURCE_DIR}")
@@ -58,6 +60,9 @@ ConfigureBench(DYNAMIC_MAP_BENCH "${DYNAMIC_MAP_BENCH_SRC}")
 set(STATIC_MAP_BENCH_SRC "${CMAKE_CURRENT_SOURCE_DIR}/hash_table/static_map_bench.cu")
 ConfigureBench(STATIC_MAP_BENCH "${STATIC_MAP_BENCH_SRC}")
 
+###################################################################################################
+ConfigureBench(STATIC_REDUCTION_MAP_BENCH "${CMAKE_CURRENT_SOURCE_DIR}/hash_table/static_reduction_map_bench.cu")
+
 ###################################################################################################
 set(RBK_BENCH_SRC "${CMAKE_CURRENT_SOURCE_DIR}/reduce_by_key/reduce_by_key.cu")
 ConfigureBench(RBK_BENCH "${RBK_BENCH_SRC}")
diff --git a/benchmarks/hash_table/static_map_bench.cu b/benchmarks/hash_table/static_map_bench.cu
index 165465518..563769df6 100644
--- a/benchmarks/hash_table/static_map_bench.cu
+++ b/benchmarks/hash_table/static_map_bench.cu
@@ -15,40 +15,38 @@
  */
 
 #include <benchmark/benchmark.h>
-#include "cuco/static_map.cuh"
-#include <thrust/for_each.h>
 #include <thrust/device_vector.h>
-#include <iostream>
+#include <synchronization.hpp>
+#include <thrust/for_each.h>
 #include <fstream>
+#include <iostream>
 #include <random>
+#include "cuco/static_map.cuh"
 
-enum class dist_type {
-  UNIQUE,
-  UNIFORM,
-  GAUSSIAN
-};
+enum class dist_type { UNIQUE, UNIFORM, GAUSSIAN };
 
-template<dist_type Dist, typename Key, typename OutputIt>
-static void generate_keys(OutputIt output_begin, OutputIt output_end) {
+template <dist_type Dist, typename Key, typename OutputIt>
+static void generate_keys(OutputIt output_begin, OutputIt output_end)
+{
   auto num_keys = std::distance(output_begin, output_end);
-  
+
   std::random_device rd;
   std::mt19937 gen{rd()};
 
-  switch(Dist) {
+  switch (Dist) {
     case dist_type::UNIQUE:
-      for(auto i = 0; i < num_keys; ++i) {
+      for (auto i = 0; i < num_keys; ++i) {
         output_begin[i] = i;
       }
       break;
     case dist_type::UNIFORM:
-      for(auto i = 0; i < num_keys; ++i) {
+      for (auto i = 0; i < num_keys; ++i) {
         output_begin[i] = std::abs(static_cast<Key>(gen()));
       }
       break;
     case dist_type::GAUSSIAN:
       std::normal_distribution<> dg{1e9, 1e7};
-      for(auto i = 0; i < num_keys; ++i) {
+      for (auto i = 0; i < num_keys; ++i) {
         output_begin[i] = std::abs(static_cast<Key>(dg(gen)));
       }
       break;
@@ -59,88 +57,84 @@ static void generate_keys(OutputIt output_begin, OutputIt output_end) {
  * @brief Generates input sizes and hash table occupancies
  *
  */
-static void generate_size_and_occupancy(benchmark::internal::Benchmark* b) {
-  for (auto size = 100'000'000; size <= 100'000'000; size *= 10) {
-    for (auto occupancy = 10; occupancy <= 90; occupancy += 10) {
+static void generate_size_and_occupancy(benchmark::internal::Benchmark* b)
+{
+  for (auto size = 4096; size <= 1 << 28; size *= 2) {
+    for (auto occupancy = 60; occupancy <= 60; occupancy += 10) {
       b->Args({size, occupancy});
     }
   }
 }
 
-
-
 template <typename Key, typename Value, dist_type Dist>
-static void BM_static_map_insert(::benchmark::State& state) {
+static void BM_static_map_insert(::benchmark::State& state)
+{
   using map_type = cuco::static_map<Key, Value>;
-  
+
   std::size_t num_keys = state.range(0);
-  float occupancy = state.range(1) / float{100};
-  std::size_t size = num_keys / occupancy;
+  float occupancy      = state.range(1) / float{100};
+  std::size_t size     = num_keys / occupancy;
+
+  std::vector<Key> h_keys(num_keys);
+  std::vector<cuco::pair_type<Key, Value>> h_pairs(num_keys);
 
-  std::vector<Key> h_keys( num_keys );
-  std::vector<cuco::pair_type<Key, Value>> h_pairs( num_keys );
-  
   generate_keys<Dist, Key>(h_keys.begin(), h_keys.end());
-  
-  for(auto i = 0; i < num_keys; ++i) {
-    Key key = h_keys[i];
-    Value val = h_keys[i];
-    h_pairs[i].first = key;
+
+  for (auto i = 0; i < num_keys; ++i) {
+    Key key           = h_keys[i];
+    Value val         = h_keys[i];
+    h_pairs[i].first  = key;
     h_pairs[i].second = val;
   }
 
-  thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs( h_pairs );
+  thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs(h_pairs);
 
-  for(auto _ : state) {
-    state.ResumeTiming();
-    state.PauseTiming();
+  for (auto _ : state) {
     map_type map{size, -1, -1};
-    state.ResumeTiming();
-
-    map.insert(d_pairs.begin(), d_pairs.end());
 
-    state.PauseTiming();
+    {
+      cuda_event_timer raii{state};
+      map.insert(d_pairs.begin(), d_pairs.end());
+    }
   }
 
-  state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) *
-                          int64_t(state.iterations()) *
+  state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) *
                           int64_t(state.range(0)));
 }
 
-
-
 template <typename Key, typename Value, dist_type Dist>
-static void BM_static_map_search_all(::benchmark::State& state) {
+static void BM_static_map_search_all(::benchmark::State& state)
+{
   using map_type = cuco::static_map<Key, Value>;
-  
+
   std::size_t num_keys = state.range(0);
-  float occupancy = state.range(1) / float{100};
-  std::size_t size = num_keys / occupancy;
+  float occupancy      = state.range(1) / float{100};
+  std::size_t size     = num_keys / occupancy;
 
   map_type map{size, -1, -1};
   auto view = map.get_device_mutable_view();
 
-  std::vector<Key> h_keys( num_keys );
-  std::vector<Value> h_values( num_keys );
-  std::vector<cuco::pair_type<Key, Value>> h_pairs ( num_keys );
-  std::vector<Value> h_results (num_keys);
+  std::vector<Key> h_keys(num_keys);
+  std::vector<Value> h_values(num_keys);
+  std::vector<cuco::pair_type<Key, Value>> h_pairs(num_keys);
+  std::vector<Value> h_results(num_keys);
 
   generate_keys<Dist, Key>(h_keys.begin(), h_keys.end());
-  
-  for(auto i = 0; i < num_keys; ++i) {
-    Key key = h_keys[i];
-    Value val = h_keys[i];
-    h_pairs[i].first = key;
+
+  for (auto i = 0; i < num_keys; ++i) {
+    Key key           = h_keys[i];
+    Value val         = h_keys[i];
+    h_pairs[i].first  = key;
     h_pairs[i].second = val;
   }
 
-  thrust::device_vector<Key> d_keys( h_keys ); 
-  thrust::device_vector<Value> d_results( num_keys);
-  thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs( h_pairs );
+  thrust::device_vector<Key> d_keys(h_keys);
+  thrust::device_vector<Value> d_results(num_keys);
+  thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs(h_pairs);
 
   map.insert(d_pairs.begin(), d_pairs.end());
-  
-  for(auto _ : state) {
+
+  for (auto _ : state) {
     map.find(d_keys.begin(), d_keys.end(), d_results.begin());
   }
 
@@ -148,52 +142,62 @@ static void BM_static_map_search_all(::benchmark::State& state) {
                           int64_t(state.range(0)));
 }
 
-
-
 BENCHMARK_TEMPLATE(BM_static_map_insert, int32_t, int32_t, dist_type::UNIQUE)
   ->Unit(benchmark::kMillisecond)
+  ->UseManualTime()
   ->Apply(generate_size_and_occupancy);
 
 BENCHMARK_TEMPLATE(BM_static_map_search_all, int32_t, int32_t, dist_type::UNIQUE)
   ->Unit(benchmark::kMillisecond)
+  ->UseManualTime()
   ->Apply(generate_size_and_occupancy);
 
 BENCHMARK_TEMPLATE(BM_static_map_insert, int32_t, int32_t, dist_type::UNIFORM)
   ->Unit(benchmark::kMillisecond)
+  ->UseManualTime()
   ->Apply(generate_size_and_occupancy);
 
 BENCHMARK_TEMPLATE(BM_static_map_search_all, int32_t, int32_t, dist_type::UNIFORM)
   ->Unit(benchmark::kMillisecond)
+  ->UseManualTime()
   ->Apply(generate_size_and_occupancy);
 
 BENCHMARK_TEMPLATE(BM_static_map_insert, int32_t, int32_t, dist_type::GAUSSIAN)
   ->Unit(benchmark::kMillisecond)
+  ->UseManualTime()
   ->Apply(generate_size_and_occupancy);
 
 BENCHMARK_TEMPLATE(BM_static_map_search_all, int32_t, int32_t, dist_type::GAUSSIAN)
   ->Unit(benchmark::kMillisecond)
+  ->UseManualTime()
   ->Apply(generate_size_and_occupancy);
 
 BENCHMARK_TEMPLATE(BM_static_map_insert, int64_t, int64_t, dist_type::UNIQUE)
   ->Unit(benchmark::kMillisecond)
+  ->UseManualTime()
   ->Apply(generate_size_and_occupancy);
 
 BENCHMARK_TEMPLATE(BM_static_map_search_all, int64_t, int64_t, dist_type::UNIQUE)
   ->Unit(benchmark::kMillisecond)
+  ->UseManualTime()
   ->Apply(generate_size_and_occupancy);
 
 BENCHMARK_TEMPLATE(BM_static_map_insert, int64_t, int64_t, dist_type::UNIFORM)
   ->Unit(benchmark::kMillisecond)
+  ->UseManualTime()
   ->Apply(generate_size_and_occupancy);
 
 BENCHMARK_TEMPLATE(BM_static_map_search_all, int64_t, int64_t, dist_type::UNIFORM)
   ->Unit(benchmark::kMillisecond)
+  ->UseManualTime()
   ->Apply(generate_size_and_occupancy);
 
 BENCHMARK_TEMPLATE(BM_static_map_insert, int64_t, int64_t, dist_type::GAUSSIAN)
   ->Unit(benchmark::kMillisecond)
+  ->UseManualTime()
   ->Apply(generate_size_and_occupancy);
 
 BENCHMARK_TEMPLATE(BM_static_map_search_all, int64_t, int64_t, dist_type::GAUSSIAN)
   ->Unit(benchmark::kMillisecond)
+  ->UseManualTime()
   ->Apply(generate_size_and_occupancy);
\ No newline at end of file
diff --git a/benchmarks/hash_table/static_reduction_map_bench.cu b/benchmarks/hash_table/static_reduction_map_bench.cu
new file mode 100644
index 000000000..92a2ab788
--- /dev/null
+++ b/benchmarks/hash_table/static_reduction_map_bench.cu
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmark/benchmark.h>
+#include <synchronization.hpp>
+#include <thrust/device_vector.h>
+#include <thrust/for_each.h>
+#include <fstream>
+#include <iostream>
+#include <random>
+#include "cuco/static_reduction_map.cuh"
+
+enum class dist_type { UNIQUE, UNIFORM, GAUSSIAN };
+
+template <dist_type Dist, typename Key, typename OutputIt>
+static void generate_keys(OutputIt output_begin, OutputIt output_end)
+{
+  auto num_keys = std::distance(output_begin, output_end);
+
+  std::random_device rd;
+  std::mt19937 gen{rd()};
+
+  switch (Dist) {
+    case dist_type::UNIQUE:
+      for (auto i = 0; i < num_keys; ++i) {
+        output_begin[i] = i;
+      }
+      break;
+    case dist_type::UNIFORM:
+      for (auto i = 0; i < num_keys; ++i) {
+        output_begin[i] = std::abs(static_cast<Key>(gen()));
+      }
+      break;
+    case dist_type::GAUSSIAN:
+      std::normal_distribution<> dg{1e9, 1e7};
+      for (auto i = 0; i < num_keys; ++i) {
+        output_begin[i] = std::abs(static_cast<Key>(dg(gen)));
+      }
+      break;
+  }
+}
+
+/**
+ * @brief Generates input sizes and hash table occupancies
+ *
+ */
+static void generate_size_and_occupancy(benchmark::internal::Benchmark* b)
+{
+  for (auto size = 4096; size <= 1 << 28; size *= 2) {
+    for (auto occupancy = 60; occupancy <= 60; occupancy += 10) {
+      b->Args({size, occupancy});
+    }
+  }
+}
+
+template <typename Key, typename Value, dist_type Dist, template <typename> typename ReductionOp>
+static void BM_static_map_insert(::benchmark::State& state)
+{
+  using map_type = cuco::static_reduction_map<ReductionOp<Value>, Key, Value>;
+
+  std::size_t num_keys = state.range(0);
+  float occupancy      = state.range(1) / float{100};
+  std::size_t size     = num_keys / occupancy;
+
+  std::vector<Key> h_keys(num_keys);
+  std::vector<cuco::pair_type<Key, Value>> h_pairs(num_keys);
+
+  generate_keys<Dist, Key>(h_keys.begin(), h_keys.end());
+
+  thrust::device_vector<Key> d_keys(h_keys);
+  thrust::device_vector<Value> d_values(h_keys);
+
+  auto pairs_begin =
+    thrust::make_zip_iterator(thrust::make_tuple(d_keys.begin(), d_values.begin()));
+  auto pairs_end = pairs_begin + num_keys;
+
+  for (auto _ : state) {
+    map_type map{size, -1};
+    {
+      cuda_event_timer raii{state};
+      map.insert(pairs_begin, pairs_end);
+    }
+  }
+
+  state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) *
+                          int64_t(state.range(0)));
+}
+
+BENCHMARK_TEMPLATE(BM_static_map_insert, int32_t, int32_t, dist_type::UNIQUE, cuco::reduce_add)
+  ->Unit(benchmark::kMillisecond)
+  ->UseManualTime()
+  ->Apply(generate_size_and_occupancy);
+
+BENCHMARK_TEMPLATE(BM_static_map_insert, int32_t, int32_t, dist_type::UNIFORM, cuco::reduce_add)
+  ->Unit(benchmark::kMillisecond)
+  ->UseManualTime()
+  ->Apply(generate_size_and_occupancy);
+
+BENCHMARK_TEMPLATE(BM_static_map_insert, int32_t, int32_t, dist_type::GAUSSIAN, cuco::reduce_add)
+  ->Unit(benchmark::kMillisecond)
+  ->UseManualTime()
+  ->Apply(generate_size_and_occupancy);
+
+BENCHMARK_TEMPLATE(BM_static_map_insert, int64_t, int64_t, dist_type::UNIQUE, cuco::reduce_add)
+  ->Unit(benchmark::kMillisecond)
+  ->UseManualTime()
+  ->Apply(generate_size_and_occupancy);
+
+BENCHMARK_TEMPLATE(BM_static_map_insert, int64_t, int64_t, dist_type::UNIFORM, cuco::reduce_add)
+  ->Unit(benchmark::kMillisecond)
+  ->UseManualTime()
+  ->Apply(generate_size_and_occupancy);
+
+BENCHMARK_TEMPLATE(BM_static_map_insert, int64_t, int64_t, dist_type::GAUSSIAN, cuco::reduce_add)
+  ->Unit(benchmark::kMillisecond)
+  ->UseManualTime()
+  ->Apply(generate_size_and_occupancy);
\ No newline at end of file
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index a70b53da8..be1a760e6 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -15,7 +15,7 @@ endif()
 function(ConfigureExample EXAMPLE_NAME EXAMPLE_SRC)
     add_executable(${EXAMPLE_NAME} "${EXAMPLE_SRC}")
     set_target_properties(${EXAMPLE_NAME} PROPERTIES 
-                                          CUDA_ARCHITECTURES ${GPU_ARCHS}
+                                          CUDA_ARCHITECTURES "${GPU_ARCHS}"
                                           RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/examples")
     target_include_directories(${EXAMPLE_NAME} PRIVATE
                                              "${CMAKE_CURRENT_SOURCE_DIR}")
@@ -28,3 +28,5 @@ endfunction(ConfigureExample)
 ###################################################################################################
 
 ConfigureExample(STATIC_MAP_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/static_map_example.cu")
+
+ConfigureExample(STATIC_REDUCTION_MAP_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_reduction_map.cu")
diff --git a/examples/static_reduction_map.cu b/examples/static_reduction_map.cu
new file mode 100644
index 000000000..f152ceb78
--- /dev/null
+++ b/examples/static_reduction_map.cu
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <limits>
+
+#include <thrust/device_vector.h>
+#include <thrust/equal.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/random.h>
+#include <thrust/sequence.h>
+#include <thrust/transform.h>
+
+#include <cuco/static_reduction_map.cuh>
+
+/**
+ * @file host_bulk_example.cu
+ * @brief Demonstrates usage of the static_map "bulk" host APIs.
+ *
+ * The bulk APIs are only invocable from the host and are used for doing operations like insert or
+ * find on a set of keys.
+ *
+ */
+
+int main(void)
+{
+  using Key   = int;
+  using Value = int;
+
+  // Empty slots are represented by reserved "sentinel" values. These values should be selected such
+  // that they never occur in your input data.
+  Key const empty_key_sentinel = -1;
+
+  // Number of key/value pairs to be inserted
+  std::size_t num_keys = 257;
+
+  // Compute capacity based on a 50% load factor
+  auto const load_factor     = 0.5;
+  std::size_t const capacity = std::ceil(num_keys / load_factor);
+
+  // Constructs a map each key with "capacity" slots using -1 as the
+  // empty key sentinel. The initial payload value for empty slots is determined by the identity of
+  // the reduction operation. By using the `reduce_add` operation, all values associated with a
+  // given key will be summed.
+  cuco::static_reduction_map<cuco::reduce_add<Value>, Key, Value> map{capacity, empty_key_sentinel};
+
+  // Create a sequence of random keys in `[0, num_keys/2]`
+  thrust::device_vector<Key> insert_keys(num_keys);
+  thrust::transform(thrust::device,
+                    thrust::make_counting_iterator<std::size_t>(0),
+                    thrust::make_counting_iterator(insert_keys.size()),
+                    insert_keys.begin(),
+                    [=] __device__(auto i) {
+                      thrust::default_random_engine rng;
+                      thrust::uniform_int_distribution dist{0, 10};
+                      rng.discard(i);
+                      return dist(rng);
+                    });
+
+  // Insert each key with a payload of `1` to count the number of times each key was inserted by
+  // using the `reduce_add` op
+  auto zipped = thrust::make_zip_iterator(
+    thrust::make_tuple(insert_keys.begin(), thrust::make_constant_iterator(1)));
+
+  // Inserts all pairs into the map, accumulating the payloads with the `reduce_add` operation
+  map.insert(zipped, zipped + insert_keys.size());
+
+  std::cout << "Num unique keys: " << map.get_size() << std::endl;
+
+  thrust::device_vector<Key> unique_keys(map.get_size());
+  thrust::device_vector<Value> count_per_key(map.get_size());
+
+  map.retrieve_all(unique_keys.begin(), count_per_key.begin());
+
+  for (int i = 0; i < unique_keys.size(); ++i) {
+    std::cout << "Key: " << unique_keys[i] << " Count: " << count_per_key[i] << std::endl;
+  }
+}
\ No newline at end of file
diff --git a/include/cuco/detail/pair.cuh b/include/cuco/detail/pair.cuh
index da50f7258..dfdf7632e 100644
--- a/include/cuco/detail/pair.cuh
+++ b/include/cuco/detail/pair.cuh
@@ -68,8 +68,8 @@ struct is_thrust_pair_like_impl : std::false_type {
 
 template <typename T>
 struct is_thrust_pair_like_impl<T,
-                           std::void_t<decltype(thrust::get<0>(std::declval<T>())),
-                                       decltype(thrust::get<1>(std::declval<T>()))>>
+                                std::void_t<decltype(thrust::get<0>(std::declval<T>())),
+                                            decltype(thrust::get<1>(std::declval<T>()))>>
   : std::conditional_t<thrust::tuple_size<T>::value == 2, std::true_type, std::false_type> {
 };
 
diff --git a/include/cuco/detail/static_reduction_map.inl b/include/cuco/detail/static_reduction_map.inl
new file mode 100644
index 000000000..bd9907ebc
--- /dev/null
+++ b/include/cuco/detail/static_reduction_map.inl
@@ -0,0 +1,460 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace cuco {
+
+/**---------------------------------------------------------------------------*
+ * @brief Enumeration of the possible results of attempting to insert into
+ *a hash bucket
+ *---------------------------------------------------------------------------**/
+enum class insert_result {
+  CONTINUE,  ///< Insert did not succeed, continue trying to insert
+  SUCCESS,   ///< New pair inserted successfully
+  DUPLICATE  ///< Insert did not succeed, key is already present
+};
+
+template <typename ReductionOp,
+          typename Key,
+          typename Value,
+          cuda::thread_scope Scope,
+          typename Allocator>
+static_reduction_map<ReductionOp, Key, Value, Scope, Allocator>::static_reduction_map(
+  std::size_t capacity, Key empty_key_sentinel, ReductionOp reduction_op, Allocator const& alloc)
+  : capacity_{capacity},
+    empty_key_sentinel_{empty_key_sentinel},
+    empty_value_sentinel_{ReductionOp::identity},
+    op_{reduction_op},
+    slot_allocator_{alloc}
+{
+  slots_ = std::allocator_traits<slot_allocator_type>::allocate(slot_allocator_, capacity);
+
+  auto constexpr block_size = 256;
+  auto constexpr stride     = 4;
+  auto const grid_size      = (capacity + stride * block_size - 1) / (stride * block_size);
+  detail::initialize<atomic_key_type, atomic_mapped_type><<<grid_size, block_size>>>(
+    slots_, get_empty_key_sentinel(), get_empty_value_sentinel(), get_capacity());
+
+  CUCO_CUDA_TRY(cudaMallocManaged(&num_successes_, sizeof(atomic_ctr_type)));
+}
+
+template <typename ReductionOp,
+          typename Key,
+          typename Value,
+          cuda::thread_scope Scope,
+          typename Allocator>
+static_reduction_map<ReductionOp, Key, Value, Scope, Allocator>::~static_reduction_map()
+{
+  std::allocator_traits<slot_allocator_type>::deallocate(slot_allocator_, slots_, capacity_);
+  CUCO_CUDA_TRY(cudaFree(num_successes_));
+}
+
+template <typename ReductionOp,
+          typename Key,
+          typename Value,
+          cuda::thread_scope Scope,
+          typename Allocator>
+template <typename InputIt, typename Hash, typename KeyEqual>
+void static_reduction_map<ReductionOp, Key, Value, Scope, Allocator>::insert(InputIt first,
+                                                                             InputIt last,
+                                                                             Hash hash,
+                                                                             KeyEqual key_equal)
+{
+  auto num_keys         = std::distance(first, last);
+  auto const block_size = 128;
+  auto const stride     = 1;
+  auto const tile_size  = 4;
+  auto const grid_size  = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size);
+  auto view             = get_device_mutable_view();
+
+  *num_successes_ = 0;
+  int device_id;
+  CUCO_CUDA_TRY(cudaGetDevice(&device_id));
+  CUCO_CUDA_TRY(cudaMemPrefetchAsync(num_successes_, sizeof(atomic_ctr_type), device_id));
+
+  detail::insert<block_size, tile_size>
+    <<<grid_size, block_size>>>(first, first + num_keys, num_successes_, view, hash, key_equal);
+  CUCO_CUDA_TRY(cudaDeviceSynchronize());
+
+  size_ += num_successes_->load(cuda::std::memory_order_relaxed);
+}
+
+template <typename ReductionOp,
+          typename Key,
+          typename Value,
+          cuda::thread_scope Scope,
+          typename Allocator>
+template <typename InputIt, typename OutputIt, typename Hash, typename KeyEqual>
+void static_reduction_map<ReductionOp, Key, Value, Scope, Allocator>::find(
+  InputIt first, InputIt last, OutputIt output_begin, Hash hash, KeyEqual key_equal) noexcept
+{
+  auto num_keys         = std::distance(first, last);
+  auto const block_size = 128;
+  auto const stride     = 1;
+  auto const tile_size  = 4;
+  auto const grid_size  = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size);
+  auto view             = get_device_view();
+
+  detail::find<block_size, tile_size, Value>
+    <<<grid_size, block_size>>>(first, last, output_begin, view, hash, key_equal);
+  CUCO_CUDA_TRY(cudaDeviceSynchronize());
+}
+
+namespace detail {
+template <typename Key, typename Value>
+struct slot_to_tuple {
+  template <typename S>
+  __device__ thrust::tuple<Key, Value> operator()(S const& s)
+  {
+    return thrust::tuple<Key, Value>(s.first, s.second);
+  }
+};
+
+template <typename Key>
+struct slot_is_filled {
+  Key empty_key_sentinel;
+  template <typename S>
+  __device__ bool operator()(S const& s)
+  {
+    return thrust::get<0>(s) != empty_key_sentinel;
+  }
+};
+}  // namespace detail
+
+template <typename ReductionOp,
+          typename Key,
+          typename Value,
+          cuda::thread_scope Scope,
+          typename Allocator>
+template <typename KeyOut, typename ValueOut>
+void static_reduction_map<ReductionOp, Key, Value, Scope, Allocator>::retrieve_all(
+  KeyOut keys_out, ValueOut values_out)
+{
+  // Convert pair_type to thrust::tuple to allow assigning to a zip iterator
+  auto begin      = thrust::make_transform_iterator(raw_slots_begin(), detail::slot_to_tuple<Key, Value>{});
+  auto end        = begin + get_capacity();
+  auto filled     = detail::slot_is_filled<Key>{get_empty_key_sentinel()};
+  auto zipped_out = thrust::make_zip_iterator(thrust::make_tuple(keys_out, values_out));
+
+  thrust::copy_if(thrust::device, begin, end, zipped_out, filled);
+}
+
+template <typename ReductionOp,
+          typename Key,
+          typename Value,
+          cuda::thread_scope Scope,
+          typename Allocator>
+template <typename InputIt, typename OutputIt, typename Hash, typename KeyEqual>
+void static_reduction_map<ReductionOp, Key, Value, Scope, Allocator>::contains(
+  InputIt first, InputIt last, OutputIt output_begin, Hash hash, KeyEqual key_equal) noexcept
+{
+  auto num_keys         = std::distance(first, last);
+  auto const block_size = 128;
+  auto const stride     = 1;
+  auto const tile_size  = 4;
+  auto const grid_size  = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size);
+  auto view             = get_device_view();
+
+  detail::contains<block_size, tile_size>
+    <<<grid_size, block_size>>>(first, last, output_begin, view, hash, key_equal);
+  CUCO_CUDA_TRY(cudaDeviceSynchronize());
+}
+
+template <typename ReductionOp,
+          typename Key,
+          typename Value,
+          cuda::thread_scope Scope,
+          typename Allocator>
+template <typename Hash, typename KeyEqual>
+__device__ Value
+static_reduction_map<ReductionOp, Key, Value, Scope, Allocator>::device_mutable_view::insert(
+  value_type const& insert_pair, Hash hash, KeyEqual key_equal) noexcept
+{
+  auto current_slot{initial_slot(insert_pair.first, hash)};
+
+  while (true) {
+    using cuda::std::memory_order_relaxed;
+    auto expected_key   = this->get_empty_key_sentinel();
+    auto expected_value = this->get_empty_value_sentinel();
+    auto& slot_key      = current_slot->first;
+    auto& slot_value    = current_slot->second;
+
+    auto const key_success =
+      slot_key.compare_exchange_strong(expected_key, insert_pair.first, memory_order_relaxed);
+
+    if (key_success or key_equal(insert_pair.first, expected_key)) {
+      return this->get_op().apply(slot_value, insert_pair.second);
+    }
+
+    // if we couldn't insert the key, but it wasn't a duplicate, then there must
+    // have been some other key there, so we keep looking for a slot
+    current_slot = next_slot(current_slot);
+  }
+}
+
+template <typename ReductionOp,
+          typename Key,
+          typename Value,
+          cuda::thread_scope Scope,
+          typename Allocator>
+template <typename CG, typename Hash, typename KeyEqual>
+__device__ bool
+static_reduction_map<ReductionOp, Key, Value, Scope, Allocator>::device_mutable_view::insert(
+  CG g, value_type const& insert_pair, Hash hash, KeyEqual key_equal) noexcept
+{
+  auto current_slot = initial_slot(g, insert_pair.first, hash);
+
+  while (true) {
+    auto& slot_key         = current_slot->first;
+    auto& slot_value       = current_slot->second;
+    auto const current_key = slot_key.load(cuda::std::memory_order_relaxed);
+
+    // The user provided `key_equal` should never be used to compare against `empty_key_sentinel` as
+    // the sentinel is not a valid key value. Therefore, first check for the sentinel
+    // TODO: Use memcmp
+    auto const slot_is_empty = (current_key == this->get_empty_key_sentinel());
+
+    auto const key_exists = not slot_is_empty and key_equal(current_key, insert_pair.first);
+
+    // Key already exists, aggregate with it's value
+    if (key_exists) { this->get_op().apply(slot_value, insert_pair.second); }
+
+    // If key already exists in the CG window, all threads exit
+    if (g.ballot(key_exists)) { return false; }
+
+    auto const window_empty_mask = g.ballot(slot_is_empty);
+
+    if (window_empty_mask) {
+      // the first lane in the group with an empty slot will attempt the insert
+      auto const src_lane = __ffs(window_empty_mask) - 1;
+
+      auto const attempt_update = [&]() {
+        auto expected_key = this->get_empty_key_sentinel();
+
+        auto const key_success = slot_key.compare_exchange_strong(
+          expected_key, insert_pair.first, cuda::memory_order_relaxed);
+
+        if (key_success or key_equal(insert_pair.first, expected_key)) {
+          this->get_op().apply(slot_value, insert_pair.second);
+          return key_success ? insert_result::SUCCESS : insert_result::DUPLICATE;
+        }
+        return insert_result::CONTINUE;
+      };
+
+      auto const update_result =
+        (g.thread_rank() == src_lane) ? attempt_update() : insert_result::CONTINUE;
+
+      auto const window_result = g.shfl(update_result, src_lane);
+
+      // If the update succeeded, the thread group exits
+      if (window_result != insert_result::CONTINUE) {
+        return (window_result == insert_result::SUCCESS);
+      }
+
+      // A different key took the current slot. Look for an empty slot in the current window
+    } else {
+      // No empty slots in the current window, move onto the next window
+      current_slot = next_slot(g, current_slot);
+    }
+  }
+}
+
+template <typename ReductionOp,
+          typename Key,
+          typename Value,
+          cuda::thread_scope Scope,
+          typename Allocator>
+template <typename Hash, typename KeyEqual>
+__device__
+  typename static_reduction_map<ReductionOp, Key, Value, Scope, Allocator>::device_view::iterator
+  static_reduction_map<ReductionOp, Key, Value, Scope, Allocator>::device_view::find(
+    Key const& k, Hash hash, KeyEqual key_equal) noexcept
+{
+  auto current_slot = initial_slot(k, hash);
+
+  while (true) {
+    auto const existing_key = current_slot->first.load(cuda::std::memory_order_relaxed);
+    // Key doesn't exist, return end()
+    if (existing_key == this->get_empty_key_sentinel()) { return this->end(); }
+
+    // Key exists, return iterator to location
+    if (key_equal(existing_key, k)) { return current_slot; }
+
+    current_slot = next_slot(current_slot);
+  }
+}
+
+template <typename ReductionOp,
+          typename Key,
+          typename Value,
+          cuda::thread_scope Scope,
+          typename Allocator>
+template <typename Hash, typename KeyEqual>
+__device__ typename static_reduction_map<ReductionOp, Key, Value, Scope, Allocator>::device_view::
+  const_iterator
+  static_reduction_map<ReductionOp, Key, Value, Scope, Allocator>::device_view::find(
+    Key const& k, Hash hash, KeyEqual key_equal) const noexcept
+{
+  auto current_slot = initial_slot(k, hash);
+
+  while (true) {
+    auto const existing_key = current_slot->first.load(cuda::std::memory_order_relaxed);
+    // Key doesn't exist, return end()
+    if (existing_key == this->get_empty_key_sentinel()) { return this->end(); }
+
+    // Key exists, return iterator to location
+    if (key_equal(existing_key, k)) { return current_slot; }
+
+    current_slot = next_slot(current_slot);
+  }
+}
+
+template <typename ReductionOp,
+          typename Key,
+          typename Value,
+          cuda::thread_scope Scope,
+          typename Allocator>
+template <typename CG, typename Hash, typename KeyEqual>
+__device__
+  typename static_reduction_map<ReductionOp, Key, Value, Scope, Allocator>::device_view::iterator
+  static_reduction_map<ReductionOp, Key, Value, Scope, Allocator>::device_view::find(
+    CG g, Key const& k, Hash hash, KeyEqual key_equal) noexcept
+{
+  auto current_slot = initial_slot(g, k, hash);
+
+  while (true) {
+    auto const existing_key = current_slot->first.load(cuda::std::memory_order_relaxed);
+
+    // The user provide `key_equal` can never be used to compare against `empty_key_sentinel` as
+    // the sentinel is not a valid key value. Therefore, first check for the sentinel
+    auto const slot_is_empty = (existing_key == this->get_empty_key_sentinel());
+
+    // the key we were searching for was found by one of the threads,
+    // so we return an iterator to the entry
+    auto const exists = g.ballot(not slot_is_empty and key_equal(existing_key, k));
+    if (exists) {
+      uint32_t src_lane = __ffs(exists) - 1;
+      // TODO: This shouldn't cast an iterator to an int to shuffle. Instead, get the index of the
+      // current_slot and shuffle that instead.
+      intptr_t res_slot = g.shfl(reinterpret_cast<intptr_t>(current_slot), src_lane);
+      return reinterpret_cast<iterator>(res_slot);
+    }
+
+    // we found an empty slot, meaning that the key we're searching for isn't present
+    if (g.ballot(slot_is_empty)) { return this->end(); }
+
+    // otherwise, all slots in the current window are full with other keys, so we move onto the
+    // next window
+    current_slot = next_slot(g, current_slot);
+  }
+}
+
+template <typename ReductionOp,
+          typename Key,
+          typename Value,
+          cuda::thread_scope Scope,
+          typename Allocator>
+template <typename CG, typename Hash, typename KeyEqual>
+__device__ typename static_reduction_map<ReductionOp, Key, Value, Scope, Allocator>::device_view::
+  const_iterator
+  static_reduction_map<ReductionOp, Key, Value, Scope, Allocator>::device_view::find(
+    CG g, Key const& k, Hash hash, KeyEqual key_equal) const noexcept
+{
+  auto current_slot = initial_slot(g, k, hash);
+
+  while (true) {
+    auto const existing_key = current_slot->first.load(cuda::std::memory_order_relaxed);
+
+    // The user provide `key_equal` can never be used to compare against `empty_key_sentinel` as
+    // the sentinel is not a valid key value. Therefore, first check for the sentinel
+    auto const slot_is_empty = (existing_key == this->get_empty_key_sentinel());
+
+    // the key we were searching for was found by one of the threads, so we return an iterator to
+    // the entry
+    auto const exists = g.ballot(not slot_is_empty and key_equal(existing_key, k));
+    if (exists) {
+      uint32_t src_lane = __ffs(exists) - 1;
+      // TODO: This shouldn't cast an iterator to an int to shuffle. Instead, get the index of the
+      // current_slot and shuffle that instead.
+      intptr_t res_slot = g.shfl(reinterpret_cast<intptr_t>(current_slot), src_lane);
+      return reinterpret_cast<const_iterator>(res_slot);
+    }
+
+    // we found an empty slot, meaning that the key we're searching
+    // for isn't in this submap, so we should move onto the next one
+    if (g.ballot(slot_is_empty)) { return this->end(); }
+
+    // otherwise, all slots in the current window are full with other keys,
+    // so we move onto the next window in the current submap
+
+    current_slot = next_slot(g, current_slot);
+  }
+}
+
+template <typename ReductionOp,
+          typename Key,
+          typename Value,
+          cuda::thread_scope Scope,
+          typename Allocator>
+template <typename Hash, typename KeyEqual>
+__device__ bool
+static_reduction_map<ReductionOp, Key, Value, Scope, Allocator>::device_view::contains(
+  Key const& k, Hash hash, KeyEqual key_equal) noexcept
+{
+  auto current_slot = initial_slot(k, hash);
+
+  while (true) {
+    auto const existing_key = current_slot->first.load(cuda::std::memory_order_relaxed);
+
+    if (existing_key == empty_key_sentinel_) { return false; }
+
+    if (key_equal(existing_key, k)) { return true; }
+
+    current_slot = next_slot(current_slot);
+  }
+}
+
+template <typename ReductionOp,
+          typename Key,
+          typename Value,
+          cuda::thread_scope Scope,
+          typename Allocator>
+template <typename CG, typename Hash, typename KeyEqual>
+__device__ bool
+static_reduction_map<ReductionOp, Key, Value, Scope, Allocator>::device_view::contains(
+  CG g, Key const& k, Hash hash, KeyEqual key_equal) noexcept
+{
+  auto current_slot = initial_slot(g, k, hash);
+
+  while (true) {
+    key_type const existing_key = current_slot->first.load(cuda::std::memory_order_relaxed);
+
+    // The user provide `key_equal` can never be used to compare against `empty_key_sentinel` as
+    // the sentinel is not a valid key value. Therefore, first check for the sentinel
+    auto const slot_is_empty = (existing_key == this->get_empty_key_sentinel());
+
+    // the key we were searching for was found by one of the threads, so we return an iterator to
+    // the entry
+    if (g.ballot(not slot_is_empty and key_equal(existing_key, k))) { return true; }
+
+    // we found an empty slot, meaning that the key we're searching for isn't present
+    if (g.ballot(slot_is_empty)) { return false; }
+
+    // otherwise, all slots in the current window are full with other keys, so we move onto the
+    // next window
+    current_slot = next_slot(g, current_slot);
+  }
+}
+}  // namespace cuco
diff --git a/include/cuco/detail/static_reduction_map_kernels.cuh b/include/cuco/detail/static_reduction_map_kernels.cuh
new file mode 100644
index 000000000..9849efb44
--- /dev/null
+++ b/include/cuco/detail/static_reduction_map_kernels.cuh
@@ -0,0 +1,389 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace cuco {
+namespace detail {
+namespace cg = cooperative_groups;
+
+/**
+ * @brief Initializes each slot in the flat `slots` storage to contain `k` and `v`.
+ *
+ * Each space in `slots` that can hold a key value pair is initialized to a
+ * `pair_atomic_type` containing the key `k` and the value `v`.
+ *
+ * @tparam atomic_key_type Type of the `Key` atomic container
+ * @tparam atomic_mapped_type Type of the `Value` atomic container
+ * @tparam Key key type
+ * @tparam Value value type
+ * @tparam pair_atomic_type key/value pair type
+ * @param slots Pointer to flat storage for the map's key/value pairs
+ * @param k Key to which all keys in `slots` are initialized
+ * @param v Value to which all values in `slots` are initialized
+ * @param size Size of the storage pointed to by `slots`
+ */
+template <typename atomic_key_type,
+          typename atomic_mapped_type,
+          typename Key,
+          typename Value,
+          typename pair_atomic_type>
+__global__ void initialize(pair_atomic_type* const slots, Key k, Value v, std::size_t size)
+{
+  auto tid = threadIdx.x + blockIdx.x * blockDim.x;
+  while (tid < size) {
+    new (&slots[tid].first) atomic_key_type{k};
+    new (&slots[tid].second) atomic_mapped_type{v};
+    tid += gridDim.x * blockDim.x;
+  }
+}
+
+/**
+ * @brief Inserts all key/value pairs in the range `[first, last)`.
+ *
+ * If multiple keys in `[first, last)` compare equal, it is unspecified which
+ * element is inserted.
+ *
+ * @tparam block_size
+ * @tparam InputIt Device accessible input iterator whose `value_type` is
+ * convertible to the map's `value_type`
+ * @tparam atomicT Type of atomic storage
+ * @tparam viewT Type of device view allowing access of hash map storage
+ * @tparam Hash Unary callable type
+ * @tparam KeyEqual Binary callable type
+ * @param first Beginning of the sequence of key/value pairs
+ * @param last End of the sequence of key/value pairs
+ * @param num_successes The number of successfully inserted key/value pairs
+ * @param view Mutable device view used to access the hash map's slot storage
+ * @param hash The unary function to apply to hash each key
+ * @param key_equal The binary function used to compare two keys for equality
+ */
+template <uint32_t block_size,
+          typename InputIt,
+          typename atomicT,
+          typename viewT,
+          typename Hash,
+          typename KeyEqual>
+__global__ void insert(
+  InputIt first, InputIt last, atomicT* num_successes, viewT view, Hash hash, KeyEqual key_equal)
+{
+  typedef cub::BlockReduce<std::size_t, block_size> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  std::size_t thread_num_successes = 0;
+
+  auto tid = blockDim.x * blockIdx.x + threadIdx.x;
+  auto it  = first + tid;
+
+  while (it < last) {
+    typename viewT::value_type const insert_pair{*it};
+    if (view.insert(insert_pair, hash, key_equal)) { thread_num_successes++; }
+    it += gridDim.x * blockDim.x;
+  }
+
+  // compute number of successfully inserted elements for each block
+  // and atomically add to the grand total
+  std::size_t block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes);
+  if (threadIdx.x == 0) { *num_successes += block_num_successes; }
+}
+
+/**
+ * @brief Inserts all key/value pairs in the range `[first, last)`.
+ *
+ * If multiple keys in `[first, last)` compare equal, it is unspecified which
+ * element is inserted. Uses the CUDA Cooperative Groups API to leverage groups
+ * of multiple threads to perform each key/value insertion. This provides a
+ * significant boost in throughput compared to the non Cooperative Group
+ * `insert` at moderate to high load factors.
+ *
+ * @tparam block_size
+ * @tparam tile_size The number of threads in the Cooperative Groups used to perform
+ * inserts
+ * @tparam InputIt Device accessible input iterator whose `value_type` is
+ * convertible to the map's `value_type`
+ * @tparam atomicT Type of atomic storage
+ * @tparam viewT Type of device view allowing access of hash map storage
+ * @tparam Hash Unary callable type
+ * @tparam KeyEqual Binary callable type
+ * @param first Beginning of the sequence of key/value pairs
+ * @param last End of the sequence of key/value pairs
+ * @param num_successes The number of successfully inserted key/value pairs
+ * @param view Mutable device view used to access the hash map's slot storage
+ * @param hash The unary function to apply to hash each key
+ * @param key_equal The binary function used to compare two keys for equality
+ */
+template <uint32_t block_size,
+          uint32_t tile_size,
+          typename InputIt,
+          typename atomicT,
+          typename viewT,
+          typename Hash,
+          typename KeyEqual>
+__global__ void insert(
+  InputIt first, InputIt last, atomicT* num_successes, viewT view, Hash hash, KeyEqual key_equal)
+{
+  typedef cub::BlockReduce<std::size_t, block_size> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  std::size_t thread_num_successes = 0;
+
+  auto tile = cg::tiled_partition<tile_size>(cg::this_thread_block());
+  auto tid  = blockDim.x * blockIdx.x + threadIdx.x;
+  auto it   = first + tid / tile_size;
+
+  while (it < last) {
+    // force conversion to value_type
+    typename viewT::value_type const insert_pair{
+      static_cast<typename viewT::key_type>(thrust::get<0>(*it)),
+      static_cast<typename viewT::mapped_type>(thrust::get<1>(*it))};
+
+    if (view.insert(tile, insert_pair, hash, key_equal) && tile.thread_rank() == 0) {
+      thread_num_successes++;
+    }
+    it += (gridDim.x * blockDim.x) / tile_size;
+  }
+
+  // compute number of successfully inserted elements for each block
+  // and atomically add to the grand total
+  std::size_t block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes);
+  if (threadIdx.x == 0) { *num_successes += block_num_successes; }
+}
+
+/**
+ * @brief Finds the values corresponding to all keys in the range `[first, last)`.
+ *
+ * If the key `*(first + i)` exists in the map, copies its associated value to `(output_begin + i)`.
+ * Else, copies the empty value sentinel.
+ * @tparam block_size The size of the thread block
+ * @tparam Value The type of the mapped value for the map
+ * @tparam InputIt Device accessible input iterator whose `value_type` is
+ * convertible to the map's `key_type`
+ * @tparam OutputIt Device accessible output iterator whose `value_type` is
+ * convertible to the map's `mapped_type`
+ * @tparam viewT Type of device view allowing access of hash map storage
+ * @tparam Hash Unary callable type
+ * @tparam KeyEqual Binary callable type
+ * @param first Beginning of the sequence of keys
+ * @param last End of the sequence of keys
+ * @param output_begin Beginning of the sequence of values retrieved for each key
+ * @param view Device view used to access the hash map's slot storage
+ * @param hash The unary function to apply to hash each key
+ * @param key_equal The binary function to compare two keys for equality
+ */
+template <uint32_t block_size,
+          typename Value,
+          typename InputIt,
+          typename OutputIt,
+          typename viewT,
+          typename Hash,
+          typename KeyEqual>
+__global__ void find(
+  InputIt first, InputIt last, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal)
+{
+  auto tid     = blockDim.x * blockIdx.x + threadIdx.x;
+  auto key_idx = tid;
+  __shared__ Value writeBuffer[block_size];
+
+  while (first + key_idx < last) {
+    auto key   = *(first + key_idx);
+    auto found = view.find(key, hash, key_equal);
+
+    /*
+     * The ld.relaxed.gpu instruction used in view.find causes L1 to
+     * flush more frequently, causing increased sector stores from L2 to global memory.
+     * By writing results to shared memory and then synchronizing before writing back
+     * to global, we no longer rely on L1, preventing the increase in sector stores from
+     * L2 to global and improving performance.
+     */
+    writeBuffer[threadIdx.x] = found->second.load(cuda::std::memory_order_relaxed);
+    __syncthreads();
+    *(output_begin + key_idx) = writeBuffer[threadIdx.x];
+    key_idx += gridDim.x * blockDim.x;
+  }
+}
+
+/**
+ * @brief Finds the values corresponding to all keys in the range `[first, last)`.
+ *
+ * If the key `*(first + i)` exists in the map, copies its associated value to `(output_begin + i)`.
+ * Else, copies the empty value sentinel. Uses the CUDA Cooperative Groups API to leverage groups
+ * of multiple threads to find each key. This provides a significant boost in throughput compared
+ * to the non Cooperative Group `find` at moderate to high load factors.
+ *
+ * @tparam block_size The size of the thread block
+ * @tparam tile_size The number of threads in the Cooperative Groups used to perform
+ * inserts
+ * @tparam Value The type of the mapped value for the map
+ * @tparam InputIt Device accessible input iterator whose `value_type` is
+ * convertible to the map's `key_type`
+ * @tparam OutputIt Device accessible output iterator whose `value_type` is
+ * convertible to the map's `mapped_type`
+ * @tparam viewT Type of device view allowing access of hash map storage
+ * @tparam Hash Unary callable type
+ * @tparam KeyEqual Binary callable type
+ * @param first Beginning of the sequence of keys
+ * @param last End of the sequence of keys
+ * @param output_begin Beginning of the sequence of values retrieved for each key
+ * @param view Device view used to access the hash map's slot storage
+ * @param hash The unary function to apply to hash each key
+ * @param key_equal The binary function to compare two keys for equality
+ */
+template <uint32_t block_size,
+          uint32_t tile_size,
+          typename Value,
+          typename InputIt,
+          typename OutputIt,
+          typename viewT,
+          typename Hash,
+          typename KeyEqual>
+__global__ void find(
+  InputIt first, InputIt last, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal)
+{
+  auto tile    = cg::tiled_partition<tile_size>(cg::this_thread_block());
+  auto tid     = blockDim.x * blockIdx.x + threadIdx.x;
+  auto key_idx = tid / tile_size;
+  __shared__ Value writeBuffer[block_size];
+
+  while (first + key_idx < last) {
+    auto key   = *(first + key_idx);
+    auto found = view.find(tile, key, hash, key_equal);
+
+    /*
+     * The ld.relaxed.gpu instruction used in view.find causes L1 to
+     * flush more frequently, causing increased sector stores from L2 to global memory.
+     * By writing results to shared memory and then synchronizing before writing back
+     * to global, we no longer rely on L1, preventing the increase in sector stores from
+     * L2 to global and improving performance.
+     */
+    if (tile.thread_rank() == 0) {
+      writeBuffer[threadIdx.x / tile_size] = found->second.load(cuda::std::memory_order_relaxed);
+    }
+    __syncthreads();
+    if (tile.thread_rank() == 0) {
+      *(output_begin + key_idx) = writeBuffer[threadIdx.x / tile_size];
+    }
+    key_idx += (gridDim.x * blockDim.x) / tile_size;
+  }
+}
+
+/**
+ * @brief Indicates whether the keys in the range `[first, last)` are contained in the map.
+ *
+ * Writes a `bool` to `(output + i)` indicating if the key `*(first + i)` exists in the map.
+ *
+ * @tparam block_size The size of the thread block
+ * @tparam InputIt Device accessible input iterator whose `value_type` is
+ * convertible to the map's `key_type`
+ * @tparam OutputIt Device accessible output iterator whose `value_type` is
+ * convertible to the map's `mapped_type`
+ * @tparam viewT Type of device view allowing access of hash map storage
+ * @tparam Hash Unary callable type
+ * @tparam KeyEqual Binary callable type
+ * @param first Beginning of the sequence of keys
+ * @param last End of the sequence of keys
+ * @param output_begin Beginning of the sequence of booleans for the presence of each key
+ * @param view Device view used to access the hash map's slot storage
+ * @param hash The unary function to apply to hash each key
+ * @param key_equal The binary function to compare two keys for equality
+ */
+template <uint32_t block_size,
+          typename InputIt,
+          typename OutputIt,
+          typename viewT,
+          typename Hash,
+          typename KeyEqual>
+__global__ void contains(
+  InputIt first, InputIt last, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal)
+{
+  auto tid     = blockDim.x * blockIdx.x + threadIdx.x;
+  auto key_idx = tid;
+  __shared__ bool writeBuffer[block_size];
+
+  while (first + key_idx < last) {
+    auto key = *(first + key_idx);
+
+    /*
+     * The ld.relaxed.gpu instruction used in view.find causes L1 to
+     * flush more frequently, causing increased sector stores from L2 to global memory.
+     * By writing results to shared memory and then synchronizing before writing back
+     * to global, we no longer rely on L1, preventing the increase in sector stores from
+     * L2 to global and improving performance.
+     */
+    writeBuffer[threadIdx.x] = view.contains(key, hash, key_equal);
+    __syncthreads();
+    *(output_begin + key_idx) = writeBuffer[threadIdx.x];
+    key_idx += gridDim.x * blockDim.x;
+  }
+}
+
+/**
+ * @brief Indicates whether the keys in the range `[first, last)` are contained in the map.
+ *
+ * Writes a `bool` to `(output + i)` indicating if the key `*(first + i)` exists in the map.
+ * Uses the CUDA Cooperative Groups API to leverage groups of multiple threads to perform the
+ * contains operation for each key. This provides a significant boost in throughput compared
+ * to the non Cooperative Group `contains` at moderate to high load factors.
+ *
+ * @tparam block_size The size of the thread block
+ * @tparam tile_size The number of threads in the Cooperative Groups used to perform
+ * inserts
+ * @tparam InputIt Device accessible input iterator whose `value_type` is
+ * convertible to the map's `key_type`
+ * @tparam OutputIt Device accessible output iterator whose `value_type` is
+ * convertible to the map's `mapped_type`
+ * @tparam viewT Type of device view allowing access of hash map storage
+ * @tparam Hash Unary callable type
+ * @tparam KeyEqual Binary callable type
+ * @param first Beginning of the sequence of keys
+ * @param last End of the sequence of keys
+ * @param output_begin Beginning of the sequence of booleans for the presence of each key
+ * @param view Device view used to access the hash map's slot storage
+ * @param hash The unary function to apply to hash each key
+ * @param key_equal The binary function to compare two keys for equality
+ */
+template <uint32_t block_size,
+          uint32_t tile_size,
+          typename InputIt,
+          typename OutputIt,
+          typename viewT,
+          typename Hash,
+          typename KeyEqual>
+__global__ void contains(
+  InputIt first, InputIt last, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal)
+{
+  auto tile    = cg::tiled_partition<tile_size>(cg::this_thread_block());
+  auto tid     = blockDim.x * blockIdx.x + threadIdx.x;
+  auto key_idx = tid / tile_size;
+  __shared__ bool writeBuffer[block_size];
+
+  while (first + key_idx < last) {
+    auto key   = *(first + key_idx);
+    auto found = view.contains(tile, key, hash, key_equal);
+
+    /*
+     * The ld.relaxed.gpu instruction used in view.find causes L1 to
+     * flush more frequently, causing increased sector stores from L2 to global memory.
+     * By writing results to shared memory and then synchronizing before writing back
+     * to global, we no longer rely on L1, preventing the increase in sector stores from
+     * L2 to global and improving performance.
+     */
+    if (tile.thread_rank() == 0) { writeBuffer[threadIdx.x / tile_size] = found; }
+    __syncthreads();
+    if (tile.thread_rank() == 0) {
+      *(output_begin + key_idx) = writeBuffer[threadIdx.x / tile_size];
+    }
+    key_idx += (gridDim.x * blockDim.x) / tile_size;
+  }
+}
+
+}  // namespace detail
+}  // namespace cuco
\ No newline at end of file
diff --git a/include/cuco/static_reduction_map.cuh b/include/cuco/static_reduction_map.cuh
new file mode 100644
index 000000000..c66958fb8
--- /dev/null
+++ b/include/cuco/static_reduction_map.cuh
@@ -0,0 +1,1022 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cooperative_groups.h>
+#include <thrust/distance.h>
+#include <thrust/functional.h>
+#include <cub/cub.cuh>
+#include <cuda/std/atomic>
+#include <memory>
+#include <thrust/copy.h>
+#include <thrust/iterator/transform_output_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/tuple.h>
+
+#include <cuco/allocator.hpp>
+#ifndef CUDART_VERSION
+#error CUDART_VERSION Undefined!
+#elif (CUDART_VERSION >= 11000)  // including with CUDA 10.2 leads to compilation errors
+#include <cuda/barrier>
+#endif
+
+#include <cuco/detail/error.hpp>
+#include <cuco/detail/hash_functions.cuh>
+#include <cuco/detail/pair.cuh>
+#include <cuco/detail/static_reduction_map_kernels.cuh>
+
+namespace cuco {
+
+template <typename T>
+struct reduce_add {
+  using value_type            = T;
+  static constexpr T identity = 0;
+
+  template <cuda::thread_scope Scope, typename T2>
+  __device__ T apply(cuda::atomic<T, Scope>& slot, T2 const& value) const
+  {
+    return slot.fetch_add(value, cuda::memory_order_relaxed);
+  }
+};
+
+template <typename T>
+struct reduce_sub {
+  using value_type            = T;
+  static constexpr T identity = 0;
+
+  template <cuda::thread_scope Scope, typename T2>
+  __device__ T apply(cuda::atomic<T, Scope>& slot, T2 const& value) const
+  {
+    return slot.fetch_sub(value, cuda::memory_order_relaxed);
+  }
+};
+
+template <typename T>
+struct reduce_min {
+  using value_type            = T;
+  static constexpr T identity = std::numeric_limits<T>::max();
+
+  template <cuda::thread_scope Scope, typename T2>
+  __device__ T apply(cuda::atomic<T, Scope>& slot, T2 const& value) const
+  {
+    return slot.fetch_min(value, cuda::memory_order_relaxed);
+  }
+};
+
+template <typename T>
+struct reduce_max {
+  using value_type            = T;
+  static constexpr T identity = std::numeric_limits<T>::lowest();
+
+  template <cuda::thread_scope Scope, typename T2>
+  __device__ T apply(cuda::atomic<T, Scope>& slot, T2 const& value) const
+  {
+    return slot.fetch_max(value, cuda::memory_order_relaxed);
+  }
+};
+
+template <typename T, T Identity, typename Op>
+struct custom_op {
+  using value_type            = T;
+  static constexpr T identity = Identity;
+
+  Op op;
+
+  template <cuda::thread_scope Scope, typename T2>
+  __device__ T apply(cuda::atomic<T, Scope>& slot, T2 const& value) const
+  {
+    auto old = slot.load(cuda::memory_order_relaxed);
+    while (not slot.compare_exchange_strong(old, op(old, value), cuda::memory_order_relaxed)) {}
+    return old;
+  }
+};
+
+/**
+ * @brief A GPU-accelerated, unordered, associative container of key-value
+ * pairs with unique keys.
+ *
+ * Allows constant time concurrent inserts or concurrent find operations (not
+ * concurrent insert and find) from threads in device code.
+ *
+ * Current limitations:
+ * - Requires keys that are Arithmetic
+ * - Does not support erasing keys
+ * - Capacity is fixed and will not grow automatically
+ * - Requires the user to specify sentinel values for both key and mapped value
+ * to indicate empty slots
+ * - Does not support concurrent insert and find operations
+ *
+ * The `static_reduction_map` supports two types of operations:
+ * - Host-side "bulk" operations
+ * - Device-side "singular" operations
+ *
+ * The host-side bulk operations include `insert`, `find`, and `contains`. These
+ * APIs should be used when there are a large number of keys to insert or lookup
+ * in the map. For example, given a range of keys specified by device-accessible
+ * iterators, the bulk `insert` function will insert all keys into the map.
+ *
+ * The singular device-side operations allow individual threads to to perform
+ * independent insert or find/contains operations from device code. These
+ * operations are accessed through non-owning, trivially copyable "view" types:
+ * `device_view` and `mutable_device_view`. The `device_view` class is an
+ * immutable view that allows only non-modifying operations such as `find` or
+ * `contains`. The `mutable_device_view` class only allows `insert` operations.
+ * The two types are separate to prevent erroneous concurrent insert/find
+ * operations.
+ *
+ * Example:
+ * \code{.cpp}
+ * int empty_key_sentinel = -1;
+ * int empty_value_sentine = -1;
+ *
+ * // Constructs a map with 100,000 slots using -1 and -1 as the empty key/value
+ * // sentinels. Note the capacity is chosen knowing we will insert 50,000 keys,
+ * // for an load factor of 50%.
+ * static_reduction_map<int, int> m{100'000, empty_key_sentinel, empty_value_sentinel};
+ *
+ * // Create a sequence of pairs {{0,0}, {1,1}, ... {i,i}}
+ * thrust::device_vector<thrust::pair<int,int>> pairs(50,000);
+ * thrust::transform(thrust::make_counting_iterator(0),
+ *                   thrust::make_counting_iterator(pairs.size()),
+ *                   pairs.begin(),
+ *                   []__device__(auto i){ return thrust::make_pair(i,i); };
+ *
+ *
+ * // Inserts all pairs into the map
+ * m.insert(pairs.begin(), pairs.end());
+ *
+ * // Get a `device_view` and passes it to a kernel where threads may perform
+ * // `find/contains` lookups
+ * kernel<<<...>>>(m.get_device_view());
+ * \endcode
+ *
+ *
+ * @tparam Key Arithmetic type used for key
+ * @tparam Value Type of the mapped values
+ * @tparam Scope The scope in which insert/find operations will be performed by
+ * individual threads.
+ * @tparam Allocator Type of allocator used for device storage
+ */
+template <typename ReductionOp,
+          typename Key,
+          typename Value,
+          cuda::thread_scope Scope = cuda::thread_scope_device,
+          typename Allocator       = cuco::cuda_allocator<char>>
+class static_reduction_map {
+  static_assert(std::is_arithmetic<Key>::value, "Unsupported, non-arithmetic key type.");
+  static_assert(std::is_same<typename ReductionOp::value_type, Value>::value,
+                "Type mismatch between ReductionOp::value_type and Value");
+
+ public:
+  using value_type         = cuco::pair_type<Key, Value>;
+  using key_type           = Key;
+  using mapped_type        = Value;
+  using atomic_key_type    = cuda::atomic<key_type, Scope>;
+  using atomic_mapped_type = cuda::atomic<mapped_type, Scope>;
+  using pair_atomic_type   = cuco::pair_type<atomic_key_type, atomic_mapped_type>;
+  using atomic_ctr_type    = cuda::atomic<std::size_t, Scope>;
+  using allocator_type     = Allocator;
+  using slot_allocator_type =
+    typename std::allocator_traits<Allocator>::rebind_alloc<pair_atomic_type>;
+
+  static_reduction_map(static_reduction_map const&) = delete;
+  static_reduction_map(static_reduction_map&&)      = delete;
+  static_reduction_map& operator=(static_reduction_map const&) = delete;
+  static_reduction_map& operator=(static_reduction_map&&) = delete;
+
+  /**
+   * @brief Construct a fixed-size map with the specified capacity and sentinel values.
+   * @brief Construct a statically sized map with the specified number of slots
+   * and sentinel values.
+   *
+   * The capacity of the map is fixed. Insert operations will not automatically
+   * grow the map. Attempting to insert more unique keys than the capacity of
+   * the map results in undefined behavior.
+   *
+   * Performance begins to degrade significantly beyond a load factor of ~70%.
+   * For best performance, choose a capacity that will keep the load factor
+   * below 70%. E.g., if inserting `N` unique keys, choose a capacity of
+   * `N * (1/0.7)`.
+   *
+   * The `empty_key_sentinel` and `empty_value_sentinel` values are reserved and
+   * undefined behavior results from attempting to insert any key/value pair
+   * that contains either.
+   *
+   * @param capacity The total number of slots in the map
+   * @param empty_key_sentinel The reserved key value for empty slots
+   * @param empty_value_sentinel The reserved mapped value for empty slots
+   * @param alloc Allocator used for allocating device storage
+   */
+  static_reduction_map(std::size_t capacity,
+                       Key empty_key_sentinel,
+                       ReductionOp reduction_op = {},
+                       Allocator const& alloc   = Allocator{});
+
+  /**
+   * @brief Destroys the map and frees its contents.
+   *
+   */
+  ~static_reduction_map();
+
+  /**
+   * @brief Inserts all key/value pairs in the range `[first, last)`.
+   *
+   * If multiple keys in `[first, last)` compare equal, it is unspecified which
+   * element is inserted.
+   *
+   * @tparam InputIt Device accessible input iterator whose `value_type` is
+   * convertible to the map's `value_type`
+   * @tparam Hash Unary callable type
+   * @tparam KeyEqual Binary callable type
+   * @param first Beginning of the sequence of key/value pairs
+   * @param last End of the sequence of key/value pairs
+   * @param hash The unary function to apply to hash each key
+   * @param key_equal The binary function to compare two keys for equality
+   */
+  template <typename InputIt,
+            typename Hash     = cuco::detail::MurmurHash3_32<key_type>,
+            typename KeyEqual = thrust::equal_to<key_type>>
+  void insert(InputIt first, InputIt last, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{});
+
+  /**
+   * @brief Finds the values corresponding to all keys in the range `[first, last)`.
+   *
+   * If the key `*(first + i)` exists in the map, copies its associated value to `(output_begin +
+   * i)`. Else, copies the empty value sentinel.
+   *
+   * @tparam InputIt Device accessible input iterator whose `value_type` is
+   * convertible to the map's `key_type`
+   * @tparam OutputIt Device accessible output iterator whose `value_type` is
+   * convertible to the map's `mapped_type`
+   * @tparam Hash Unary callable type
+   * @tparam KeyEqual Binary callable type
+   * @param first Beginning of the sequence of keys
+   * @param last End of the sequence of keys
+   * @param output_begin Beginning of the sequence of values retrieved for each key
+   * @param hash The unary function to apply to hash each key
+   * @param key_equal The binary function to compare two keys for equality
+   */
+  template <typename InputIt,
+            typename OutputIt,
+            typename Hash     = cuco::detail::MurmurHash3_32<key_type>,
+            typename KeyEqual = thrust::equal_to<key_type>>
+  void find(InputIt first,
+            InputIt last,
+            OutputIt output_begin,
+            Hash hash          = Hash{},
+            KeyEqual key_equal = KeyEqual{}) noexcept;
+
+  /**
+   * @brief Retrieves all of the keys and their associated values.
+   *
+   * The order in which keys are returned is implementation defined and not guaranteed to be
+   * consistent between subsequent calls to `retrieve_all`.
+   *
+   * Behavior is undefined if the range beginning at `keys_out` or `values_out` is not large enough
+   * to contain the number of keys in the map.
+   *
+   * @tparam KeyOut Device accessible random access output iterator whose `value_type` is
+   * convertible from `key_type`.
+   * @tparam ValueOut Device accesible random access output iterator whose `value_type` is
+   * convertible from `mapped_type`.
+   * @param keys_out Beginning output iterator for keys
+   * @param values_out Beginning output iterator for values
+   */
+  template <typename KeyOut, typename ValueOut>
+  void retrieve_all(KeyOut keys_out, ValueOut values_out);
+
+  /**
+   * @brief Indicates whether the keys in the range `[first, last)` are contained in the map.
+   *
+   * Writes a `bool` to `(output + i)` indicating if the key `*(first + i)` exists in the map.
+   *
+   * @tparam InputIt Device accessible input iterator whose `value_type` is
+   * convertible to the map's `key_type`
+   * @tparam OutputIt Device accessible output iterator whose `value_type` is
+   * convertible to the map's `mapped_type`
+   * @tparam Hash Unary callable type
+   * @tparam KeyEqual Binary callable type
+   * @param first Beginning of the sequence of keys
+   * @param last End of the sequence of keys
+   * @param output_begin Beginning of the sequence of booleans for the presence of each key
+   * @param hash The unary function to apply to hash each key
+   * @param key_equal The binary function to compare two keys for equality
+   */
+  template <typename InputIt,
+            typename OutputIt,
+            typename Hash     = cuco::detail::MurmurHash3_32<key_type>,
+            typename KeyEqual = thrust::equal_to<key_type>>
+  void contains(InputIt first,
+                InputIt last,
+                OutputIt output_begin,
+                Hash hash          = Hash{},
+                KeyEqual key_equal = KeyEqual{}) noexcept;
+
+ private:
+  class device_view_base {
+   protected:
+    // Import member type definitions from `static_reduction_map`
+    using value_type     = value_type;
+    using key_type       = Key;
+    using mapped_type    = Value;
+    using iterator       = pair_atomic_type*;
+    using const_iterator = pair_atomic_type const*;
+
+   private:
+    pair_atomic_type* slots_{};     ///< Pointer to flat slots storage
+    std::size_t capacity_{};        ///< Total number of slots
+    Key empty_key_sentinel_{};      ///< Key value that represents an empty slot
+    Value empty_value_sentinel_{};  ///< Initial Value of empty slot
+    ReductionOp op_{};              ///< Binary operation reduction function object
+
+   protected:
+    __host__ __device__ device_view_base(pair_atomic_type* slots,
+                                         std::size_t capacity,
+                                         Key empty_key_sentinel,
+                                         ReductionOp reduction_op) noexcept
+      : slots_{slots},
+        capacity_{capacity},
+        empty_key_sentinel_{empty_key_sentinel},
+        empty_value_sentinel_{ReductionOp::identity},
+        op_{reduction_op}
+    {
+    }
+
+    /**
+     * @brief Gets the binary op
+     *
+     */
+    __device__ ReductionOp get_op() const { return op_; }
+
+    /**
+     * @brief Gets slots array.
+     *
+     * @return Slots array
+     */
+    __device__ pair_atomic_type* get_slots() noexcept { return slots_; }
+
+    /**
+     * @brief Gets slots array.
+     *
+     * @return Slots array
+     */
+    __device__ pair_atomic_type const* get_slots() const noexcept { return slots_; }
+
+    /**
+     * @brief Returns the initial slot for a given key `k`
+     *
+     * @tparam Hash Unary callable type
+     * @param k The key to get the slot for
+     * @param hash The unary callable used to hash the key
+     * @return Pointer to the initial slot for `k`
+     */
+    template <typename Hash>
+    __device__ iterator initial_slot(Key const& k, Hash hash) noexcept
+    {
+      return &slots_[hash(k) % capacity_];
+    }
+
+    /**
+     * @brief Returns the initial slot for a given key `k`
+     *
+     * @tparam Hash Unary callable type
+     * @param k The key to get the slot for
+     * @param hash The unary callable used to hash the key
+     * @return Pointer to the initial slot for `k`
+     */
+    template <typename Hash>
+    __device__ const_iterator initial_slot(Key const& k, Hash hash) const noexcept
+    {
+      return &slots_[hash(k) % capacity_];
+    }
+
+    /**
+     * @brief Returns the initial slot for a given key `k`
+     *
+     * To be used for Cooperative Group based probing.
+     *
+     * @tparam CG Cooperative Group type
+     * @tparam Hash Unary callable type
+     * @param g the Cooperative Group for which the initial slot is needed
+     * @param k The key to get the slot for
+     * @param hash The unary callable used to hash the key
+     * @return Pointer to the initial slot for `k`
+     */
+    template <typename CG, typename Hash>
+    __device__ iterator initial_slot(CG g, Key const& k, Hash hash) noexcept
+    {
+      return &slots_[(hash(k) + g.thread_rank()) % capacity_];
+    }
+
+    /**
+     * @brief Returns the initial slot for a given key `k`
+     *
+     * To be used for Cooperative Group based probing.
+     *
+     * @tparam CG Cooperative Group type
+     * @tparam Hash Unary callable type
+     * @param g the Cooperative Group for which the initial slot is needed
+     * @param k The key to get the slot for
+     * @param hash The unary callable used to hash the key
+     * @return Pointer to the initial slot for `k`
+     */
+    template <typename CG, typename Hash>
+    __device__ const_iterator initial_slot(CG g, Key const& k, Hash hash) const noexcept
+    {
+      return &slots_[(hash(k) + g.thread_rank()) % capacity_];
+    }
+
+    /**
+     * @brief Given a slot `s`, returns the next slot.
+     *
+     * If `s` is the last slot, wraps back around to the first slot.
+     *
+     * @param s The slot to advance
+     * @return The next slot after `s`
+     */
+    __device__ iterator next_slot(iterator s) noexcept { return (++s < end()) ? s : begin_slot(); }
+
+    /**
+     * @brief Given a slot `s`, returns the next slot.
+     *
+     * If `s` is the last slot, wraps back around to the first slot.
+     *
+     * @param s The slot to advance
+     * @return The next slot after `s`
+     */
+    __device__ const_iterator next_slot(const_iterator s) const noexcept
+    {
+      return (++s < end()) ? s : begin_slot();
+    }
+
+    /**
+     * @brief Given a slot `s`, returns the next slot.
+     *
+     * If `s` is the last slot, wraps back around to the first slot. To
+     * be used for Cooperative Group based probing.
+     *
+     * @tparam CG The Cooperative Group type
+     * @param g The Cooperative Group for which the next slot is needed
+     * @param s The slot to advance
+     * @return The next slot after `s`
+     */
+    template <typename CG>
+    __device__ iterator next_slot(CG g, iterator s) noexcept
+    {
+      uint32_t index = s - slots_;
+      return &slots_[(index + g.size()) % capacity_];
+    }
+
+    /**
+     * @brief Given a slot `s`, returns the next slot.
+     *
+     * If `s` is the last slot, wraps back around to the first slot. To
+     * be used for Cooperative Group based probing.
+     *
+     * @tparam CG The Cooperative Group type
+     * @param g The Cooperative Group for which the next slot is needed
+     * @param s The slot to advance
+     * @return The next slot after `s`
+     */
+    template <typename CG>
+    __device__ const_iterator next_slot(CG g, const_iterator s) const noexcept
+    {
+      uint32_t index = s - slots_;
+      return &slots_[(index + g.size()) % capacity_];
+    }
+
+   public:
+    /**
+     * @brief Gets the maximum number of elements the hash map can hold.
+     *
+     * @return The maximum number of elements the hash map can hold
+     */
+    __host__ __device__ std::size_t get_capacity() const noexcept { return capacity_; }
+
+    /**
+     * @brief Gets the sentinel value used to represent an empty key slot.
+     *
+     * @return The sentinel value used to represent an empty key slot
+     */
+    __host__ __device__ Key get_empty_key_sentinel() const noexcept { return empty_key_sentinel_; }
+
+    /**
+     * @brief Gets the sentinel value used to represent an empty value slot.
+     *
+     * @return The sentinel value used to represent an empty value slot
+     */
+    __host__ __device__ Value get_empty_value_sentinel() const noexcept
+    {
+      return empty_value_sentinel_;
+    }
+
+    /**
+     * @brief Returns iterator to the first slot.
+     *
+     * @note Unlike `std::map::begin()`, the `begin_slot()` iterator does _not_ point to the first
+     * occupied slot. Instead, it refers to the first slot in the array of contiguous slot storage.
+     * Iterating from `begin_slot()` to `end_slot()` will iterate over all slots, including those
+     * both empty and filled.
+     *
+     * There is no `begin()` iterator to avoid confusion as it is not possible to provide an
+     * iterator over only the filled slots.
+     *
+     * @return Iterator to the first slot
+     */
+    __device__ iterator begin_slot() noexcept { return slots_; }
+
+    /**
+     * @brief Returns iterator to the first slot.
+     *
+     * @note Unlike `std::map::begin()`, the `begin_slot()` iterator does _not_ point to the first
+     * occupied slot. Instead, it refers to the first slot in the array of contiguous slot storage.
+     * Iterating from `begin_slot()` to `end_slot()` will iterate over all slots, including those
+     * both empty and filled.
+     *
+     * There is no `begin()` iterator to avoid confusion as it is not possible to provide an
+     * iterator over only the filled slots.
+     *
+     * @return Iterator to the first slot
+     */
+    __device__ const_iterator begin_slot() const noexcept { return slots_; }
+
+    /**
+     * @brief Returns a const_iterator to one past the last slot.
+     *
+     * @return A const_iterator to one past the last slot
+     */
+    __host__ __device__ const_iterator end_slot() const noexcept { return slots_ + capacity_; }
+
+    /**
+     * @brief Returns an iterator to one past the last slot.
+     *
+     * @return An iterator to one past the last slot
+     */
+    __host__ __device__ iterator end_slot() noexcept { return slots_ + capacity_; }
+
+    /**
+     * @brief Returns a const_iterator to one past the last slot.
+     *
+     * `end()` calls `end_slot()` and is provided for convenience for those familiar with checking
+     * an iterator returned from `find()` against the `end()` iterator.
+     *
+     * @return A const_iterator to one past the last slot
+     */
+    __host__ __device__ const_iterator end() const noexcept { return end_slot(); }
+
+    /**
+     * @brief Returns an iterator to one past the last slot.
+     *
+     * `end()` calls `end_slot()` and is provided for convenience for those familiar with checking
+     * an iterator returned from `find()` against the `end()` iterator.
+     *
+     * @return An iterator to one past the last slot
+     */
+    __host__ __device__ iterator end() noexcept { return end_slot(); }
+  };
+
+ public:
+  /**
+   * @brief Mutable, non-owning view-type that may be used in device code to
+   * perform singular inserts into the map.
+   *
+   * `device_mutable_view` is trivially-copyable and is intended to be passed by
+   * value.
+   *
+   * Example:
+   * \code{.cpp}
+   * cuco::static_reduction_map<int,int> m{100'000, -1, -1};
+   *
+   * // Inserts a sequence of pairs {{0,0}, {1,1}, ... {i,i}}
+   * thrust::for_each(thrust::make_counting_iterator(0),
+   *                  thrust::make_counting_iterator(50'000),
+   *                  [map = m.get_mutable_device_view()]
+   *                  __device__ (auto i) mutable {
+   *                     map.insert(thrust::make_pair(i,i));
+   *                  });
+   * \endcode
+   */
+  class device_mutable_view : public device_view_base {
+   public:
+    using value_type     = typename device_view_base::value_type;
+    using key_type       = typename device_view_base::key_type;
+    using mapped_type    = typename device_view_base::mapped_type;
+    using iterator       = typename device_view_base::iterator;
+    using const_iterator = typename device_view_base::const_iterator;
+    /**
+     * @brief Construct a mutable view of the first `capacity` slots of the
+     * slots array pointed to by `slots`.
+     *
+     * @param slots Pointer to beginning of initialized slots array
+     * @param capacity The number of slots viewed by this object
+     * @param empty_key_sentinel The reserved value for keys to represent empty
+     * slots
+     * @param empty_value_sentinel The reserved value for mapped values to
+     * represent empty slots
+     */
+    __host__ __device__ device_mutable_view(pair_atomic_type* slots,
+                                            std::size_t capacity,
+                                            Key empty_key_sentinel,
+                                            ReductionOp reduction_op = {}) noexcept
+      : device_view_base{slots, capacity, empty_key_sentinel, reduction_op}
+    {
+    }
+
+    /**
+     * @brief Inserts the specified key/value pair into the map.
+     *
+     * Returns a pair consisting of an iterator to the inserted element (or to
+     * the element that prevented the insertion) and a `bool` denoting whether
+     * the insertion took place.
+     *
+     * @tparam Hash Unary callable type
+     * @tparam KeyEqual Binary callable type
+     * @param insert_pair The pair to insert
+     * @param hash The unary callable used to hash the key
+     * @param key_equal The binary callable used to compare two keys for
+     * equality
+     * @return `true` if the insert was successful, `false` otherwise.
+     */
+    template <typename Hash     = cuco::detail::MurmurHash3_32<key_type>,
+              typename KeyEqual = thrust::equal_to<key_type>>
+    __device__ Value insert(value_type const& insert_pair,
+                            Hash hash          = Hash{},
+                            KeyEqual key_equal = KeyEqual{}) noexcept;
+    /**
+     * @brief Inserts the specified key/value pair into the map.
+     *
+     * Returns a pair consisting of an iterator to the inserted element (or to
+     * the element that prevented the insertion) and a `bool` denoting whether
+     * the insertion took place. Uses the CUDA Cooperative Groups API to
+     * to leverage multiple threads to perform a single insert. This provides a
+     * significant boost in throughput compared to the non Cooperative Group
+     * `insert` at moderate to high load factors.
+     *
+     * @tparam Cooperative Group type
+     * @tparam Hash Unary callable type
+     * @tparam KeyEqual Binary callable type
+     *
+     * @param g The Cooperative Group that performs the insert
+     * @param insert_pair The pair to insert
+     * @param hash The unary callable used to hash the key
+     * @param key_equal The binary callable used to compare two keys for
+     * equality
+     * @return `true` if the insert was successful, `false` otherwise.
+     */
+    template <typename CG,
+              typename Hash     = cuco::detail::MurmurHash3_32<key_type>,
+              typename KeyEqual = thrust::equal_to<key_type>>
+    __device__ bool insert(CG g,
+                           value_type const& insert_pair,
+                           Hash hash          = Hash{},
+                           KeyEqual key_equal = KeyEqual{}) noexcept;
+
+  };  // class device mutable view
+
+  /**
+   * @brief Non-owning view-type that may be used in device code to
+   * perform singular find and contains operations for the map.
+   *
+   * `device_view` is trivially-copyable and is intended to be passed by
+   * value.
+   *
+   */
+  class device_view : public device_view_base {
+   public:
+    using value_type     = typename device_view_base::value_type;
+    using key_type       = typename device_view_base::key_type;
+    using mapped_type    = typename device_view_base::mapped_type;
+    using iterator       = typename device_view_base::iterator;
+    using const_iterator = typename device_view_base::const_iterator;
+    /**
+     * @brief Construct a view of the first `capacity` slots of the
+     * slots array pointed to by `slots`.
+     *
+     * @param slots Pointer to beginning of initialized slots array
+     * @param capacity The number of slots viewed by this object
+     * @param empty_key_sentinel The reserved value for keys to represent empty
+     * slots
+     * @param empty_value_sentinel The reserved value for mapped values to
+     * represent empty slots
+     */
+    __host__ __device__ device_view(pair_atomic_type* slots,
+                                    std::size_t capacity,
+                                    Key empty_key_sentinel,
+                                    ReductionOp reduction_op = {}) noexcept
+      : device_view_base{slots, capacity, empty_key_sentinel, reduction_op}
+    {
+    }
+
+    /**
+     * @brief Makes a copy of given `device_view` using non-owned memory.
+     *
+     * This function is intended to be used to create shared memory copies of small static maps,
+     * although global memory can be used as well.
+     *
+     * Example:
+     * @code{.cpp}
+     * template <typename MapType, int CAPACITY>
+     * __global__ void use_device_view(const typename MapType::device_view device_view,
+     *                                 map_key_t const* const keys_to_search,
+     *                                 map_value_t* const values_found,
+     *                                 const size_t number_of_elements)
+     * {
+     *     const size_t index = blockIdx.x * blockDim.x + threadIdx.x;
+     *
+     *     __shared__ typename MapType::pair_atomic_type sm_buffer[CAPACITY];
+     *
+     *     auto g = cg::this_thread_block();
+     *
+     *     const map_t::device_view sm_static_reduction_map = device_view.make_copy(g,
+     *                                                                    sm_buffer);
+     *
+     *     for (size_t i = g.thread_rank(); i < number_of_elements; i += g.size())
+     *     {
+     *         values_found[i] = sm_static_reduction_map.find(keys_to_search[i])->second;
+     *     }
+     * }
+     * @endcode
+     *
+     * @tparam CG The type of the cooperative thread group
+     * @param g The ooperative thread group used to copy the slots
+     * @param source_device_view `device_view` to copy from
+     * @param memory_to_use Array large enough to support `capacity` elements. Object does not take
+     * the ownership of the memory
+     * @return Copy of passed `device_view`
+     */
+    template <typename CG>
+    __device__ static device_view make_copy(CG g,
+                                            pair_atomic_type* const memory_to_use,
+                                            device_view source_device_view) noexcept
+    {
+#ifndef CUDART_VERSION
+#error CUDART_VERSION Undefined!
+#elif (CUDART_VERSION >= 11000)
+      __shared__ cuda::barrier<cuda::thread_scope::thread_scope_block> barrier;
+      if (g.thread_rank() == 0) { init(&barrier, g.size()); }
+      g.sync();
+
+      cuda::memcpy_async(g,
+                         memory_to_use,
+                         source_device_view.get_slots(),
+                         sizeof(pair_atomic_type) * source_device_view.get_capacity(),
+                         barrier);
+
+      barrier.arrive_and_wait();
+#else
+      pair_atomic_type const* const slots_ptr = source_device_view.get_slots();
+      for (std::size_t i = g.thread_rank(); i < source_device_view.get_capacity(); i += g.size()) {
+        new (&memory_to_use[i].first)
+          atomic_key_type{slots_ptr[i].first.load(cuda::memory_order_relaxed)};
+        new (&memory_to_use[i].second)
+          atomic_mapped_type{slots_ptr[i].second.load(cuda::memory_order_relaxed)};
+      }
+      g.sync();
+#endif
+
+      return device_view(memory_to_use,
+                         source_device_view.get_capacity(),
+                         source_device_view.get_empty_key_sentinel(),
+                         source_device_view.get_empty_value_sentinel());
+    }
+
+    /**
+     * @brief Finds the value corresponding to the key `k`.
+     *
+     * Returns an iterator to the pair whose key is equivalent to `k`.
+     * If no such pair exists, returns `end()`.
+     *
+     * @tparam Hash Unary callable type
+     * @tparam KeyEqual Binary callable type
+     * @param k The key to search for
+     * @param hash The unary callable used to hash the key
+     * @param key_equal The binary callable used to compare two keys
+     * for equality
+     * @return An iterator to the position at which the key/value pair
+     * containing `k` was inserted
+     */
+    template <typename Hash     = cuco::detail::MurmurHash3_32<key_type>,
+              typename KeyEqual = thrust::equal_to<key_type>>
+    __device__ iterator find(Key const& k,
+                             Hash hash          = Hash{},
+                             KeyEqual key_equal = KeyEqual{}) noexcept;
+
+    /** @brief Finds the value corresponding to the key `k`.
+     *
+     * Returns a const_iterator to the pair whose key is equivalent to `k`.
+     * If no such pair exists, returns `end()`.
+     *
+     * @tparam Hash Unary callable type
+     * @tparam KeyEqual Binary callable type
+     * @param k The key to search for
+     * @param hash The unary callable used to hash the key
+     * @param key_equal The binary callable used to compare two keys
+     * for equality
+     * @return An iterator to the position at which the key/value pair
+     * containing `k` was inserted
+     */
+    template <typename Hash     = cuco::detail::MurmurHash3_32<key_type>,
+              typename KeyEqual = thrust::equal_to<key_type>>
+    __device__ const_iterator find(Key const& k,
+                                   Hash hash          = Hash{},
+                                   KeyEqual key_equal = KeyEqual{}) const noexcept;
+
+    /**
+     * @brief Finds the value corresponding to the key `k`.
+     *
+     * Returns an iterator to the pair whose key is equivalent to `k`.
+     * If no such pair exists, returns `end()`. Uses the CUDA Cooperative Groups API to
+     * to leverage multiple threads to perform a single find. This provides a
+     * significant boost in throughput compared to the non Cooperative Group
+     * `find` at moderate to high load factors.
+     *
+     * @tparam CG Cooperative Group type
+     * @tparam Hash Unary callable type
+     * @tparam KeyEqual Binary callable type
+     * @param g The Cooperative Group used to perform the find
+     * @param k The key to search for
+     * @param hash The unary callable used to hash the key
+     * @param key_equal The binary callable used to compare two keys
+     * for equality
+     * @return An iterator to the position at which the key/value pair
+     * containing `k` was inserted
+     */
+    template <typename CG,
+              typename Hash     = cuco::detail::MurmurHash3_32<key_type>,
+              typename KeyEqual = thrust::equal_to<key_type>>
+    __device__ iterator
+    find(CG g, Key const& k, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}) noexcept;
+
+    /**
+     * @brief Finds the value corresponding to the key `k`.
+     *
+     * Returns a const_iterator to the pair whose key is equivalent to `k`.
+     * If no such pair exists, returns `end()`. Uses the CUDA Cooperative Groups API to
+     * to leverage multiple threads to perform a single find. This provides a
+     * significant boost in throughput compared to the non Cooperative Group
+     * `find` at moderate to high load factors.
+     *
+     * @tparam CG Cooperative Group type
+     * @tparam Hash Unary callable type
+     * @tparam KeyEqual Binary callable type
+     * @param g The Cooperative Group used to perform the find
+     * @param k The key to search for
+     * @param hash The unary callable used to hash the key
+     * @param key_equal The binary callable used to compare two keys
+     * for equality
+     * @return An iterator to the position at which the key/value pair
+     * containing `k` was inserted
+     */
+    template <typename CG,
+              typename Hash     = cuco::detail::MurmurHash3_32<key_type>,
+              typename KeyEqual = thrust::equal_to<key_type>>
+    __device__ const_iterator
+    find(CG g, Key const& k, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}) const noexcept;
+
+    /**
+     * @brief Indicates whether the key `k` was inserted into the map.
+     *
+     * If the key `k` was inserted into the map, find returns
+     * true. Otherwise, it returns false.
+     *
+     * @tparam Hash Unary callable type
+     * @tparam KeyEqual Binary callable type
+     * @param k The key to search for
+     * @param hash The unary callable used to hash the key
+     * @param key_equal The binary callable used to compare two keys
+     * for equality
+     * @return A boolean indicating whether the key/value pair
+     * containing `k` was inserted
+     */
+    template <typename Hash     = cuco::detail::MurmurHash3_32<key_type>,
+              typename KeyEqual = thrust::equal_to<key_type>>
+    __device__ bool contains(Key const& k,
+                             Hash hash          = Hash{},
+                             KeyEqual key_equal = KeyEqual{}) noexcept;
+
+    /**
+     * @brief Indicates whether the key `k` was inserted into the map.
+     *
+     * If the key `k` was inserted into the map, find returns
+     * true. Otherwise, it returns false. Uses the CUDA Cooperative Groups API to
+     * to leverage multiple threads to perform a single contains operation. This provides a
+     * significant boost in throughput compared to the non Cooperative Group
+     * `contains` at moderate to high load factors.
+     *
+     * @tparam CG Cooperative Group type
+     * @tparam Hash Unary callable type
+     * @tparam KeyEqual Binary callable type
+     * @param g The Cooperative Group used to perform the contains operation
+     * @param k The key to search for
+     * @param hash The unary callable used to hash the key
+     * @param key_equal The binary callable used to compare two keys
+     * for equality
+     * @return A boolean indicating whether the key/value pair
+     * containing `k` was inserted
+     */
+    template <typename CG,
+              typename Hash     = cuco::detail::MurmurHash3_32<key_type>,
+              typename KeyEqual = thrust::equal_to<key_type>>
+    __device__ bool contains(CG g,
+                             Key const& k,
+                             Hash hash          = Hash{},
+                             KeyEqual key_equal = KeyEqual{}) noexcept;
+  };  // class device_view
+
+  /**
+   * @brief Gets the maximum number of elements the hash map can hold.
+   *
+   * @return The maximum number of elements the hash map can hold
+   */
+  std::size_t get_capacity() const noexcept { return capacity_; }
+
+  /**
+   * @brief Gets the number of elements in the hash map.
+   *
+   * @return The number of elements in the map
+   */
+  std::size_t get_size() const noexcept { return size_; }
+
+  /**
+   * @brief Gets the load factor of the hash map.
+   *
+   * @return The load factor of the hash map
+   */
+  float get_load_factor() const noexcept { return static_cast<float>(size_) / capacity_; }
+
+  /**
+   * @brief Gets the sentinel value used to represent an empty key slot.
+   *
+   * @return The sentinel value used to represent an empty key slot
+   */
+  Key get_empty_key_sentinel() const noexcept { return empty_key_sentinel_; }
+
+  /**
+   * @brief Gets the sentinel value used to represent an empty value slot.
+   *
+   * @return The sentinel value used to represent an empty value slot
+   */
+  Value get_empty_value_sentinel() const noexcept { return empty_value_sentinel_; }
+
+  /**
+   * @brief Constructs a device_view object based on the members of the `static_reduction_map`
+   * object.
+   *
+   * @return A device_view object based on the members of the `static_reduction_map` object
+   */
+  device_view get_device_view() const noexcept
+  {
+    return device_view(slots_, capacity_, empty_key_sentinel_, op_);
+  }
+
+  /**
+   * @brief Constructs a device_mutable_view object based on the members of the
+   * `static_reduction_map` object
+   *
+   * @return A device_mutable_view object based on the members of the `static_reduction_map` object
+   */
+  device_mutable_view get_device_mutable_view() const noexcept
+  {
+    return device_mutable_view(slots_, capacity_, empty_key_sentinel_, op_);
+  }
+
+ private:
+  /// Unsafe access to the slots stripping away their atomic-ness to allow non-atomic access. This
+  /// is a temporary solution until we have atomic_ref
+  value_type* raw_slots_begin() noexcept { return reinterpret_cast<value_type*>(slots_); }
+
+  value_type const* raw_slots_begin() const noexcept
+  {
+    return reinterpret_cast<value_type const*>(slots_);
+  }
+
+  value_type* raw_slots_end() noexcept { return raw_slots_begin() + get_capacity(); }
+
+  value_type const* raw_slots_end() const noexcept { return raw_slots_begin() + get_capacity(); }
+
+  pair_atomic_type* slots_{nullptr};      ///< Pointer to flat slots storage
+  std::size_t capacity_{};                ///< Total number of slots
+  std::size_t size_{};                    ///< Number of keys in map
+  Key empty_key_sentinel_{};              ///< Key value that represents an empty slot
+  Value empty_value_sentinel_{};          ///< Initial value of empty slot
+  atomic_ctr_type* num_successes_{};      ///< Number of successfully inserted keys on insert
+  ReductionOp op_{};                      ///< Binary operation reduction function object
+  slot_allocator_type slot_allocator_{};  ///< Allocator used to allocate slots
+};
+}  // namespace cuco
+
+#include <cuco/detail/static_reduction_map.inl>
\ No newline at end of file
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 9c142ef58..a926d21f3 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -39,7 +39,7 @@ function(ConfigureTest TEST_NAME TEST_SRC)
                    $<TARGET_OBJECTS:CatchMain>) # Link in the CatchMain object file
     target_link_libraries(${TEST_NAME} Catch2::Catch2 cuco)
     set_target_properties(${TEST_NAME} PROPERTIES
-                                       CUDA_ARCHITECTURES ${GPU_ARCHS}
+                                       CUDA_ARCHITECTURES "${GPU_ARCHS}"
                                        RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tests")
     target_compile_options(${TEST_NAME} PRIVATE --expt-extended-lambda --expt-relaxed-constexpr -Xcompiler -Wno-subobject-linkage)
     catch_discover_tests(${TEST_NAME})
@@ -48,13 +48,9 @@ endfunction(ConfigureTest)
 ###################################################################################################
 ### test sources ##################################################################################
 ###################################################################################################
-set(STATIC_MAP_TEST_SRC 
-    "${CMAKE_CURRENT_SOURCE_DIR}/static_map/static_map_test.cu")
 
-ConfigureTest(STATIC_MAP_TEST "${STATIC_MAP_TEST_SRC}")
-####################################################################################################
-set(DYNAMIC_MAP_TEST_SRC 
-    "${CMAKE_CURRENT_SOURCE_DIR}/dynamic_map/dynamic_map_test.cu")
+ConfigureTest(STATIC_MAP_TEST "${CMAKE_CURRENT_SOURCE_DIR}/static_map/static_map_test.cu")
 
-ConfigureTest(DYNAMIC_MAP_TEST "${DYNAMIC_MAP_TEST_SRC}")
-####################################################################################################
\ No newline at end of file
+ConfigureTest(STATIC_REDUCTION_MAP_TEST "${CMAKE_CURRENT_SOURCE_DIR}/static_reduction_map/static_reduction_map_test.cu")
+
+ConfigureTest(DYNAMIC_MAP_TEST "${CMAKE_CURRENT_SOURCE_DIR}/dynamic_map/dynamic_map_test.cu")
\ No newline at end of file
diff --git a/tests/static_reduction_map/static_reduction_map_test.cu b/tests/static_reduction_map/static_reduction_map_test.cu
new file mode 100644
index 000000000..bb57f0847
--- /dev/null
+++ b/tests/static_reduction_map/static_reduction_map_test.cu
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <thrust/count.h>
+#include <thrust/device_vector.h>
+#include <thrust/for_each.h>
+#include <algorithm>
+#include <catch2/catch.hpp>
+#include <cuco/static_reduction_map.cuh>
+#include <limits>
+
+namespace {
+// Thrust logical algorithms (any_of/all_of/none_of) don't work with device
+// lambdas: See https://github.com/thrust/thrust/issues/1062
+template <typename Iterator, typename Predicate>
+bool all_of(Iterator begin, Iterator end, Predicate p)
+{
+  auto size = thrust::distance(begin, end);
+  return size == thrust::count_if(begin, end, p);
+}
+
+template <typename Iterator, typename Predicate>
+bool any_of(Iterator begin, Iterator end, Predicate p)
+{
+  return thrust::count_if(begin, end, p) > 0;
+}
+
+template <typename Iterator, typename Predicate>
+bool none_of(Iterator begin, Iterator end, Predicate p)
+{
+  return not all_of(begin, end, p);
+}
+}  // namespace
+
+TEMPLATE_TEST_CASE_SIG("Insert all identical keys",
+                       "",
+                       ((typename Key, typename Value), Key, Value),
+                       (int32_t, int32_t))
+{
+  thrust::device_vector<Key> keys(100, 42);
+  thrust::device_vector<Value> values(keys.size(), 1);
+
+  auto const num_slots{keys.size() * 2};
+  cuco::static_reduction_map<cuco::reduce_add<Value>, Key, Value> map{num_slots, -1};
+
+  auto zip     = thrust::make_zip_iterator(thrust::make_tuple(keys.begin(), values.begin()));
+  auto zip_end = zip + keys.size();
+  map.insert(zip, zip_end);
+
+  SECTION("There should only be one key in the map") { REQUIRE(map.get_size() == 1); }
+
+  SECTION("Map should contain the inserted key")
+  {
+    thrust::device_vector<bool> contained(keys.size());
+    map.contains(keys.begin(), keys.end(), contained.begin());
+    REQUIRE(all_of(contained.begin(), contained.end(), [] __device__(bool c) { return c; }));
+  }
+
+  SECTION("Found value should equal aggregate of inserted values")
+  {
+    thrust::device_vector<Value> found(keys.size());
+    map.find(keys.begin(), keys.end(), found.begin());
+    auto const expected_aggregate = keys.size();  // All keys inserted "1", so the
+                                                  // sum aggregate should be
+                                                  // equal to the number of keys inserted
+    REQUIRE(all_of(found.begin(), found.end(), [expected_aggregate] __device__(Value v) {
+      return v == expected_aggregate;
+    }));
+  }
+}
+
+TEMPLATE_TEST_CASE_SIG("Insert all unique keys",
+                       "",
+                       ((typename Key, typename Value), Key, Value),
+                       (int32_t, int32_t))
+{
+  constexpr std::size_t num_keys = 10000;
+  constexpr std::size_t num_slots{num_keys * 2};
+  cuco::static_reduction_map<cuco::reduce_add<Value>, Key, Value> map{num_slots, -1};
+
+  auto keys_begin   = thrust::make_counting_iterator<Key>(0);
+  auto values_begin = thrust::make_counting_iterator<Value>(0);
+  auto zip          = thrust::make_zip_iterator(thrust::make_tuple(keys_begin, values_begin));
+  auto zip_end      = zip + num_keys;
+  map.insert(zip, zip_end);
+
+  SECTION("Size of map should equal number of inserted keys")
+  {
+    REQUIRE(map.get_size() == num_keys);
+  }
+
+  SECTION("Map should contain the inserted keys")
+  {
+    thrust::device_vector<bool> contained(num_keys);
+    map.contains(keys_begin, keys_begin + num_keys, contained.begin());
+    REQUIRE(all_of(contained.begin(), contained.end(), [] __device__(bool c) { return c; }));
+  }
+
+  SECTION("Found value should equal inserted value")
+  {
+    thrust::device_vector<Value> found(num_keys);
+    map.find(keys_begin, keys_begin + num_keys, found.begin());
+    REQUIRE(thrust::equal(thrust::device, values_begin, values_begin + num_keys, found.begin()));
+  }
+}