From 976a746edee29434b036bc7b09a1f96ec7001c33 Mon Sep 17 00:00:00 2001
From: Nico Iskos <niskos@nvidia.com>
Date: Wed, 9 Mar 2022 09:41:17 -0800
Subject: [PATCH 01/36] dynamic map erase, still needs work

---
 benchmarks/hash_table/dynamic_map_bench.cu  |   2 +
 benchmarks/hash_table/static_map_bench.cu   | 119 ++++++++++++++++++++
 include/cuco/detail/dynamic_map.inl         |  48 ++++++++
 include/cuco/detail/dynamic_map_kernels.cuh |  61 ++++++++++
 include/cuco/detail/static_map.inl          |   3 +
 include/cuco/dynamic_map.cuh                |  10 ++
 include/cuco/static_map.cuh                 |   5 +
 tests/CMakeLists.txt                        |   3 +-
 tests/dynamic_map/erase_test.cu             |  90 +++++++++++++++
 tests/static_map/erase_test.cu              |   2 +
 10 files changed, 342 insertions(+), 1 deletion(-)
 create mode 100644 tests/dynamic_map/erase_test.cu
diff --git a/benchmarks/hash_table/dynamic_map_bench.cu b/benchmarks/hash_table/dynamic_map_bench.cu
index 90446ea57..d42aae755 100644
--- a/benchmarks/hash_table/dynamic_map_bench.cu
+++ b/benchmarks/hash_table/dynamic_map_bench.cu
@@ -147,6 +147,7 @@ BENCHMARK_TEMPLATE(BM_dynamic_search_all, int32_t, int32_t, dist_type::UNIQUE)
   ->Apply(gen_final_size)
   ->UseManualTime();
 
+/*
 BENCHMARK_TEMPLATE(BM_dynamic_insert, int32_t, int32_t, dist_type::UNIFORM)
   ->Unit(benchmark::kMillisecond)
   ->Apply(gen_final_size)
@@ -196,3 +197,4 @@ BENCHMARK_TEMPLATE(BM_dynamic_search_all, int64_t, int64_t, dist_type::GAUSSIAN)
   ->Unit(benchmark::kMillisecond)
   ->Apply(gen_final_size)
   ->UseManualTime();
+*/
\ No newline at end of file
diff --git a/benchmarks/hash_table/static_map_bench.cu b/benchmarks/hash_table/static_map_bench.cu
index e2b15b05e..63c2976d4 100644
--- a/benchmarks/hash_table/static_map_bench.cu
+++ b/benchmarks/hash_table/static_map_bench.cu
@@ -155,6 +155,49 @@ static void BM_static_map_search_all(::benchmark::State& state)
                           int64_t(state.range(0)));
 }
 
+template <typename Key, typename Value, dist_type Dist>
+static void BM_static_map_search_none(::benchmark::State& state)
+{
+  using map_type = cuco::static_map<Key, Value>;
+
+  std::size_t num_keys = state.range(0);
+  float occupancy      = state.range(1) / float{100};
+  std::size_t size     = num_keys / occupancy;
+
+  map_type map{size, -1, -1};
+  auto view = map.get_device_mutable_view();
+
+  std::vector<Key> h_keys(num_keys);
+  std::vector<Value> h_values(num_keys);
+  std::vector<cuco::pair_type<Key, Value>> h_pairs(num_keys);
+  std::vector<Value> h_results(num_keys);
+
+  generate_keys<Dist, Key>(h_keys.begin(), h_keys.end());
+
+  for (auto i = 0; i < num_keys; ++i) {
+    Key key           = h_keys[i];
+    Value val         = h_keys[i];
+    h_pairs[i].first  = key;
+    h_pairs[i].second = val;
+  }
+  
+  // diff keys
+  for(int i = 0; i < num_keys; ++i) h_keys[i] += num_keys;
+
+  thrust::device_vector<Key> d_keys(h_keys);
+  thrust::device_vector<Value> d_results(num_keys);
+  thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs(h_pairs);
+
+  map.insert(d_pairs.begin(), d_pairs.end());
+
+  for (auto _ : state) {
+    map.find(d_keys.begin(), d_keys.end(), d_results.begin());
+  }
+
+  state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) *
+                          int64_t(state.range(0)));
+}
+
 template <typename Key, typename Value, dist_type Dist>
 static void BM_static_map_erase_all(::benchmark::State& state)
 {
@@ -200,6 +243,82 @@ static void BM_static_map_erase_all(::benchmark::State& state)
                           int64_t(state.range(0)));
 }
 
+template <typename Key, typename Value, dist_type Dist>
+static void BM_static_map_erase_none(::benchmark::State& state)
+{
+  using map_type = cuco::static_map<Key, Value>;
+
+  std::size_t num_keys = state.range(0);
+  float occupancy      = state.range(1) / float{100};
+  std::size_t size     = num_keys / occupancy;
+
+  map_type map{size, -1, -1};
+  auto view = map.get_device_mutable_view();
+
+  std::vector<Key> h_keys(num_keys);
+  std::vector<Value> h_values(num_keys);
+  std::vector<cuco::pair_type<Key, Value>> h_pairs(num_keys);
+  std::vector<Value> h_results(num_keys);
+
+  generate_keys<Dist, Key>(h_keys.begin(), h_keys.end());
+
+  for (auto i = 0; i < num_keys; ++i) {
+    Key key           = h_keys[i];
+    Value val         = h_keys[i];
+    h_pairs[i].first  = key;
+    h_pairs[i].second = val;
+  }
+
+
+  // diff keys
+  for(int i = 0; i < num_keys; ++i) h_keys[i] += num_keys;
+
+  thrust::device_vector<Key> d_keys(h_keys);
+  thrust::device_vector<bool> d_results(num_keys);
+  thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs(h_pairs);
+
+  for (auto _ : state) {
+    //state.ResumeTiming();
+    state.PauseTiming();
+    map.insert(d_pairs.begin(), d_pairs.end());
+    state.ResumeTiming();
+
+    map.erase(d_keys.begin(), d_keys.end());
+    
+    //state.PauseTiming();
+  }
+
+  state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) *
+                          int64_t(state.range(0)));
+}
+
+/*
+BENCHMARK_TEMPLATE(BM_static_map_search_none, int32_t, int32_t, dist_type::UNIQUE)
+  ->Unit(benchmark::kMillisecond)
+  ->Apply(generate_size_and_occupancy);
+
+BENCHMARK_TEMPLATE(BM_static_map_search_none, int64_t, int64_t, dist_type::UNIQUE)
+  ->Unit(benchmark::kMillisecond)
+  ->Apply(generate_size_and_occupancy);
+*/
+
+
+BENCHMARK_TEMPLATE(BM_static_map_erase_none, int64_t, int64_t, dist_type::UNIQUE)
+  ->Unit(benchmark::kMillisecond)
+  ->Apply(generate_size_and_occupancy);
+
+BENCHMARK_TEMPLATE(BM_static_map_erase_none, int32_t, int32_t, dist_type::UNIQUE)
+  ->Unit(benchmark::kMillisecond)
+  ->Apply(generate_size_and_occupancy);
+
+BENCHMARK_TEMPLATE(BM_static_map_erase_all, int64_t, int64_t, dist_type::UNIQUE)
+  ->Unit(benchmark::kMillisecond)
+  ->Apply(generate_size_and_occupancy);
+
+BENCHMARK_TEMPLATE(BM_static_map_erase_all, int32_t, int32_t, dist_type::UNIQUE)
+  ->Unit(benchmark::kMillisecond)
+  ->Apply(generate_size_and_occupancy);
+/*
 BENCHMARK_TEMPLATE(BM_static_map_insert, int32_t, int32_t, dist_type::UNIQUE)
   ->Unit(benchmark::kMillisecond)
   ->Apply(generate_size_and_occupancy)
diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl
index 0c1d2e377..28857f547 100644
--- a/include/cuco/detail/dynamic_map.inl
+++ b/include/cuco/detail/dynamic_map.inl
@@ -38,6 +38,8 @@ dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(
   submap_views_.push_back(submaps_[0]->get_device_view());
   submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view());
 
+  submap_num_successes_.push_back(submaps_[0]->get_num_successes());
+
   CUCO_CUDA_TRY(cudaMallocManaged(&num_successes_, sizeof(atomic_ctr_type)));
 }  // namespace cuco
 
@@ -69,6 +71,8 @@ void dynamic_map<Key, Value, Scope, Allocator>::reserve(std::size_t n)
         alloc_));
       submap_views_.push_back(submaps_[submap_idx]->get_device_view());
       submap_mutable_views_.push_back(submaps_[submap_idx]->get_device_mutable_view());
+      
+      submap_num_successes_.push_back(submaps_[submap_idx]->get_num_successes());
 
       capacity_ *= 2;
     }
@@ -128,6 +132,50 @@ void dynamic_map<Key, Value, Scope, Allocator>::insert(InputIt first,
   }
 }
 
+template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
+template <typename InputIt, typename Hash, typename KeyEqual>
+void dynamic_map<Key, Value, Scope, Allocator>::erase(InputIt first,
+                                                       InputIt last,
+                                                       Hash hash,
+                                                       KeyEqual key_equal)
+{
+  std::size_t num_keys = std::distance(first, last);
+
+  auto const block_size = 128;
+  auto const stride     = 1;
+  auto const tile_size  = 4;
+  auto const grid_size  = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size);
+
+  *num_successes_ = 0;
+  int device_id;
+  CUCO_CUDA_TRY(cudaGetDevice(&device_id));
+  CUCO_CUDA_TRY(cudaMemPrefetchAsync(num_successes_, sizeof(atomic_ctr_type), device_id));
+  
+  // TODO: hacky, improve this
+  thrust::device_vector<atomic_ctr_type*> d_submap_num_successes(submap_num_successes_);
+      
+  detail::erase<block_size, tile_size, cuco::pair_type<key_type, mapped_type>>
+    <<<grid_size, block_size>>>(first,
+                                first + num_keys,
+                                submap_views_.data().get(),
+                                submap_mutable_views_.data().get(),
+                                num_successes_,
+                                d_submap_num_successes.data().get(),
+                                submaps_.size(),
+                                hash,
+                                key_equal);
+  CUCO_CUDA_TRY(cudaDeviceSynchronize());
+
+  std::size_t h_num_successes = num_successes_->load(cuda::std::memory_order_relaxed);
+  size_ -= h_num_successes;
+  
+  for(int i = 0; i < submaps_.size(); ++i) {
+    //std::size_t h_num_submap_successes = submap_num_successes_[i]->load(cuda::std::memory_order_relaxed);
+    //submaps_[i]->size_ -= h_num_submap_successes;
+  }
+  
+}
+
 template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
 template <typename InputIt, typename OutputIt, typename Hash, typename KeyEqual>
 void dynamic_map<Key, Value, Scope, Allocator>::find(
diff --git a/include/cuco/detail/dynamic_map_kernels.cuh b/include/cuco/detail/dynamic_map_kernels.cuh
index f261b49aa..3bc8a0d8a 100644
--- a/include/cuco/detail/dynamic_map_kernels.cuh
+++ b/include/cuco/detail/dynamic_map_kernels.cuh
@@ -186,6 +186,67 @@ __global__ void insert(InputIt first,
   if (threadIdx.x == 0) { *num_successes += block_num_successes; }
 }
 
+template <uint32_t block_size,
+          uint32_t tile_size,
+          typename pair_type,
+          typename InputIt,
+          typename viewT,
+          typename mutableViewT,
+          typename atomicT,
+          typename Hash,
+          typename KeyEqual>
+__global__ void erase(InputIt first,
+                       InputIt last,
+                       viewT* submap_views,
+                       mutableViewT* submap_mutable_views,
+                       atomicT* num_successes,
+                       atomicT** submap_num_successes,
+                       uint32_t num_submaps,
+                       Hash hash,
+                       KeyEqual key_equal)
+{
+  typedef cub::BlockReduce<std::size_t, block_size> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  // TODO: hack for up to 4 submaps, make this better
+  __shared__ typename BlockReduce::TempStorage temp_submap_storage[4];
+
+  std::size_t thread_num_successes = 0;
+  std::size_t submap_thread_num_successes[4] = {0, 0, 0, 0};
+
+  auto tile = cg::tiled_partition<tile_size>(cg::this_thread_block());
+  auto tid  = blockDim.x * blockIdx.x + threadIdx.x;
+  auto key_idx              = tid / tile_size;
+  auto it   = first + tid / tile_size;
+
+  while (it < last) {
+    auto key         = *(first + key_idx);
+    auto erased           = false;
+
+    // manually check for duplicates in those submaps we are not inserting into
+    int i;
+    for (i = 0; i < num_submaps; ++i) {
+      erased = submap_mutable_views[i].erase(tile, key, hash, key_equal);
+      if (erased) { break; }
+    }
+    if (erased && tile.thread_rank() == 0) {
+      thread_num_successes++;
+      //submap_thread_num_successes[i]++;
+    }
+
+    it += (gridDim.x * blockDim.x) / tile_size;
+  }
+
+  std::size_t block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes);
+  if (threadIdx.x == 0) { *num_successes += block_num_successes; }
+
+  // update submap thread counts
+  for(int i = 0; i < num_submaps; ++i) {
+    //std::size_t submap_block_num_successes = BlockReduce(temp_submap_storage[i]).Sum(submap_thread_num_successes[i]);
+    //if(threadIdx.x == 0) {*submap_num_successes[i] += submap_block_num_successes; }
+  }
+}
+
 /**
  * @brief Finds the values corresponding to all keys in the range `[first, last)`.
  *
diff --git a/include/cuco/detail/static_map.inl b/include/cuco/detail/static_map.inl
index 09e9d05dd..b451f9089 100644
--- a/include/cuco/detail/static_map.inl
+++ b/include/cuco/detail/static_map.inl
@@ -650,6 +650,9 @@ __device__ bool static_map<Key, Value, Scope, Allocator>::device_mutable_view::e
 
       bool status;
       if (g.thread_rank() == src_lane) {
+        // only fetch value once necessary
+        auto existing_value = current_slot->second.load(cuda::std::memory_order_relaxed);
+        
         if constexpr (cuco::detail::is_packable<value_type>()) {
           auto slot = reinterpret_cast<
             cuda::atomic<typename cuco::detail::pair_converter<value_type>::packed_type>*>(
diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh
index 866f94819..1e347239b 100644
--- a/include/cuco/dynamic_map.cuh
+++ b/include/cuco/dynamic_map.cuh
@@ -171,6 +171,11 @@ class dynamic_map {
             typename Hash     = cuco::detail::MurmurHash3_32<key_type>,
             typename KeyEqual = thrust::equal_to<key_type>>
   void insert(InputIt first, InputIt last, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{});
+  
+  template <typename InputIt,
+            typename Hash     = cuco::detail::MurmurHash3_32<key_type>,
+            typename KeyEqual = thrust::equal_to<key_type>>
+  void erase(InputIt first, InputIt last, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{});
 
   /**
    * @brief Finds the values corresponding to all keys in the range `[first, last)`.
@@ -251,6 +256,9 @@ class dynamic_map {
  private:
   key_type empty_key_sentinel_{};       ///< Key value that represents an empty slot
   mapped_type empty_value_sentinel_{};  ///< Initial value of empty slot
+
+  // TODO: initialize this
+  key_type erased_key_sentinel_{};
   std::size_t size_{};                  ///< Number of keys in the map
   std::size_t capacity_{};              ///< Maximum number of keys that can be inserted
   float max_load_factor_{};             ///< Max load factor before capacity growth
@@ -263,6 +271,8 @@ class dynamic_map {
   std::size_t min_insert_size_{};   ///< min remaining capacity of submap for insert
   atomic_ctr_type* num_successes_;  ///< number of successfully inserted keys on insert
   Allocator alloc_{};  ///< Allocator passed to submaps to allocate their device storage
+
+  std::vector<atomic_ctr_type*> submap_num_successes_;
 };
 }  // namespace cuco
 
diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh
index 1daad9965..3ef487b7c 100644
--- a/include/cuco/static_map.cuh
+++ b/include/cuco/static_map.cuh
@@ -1414,6 +1414,11 @@ class static_map {
                                sentinel::erased_key<Key>{erased_key_sentinel_});
   }
 
+  atomic_ctr_type* get_num_successes() const noexcept
+  {
+    return num_successes_;
+  }
+
  private:
   pair_atomic_type* slots_{nullptr};            ///< Pointer to flat slots storage
   std::size_t capacity_{};                      ///< Total number of slots
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 2d1d25526..a7b40300c 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -68,7 +68,8 @@ ConfigureTest(STATIC_MAP_TEST
 ###################################################################################################
 # - dynamic_map tests -----------------------------------------------------------------------------
 ConfigureTest(DYNAMIC_MAP_TEST
-    dynamic_map/unique_sequence_test.cu)
+    dynamic_map/unique_sequence_test.cu
+    dynamic_map/erase_test.cu)
 
 ###################################################################################################
 # - static_multimap tests -------------------------------------------------------------------------
diff --git a/tests/dynamic_map/erase_test.cu b/tests/dynamic_map/erase_test.cu
new file mode 100644
index 000000000..c00013961
--- /dev/null
+++ b/tests/dynamic_map/erase_test.cu
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <catch2/catch.hpp>
+#include <thrust/device_vector.h>
+
+#include <cuco/dynamic_map.cuh>
+
+#include <utils.hpp>
+
+
+TEMPLATE_TEST_CASE_SIG(
+  "erase key", "", ((typename T), T), (int32_t))
+{
+  using Key   = T;
+  using Value = T;
+  
+  unsigned long num_keys = 1'000'000;
+  cuco::dynamic_map<Key, Value> map{num_keys * 2, -1, -1};
+
+  thrust::device_vector<Key> d_keys(num_keys);
+  thrust::device_vector<Value> d_values(num_keys);
+  thrust::device_vector<bool> d_keys_exist(num_keys);
+
+  thrust::sequence(thrust::device, d_keys.begin(), d_keys.end(), 1);
+  thrust::sequence(thrust::device, d_values.begin(), d_values.end(), 1);
+    
+  auto pairs_begin =
+    thrust::make_zip_iterator(thrust::make_tuple(d_keys.begin(), d_values.begin()));
+
+  SECTION(
+    "Check basic insert/erase")
+  {
+    map.insert(pairs_begin, pairs_begin + num_keys);
+
+    REQUIRE(map.get_size() == num_keys);
+
+    map.erase(d_keys.begin(), d_keys.end());
+
+    // delete decreases count correctly
+    REQUIRE(map.get_size() == 0);
+
+    map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin());
+
+    // keys were actaully deleted
+    REQUIRE(cuco::test::none_of(d_keys_exist.begin(),
+                                d_keys_exist.end(),
+                                [] __device__(const bool key_found) { return key_found; }));
+
+    printf("cow\n");
+
+    // ensures that map is reusing deleted slots    
+    map.insert(pairs_begin, pairs_begin + num_keys);
+
+    REQUIRE(map.get_size() == num_keys);
+
+    printf("cow2\n");
+
+    map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin());
+
+    REQUIRE(cuco::test::all_of(d_keys_exist.begin(),
+                                d_keys_exist.end(),
+                                [] __device__(const bool key_found) { return key_found; }));
+
+    // erase can act selectively
+    map.erase(d_keys.begin(), d_keys.begin() + num_keys/2);
+    map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin());
+    
+    REQUIRE(cuco::test::none_of(d_keys_exist.begin(),
+                                d_keys_exist.begin() + num_keys/2,
+                                [] __device__(const bool key_found) { return key_found; }));
+
+    REQUIRE(cuco::test::all_of(d_keys_exist.begin() + num_keys/2,
+                                d_keys_exist.end(),
+                                [] __device__(const bool key_found) { return key_found; }));
+  }
+}
\ No newline at end of file
diff --git a/tests/static_map/erase_test.cu b/tests/static_map/erase_test.cu
index b5641539c..a4b956305 100644
--- a/tests/static_map/erase_test.cu
+++ b/tests/static_map/erase_test.cu
@@ -56,10 +56,12 @@ TEMPLATE_TEST_CASE_SIG("erase key", "", ((typename T), T), (int32_t), (int64_t))
 
     map.erase(d_keys.begin(), d_keys.end());
 
+    // delete decreases count correctly
     REQUIRE(map.get_size() == 0);
 
     map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin());
 
+    // keys were actaully deleted
     REQUIRE(cuco::test::none_of(d_keys_exist.begin(),
                                 d_keys_exist.end(),
                                 [] __device__(const bool key_found) { return key_found; }));

From eead8b8c9d08ae9cc6c1ca0bd2648e46af45b271 Mon Sep 17 00:00:00 2001
From: Nico Iskos <niskos@nvidia.com>
Date: Wed, 9 Mar 2022 11:43:40 -0800
Subject: [PATCH 02/36] minor clarity changes

---
 benchmarks/hash_table/static_map_bench.cu | 44 ++++-------------------
 1 file changed, 7 insertions(+), 37 deletions(-)

diff --git a/benchmarks/hash_table/static_map_bench.cu b/benchmarks/hash_table/static_map_bench.cu
index 63c2976d4..363899a46 100644
--- a/benchmarks/hash_table/static_map_bench.cu
+++ b/benchmarks/hash_table/static_map_bench.cu
@@ -293,33 +293,28 @@ static void BM_static_map_erase_none(::benchmark::State& state)
 }
 
 /*
-BENCHMARK_TEMPLATE(BM_static_map_search_none, int32_t, int32_t, dist_type::UNIQUE)
+BENCHMARK_TEMPLATE(BM_static_map_erase_none, int64_t, int64_t, dist_type::UNIFORM)
   ->Unit(benchmark::kMillisecond)
   ->Apply(generate_size_and_occupancy);
 
-BENCHMARK_TEMPLATE(BM_static_map_search_none, int64_t, int64_t, dist_type::UNIQUE)
+BENCHMARK_TEMPLATE(BM_static_map_erase_none, int32_t, int32_t, dist_type::UNIFORM)
   ->Unit(benchmark::kMillisecond)
   ->Apply(generate_size_and_occupancy);
-*/
 
-
-BENCHMARK_TEMPLATE(BM_static_map_erase_none, int64_t, int64_t, dist_type::UNIQUE)
+BENCHMARK_TEMPLATE(BM_static_map_erase_all, int64_t, int64_t, dist_type::UNIFORM)
   ->Unit(benchmark::kMillisecond)
   ->Apply(generate_size_and_occupancy);
 
-BENCHMARK_TEMPLATE(BM_static_map_erase_none, int32_t, int32_t, dist_type::UNIQUE)
+BENCHMARK_TEMPLATE(BM_static_map_erase_all, int32_t, int32_t, dist_type::UNIFORM)
   ->Unit(benchmark::kMillisecond)
   ->Apply(generate_size_and_occupancy);
 
-BENCHMARK_TEMPLATE(BM_static_map_erase_all, int64_t, int64_t, dist_type::UNIQUE)
+BENCHMARK_TEMPLATE(BM_static_map_search_none, int64_t, int64_t, dist_type::UNIQUE)
   ->Unit(benchmark::kMillisecond)
   ->Apply(generate_size_and_occupancy);
+*/
 
-BENCHMARK_TEMPLATE(BM_static_map_erase_all, int32_t, int32_t, dist_type::UNIQUE)
-  ->Unit(benchmark::kMillisecond)
-  ->Apply(generate_size_and_occupancy);
-/*
-BENCHMARK_TEMPLATE(BM_static_map_insert, int32_t, int32_t, dist_type::UNIQUE)
+BENCHMARK_TEMPLATE(BM_static_map_insert, int32_t, int32_t, dist_type::GAUSSIAN)
   ->Unit(benchmark::kMillisecond)
   ->Apply(generate_size_and_occupancy)
   ->UseManualTime();
@@ -351,28 +346,3 @@ BENCHMARK_TEMPLATE(BM_static_map_insert, int64_t, int64_t, dist_type::UNIQUE)
   ->Apply(generate_size_and_occupancy)
   ->UseManualTime();
 
-BENCHMARK_TEMPLATE(BM_static_map_search_all, int64_t, int64_t, dist_type::UNIQUE)
-  ->Unit(benchmark::kMillisecond)
-  ->Apply(generate_size_and_occupancy);
-
-BENCHMARK_TEMPLATE(BM_static_map_insert, int64_t, int64_t, dist_type::UNIFORM)
-  ->Unit(benchmark::kMillisecond)
-  ->Apply(generate_size_and_occupancy)
-  ->UseManualTime();
-
-BENCHMARK_TEMPLATE(BM_static_map_search_all, int64_t, int64_t, dist_type::UNIFORM)
-  ->Unit(benchmark::kMillisecond)
-  ->Apply(generate_size_and_occupancy);
-
-BENCHMARK_TEMPLATE(BM_static_map_insert, int64_t, int64_t, dist_type::GAUSSIAN)
-  ->Unit(benchmark::kMillisecond)
-  ->Apply(generate_size_and_occupancy)
-  ->UseManualTime();
-
-BENCHMARK_TEMPLATE(BM_static_map_search_all, int64_t, int64_t, dist_type::GAUSSIAN)
-  ->Unit(benchmark::kMillisecond)
-  ->Apply(generate_size_and_occupancy);
-
-BENCHMARK_TEMPLATE(BM_static_map_erase_all, int32_t, int32_t, dist_type::UNIQUE)
-  ->Unit(benchmark::kMillisecond)
-  ->Apply(generate_size_and_occupancy);

From 20ac7a33c5136f29a4b24826fab0f4fb49404b4a Mon Sep 17 00:00:00 2001
From: Nico Iskos <niskos@nvidia.com>
Date: Mon, 4 Apr 2022 18:22:55 -0700
Subject: [PATCH 03/36] erase bug fix

---
 include/cuco/detail/dynamic_map.inl         | 47 +++++++++++++++++----
 include/cuco/detail/dynamic_map_kernels.cuh | 10 ++---
 include/cuco/detail/static_map.inl          |  5 +--
 include/cuco/dynamic_map.cuh                |  8 +++-
 tests/CMakeLists.txt                        |  2 +-
 tests/dynamic_map/erase_test.cu             |  8 ++--
 tests/static_map/erase_test.cu              |  2 -
 7 files changed, 56 insertions(+), 26 deletions(-)

diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl
index 28857f547..f5625bd72 100644
--- a/include/cuco/detail/dynamic_map.inl
+++ b/include/cuco/detail/dynamic_map.inl
@@ -17,13 +17,13 @@
 namespace cuco {
 
 template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
-dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(
-  std::size_t initial_capacity,
-  sentinel::empty_key<Key> empty_key_sentinel,
-  sentinel::empty_value<Value> empty_value_sentinel,
-  Allocator const& alloc)
-  : empty_key_sentinel_(empty_key_sentinel.value),
-    empty_value_sentinel_(empty_value_sentinel.value),
+dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(std::size_t initial_capacity,
+                                                       Key empty_key_sentinel,
+                                                       Value empty_value_sentinel,
+                                                       Allocator const& alloc)
+  : empty_key_sentinel_(empty_key_sentinel),
+    empty_value_sentinel_(empty_value_sentinel),
+    erased_key_sentinel_(empty_value_sentinel),
     size_(0),
     capacity_(initial_capacity),
     min_insert_size_(1E4),
@@ -41,7 +41,37 @@ dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(
   submap_num_successes_.push_back(submaps_[0]->get_num_successes());
 
   CUCO_CUDA_TRY(cudaMallocManaged(&num_successes_, sizeof(atomic_ctr_type)));
-}  // namespace cuco
+}
+
+template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
+dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(std::size_t initial_capacity,
+                                                       Key empty_key_sentinel,
+                                                       Value empty_value_sentinel,
+                                                       Key erased_key_sentinel,
+                                                       Allocator const& alloc)
+  : empty_key_sentinel_(empty_key_sentinel),
+    empty_value_sentinel_(empty_value_sentinel),
+    erased_key_sentinel_(erased_key_sentinel),
+    size_(0),
+    capacity_(initial_capacity),
+    min_insert_size_(1E4),
+    max_load_factor_(0.60),
+    alloc_{alloc}
+{
+  submaps_.push_back(std::make_unique<static_map<Key, Value, Scope, Allocator>>(
+    initial_capacity,
+    sentinel::empty_key<Key>{empty_key_sentinel},
+    sentinel::empty_value<Value>{empty_value_sentinel},
+    sentinel::erased_key<Key>{erased_key_sentinel},
+    alloc));
+  submap_views_.push_back(submaps_[0]->get_device_view());
+  submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view());
+
+  submap_num_successes_.push_back(submaps_[0]->get_num_successes());
+
+  CUCO_CUDA_TRY(cudaMallocManaged(&num_successes_, sizeof(atomic_ctr_type)));
+}
+
 
 template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
 dynamic_map<Key, Value, Scope, Allocator>::~dynamic_map()
@@ -123,6 +153,7 @@ void dynamic_map<Key, Value, Scope, Allocator>::insert(InputIt first,
       CUCO_CUDA_TRY(cudaDeviceSynchronize());
 
       std::size_t h_num_successes = num_successes_->load(cuda::std::memory_order_relaxed);
+
       submaps_[submap_idx]->size_ += h_num_successes;
       size_ += h_num_successes;
       first += n;
diff --git a/include/cuco/detail/dynamic_map_kernels.cuh b/include/cuco/detail/dynamic_map_kernels.cuh
index 3bc8a0d8a..46fae21b5 100644
--- a/include/cuco/detail/dynamic_map_kernels.cuh
+++ b/include/cuco/detail/dynamic_map_kernels.cuh
@@ -209,24 +209,22 @@ __global__ void erase(InputIt first,
   __shared__ typename BlockReduce::TempStorage temp_storage;
 
   // TODO: hack for up to 4 submaps, make this better
-  __shared__ typename BlockReduce::TempStorage temp_submap_storage[4];
+  //__shared__ typename BlockReduce::TempStorage temp_submap_storage[4];
 
   std::size_t thread_num_successes = 0;
   std::size_t submap_thread_num_successes[4] = {0, 0, 0, 0};
 
   auto tile = cg::tiled_partition<tile_size>(cg::this_thread_block());
-  auto tid  = blockDim.x * blockIdx.x + threadIdx.x;
-  auto key_idx              = tid / tile_size;
+  auto tid  = block_size * blockIdx.x + threadIdx.x;
   auto it   = first + tid / tile_size;
 
   while (it < last) {
-    auto key         = *(first + key_idx);
-    auto erased           = false;
+    auto erased     = false;
 
     // manually check for duplicates in those submaps we are not inserting into
     int i;
     for (i = 0; i < num_submaps; ++i) {
-      erased = submap_mutable_views[i].erase(tile, key, hash, key_equal);
+      erased = submap_mutable_views[i].erase(tile, *it, hash, key_equal);
       if (erased) { break; }
     }
     if (erased && tile.thread_rank() == 0) {
diff --git a/include/cuco/detail/static_map.inl b/include/cuco/detail/static_map.inl
index b451f9089..42c163550 100644
--- a/include/cuco/detail/static_map.inl
+++ b/include/cuco/detail/static_map.inl
@@ -650,9 +650,6 @@ __device__ bool static_map<Key, Value, Scope, Allocator>::device_mutable_view::e
 
       bool status;
       if (g.thread_rank() == src_lane) {
-        // only fetch value once necessary
-        auto existing_value = current_slot->second.load(cuda::std::memory_order_relaxed);
-        
         if constexpr (cuco::detail::is_packable<value_type>()) {
           auto slot = reinterpret_cast<
             cuda::atomic<typename cuco::detail::pair_converter<value_type>::packed_type>*>(
@@ -855,4 +852,4 @@ static_map<Key, Value, Scope, Allocator>::device_view::contains(CG const& g,
     current_slot = next_slot(g, current_slot);
   }
 }
-}  // namespace cuco
+}  // namespace cuco
\ No newline at end of file
diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh
index 1e347239b..649eb3d01 100644
--- a/include/cuco/dynamic_map.cuh
+++ b/include/cuco/dynamic_map.cuh
@@ -136,6 +136,12 @@ class dynamic_map {
               sentinel::empty_key<Key> empty_key_sentinel,
               sentinel::empty_value<Value> empty_value_sentinel,
               Allocator const& alloc = Allocator{});
+  
+  dynamic_map(std::size_t initial_capacity,
+              Key empty_key_sentinel,
+              Value empty_value_sentinel,
+              Key erased_key_sentinel,
+              Allocator const& alloc = Allocator{});
 
   /**
    * @brief Destroy the map and frees its contents
@@ -256,9 +262,9 @@ class dynamic_map {
  private:
   key_type empty_key_sentinel_{};       ///< Key value that represents an empty slot
   mapped_type empty_value_sentinel_{};  ///< Initial value of empty slot
+  key_type erased_key_sentinel_{};
 
   // TODO: initialize this
-  key_type erased_key_sentinel_{};
   std::size_t size_{};                  ///< Number of keys in the map
   std::size_t capacity_{};              ///< Maximum number of keys that can be inserted
   float max_load_factor_{};             ///< Max load factor before capacity growth
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index a7b40300c..ae5dfd5af 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -68,7 +68,7 @@ ConfigureTest(STATIC_MAP_TEST
 ###################################################################################################
 # - dynamic_map tests -----------------------------------------------------------------------------
 ConfigureTest(DYNAMIC_MAP_TEST
-    dynamic_map/unique_sequence_test.cu
+    #dynamic_map/unique_sequence_test.cu
     dynamic_map/erase_test.cu)
 
 ###################################################################################################
diff --git a/tests/dynamic_map/erase_test.cu b/tests/dynamic_map/erase_test.cu
index c00013961..ec2ca44b4 100644
--- a/tests/dynamic_map/erase_test.cu
+++ b/tests/dynamic_map/erase_test.cu
@@ -29,7 +29,7 @@ TEMPLATE_TEST_CASE_SIG(
   using Value = T;
   
   unsigned long num_keys = 1'000'000;
-  cuco::dynamic_map<Key, Value> map{num_keys * 2, -1, -1};
+  cuco::dynamic_map<Key, Value> map{num_keys * 2, -1, -1, -2};
 
   thrust::device_vector<Key> d_keys(num_keys);
   thrust::device_vector<Value> d_values(num_keys);
@@ -48,6 +48,7 @@ TEMPLATE_TEST_CASE_SIG(
 
     REQUIRE(map.get_size() == num_keys);
 
+    
     map.erase(d_keys.begin(), d_keys.end());
 
     // delete decreases count correctly
@@ -60,13 +61,11 @@ TEMPLATE_TEST_CASE_SIG(
                                 d_keys_exist.end(),
                                 [] __device__(const bool key_found) { return key_found; }));
 
-    printf("cow\n");
-
     // ensures that map is reusing deleted slots    
     map.insert(pairs_begin, pairs_begin + num_keys);
 
     REQUIRE(map.get_size() == num_keys);
-
+/*
     printf("cow2\n");
 
     map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin());
@@ -86,5 +85,6 @@ TEMPLATE_TEST_CASE_SIG(
     REQUIRE(cuco::test::all_of(d_keys_exist.begin() + num_keys/2,
                                 d_keys_exist.end(),
                                 [] __device__(const bool key_found) { return key_found; }));
+    */
   }
 }
\ No newline at end of file
diff --git a/tests/static_map/erase_test.cu b/tests/static_map/erase_test.cu
index a4b956305..b5641539c 100644
--- a/tests/static_map/erase_test.cu
+++ b/tests/static_map/erase_test.cu
@@ -56,12 +56,10 @@ TEMPLATE_TEST_CASE_SIG("erase key", "", ((typename T), T), (int32_t), (int64_t))
 
     map.erase(d_keys.begin(), d_keys.end());
 
-    // delete decreases count correctly
     REQUIRE(map.get_size() == 0);
 
     map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin());
 
-    // keys were actaully deleted
     REQUIRE(cuco::test::none_of(d_keys_exist.begin(),
                                 d_keys_exist.end(),
                                 [] __device__(const bool key_found) { return key_found; }));

From ede50d68a1e4f14383654248f3a0e64993186273 Mon Sep 17 00:00:00 2001
From: Nico Iskos <niskos@nvidia.com>
Date: Tue, 5 Apr 2022 11:19:57 -0700
Subject: [PATCH 04/36] dynamic map erase working, only 4 submaps for now

---
 include/cuco/detail/dynamic_map.inl         | 24 ++++++---
 include/cuco/detail/dynamic_map_kernels.cuh |  8 +--
 tests/dynamic_map/erase_test.cu             | 55 +++++++++++++++++++--
 3 files changed, 73 insertions(+), 14 deletions(-)

diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl
index f5625bd72..1599f90fd 100644
--- a/include/cuco/detail/dynamic_map.inl
+++ b/include/cuco/detail/dynamic_map.inl
@@ -23,7 +23,7 @@ dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(std::size_t initial_capac
                                                        Allocator const& alloc)
   : empty_key_sentinel_(empty_key_sentinel),
     empty_value_sentinel_(empty_value_sentinel),
-    erased_key_sentinel_(empty_value_sentinel),
+    erased_key_sentinel_(empty_key_sentinel),
     size_(0),
     capacity_(initial_capacity),
     min_insert_size_(1E4),
@@ -60,9 +60,9 @@ dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(std::size_t initial_capac
 {
   submaps_.push_back(std::make_unique<static_map<Key, Value, Scope, Allocator>>(
     initial_capacity,
-    sentinel::empty_key<Key>{empty_key_sentinel},
-    sentinel::empty_value<Value>{empty_value_sentinel},
-    sentinel::erased_key<Key>{erased_key_sentinel},
+    sentinel::empty_key<Key>{empty_key_sentinel_},
+    sentinel::empty_value<Value>{empty_value_sentinel_},
+    sentinel::erased_key<Key>{erased_key_sentinel_},
     alloc));
   submap_views_.push_back(submaps_[0]->get_device_view());
   submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view());
@@ -98,6 +98,7 @@ void dynamic_map<Key, Value, Scope, Allocator>::reserve(std::size_t n)
         submap_capacity,
         sentinel::empty_key<Key>{empty_key_sentinel_},
         sentinel::empty_value<Value>{empty_value_sentinel_},
+        sentinel::erased_key<Key>{erased_key_sentinel_},
         alloc_));
       submap_views_.push_back(submaps_[submap_idx]->get_device_view());
       submap_mutable_views_.push_back(submaps_[submap_idx]->get_device_mutable_view());
@@ -128,6 +129,7 @@ void dynamic_map<Key, Value, Scope, Allocator>::insert(InputIt first,
       max_load_factor_ * submaps_[submap_idx]->get_capacity() - submaps_[submap_idx]->get_size();
     // If we are tying to insert some of the remaining keys into this submap, we can insert
     // only if we meet the minimum insert size.
+
     if (capacity_remaining >= min_insert_size_) {
       *num_successes_ = 0;
       int device_id;
@@ -182,6 +184,11 @@ void dynamic_map<Key, Value, Scope, Allocator>::erase(InputIt first,
   CUCO_CUDA_TRY(cudaGetDevice(&device_id));
   CUCO_CUDA_TRY(cudaMemPrefetchAsync(num_successes_, sizeof(atomic_ctr_type), device_id));
   
+  static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type));
+  for(int i = 0; i < submaps_.size(); ++i) {
+    CUCO_CUDA_TRY(cudaMemset(submap_num_successes_[i], 0, sizeof(atomic_ctr_type)));
+  }
+  
   // TODO: hacky, improve this
   thrust::device_vector<atomic_ctr_type*> d_submap_num_successes(submap_num_successes_);
       
@@ -201,10 +208,13 @@ void dynamic_map<Key, Value, Scope, Allocator>::erase(InputIt first,
   size_ -= h_num_successes;
   
   for(int i = 0; i < submaps_.size(); ++i) {
-    //std::size_t h_num_submap_successes = submap_num_successes_[i]->load(cuda::std::memory_order_relaxed);
-    //submaps_[i]->size_ -= h_num_submap_successes;
+    std::size_t h_submap_num_successes;
+    CUCO_CUDA_TRY(cudaMemcpy(
+      &h_submap_num_successes, submap_num_successes_[i], sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost));
+
+    CUCO_CUDA_TRY(cudaDeviceSynchronize());  // stream sync to ensure h_num_successes is updated
+    submaps_[i]->size_ -= h_submap_num_successes;
   }
-  
 }
 
 template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
diff --git a/include/cuco/detail/dynamic_map_kernels.cuh b/include/cuco/detail/dynamic_map_kernels.cuh
index 46fae21b5..624a6a85a 100644
--- a/include/cuco/detail/dynamic_map_kernels.cuh
+++ b/include/cuco/detail/dynamic_map_kernels.cuh
@@ -209,7 +209,7 @@ __global__ void erase(InputIt first,
   __shared__ typename BlockReduce::TempStorage temp_storage;
 
   // TODO: hack for up to 4 submaps, make this better
-  //__shared__ typename BlockReduce::TempStorage temp_submap_storage[4];
+  __shared__ typename BlockReduce::TempStorage temp_submap_storage[4];
 
   std::size_t thread_num_successes = 0;
   std::size_t submap_thread_num_successes[4] = {0, 0, 0, 0};
@@ -229,7 +229,7 @@ __global__ void erase(InputIt first,
     }
     if (erased && tile.thread_rank() == 0) {
       thread_num_successes++;
-      //submap_thread_num_successes[i]++;
+      submap_thread_num_successes[i]++;
     }
 
     it += (gridDim.x * blockDim.x) / tile_size;
@@ -240,8 +240,8 @@ __global__ void erase(InputIt first,
 
   // update submap thread counts
   for(int i = 0; i < num_submaps; ++i) {
-    //std::size_t submap_block_num_successes = BlockReduce(temp_submap_storage[i]).Sum(submap_thread_num_successes[i]);
-    //if(threadIdx.x == 0) {*submap_num_successes[i] += submap_block_num_successes; }
+    std::size_t submap_block_num_successes = BlockReduce(temp_submap_storage[i]).Sum(submap_thread_num_successes[i]);
+    if(threadIdx.x == 0) {*submap_num_successes[i] += submap_block_num_successes; }
   }
 }
 
diff --git a/tests/dynamic_map/erase_test.cu b/tests/dynamic_map/erase_test.cu
index ec2ca44b4..e84c6f35e 100644
--- a/tests/dynamic_map/erase_test.cu
+++ b/tests/dynamic_map/erase_test.cu
@@ -44,6 +44,10 @@ TEMPLATE_TEST_CASE_SIG(
   SECTION(
     "Check basic insert/erase")
   {
+    // *****************************************
+    // first, check single submap works properly
+    // *****************************************
+
     map.insert(pairs_begin, pairs_begin + num_keys);
 
     REQUIRE(map.get_size() == num_keys);
@@ -65,8 +69,6 @@ TEMPLATE_TEST_CASE_SIG(
     map.insert(pairs_begin, pairs_begin + num_keys);
 
     REQUIRE(map.get_size() == num_keys);
-/*
-    printf("cow2\n");
 
     map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin());
 
@@ -85,6 +87,53 @@ TEMPLATE_TEST_CASE_SIG(
     REQUIRE(cuco::test::all_of(d_keys_exist.begin() + num_keys/2,
                                 d_keys_exist.end(),
                                 [] __device__(const bool key_found) { return key_found; }));
-    */
+    
+    // clear map
+    map.erase(d_keys.begin()+num_keys/2, d_keys.end());
+    
+    // *************************************************
+    // second, check multiple submaps case works properly
+    // *************************************************
+    
+    thrust::device_vector<Key> d_keys2(4 * num_keys);
+    thrust::device_vector<Value> d_values2(4 * num_keys);
+    thrust::device_vector<bool> d_keys_exist2(4 * num_keys);
+  
+    thrust::sequence(thrust::device, d_keys2.begin(), d_keys2.end(), 1);
+    thrust::sequence(thrust::device, d_values2.begin(), d_values2.end(), 1);
+      
+    auto pairs_begin2 =
+      thrust::make_zip_iterator(thrust::make_tuple(d_keys2.begin(), d_values2.begin()));
+
+    map.insert(pairs_begin2, pairs_begin2 + 4*num_keys);
+    
+    // map should resize twice if the erased slots are successfully reused
+    REQUIRE(map.get_capacity() == 8*num_keys);
+
+    // check that keys can be successfully deleted from only the first and second submaps
+    map.erase(d_keys2.begin(), d_keys2.begin() + 2*num_keys);
+
+    map.contains(d_keys2.begin(), d_keys2.end(), d_keys_exist2.begin());
+    
+    REQUIRE(cuco::test::none_of(d_keys_exist2.begin(),
+                                d_keys_exist2.begin() + 2*num_keys,
+                                [] __device__(const bool key_found) { return key_found; }));
+
+    REQUIRE(cuco::test::all_of(d_keys_exist2.begin() + 2*num_keys,
+                                d_keys_exist2.end(),
+                                [] __device__(const bool key_found) { return key_found; }));
+
+    REQUIRE(map.get_size() == 2*num_keys);
+
+    // check that keys can be successfully deleted from all submaps (some will be unsuccessful erases)
+    map.erase(d_keys2.begin(), d_keys2.end());
+    
+    map.contains(d_keys2.begin(), d_keys2.end(), d_keys_exist2.begin());
+    
+    REQUIRE(cuco::test::none_of(d_keys_exist2.begin(),
+                                d_keys_exist2.end(),
+                                [] __device__(const bool key_found) { return key_found; }));
+
+    REQUIRE(map.get_size() == 0);
   }
 }
\ No newline at end of file

From 1d8fbd0d54c24c248cbdb340e99506d61179c545 Mon Sep 17 00:00:00 2001
From: Nico Iskos <niskos@nvidia.com>
Date: Tue, 5 Apr 2022 11:57:06 -0700
Subject: [PATCH 05/36] type wrappers added

---
 include/cuco/detail/dynamic_map.inl       | 44 +++++++++++++----------
 include/cuco/dynamic_map.cuh              |  6 ++--
 tests/CMakeLists.txt                      |  2 +-
 tests/dynamic_map/erase_test.cu           |  5 ++-
 tests/dynamic_map/unique_sequence_test.cu |  6 ++--
 5 files changed, 37 insertions(+), 26 deletions(-)

diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl
index 1599f90fd..ecccf82dc 100644
--- a/include/cuco/detail/dynamic_map.inl
+++ b/include/cuco/detail/dynamic_map.inl
@@ -18,12 +18,12 @@ namespace cuco {
 
 template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
 dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(std::size_t initial_capacity,
-                                                       Key empty_key_sentinel,
-                                                       Value empty_value_sentinel,
+                                                       sentinel::empty_key<Key> empty_key_sentinel,
+                                                       sentinel::empty_value<Value> empty_value_sentinel,
                                                        Allocator const& alloc)
-  : empty_key_sentinel_(empty_key_sentinel),
-    empty_value_sentinel_(empty_value_sentinel),
-    erased_key_sentinel_(empty_key_sentinel),
+  : empty_key_sentinel_(empty_key_sentinel.value),
+    empty_value_sentinel_(empty_value_sentinel.value),
+    erased_key_sentinel_(empty_key_sentinel.value),
     size_(0),
     capacity_(initial_capacity),
     min_insert_size_(1E4),
@@ -45,13 +45,13 @@ dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(std::size_t initial_capac
 
 template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
 dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(std::size_t initial_capacity,
-                                                       Key empty_key_sentinel,
-                                                       Value empty_value_sentinel,
-                                                       Key erased_key_sentinel,
+                                                       sentinel::empty_key<Key> empty_key_sentinel,
+                                                       sentinel::empty_value<Value> empty_value_sentinel,
+                                                       sentinel::erased_key<Key> erased_key_sentinel,
                                                        Allocator const& alloc)
-  : empty_key_sentinel_(empty_key_sentinel),
-    empty_value_sentinel_(empty_value_sentinel),
-    erased_key_sentinel_(erased_key_sentinel),
+  : empty_key_sentinel_(empty_key_sentinel.value),
+    empty_value_sentinel_(empty_value_sentinel.value),
+    erased_key_sentinel_(erased_key_sentinel.value),
     size_(0),
     capacity_(initial_capacity),
     min_insert_size_(1E4),
@@ -66,7 +66,6 @@ dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(std::size_t initial_capac
     alloc));
   submap_views_.push_back(submaps_[0]->get_device_view());
   submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view());
-
   submap_num_successes_.push_back(submaps_[0]->get_num_successes());
 
   CUCO_CUDA_TRY(cudaMallocManaged(&num_successes_, sizeof(atomic_ctr_type)));
@@ -94,15 +93,22 @@ void dynamic_map<Key, Value, Scope, Allocator>::reserve(std::size_t n)
     // if the submap does not exist yet, create it
     else {
       submap_capacity = capacity_;
-      submaps_.push_back(std::make_unique<static_map<Key, Value, Scope, Allocator>>(
-        submap_capacity,
-        sentinel::empty_key<Key>{empty_key_sentinel_},
-        sentinel::empty_value<Value>{empty_value_sentinel_},
-        sentinel::erased_key<Key>{erased_key_sentinel_},
-        alloc_));
+      if(erased_key_sentinel_ != empty_key_sentinel_) {
+        submaps_.push_back(std::make_unique<static_map<Key, Value, Scope, Allocator>>(
+          submap_capacity,
+          sentinel::empty_key<Key>{empty_key_sentinel_},
+          sentinel::empty_value<Value>{empty_value_sentinel_},
+          sentinel::erased_key<Key>{erased_key_sentinel_},
+          alloc_));
+      } else {
+        submaps_.push_back(std::make_unique<static_map<Key, Value, Scope, Allocator>>(
+          submap_capacity,
+          sentinel::empty_key<Key>{empty_key_sentinel_},
+          sentinel::empty_value<Value>{empty_value_sentinel_},
+          alloc_));
+      }
       submap_views_.push_back(submaps_[submap_idx]->get_device_view());
       submap_mutable_views_.push_back(submaps_[submap_idx]->get_device_mutable_view());
-      
       submap_num_successes_.push_back(submaps_[submap_idx]->get_num_successes());
 
       capacity_ *= 2;
diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh
index 649eb3d01..bbe8c664b 100644
--- a/include/cuco/dynamic_map.cuh
+++ b/include/cuco/dynamic_map.cuh
@@ -138,9 +138,9 @@ class dynamic_map {
               Allocator const& alloc = Allocator{});
   
   dynamic_map(std::size_t initial_capacity,
-              Key empty_key_sentinel,
-              Value empty_value_sentinel,
-              Key erased_key_sentinel,
+              sentinel::empty_key<Key> empty_key_sentinel,
+              sentinel::empty_value<Value> empty_value_sentinel,
+              sentinel::erased_key<Key> erased_key_sentinel,
               Allocator const& alloc = Allocator{});
 
   /**
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index ae5dfd5af..a7b40300c 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -68,7 +68,7 @@ ConfigureTest(STATIC_MAP_TEST
 ###################################################################################################
 # - dynamic_map tests -----------------------------------------------------------------------------
 ConfigureTest(DYNAMIC_MAP_TEST
-    #dynamic_map/unique_sequence_test.cu
+    dynamic_map/unique_sequence_test.cu
     dynamic_map/erase_test.cu)
 
 ###################################################################################################
diff --git a/tests/dynamic_map/erase_test.cu b/tests/dynamic_map/erase_test.cu
index e84c6f35e..385b2e426 100644
--- a/tests/dynamic_map/erase_test.cu
+++ b/tests/dynamic_map/erase_test.cu
@@ -29,7 +29,10 @@ TEMPLATE_TEST_CASE_SIG(
   using Value = T;
   
   unsigned long num_keys = 1'000'000;
-  cuco::dynamic_map<Key, Value> map{num_keys * 2, -1, -1, -2};
+  cuco::dynamic_map<Key, Value> map{num_keys * 2, 
+    cuco::sentinel::empty_key<Key>{-1}, 
+    cuco::sentinel::empty_value<Value>{-1}, 
+    cuco::sentinel::erased_key<Key>{-2}};
 
   thrust::device_vector<Key> d_keys(num_keys);
   thrust::device_vector<Value> d_values(num_keys);
diff --git a/tests/dynamic_map/unique_sequence_test.cu b/tests/dynamic_map/unique_sequence_test.cu
index de26bb3dc..24a2041aa 100644
--- a/tests/dynamic_map/unique_sequence_test.cu
+++ b/tests/dynamic_map/unique_sequence_test.cu
@@ -38,8 +38,10 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys",
                        (int64_t, int64_t))
 {
   constexpr std::size_t num_keys{50'000'000};
-  cuco::dynamic_map<Key, Value> map{
-    30'000'000, cuco::sentinel::empty_key<Key>{-1}, cuco::sentinel::empty_value<Value>{-1}};
+
+  cuco::dynamic_map<Key, Value> map{30'000'000, 
+    cuco::sentinel::empty_key<Key>{-1}, 
+    cuco::sentinel::empty_value<Value>{-1}};
 
   thrust::device_vector<Key> d_keys(num_keys);
   thrust::device_vector<Value> d_values(num_keys);

From 63dd4eb07fbb4b145f7075f47b2fa9d64c3b3538 Mon Sep 17 00:00:00 2001
From: Nico Iskos <niskos@nvidia.com>
Date: Tue, 5 Apr 2022 12:55:31 -0700
Subject: [PATCH 06/36] prevent implicit type conversion of sentinels during
 construction

---
 benchmarks/hash_table/dynamic_map_bench.cu | 9 +++++++--
 include/cuco/dynamic_map.cuh               | 9 +++++++++
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/benchmarks/hash_table/dynamic_map_bench.cu b/benchmarks/hash_table/dynamic_map_bench.cu
index d42aae755..8545a47c6 100644
--- a/benchmarks/hash_table/dynamic_map_bench.cu
+++ b/benchmarks/hash_table/dynamic_map_bench.cu
@@ -86,8 +86,13 @@ static void BM_dynamic_insert(::benchmark::State& state)
 
   std::size_t batch_size = 1E6;
   for (auto _ : state) {
+<<<<<<< HEAD
     map_type map{
       initial_size, cuco::sentinel::empty_key<Key>{-1}, cuco::sentinel::empty_value<Value>{-1}};
+=======
+    map_type map{initial_size, 
+      cuco::sentinel::empty_key<Key>{-1}, cuco::sentinel::empty_value<Value>{-1}};
+>>>>>>> prevent implicit type conversion of sentinels during construction
     {
       cuda_event_timer raii{state};
       for (std::size_t i = 0; i < num_keys; i += batch_size) {
@@ -124,8 +129,8 @@ static void BM_dynamic_search_all(::benchmark::State& state)
   thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs(h_pairs);
   thrust::device_vector<Value> d_results(num_keys);
 
-  map_type map{
-    initial_size, cuco::sentinel::empty_key<Key>{-1}, cuco::sentinel::empty_value<Value>{-1}};
+  map_type map{initial_size, 
+    cuco::sentinel::empty_key<Key>{-1}, cuco::sentinel::empty_value<Value>{-1}};
   map.insert(d_pairs.begin(), d_pairs.end());
 
   for (auto _ : state) {
diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh
index bbe8c664b..267910b43 100644
--- a/include/cuco/dynamic_map.cuh
+++ b/include/cuco/dynamic_map.cuh
@@ -109,6 +109,15 @@ class dynamic_map {
 
   dynamic_map(dynamic_map const&) = delete;
   dynamic_map(dynamic_map&&)      = delete;
+
+  template<typename T1, typename T2>
+  dynamic_map(std::size_t, T1, T2,
+              Allocator const& = Allocator{}) = delete;
+  
+  template<typename T1, typename T2, typename T3>
+  dynamic_map(std::size_t, T1, T2, T3,
+              Allocator const& = Allocator{}) = delete;
+  
   dynamic_map& operator=(dynamic_map const&) = delete;
   dynamic_map& operator=(dynamic_map&&) = delete;
 

From 52d83f6cb0d6bd4575d43621cd15baee4baa0870 Mon Sep 17 00:00:00 2001
From: Nico Iskos <niskos@nvidia.com>
Date: Tue, 5 Apr 2022 16:34:16 -0700
Subject: [PATCH 07/36] erase benchmark added

---
 benchmarks/hash_table/dynamic_map_bench.cu  | 53 ++++++++++++++++++++-
 include/cuco/detail/dynamic_map.inl         | 23 +++++----
 include/cuco/detail/dynamic_map_kernels.cuh |  4 +-
 3 files changed, 66 insertions(+), 14 deletions(-)

diff --git a/benchmarks/hash_table/dynamic_map_bench.cu b/benchmarks/hash_table/dynamic_map_bench.cu
index 8545a47c6..afdd3644b 100644
--- a/benchmarks/hash_table/dynamic_map_bench.cu
+++ b/benchmarks/hash_table/dynamic_map_bench.cu
@@ -57,7 +57,7 @@ static void generate_keys(OutputIt output_begin, OutputIt output_end)
 
 static void gen_final_size(benchmark::internal::Benchmark* b)
 {
-  for (auto size = 10'000'000; size <= 150'000'000; size += 20'000'000) {
+  for (auto size = 10'000'000; size <= 310'000'000; size += 20'000'000) {
     b->Args({size});
   }
 }
@@ -142,15 +142,64 @@ static void BM_dynamic_search_all(::benchmark::State& state)
                           int64_t(state.range(0)));
 }
 
+template <typename Key, typename Value, dist_type Dist>
+static void BM_dynamic_erase_all(::benchmark::State& state)
+{
+  using map_type = cuco::dynamic_map<Key, Value>;
+
+  std::size_t num_keys     = state.range(0);
+  std::size_t initial_size = 1 << 27;
+
+  std::vector<Key> h_keys(num_keys);
+  std::vector<cuco::pair_type<Key, Value>> h_pairs(num_keys);
+
+  generate_keys<Dist, Key>(h_keys.begin(), h_keys.end());
+
+  for (auto i = 0; i < num_keys; ++i) {
+    Key key           = h_keys[i];
+    Value val         = h_keys[i];
+    h_pairs[i].first  = key;
+    h_pairs[i].second = val;
+  }
+
+  thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs(h_pairs);
+  thrust::device_vector<Key> d_keys(h_keys);
+
+  std::size_t batch_size = 1E6;
+  for (auto _ : state) {
+    map_type map{initial_size, 
+      cuco::sentinel::empty_key<Key>{-1}, 
+      cuco::sentinel::empty_value<Value>{-1},
+      cuco::sentinel::erased_key<Key>{-2}};
+    for (auto i = 0; i < num_keys; i += batch_size) {
+      map.insert(d_pairs.begin() + i, d_pairs.begin() + i + batch_size);
+    }
+    {
+      cuda_event_timer raii{state};
+      for (auto i = 0; i < num_keys; i += batch_size) {
+        map.erase(d_keys.begin() + i, d_keys.begin() + i + batch_size);
+      }
+    }
+  }
+
+  state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) *
+                          int64_t(state.range(0)));
+}
+
 BENCHMARK_TEMPLATE(BM_dynamic_insert, int32_t, int32_t, dist_type::UNIQUE)
   ->Unit(benchmark::kMillisecond)
   ->Apply(gen_final_size)
   ->UseManualTime();
-
+/*
 BENCHMARK_TEMPLATE(BM_dynamic_search_all, int32_t, int32_t, dist_type::UNIQUE)
   ->Unit(benchmark::kMillisecond)
   ->Apply(gen_final_size)
   ->UseManualTime();
+*/
+BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int32_t, int32_t, dist_type::UNIQUE)
+  ->Unit(benchmark::kMillisecond)
+  ->Apply(gen_final_size)
+  ->UseManualTime();
 
 /*
 BENCHMARK_TEMPLATE(BM_dynamic_insert, int32_t, int32_t, dist_type::UNIFORM)
diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl
index ecccf82dc..ba9a7bf8b 100644
--- a/include/cuco/detail/dynamic_map.inl
+++ b/include/cuco/detail/dynamic_map.inl
@@ -197,17 +197,22 @@ void dynamic_map<Key, Value, Scope, Allocator>::erase(InputIt first,
   
   // TODO: hacky, improve this
   thrust::device_vector<atomic_ctr_type*> d_submap_num_successes(submap_num_successes_);
+
+  // TODO: hack (how to get size on host?)
+  constexpr size_t temp_storage_size_one_block = 48;
+  auto const temp_storage_size = submaps_.size() * temp_storage_size_one_block;
       
   detail::erase<block_size, tile_size, cuco::pair_type<key_type, mapped_type>>
-    <<<grid_size, block_size>>>(first,
-                                first + num_keys,
-                                submap_views_.data().get(),
-                                submap_mutable_views_.data().get(),
-                                num_successes_,
-                                d_submap_num_successes.data().get(),
-                                submaps_.size(),
-                                hash,
-                                key_equal);
+    <<<grid_size, block_size, temp_storage_size>>>(
+      first,
+      first + num_keys,
+      submap_views_.data().get(),
+      submap_mutable_views_.data().get(),
+      num_successes_,
+      d_submap_num_successes.data().get(),
+      submaps_.size(),
+      hash,
+      key_equal);
   CUCO_CUDA_TRY(cudaDeviceSynchronize());
 
   std::size_t h_num_successes = num_successes_->load(cuda::std::memory_order_relaxed);
diff --git a/include/cuco/detail/dynamic_map_kernels.cuh b/include/cuco/detail/dynamic_map_kernels.cuh
index 624a6a85a..599d1d68b 100644
--- a/include/cuco/detail/dynamic_map_kernels.cuh
+++ b/include/cuco/detail/dynamic_map_kernels.cuh
@@ -206,11 +206,9 @@ __global__ void erase(InputIt first,
                        KeyEqual key_equal)
 {
   typedef cub::BlockReduce<std::size_t, block_size> BlockReduce;
+  extern __shared__ typename BlockReduce::TempStorage temp_submap_storage[];
   __shared__ typename BlockReduce::TempStorage temp_storage;
 
-  // TODO: hack for up to 4 submaps, make this better
-  __shared__ typename BlockReduce::TempStorage temp_submap_storage[4];
-
   std::size_t thread_num_successes = 0;
   std::size_t submap_thread_num_successes[4] = {0, 0, 0, 0};
 

From 0878216aed25ad34fcae870b17138859ca7dee3d Mon Sep 17 00:00:00 2001
From: Nico Iskos <niskos@nvidia.com>
Date: Tue, 5 Apr 2022 23:29:16 -0700
Subject: [PATCH 08/36] num_successes managed pointer updated

---
 benchmarks/hash_table/dynamic_map_bench.cu  |    2 +-
 include/cuco/detail/dynamic_map.inl         |   50 +-
 include/cuco/detail/dynamic_map_kernels.cuh |    2 +-
 include/cuco/detail/nvtx3.hpp               | 2045 +++++++++++++++++++
 include/cuco/dynamic_map.cuh                |   18 +-
 5 files changed, 2085 insertions(+), 32 deletions(-)
 create mode 100644 include/cuco/detail/nvtx3.hpp

diff --git a/benchmarks/hash_table/dynamic_map_bench.cu b/benchmarks/hash_table/dynamic_map_bench.cu
index afdd3644b..c0306f901 100644
--- a/benchmarks/hash_table/dynamic_map_bench.cu
+++ b/benchmarks/hash_table/dynamic_map_bench.cu
@@ -57,7 +57,7 @@ static void generate_keys(OutputIt output_begin, OutputIt output_end)
 
 static void gen_final_size(benchmark::internal::Benchmark* b)
 {
-  for (auto size = 10'000'000; size <= 310'000'000; size += 20'000'000) {
+  for (auto size = 10'000'000; size <= 10'000'000; size += 20'000'000) {
     b->Args({size});
   }
 }
diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl
index ba9a7bf8b..2bb1459bc 100644
--- a/include/cuco/detail/dynamic_map.inl
+++ b/include/cuco/detail/dynamic_map.inl
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+//#include "nvtx3.hpp"
+
 namespace cuco {
 
 template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
@@ -28,7 +30,8 @@ dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(std::size_t initial_capac
     capacity_(initial_capacity),
     min_insert_size_(1E4),
     max_load_factor_(0.60),
-    alloc_{alloc}
+    alloc_{alloc},
+    counter_allocator_{alloc}
 {
   submaps_.push_back(std::make_unique<static_map<Key, Value, Scope, Allocator>>(
     initial_capacity,
@@ -39,8 +42,8 @@ dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(std::size_t initial_capac
   submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view());
 
   submap_num_successes_.push_back(submaps_[0]->get_num_successes());
-
-  CUCO_CUDA_TRY(cudaMallocManaged(&num_successes_, sizeof(atomic_ctr_type)));
+  
+  num_successes_ = std::allocator_traits<counter_allocator_type>::allocate(counter_allocator_, 1);
 }
 
 template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
@@ -56,7 +59,8 @@ dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(std::size_t initial_capac
     capacity_(initial_capacity),
     min_insert_size_(1E4),
     max_load_factor_(0.60),
-    alloc_{alloc}
+    alloc_{alloc},
+    counter_allocator_{alloc}
 {
   submaps_.push_back(std::make_unique<static_map<Key, Value, Scope, Allocator>>(
     initial_capacity,
@@ -68,14 +72,14 @@ dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(std::size_t initial_capac
   submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view());
   submap_num_successes_.push_back(submaps_[0]->get_num_successes());
 
-  CUCO_CUDA_TRY(cudaMallocManaged(&num_successes_, sizeof(atomic_ctr_type)));
+  num_successes_ = std::allocator_traits<counter_allocator_type>::allocate(counter_allocator_, 1);
 }
 
 
 template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
 dynamic_map<Key, Value, Scope, Allocator>::~dynamic_map()
 {
-  CUCO_ASSERT_CUDA_SUCCESS(cudaFree(num_successes_));
+  std::allocator_traits<counter_allocator_type>::deallocate(counter_allocator_, num_successes_, 1);
 }
 
 template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
@@ -126,7 +130,10 @@ void dynamic_map<Key, Value, Scope, Allocator>::insert(InputIt first,
                                                        Hash hash,
                                                        KeyEqual key_equal)
 {
+  //nvtx3::thread_range r{"insert"};
+
   std::size_t num_to_insert = std::distance(first, last);
+
   reserve(size_ + num_to_insert);
 
   uint32_t submap_idx = 0;
@@ -137,11 +144,10 @@ void dynamic_map<Key, Value, Scope, Allocator>::insert(InputIt first,
     // only if we meet the minimum insert size.
 
     if (capacity_remaining >= min_insert_size_) {
-      *num_successes_ = 0;
-      int device_id;
-      CUCO_CUDA_TRY(cudaGetDevice(&device_id));
-      CUCO_CUDA_TRY(cudaMemPrefetchAsync(num_successes_, sizeof(atomic_ctr_type), device_id));
-
+      // TODO: memset an atomic variable is unsafe
+      static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type));
+      CUCO_CUDA_TRY(cudaMemset(num_successes_, 0, sizeof(atomic_ctr_type)));
+      
       auto n                = std::min(capacity_remaining, num_to_insert);
       auto const block_size = 128;
       auto const stride     = 1;
@@ -158,9 +164,10 @@ void dynamic_map<Key, Value, Scope, Allocator>::insert(InputIt first,
                                     submaps_.size(),
                                     hash,
                                     key_equal);
-      CUCO_CUDA_TRY(cudaDeviceSynchronize());
 
-      std::size_t h_num_successes = num_successes_->load(cuda::std::memory_order_relaxed);
+      std::size_t h_num_successes;
+      CUCO_CUDA_TRY(cudaMemcpy(
+        &h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost));
 
       submaps_[submap_idx]->size_ += h_num_successes;
       size_ += h_num_successes;
@@ -178,6 +185,7 @@ void dynamic_map<Key, Value, Scope, Allocator>::erase(InputIt first,
                                                        Hash hash,
                                                        KeyEqual key_equal)
 {
+  //nvtx3::thread_range r{"erase"};
   std::size_t num_keys = std::distance(first, last);
 
   auto const block_size = 128;
@@ -185,10 +193,9 @@ void dynamic_map<Key, Value, Scope, Allocator>::erase(InputIt first,
   auto const tile_size  = 4;
   auto const grid_size  = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size);
 
-  *num_successes_ = 0;
-  int device_id;
-  CUCO_CUDA_TRY(cudaGetDevice(&device_id));
-  CUCO_CUDA_TRY(cudaMemPrefetchAsync(num_successes_, sizeof(atomic_ctr_type), device_id));
+  // TODO: memset an atomic variable is unsafe
+  static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type));
+  CUCO_CUDA_TRY(cudaMemset(num_successes_, 0, sizeof(atomic_ctr_type)));
   
   static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type));
   for(int i = 0; i < submaps_.size(); ++i) {
@@ -213,17 +220,16 @@ void dynamic_map<Key, Value, Scope, Allocator>::erase(InputIt first,
       submaps_.size(),
       hash,
       key_equal);
-  CUCO_CUDA_TRY(cudaDeviceSynchronize());
-
-  std::size_t h_num_successes = num_successes_->load(cuda::std::memory_order_relaxed);
+      
+  std::size_t h_num_successes;
+  CUCO_CUDA_TRY(cudaMemcpy(
+    &h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost));
   size_ -= h_num_successes;
   
   for(int i = 0; i < submaps_.size(); ++i) {
     std::size_t h_submap_num_successes;
     CUCO_CUDA_TRY(cudaMemcpy(
       &h_submap_num_successes, submap_num_successes_[i], sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost));
-
-    CUCO_CUDA_TRY(cudaDeviceSynchronize());  // stream sync to ensure h_num_successes is updated
     submaps_[i]->size_ -= h_submap_num_successes;
   }
 }
diff --git a/include/cuco/detail/dynamic_map_kernels.cuh b/include/cuco/detail/dynamic_map_kernels.cuh
index 599d1d68b..c5605d463 100644
--- a/include/cuco/detail/dynamic_map_kernels.cuh
+++ b/include/cuco/detail/dynamic_map_kernels.cuh
@@ -201,7 +201,7 @@ __global__ void erase(InputIt first,
                        mutableViewT* submap_mutable_views,
                        atomicT* num_successes,
                        atomicT** submap_num_successes,
-                       uint32_t num_submaps,
+                       const uint32_t num_submaps,
                        Hash hash,
                        KeyEqual key_equal)
 {
diff --git a/include/cuco/detail/nvtx3.hpp b/include/cuco/detail/nvtx3.hpp
new file mode 100644
index 000000000..08a02153b
--- /dev/null
+++ b/include/cuco/detail/nvtx3.hpp
@@ -0,0 +1,2045 @@
+/*
+ *  Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+#pragma once
+
+#if defined(NVTX3_MINOR_VERSION) and NVTX3_MINOR_VERSION < 0
+#error \
+    "Trying to #include NVTX version 3 in a source file where an older NVTX version has already been included.  If you are not directly using NVTX (the NVIDIA Tools Extension library), you are getting this error because libraries you are using have included different versions of NVTX.  Suggested solutions are: (1) reorder #includes so the newest NVTX version is included first, (2) avoid using the conflicting libraries in the same .c/.cpp file, or (3) update the library using the older NVTX version to use the newer version instead."
+#endif
+
+/**
+ * @brief Semantic minor version number.
+ *
+ * Major version number is hardcoded into the "nvtx3" namespace/prefix.
+ *
+ * If this value is incremented, the above version include guard needs to be
+ * updated.
+ *
+ */
+#define NVTX3_MINOR_VERSION 0
+
+#include <nvtx3/nvToolsExt.h>
+
+#include <string>
+
+/**
+ * @file nvtx3.hpp
+ *
+ * @brief Provides C++ constructs making the NVTX library safer and easier to
+ * use with zero overhead.
+ */
+
+/**
+ * \mainpage
+ * \tableofcontents
+ *
+ * \section QUICK_START Quick Start
+ *
+ * To add NVTX ranges to your code, use the `nvtx3::thread_range` RAII object. A
+ * range begins when the object is created, and ends when the object is
+ * destroyed.
+ *
+ * \code{.cpp}
+ * #include "nvtx3.hpp"
+ * void some_function(){
+ *    // Begins a NVTX range with the messsage "some_function"
+ *    // The range ends when some_function() returns and `r` is destroyed
+ *    nvtx3::thread_range r{"some_function"};
+ *
+ *    for(int i = 0; i < 6; ++i){
+ *       nvtx3::thread_range loop{"loop range"};
+ *       std::this_thread::sleep_for(std::chrono::seconds{1});
+ *    }
+ * } // Range ends when `r` is destroyed
+ * \endcode
+ *
+ * The example code above generates the following timeline view in Nsight
+ * Systems:
+ *
+ * \image html
+ * https://raw.githubusercontent.com/jrhemstad/nvtx_wrappers/master/docs/example_range.png
+ *
+ * Alternatively, use the \ref MACROS like `NVTX3_FUNC_RANGE()` to add
+ * ranges to your code that automatically use the name of the enclosing function
+ * as the range's message.
+ *
+ * \code{.cpp}
+ * #include "nvtx3.hpp"
+ * void some_function(){
+ *    // Creates a range with a message "some_function" that ends when the
+ * enclosing
+ *    // function returns
+ *    NVTX3_FUNC_RANGE();
+ *    ...
+ * }
+ * \endcode
+ *
+ *
+ * \section Overview
+ *
+ * The NVTX library provides a set of functions for users to annotate their code
+ * to aid in performance profiling and optimization. These annotations provide
+ * information to tools like Nsight Systems to improve visualization of
+ * application timelines.
+ *
+ * \ref RANGES are one of the most commonly used NVTX constructs for annotating
+ * a span of time. For example, imagine a user wanted to see every time a
+ * function, `my_function`, is called and how long it takes to execute. This can
+ * be accomplished with an NVTX range created on the entry to the function and
+ * terminated on return from `my_function` using the push/pop C APIs:
+ *
+ * ```
+ * void my_function(...){
+ *    nvtxRangePushA("my_function"); // Begins NVTX range
+ *    // do work
+ *    nvtxRangePop(); // Ends NVTX range
+ * }
+ * ```
+ *
+ * One of the challenges with using the NVTX C API is that it requires manually
+ * terminating the end of the range with `nvtxRangePop`. This can be challenging
+ * if `my_function()` has multiple returns or can throw exceptions as it
+ * requires calling `nvtxRangePop()` before all possible return points.
+ *
+ * NVTX++ solves this inconvenience through the "RAII" technique by providing a
+ * `nvtx3::thread_range` class that begins a range at construction and ends the
+ * range on destruction. The above example then becomes:
+ *
+ * ```
+ * void my_function(...){
+ *    nvtx3::thread_range r{"my_function"}; // Begins NVTX range
+ *    // do work
+ * } // Range ends on exit from `my_function` when `r` is destroyed
+ * ```
+ *
+ * The range object `r` is deterministically destroyed whenever `my_function`
+ * returns---ending the NVTX range without manual intervention. For more
+ * information, see \ref RANGES and `nvtx3::domain_thread_range`.
+ *
+ * Another inconvenience of the NVTX C APIs are the several constructs where the
+ * user is expected to initialize an object at the beginning of an application
+ * and reuse that object throughout the lifetime of the application. For example
+ * Domains, Categories, and Registered messages.
+ *
+ * Example:
+ * ```
+ * nvtxDomainHandle_t D = nvtxDomainCreateA("my domain");
+ * // Reuse `D` throughout the rest of the application
+ * ```
+ *
+ * This can be problematic if the user application or library does not have an
+ * explicit initialization function called before all other functions to
+ * ensure that these long-lived objects are initialized before being used.
+ *
+ * NVTX++ makes use of the "construct on first use" technique to alleviate this
+ * inconvenience. In short, a function local static object is constructed upon
+ * the first invocation of a function and returns a reference to that object on
+ * all future invocations. See the documentation for
+ * `nvtx3::registered_message`, `nvtx3::domain`, `nvtx3::named_category`,  and
+ * https://isocpp.org/wiki/faq/ctors#static-init-order-on-first-use for more
+ * information.
+ *
+ * Using construct on first use, the above example becomes:
+ * ```
+ * struct my_domain{ static constexpr char const* name{"my domain"}; };
+ *
+ * // The first invocation of `domain::get` for the type `my_domain` will
+ * // construct a `nvtx3::domain` object and return a reference to it. Future
+ * // invocations simply return a reference.
+ * nvtx3::domain const& D = nvtx3::domain::get<my_domain>();
+ * ```
+ * For more information about NVTX and how it can be used, see
+ * https://docs.nvidia.com/cuda/profiler-users-guide/index.html#nvtx and
+ * https://devblogs.nvidia.com/cuda-pro-tip-generate-custom-application-profile-timelines-nvtx/
+ * for more information.
+ *
+ * \section RANGES Ranges
+ *
+ * Ranges are used to describe a span of time during the execution of an
+ * application. Common examples are using ranges to annotate the time it takes
+ * to execute a function or an iteration of a loop.
+ *
+ * NVTX++ uses RAII to automate the generation of ranges that are tied to the
+ * lifetime of objects. Similar to `std::lock_guard` in the C++ Standard
+ * Template Library.
+ *
+ * \subsection THREAD_RANGE Thread Range
+ *
+ * `nvtx3::domain_thread_range` is a class that begins a range upon construction
+ * and ends the range at destruction. This is one of the most commonly used
+ * constructs in NVTX++ and is useful for annotating spans of time on a
+ * particular thread. These ranges can be nested to arbitrary depths.
+ *
+ * `nvtx3::thread_range` is an alias for a `nvtx3::domain_thread_range` in the
+ * global NVTX domain. For more information about Domains, see \ref DOMAINS.
+ *
+ * Various attributes of a range can be configured constructing a
+ * `nvtx3::domain_thread_range` with a `nvtx3::event_attributes` object. For
+ * more information, see \ref ATTRIBUTES.
+ *
+ * Example:
+ *
+ * \code{.cpp}
+ * void some_function(){
+ *    // Creates a range for the duration of `some_function`
+ *    nvtx3::thread_range r{};
+ *
+ *    while(true){
+ *       // Creates a range for every loop iteration
+ *       // `loop_range` is nested inside `r`
+ *       nvtx3::thread_range loop_range{};
+ *    }
+ * }
+ * \endcode
+ *
+ * \subsection PROCESS_RANGE Process Range
+ *
+ * `nvtx3::domain_process_range` is identical to `nvtx3::domain_thread_range`
+ * with the exception that a `domain_process_range` can be created and destroyed
+ * on different threads. This is useful to annotate spans of time that can
+ * bridge multiple threads.
+ *
+ * `nvtx3::domain_thread_range`s should be preferred unless one needs the
+ * ability to begin and end a range on different threads.
+ *
+ * \section MARKS Marks
+ *
+ * `nvtx3::mark` allows annotating an instantaneous event in an application's
+ * timeline. For example, indicating when a mutex is locked or unlocked.
+ * 
+ * \code{.cpp}
+ * std::mutex global_lock;
+ * void lock_mutex(){
+ *    global_lock.lock();
+ *    // Marks an event immediately after the mutex is locked
+ *    nvtx3::mark<my_domain>("lock_mutex");
+ * }
+ * \endcode
+ *
+ * \section DOMAINS Domains
+ *
+ * Similar to C++ namespaces, Domains allow for scoping NVTX events. By default,
+ * all NVTX events belong to the "global" domain. Libraries and applications
+ * should scope their events to use a custom domain to differentiate where the
+ * events originate from.
+ *
+ * It is common for a library or application to have only a single domain and
+ * for the name of that domain to be known at compile time. Therefore, Domains
+ * in NVTX++ are represented by _tag types_.
+ *
+ * For example, to define a custom  domain, simply define a new concrete type
+ * (a `class` or `struct`) with a `static` member called `name` that contains
+ * the desired name of the domain.
+ *
+ * ```
+ * struct my_domain{ static constexpr char const* name{"my domain"}; };
+ * ```
+ *
+ * For any NVTX++ construct that can be scoped to a domain, the type `my_domain`
+ * can be passed as an explicit template argument to scope it to the custom
+ * domain.
+ *
+ * The tag type `nvtx3::domain::global` represents the global NVTX domain.
+ *
+ * \code{.cpp}
+ * // By default, `domain_thread_range` belongs to the global domain
+ * nvtx3::domain_thread_range<> r0{};
+ *
+ * // Alias for a `domain_thread_range` in the global domain
+ * nvtx3::thread_range r1{};
+ *
+ * // `r` belongs to the custom domain
+ * nvtx3::domain_thread_range<my_domain> r{};
+ * \endcode
+ *
+ * When using a custom domain, it is reccomended to define type aliases for NVTX
+ * constructs in the custom domain.
+ * ```
+ * using my_thread_range = nvtx3::domain_thread_range<my_domain>;
+ * using my_registered_message = nvtx3::registered_message<my_domain>;
+ * using my_named_category = nvtx3::named_category<my_domain>;
+ * ```
+ *
+ * See `nvtx3::domain` for more information.
+ *
+ * \section ATTRIBUTES Event Attributes
+ *
+ * NVTX events can be customized with various attributes to provide additional
+ * information (such as a custom message) or to control visualization of the
+ * event (such as the color used). These attributes can be specified per-event
+ * via arguments to a `nvtx3::event_attributes` object.
+ *
+ * NVTX events can be customized via four "attributes":
+ * - \ref COLOR : color used to visualize the event in tools.
+ * - \ref MESSAGES :  Custom message string.
+ * - \ref PAYLOAD :  User-defined numerical value.
+ * - \ref CATEGORY : Intra-domain grouping.
+ *
+ * It is possible to construct a `nvtx3::event_attributes` from any number of
+ * attribute objects (nvtx3::color, nvtx3::message, nvtx3::payload,
+ * nvtx3::category) in any order. If an attribute is not specified, a tool
+ * specific default value is used. See `nvtx3::event_attributes` for more
+ * information.
+ *
+ * \code{.cpp}
+ * // Custom color, message
+ * event_attributes attr{nvtx3::rgb{127, 255, 0},
+ *                      "message"};
+ *
+ * // Custom color, message, payload, category
+ * event_attributes attr{nvtx3::rgb{127, 255, 0},
+ *                      nvtx3::payload{42},
+ *                      "message",
+ *                      nvtx3::category{1}};
+ *
+ * // Arguments can be in any order
+ * event_attributes attr{nvtx3::payload{42},
+ *                      nvtx3::category{1},
+ *                      "message",
+ *                      nvtx3::rgb{127, 255, 0}};
+ *
+ * // "First wins" with multiple arguments of the same type
+ * event_attributes attr{ nvtx3::payload{42}, nvtx3::payload{7} }; // payload is
+ * 42 \endcode
+ *
+ * \subsection MESSAGES message
+ *
+ * A `nvtx3::message` allows associating a custom message string with an NVTX
+ * event.
+ *
+ * Example:
+ * \code{.cpp}
+ * // Create an `event_attributes` with the custom message "my message"
+ * nvtx3::event_attributes attr{nvtx3::Mesage{"my message"}};
+ *
+ * // strings and string literals implicitly assumed to be a `nvtx3::message`
+ * nvtx3::event_attributes attr{"my message"};
+ * \endcode
+ *
+ * \subsubsection REGISTERED_MESSAGE Registered Messages
+ *
+ * Associating a `nvtx3::message` with an event requires copying the contents of
+ * the message every time the message is used, i.e., copying the entire message
+ * string. This may cause non-trivial overhead in performance sensitive code.
+ *
+ * To eliminate this overhead, NVTX allows registering a message string,
+ * yielding a "handle" that is inexpensive to copy that may be used in place of
+ * a message string. When visualizing the events, tools such as Nsight Systems
+ * will take care of mapping the message handle to its string.
+ *
+ * A message should be registered once and the handle reused throughout the rest
+ * of the application. This can be done by either explicitly creating static
+ * `nvtx3::registered_message` objects, or using the
+ * `nvtx3::registered_message::get` construct on first use helper (recommended).
+ *
+ * Similar to \ref DOMAINS, `nvtx3::registered_message::get` requires defining a
+ * custom tag type with a static `message` member whose value will be the
+ * contents of the registered string.
+ *
+ * Example:
+ * \code{.cpp}
+ * // Explicitly constructed, static `registered_message`
+ * static registered_message<my_domain> static_message{"my message"};
+ *
+ * // Or use construct on first use:
+ * // Define a tag type with a `message` member string to register
+ * struct my_message{ static constexpr char const* message{ "my message" }; };
+ *
+ * // Uses construct on first use to register the contents of
+ * // `my_message::message`
+ * nvtx3::registered_message<my_domain> const& msg =
+ * nvtx3::registered_message<my_domain>::get<my_message>(); \endcode
+ *
+ * \subsection COLOR color
+ *
+ * Associating a `nvtx3::color` with an event allows controlling how the event
+ * is visualized in a tool such as Nsight Systems. This is a convenient way to
+ * visually differentiate among different events.
+ *
+ * \code{.cpp}
+ * // Define a color via rgb color values
+ * nvtx3::color c{nvtx3::rgb{127, 255, 0}};
+ * nvtx3::event_attributes attr{c};
+ *
+ * // rgb color values can be passed directly to an `event_attributes`
+ * nvtx3::event_attributes attr1{nvtx3::rgb{127,255,0}};
+ * \endcode
+ *
+ * \subsection CATEGORY category
+ *
+ * A `nvtx3::category` is simply an integer id that allows for fine-grain
+ * grouping of NVTX events. For example, one might use separate categories for
+ * IO, memory allocation, compute, etc.
+ *
+ * \code{.cpp}
+ * nvtx3::event_attributes{nvtx3::category{1}};
+ * \endcode
+ *
+ * \subsubsection NAMED_CATEGORIES Named Categories
+ *
+ * Associates a `name` string with a category `id` to help differentiate among
+ * categories.
+ *
+ * For any given category id `Id`, a `named_category{Id, "name"}` should only
+ * be constructed once and reused throughout an application. This can be done by
+ * either explicitly creating static `nvtx3::named_category` objects, or using
+ * the `nvtx3::named_category::get` construct on first use helper (recommended).
+ *
+ * Similar to \ref DOMAINS, `nvtx3::named_category::get` requires defining a
+ * custom tag type with static `name` and `id` members.
+ *
+ * \code{.cpp}
+ * // Explicitly constructed, static `named_category`
+ * static nvtx3::named_category static_category{42, "my category"};
+ *
+ * // OR use construct on first use:
+ * // Define a tag type with `name` and `id` members
+ * struct my_category{
+ *    static constexpr char const* name{"my category"}; // category name
+ *    static constexpr category::id_type id{42}; // category id
+ * };
+ *
+ * // Use construct on first use to name the category id `42`
+ * // with name "my category"
+ * nvtx3::named_category const& my_category =
+ * named_category<my_domain>::get<my_category>();
+ *
+ * // Range `r` associated with category id `42`
+ * nvtx3::event_attributes attr{my_category};
+ * \endcode
+ *
+ * \subsection PAYLOAD payload
+ *
+ * Allows associating a user-defined numerical value with an event.
+ *
+ * ```
+ * nvtx3:: event_attributes attr{nvtx3::payload{42}}; // Constructs a payload
+ * from
+ *                                                 // the `int32_t` value 42
+ * ```
+ *
+ *
+ * \section EXAMPLE Example
+ *
+ * Putting it all together:
+ * \code{.cpp}
+ * // Define a custom domain tag type
+ * struct my_domain{ static constexpr char const* name{"my domain"}; };
+ *
+ * // Define a named category tag type
+ * struct my_category{
+ *    static constexpr char const* name{"my category"};
+ *    static constexpr uint32_t id{42};
+ * };
+ *
+ * // Define a registered message tag type
+ * struct my_message{ static constexpr char const* message{"my message"}; };
+ *
+ * // For convenience, use aliases for domain scoped objects
+ * using my_thread_range = nvtx3::domain_thread_range<my_domain>;
+ * using my_registered_message = nvtx3::registered_message<my_domain>;
+ * using my_named_category = nvtx3::named_category<my_domain>;
+ *
+ * // Default values for all attributes
+ * nvtx3::event_attributes attr{};
+ * my_thread_range r0{attr};
+ *
+ * // Custom (unregistered) message, and unnamed category
+ * nvtx3::event_attributes attr1{"message", nvtx3::category{2}};
+ * my_thread_range r1{attr1};
+ *
+ * // Alternatively, pass arguments of `event_attributes` ctor directly to
+ * // `my_thread_range`
+ * my_thread_range r2{"message", nvtx3::category{2}};
+ *
+ * // construct on first use a registered message
+ * auto msg = my_registered_message::get<my_message>();
+ *
+ * // construct on first use a named category
+ * auto category = my_named_category::get<my_category>();
+ *
+ * // Use registered message and named category
+ * my_thread_range r3{msg, category, nvtx3::rgb{127, 255, 0},
+ *                    nvtx3::payload{42}};
+ *
+ * // Any number of arguments in any order
+ * my_thread_range r{nvtx3::rgb{127, 255,0}, msg};
+ *
+ * \endcode
+ * \section MACROS Convenience Macros
+ *
+ * Oftentimes users want to quickly and easily add NVTX ranges to their library
+ * or application to aid in profiling and optimization.
+ *
+ * A convenient way to do this is to use the \ref NVTX3_FUNC_RANGE and
+ * \ref NVTX3_FUNC_RANGE_IN macros. These macros take care of constructing an
+ * `nvtx3::domain_thread_range` with the name of the enclosing function as the
+ * range's message.
+ *
+ * \code{.cpp}
+ * void some_function(){
+ *    // Automatically generates an NVTX range for the duration of the function
+ *    // using "some_function" as the event's message.
+ *    NVTX3_FUNC_RANGE();
+ * }
+ * \endcode
+ *
+ */
+
+/**
+ * @brief Enables the use of constexpr when support for C++14 relaxed constexpr
+ * is present.
+ *
+ * Initializing a legacy-C (i.e., no constructor) union member requires
+ * initializing in the constructor body. Non-empty constexpr constructors
+ * require C++14 relaxed constexpr.
+ *
+ */
+#if __cpp_constexpr >= 201304L
+#define NVTX3_RELAXED_CONSTEXPR constexpr
+#else
+#define NVTX3_RELAXED_CONSTEXPR
+#endif
+
+namespace nvtx3 {
+namespace detail {
+
+/**
+ * @brief Verifies if a type `T` contains a member `T::name` of type `const
+ * char*` or `const wchar_t*`.
+ *
+ * @tparam T The type to verify
+ * @return True if `T` contains a member `T::name` of type `const char*` or
+ * `const wchar_t*`.
+ */
+template <typename T>
+constexpr auto has_name_member() noexcept -> decltype(T::name, bool()) {
+  return (std::is_same<char const*,
+                       typename std::decay<decltype(T::name)>::type>::value or
+          std::is_same<wchar_t const*,
+                       typename std::decay<decltype(T::name)>::type>::value);
+}
+}  // namespace detail
+
+/**
+ * @brief `domain`s allow for grouping NVTX events into a single scope to
+ * differentiate them from events in other `domain`s.
+ *
+ * By default, all NVTX constructs are placed in the "global" NVTX domain.
+ *
+ * A custom `domain` may be used in order to differentiate a library's or
+ * application's NVTX events from other events.
+ *
+ * `domain`s are expected to be long-lived and unique to a library or
+ * application. As such, it is assumed a domain's name is known at compile
+ * time. Therefore, all NVTX constructs that can be associated with a domain
+ * require the domain to be specified via a *type* `DomainName` passed as an
+ * explicit template parameter.
+ *
+ * The type `domain::global` may be used to indicate that the global NVTX
+ * domain should be used.
+ *
+ * None of the C++ NVTX constructs require the user to manually construct a
+ * `domain` object. Instead, if a custom domain is desired, the user is
+ * expected to define a type `DomainName` that contains a member
+ * `DomainName::name` which resolves to either a `char const*` or `wchar_t
+ * const*`. The value of `DomainName::name` is used to name and uniquely
+ * identify the custom domain.
+ *
+ * Upon the first use of an NVTX construct associated with the type
+ * `DomainName`, the "construct on first use" pattern is used to construct a
+ * function local static `domain` object. All future NVTX constructs
+ * associated with `DomainType` will use a reference to the previously
+ * constructed `domain` object. See `domain::get`.
+ *
+ * Example:
+ * ```
+ * // The type `my_domain` defines a `name` member used to name and identify
+ * the
+ * // `domain` object identified by `my_domain`.
+ * struct my_domain{ static constexpr char const* name{"my_domain"}; };
+ *
+ * // The NVTX range `r` will be grouped with all other NVTX constructs
+ * // associated with  `my_domain`.
+ * nvtx3::domain_thread_range<my_domain> r{};
+ *
+ * // An alias can be created for a `domain_thread_range` in the custom domain
+ * using my_thread_range = nvtx3::domain_thread_range<my_domain>;
+ * my_thread_range my_range{};
+ *
+ * // `domain::global` indicates that the global NVTX domain is used
+ * nvtx3::domain_thread_range<domain::global> r2{};
+ *
+ * // For convenience, `nvtx3::thread_range` is an alias for a range in the
+ * // global domain
+ * nvtx3::thread_range r3{};
+ * ```
+ */
+class domain {
+ public:
+  domain(domain const&) = delete;
+  domain& operator=(domain const&) = delete;
+  domain(domain&&) = delete;
+  domain& operator=(domain&&) = delete;
+
+  /**
+   * @brief Returns reference to an instance of a function local static
+   * `domain` object.
+   *
+   * Uses the "construct on first use" idiom to safely ensure the `domain`
+   * object is initialized exactly once upon first invocation of
+   * `domain::get<DomainName>()`. All following invocations will return a
+   * reference to the previously constructed `domain` object. See
+   * https://isocpp.org/wiki/faq/ctors#static-init-order-on-first-use
+   *
+   * None of the constructs in this header require the user to directly invoke
+   * `domain::get`. It is automatically invoked when constructing objects like
+   * a `domain_thread_range` or `category`. Advanced users may wish to use
+   * `domain::get` for the convenience of the "construct on first use" idiom
+   * when using domains with their own use of the NVTX C API.
+   *
+   * This function is threadsafe as of C++11. If two or more threads call
+   * `domain::get<DomainName>` concurrently, exactly one of them is guaranteed
+   * to construct the `domain` object and the other(s) will receive a
+   * reference to the object after it is fully constructed.
+   *
+   * The domain's name is specified via the type `DomainName` pass as an
+   * explicit template parameter. `DomainName` is required to contain a
+   * member `DomainName::name` that resolves to either a `char const*` or
+   * `wchar_t const*`. The value of `DomainName::name` is used to name and
+   * uniquely identify the `domain`.
+   *
+   * Example:
+   * ```
+   * // The type `my_domain` defines a `name` member used to name and identify
+   * // the `domain` object identified by `my_domain`.
+   * struct my_domain{ static constexpr char const* name{"my domain"}; };
+   *
+   * auto D = domain::get<my_domain>(); // First invocation constructs a
+   *                                    // `domain` with the name "my domain"
+   *
+   * auto D1 = domain::get<my_domain>(); // Simply returns reference to
+   *                                     // previously constructed `domain`.
+   * ```
+   *
+   * @tparam DomainName Type that contains a `DomainName::name` member used to
+   * name the `domain` object.
+   * @return Reference to the `domain` corresponding to the type `DomainName`.
+   */
+  template <typename DomainName>
+  static domain const& get() {
+    static_assert(detail::has_name_member<DomainName>(),
+                  "Type used to identify a domain must contain a name member of"
+                  "type const char* or const wchar_t*");
+    static domain const d{DomainName::name};
+    return d;
+  }
+
+  /**
+   * @brief Conversion operator to `nvtxDomainHandle_t`.
+   *
+   * Allows transparently passing a domain object into an API expecting a
+   * native `nvtxDomainHandle_t` object.
+   */
+  operator nvtxDomainHandle_t() const noexcept { return _domain; }
+
+  /**
+   * @brief Tag type for the "global" NVTX domain.
+   *
+   * This type may be passed as a template argument to any function/class
+   * expecting a type to identify a domain to indicate that the global domain
+   * should be used.
+   *
+   * All NVTX events in the global domain across all libraries and
+   * applications will be grouped together.
+   *
+   */
+  struct global {};
+
+ private:
+  /**
+   * @brief Construct a new domain with the specified `name`.
+   *
+   * This constructor is private as it is intended that `domain` objects only
+   * be created through the `domain::get` function.
+   *
+   * @param name A unique name identifying the domain
+   */
+  explicit domain(char const* name) noexcept
+      : _domain{nvtxDomainCreateA(name)} {}
+
+  /**
+   * @brief Construct a new domain with the specified `name`.
+   *
+   * This constructor is private as it is intended that `domain` objects only
+   * be created through the `domain::get` function.
+   *
+   * @param name A unique name identifying the domain
+   */
+  explicit domain(wchar_t const* name) noexcept
+      : _domain{nvtxDomainCreateW(name)} {}
+
+  /**
+   * @brief Construct a new domain with the specified `name`.
+   *
+   * This constructor is private as it is intended that `domain` objects only
+   * be created through the `domain::get` function.
+   *
+   * @param name A unique name identifying the domain
+   */
+  explicit domain(std::string const& name) noexcept : domain{name.c_str()} {}
+
+  /**
+   * @brief Construct a new domain with the specified `name`.
+   *
+   * This constructor is private as it is intended that `domain` objects only
+   * be created through the `domain::get` function.
+   *
+   * @param name A unique name identifying the domain
+   */
+  explicit domain(std::wstring const& name) noexcept : domain{name.c_str()} {}
+
+  /**
+   * @brief Default constructor creates a `domain` representing the
+   * "global" NVTX domain.
+   *
+   * All events not associated with a custom `domain` are grouped in the
+   * "global" NVTX domain.
+   *
+   */
+  domain() = default;
+
+  /**
+   * @brief Destroy the domain object, unregistering and freeing all domain
+   * specific resources.
+   */
+  ~domain() noexcept { nvtxDomainDestroy(_domain); }
+
+ private:
+  nvtxDomainHandle_t const _domain{};  ///< The `domain`s NVTX handle
+};
+
+/**
+ * @brief Returns reference to the `domain` object that represents the global
+ * NVTX domain.
+ *
+ * This specialization for `domain::global` returns a default constructed,
+ * `domain` object for use when the "global" domain is desired.
+ *
+ * All NVTX events in the global domain across all libraries and applications
+ * will be grouped together.
+ *
+ * @return Reference to the `domain` corresponding to the global NVTX domain.
+ *
+ */
+template <>
+inline domain const& domain::get<domain::global>() {
+  static domain const d{};
+  return d;
+}
+
+/**
+ * @brief Indicates the values of the red, green, blue color channels for
+ * a rgb color code.
+ *
+ */
+struct rgb {
+  /// Type used for component values
+  using component_type = uint8_t;
+
+  /**
+   * @brief Construct a rgb with red, green, and blue channels
+   * specified by `red_`, `green_`, and `blue_`, respectively.
+   *
+   * Valid values are in the range `[0,255]`.
+   *
+   * @param red_ Value of the red channel
+   * @param green_ Value of the green channel
+   * @param blue_ Value of the blue channel
+   */
+  constexpr rgb(component_type red_, component_type green_,
+                component_type blue_) noexcept
+      : red{red_}, green{green_}, blue{blue_} {}
+
+  component_type const red{};    ///< Red channel value
+  component_type const green{};  ///< Green channel value
+  component_type const blue{};   ///< Blue channel value
+};
+
+/**
+ * @brief Indicates the value of the alpha, red, green, and blue color
+ * channels for an argb color code.
+ *
+ */
+struct argb final : rgb {
+  /**
+   * @brief Construct an argb with alpha, red, green, and blue channels
+   * specified by `alpha_`, `red_`, `green_`, and `blue_`, respectively.
+   *
+   * Valid values are in the range `[0,255]`.
+   *
+   * @param alpha_  Value of the alpha channel (opacity)
+   * @param red_  Value of the red channel
+   * @param green_  Value of the green channel
+   * @param blue_  Value of the blue channel
+   *
+   */
+  constexpr argb(component_type alpha_, component_type red_,
+                 component_type green_, component_type blue_) noexcept
+      : rgb{red_, green_, blue_}, alpha{alpha_} {}
+
+  component_type const alpha{};  ///< Alpha channel value
+};
+
+/**
+ * @brief Represents a custom color that can be associated with an NVTX event
+ * via it's `event_attributes`.
+ *
+ * Specifying colors for NVTX events is a convenient way to visually
+ * differentiate among different events in a visualization tool such as Nsight
+ * Systems.
+ *
+ */
+class color {
+ public:
+  /// Type used for the color's value
+  using value_type = uint32_t;
+
+  /**
+   * @brief Constructs a `color` using the value provided by `hex_code`.
+   *
+   * `hex_code` is expected to be a 4 byte argb hex code.
+   *
+   * The most significant byte indicates the value of the alpha channel
+   * (opacity) (0-255)
+   *
+   * The next byte indicates the value of the red channel (0-255)
+   *
+   * The next byte indicates the value of the green channel (0-255)
+   *
+   * The least significant byte indicates the value of the blue channel
+   * (0-255)
+   *
+   * @param hex_code The hex code used to construct the `color`
+   */
+  constexpr explicit color(value_type hex_code) noexcept : _value{hex_code} {}
+
+  /**
+   * @brief Construct a `color` using the alpha, red, green, blue components
+   * in `argb`.
+   *
+   * @param argb The alpha, red, green, blue components of the desired `color`
+   */
+  constexpr color(argb argb) noexcept
+      : color{from_bytes_msb_to_lsb(argb.alpha, argb.red, argb.green,
+                                    argb.blue)} {}
+
+  /**
+   * @brief Construct a `color` using the red, green, blue components in
+   * `rgb`.
+   *
+   * Uses maximum value for the alpha channel (opacity) of the `color`.
+   *
+   * @param rgb The red, green, blue components of the desired `color`
+   */
+  constexpr color(rgb rgb) noexcept
+      : color{from_bytes_msb_to_lsb(0xFF, rgb.red, rgb.green, rgb.blue)} {}
+
+  /**
+   * @brief Returns the `color`s argb hex code
+   *
+   */
+  constexpr value_type get_value() const noexcept { return _value; }
+
+  /**
+   * @brief Return the NVTX color type of the color.
+   *
+   */
+  constexpr nvtxColorType_t get_type() const noexcept { return _type; }
+
+  color() = delete;
+  ~color() = default;
+  color(color const&) = default;
+  color& operator=(color const&) = default;
+  color(color&&) = default;
+  color& operator=(color&&) = default;
+
+ private:
+  /**
+   * @brief Constructs an unsigned, 4B integer from the component bytes in
+   * most to least significant byte order.
+   *
+   */
+  constexpr static value_type from_bytes_msb_to_lsb(uint8_t byte3,
+                                                    uint8_t byte2,
+                                                    uint8_t byte1,
+                                                    uint8_t byte0) noexcept {
+    return uint32_t{byte3} << 24 | uint32_t{byte2} << 16 |
+           uint32_t{byte1} << 8 | uint32_t{byte0};
+  }
+
+  value_type const _value{};                     ///< color's argb color code
+  nvtxColorType_t const _type{NVTX_COLOR_ARGB};  ///< NVTX color type code
+};
+
+/**
+ * @brief Object for intra-domain grouping of NVTX events.
+ *
+ * A `category` is simply an integer id that allows for fine-grain grouping of
+ * NVTX events. For example, one might use separate categories for IO, memory
+ * allocation, compute, etc.
+ *
+ * Example:
+ * \code{.cpp}
+ * nvtx3::category cat1{1};
+ *
+ * // Range `r1` belongs to the category identified by the value `1`.
+ * nvtx3::thread_range r1{cat1};
+ *
+ * // Range `r2` belongs to the same category as `r1`
+ * nvtx3::thread_range r2{nvtx3::category{1}};
+ * \endcode
+ *
+ * To associate a name string with a category id, see `named_category`.
+ *
+ */
+class category {
+ public:
+  /// Type used for `category`s integer id.
+  using id_type = uint32_t;
+
+  /**
+   * @brief Construct a `category` with the specified `id`.
+   *
+   * The `category` will be unnamed and identified only by its `id` value.
+   *
+   * All `category` objects sharing the same `id` are equivalent.
+   *
+   * @param[in] id The `category`'s identifying value
+   */
+  constexpr explicit category(id_type id) noexcept : id_{id} {}
+
+  /**
+   * @brief Returns the id of the category.
+   *
+   */
+  constexpr id_type get_id() const noexcept { return id_; }
+
+  category() = delete;
+  ~category() = default;
+  category(category const&) = default;
+  category& operator=(category const&) = default;
+  category(category&&) = default;
+  category& operator=(category&&) = default;
+
+ private:
+  id_type const id_{};  ///< category's unique identifier
+};
+
+/**
+ * @brief A `category` with an associated name string.
+ *
+ * Associates a `name` string with a category `id` to help differentiate among
+ * categories.
+ *
+ * For any given category id `Id`, a `named_category(Id, "name")` should only
+ * be constructed once and reused throughout an application. This can be done
+ * by either explicitly creating static `named_category` objects, or using the
+ * `named_category::get` construct on first use helper (recommended).
+ *
+ * Creating two or more `named_category` objects with the same value for `id`
+ * in the same domain results in undefined behavior.
+ *
+ * Similarly, behavior is undefined when a `named_category` and `category`
+ * share the same value of `id`.
+ *
+ * Example:
+ * \code{.cpp}
+ * // Explicitly constructed, static `named_category`
+ * static nvtx3::named_category static_category{42, "my category"};
+ *
+ * // Range `r` associated with category id `42`
+ * nvtx3::thread_range r{static_category};
+ *
+ * // OR use construct on first use:
+ *
+ * // Define a type with `name` and `id` members
+ * struct my_category{
+ *    static constexpr char const* name{"my category"}; // category name
+ *    static constexpr category::id_type id{42}; // category id
+ * };
+ *
+ * // Use construct on first use to name the category id `42`
+ * // with name "my category"
+ * auto my_category = named_category<my_domain>::get<my_category>();
+ *
+ * // Range `r` associated with category id `42`
+ * nvtx3::thread_range r{my_category};
+ * \endcode
+ *
+ * `named_category`'s association of a name to a category id is local to the
+ * domain specified by the type `D`. An id may have a different name in
+ * another domain.
+ *
+ * @tparam D Type containing `name` member used to identify the `domain` to
+ * which the `named_category` belongs. Else, `domain::global` to  indicate
+ * that the global NVTX domain should be used.
+ */
+template <typename D = domain::global>
+class named_category final : public category {
+ public:
+  /**
+   * @brief Returns a global instance of a `named_category` as a
+   * function-local static.
+   *
+   * Creates a `named_category` with name and id specified by the contents of
+   * a type `C`. `C::name` determines the name and `C::id` determines the
+   * category id.
+   *
+   * This function is useful for constructing a named `category` exactly once
+   * and reusing the same instance throughout an application.
+   *
+   * Example:
+   * \code{.cpp}
+   * // Define a type with `name` and `id` members
+   * struct my_category{
+   *    static constexpr char const* name{"my category"}; // category name
+   *    static constexpr uint32_t id{42}; // category id
+   * };
+   *
+   * // Use construct on first use to name the category id `42`
+   * // with name "my category"
+   * auto cat = named_category<my_domain>::get<my_category>();
+   *
+   * // Range `r` associated with category id `42`
+   * nvtx3::thread_range r{cat};
+   * \endcode
+   *
+   * Uses the "construct on first use" idiom to safely ensure the `category`
+   * object is initialized exactly once. See
+   * https://isocpp.org/wiki/faq/ctors#static-init-order-on-first-use
+   *
+   * @tparam C Type containing a member `C::name` that resolves  to either a
+   * `char const*` or `wchar_t const*` and `C::id`.
+   */
+  template <typename C>
+  static named_category<D> const& get() noexcept {
+    static_assert(detail::has_name_member<C>(),
+                  "Type used to name a category must contain a name member.");
+    static named_category<D> const category{C::id, C::name};
+    return category;
+  }
+  /**
+   * @brief Construct a `category` with the specified `id` and `name`.
+   *
+   * The name `name` will be registered with `id`.
+   *
+   * Every unique value of `id` should only be named once.
+   *
+   * @param[in] id The category id to name
+   * @param[in] name The name to associated with `id`
+   */
+  named_category(id_type id, char const* name) noexcept : category{id} {
+    nvtxDomainNameCategoryA(domain::get<D>(), get_id(), name);
+  };
+
+  /**
+   * @brief Construct a `category` with the specified `id` and `name`.
+   *
+   * The name `name` will be registered with `id`.
+   *
+   * Every unique value of `id` should only be named once.
+   *
+   * @param[in] id The category id to name
+   * @param[in] name The name to associated with `id`
+   */
+  named_category(id_type id, wchar_t const* name) noexcept : category{id} {
+    nvtxDomainNameCategoryW(domain::get<D>(), get_id(), name);
+  };
+};
+
+/**
+ * @brief A message registered with NVTX.
+ *
+ * Normally, associating a `message` with an NVTX event requires copying the
+ * contents of the message string. This may cause non-trivial overhead in
+ * highly performance sensitive regions of code.
+ *
+ * message registration is an optimization to lower the overhead of
+ * associating a message with an NVTX event. Registering a message yields a
+ * handle that is inexpensive to copy that may be used in place of a message
+ * string.
+ *
+ * A particular message should only be registered once and the handle
+ * reused throughout the rest of the application. This can be done by either
+ * explicitly creating static `registered_message` objects, or using the
+ * `registered_message::get` construct on first use helper (recommended).
+ *
+ * Example:
+ * \code{.cpp}
+ * // Explicitly constructed, static `registered_message`
+ * static registered_message<my_domain> static_message{"message"};
+ *
+ * // "message" is associated with the range `r`
+ * nvtx3::thread_range r{static_message};
+ *
+ * // Or use construct on first use:
+ *
+ * // Define a type with a `message` member that defines the contents of the
+ * // registered message
+ * struct my_message{ static constexpr char const* message{ "my message" }; };
+ *
+ * // Uses construct on first use to register the contents of
+ * // `my_message::message`
+ * auto msg = registered_message<my_domain>::get<my_message>();
+ *
+ * // "my message" is associated with the range `r`
+ * nvtx3::thread_range r{msg};
+ * \endcode
+ *
+ * `registered_message`s are local to a particular domain specified via
+ * the type `D`.
+ *
+ * @tparam D Type containing `name` member used to identify the `domain` to
+ * which the `registered_message` belongs. Else, `domain::global` to  indicate
+ * that the global NVTX domain should be used.
+ */
+template <typename D = domain::global>
+class registered_message {
+ public:
+  /**
+   * @brief Returns a global instance of a `registered_message` as a function
+   * local static.
+   *
+   * Provides a convenient way to register a message with NVTX without having
+   * to explicitly register the message.
+   *
+   * Upon first invocation, constructs a `registered_message` whose contents
+   * are specified by `message::message`.
+   *
+   * All future invocations will return a reference to the object constructed
+   * in the first invocation.
+   *
+   * Example:
+   * \code{.cpp}
+   * // Define a type with a `message` member that defines the contents of the
+   * // registered message
+   * struct my_message{ static constexpr char const* message{ "my message" };
+   * };
+   *
+   * // Uses construct on first use to register the contents of
+   * // `my_message::message`
+   * auto msg = registered_message<my_domain>::get<my_message>();
+   *
+   * // "my message" is associated with the range `r`
+   * nvtx3::thread_range r{msg};
+   * \endcode
+   *
+   * @tparam M Type required to contain a member `M::message` that
+   * resolves to either a `char const*` or `wchar_t const*` used as the
+   * registered message's contents.
+   * @return Reference to a `registered_message` associated with the type `M`.
+   */
+  template <typename M>
+  static registered_message<D> const& get() noexcept {
+    static registered_message<D> const registered_message{M::message};
+    return registered_message;
+  }
+
+  /**
+   * @brief Constructs a `registered_message` from the specified `msg` string.
+   *
+   * Registers `msg` with NVTX and associates a handle with the registered
+   * message.
+   *
+   * A particular message should should only be registered once and the handle
+   * reused throughout the rest of the application.
+   *
+   * @param msg The contents of the message
+   */
+  explicit registered_message(char const* msg) noexcept
+      : handle_{nvtxDomainRegisterStringA(domain::get<D>(), msg)} {}
+
+  /**
+   * @brief Constructs a `registered_message` from the specified `msg` string.
+   *
+   * Registers `msg` with NVTX and associates a handle with the registered
+   * message.
+   *
+   * A particular message should should only be registered once and the handle
+   * reused throughout the rest of the application.
+   *
+   * @param msg The contents of the message
+   */
+  explicit registered_message(std::string const& msg) noexcept
+      : registered_message{msg.c_str()} {}
+
+  /**
+   * @brief Constructs a `registered_message` from the specified `msg` string.
+   *
+   * Registers `msg` with NVTX and associates a handle with the registered
+   * message.
+   *
+   * A particular message should should only be registered once and the handle
+   * reused throughout the rest of the application.
+   *
+   * @param msg The contents of the message
+   */
+  explicit registered_message(wchar_t const* msg) noexcept
+      : handle_{nvtxDomainRegisterStringW(domain::get<D>(), msg)} {}
+
+  /**
+   * @brief Constructs a `registered_message` from the specified `msg` string.
+   *
+   * Registers `msg` with NVTX and associates a handle with the registered
+   * message.
+   *
+   * A particular message should only be registered once and the handle
+   * reused throughout the rest of the application.
+   *
+   * @param msg The contents of the message
+   */
+  explicit registered_message(std::wstring const& msg) noexcept
+      : registered_message{msg.c_str()} {}
+
+  /**
+   * @brief Returns the registered message's handle
+   *
+   */
+  nvtxStringHandle_t get_handle() const noexcept { return handle_; }
+
+  registered_message() = delete;
+  ~registered_message() = default;
+  registered_message(registered_message const&) = default;
+  registered_message& operator=(registered_message const&) = default;
+  registered_message(registered_message&&) = default;
+  registered_message& operator=(registered_message&&) = default;
+
+ private:
+  nvtxStringHandle_t const handle_{};  ///< The handle returned from
+                                       ///< registering the message with NVTX
+};
+
+/**
+ * @brief Allows associating a message string with an NVTX event via
+ * its `EventAttribute`s.
+ *
+ * Associating a `message` with an NVTX event through its `event_attributes`
+ * allows for naming events to easily differentiate them from other events.
+ *
+ * Every time an NVTX event is created with an associated `message`, the
+ * contents of the message string must be copied.  This may cause non-trivial
+ * overhead in highly performance sensitive sections of code. Use of a
+ * `nvtx3::registered_message` is recommended in these situations.
+ *
+ * Example:
+ * \code{.cpp}
+ * // Creates an `event_attributes` with message "message 0"
+ * nvtx3::event_attributes attr0{nvtx3::message{"message 0"}};
+ *
+ * // `range0` contains message "message 0"
+ * nvtx3::thread_range range0{attr0};
+ *
+ * // `std::string` and string literals are implicitly assumed to be
+ * // the contents of an `nvtx3::message`
+ * // Creates an `event_attributes` with message "message 1"
+ * nvtx3::event_attributes attr1{"message 1"};
+ *
+ * // `range1` contains message "message 1"
+ * nvtx3::thread_range range1{attr1};
+ *
+ * // `range2` contains message "message 2"
+ * nvtx3::thread_range range2{nvtx3::Mesage{"message 2"}};
+ *
+ * // `std::string` and string literals are implicitly assumed to be
+ * // the contents of an `nvtx3::message`
+ * // `range3` contains message "message 3"
+ * nvtx3::thread_range range3{"message 3"};
+ * \endcode
+ */
+class message {
+ public:
+  using value_type = nvtxMessageValue_t;
+
+  /**
+   * @brief Construct a `message` whose contents are specified by `msg`.
+   *
+   * @param msg The contents of the message
+   */
+  NVTX3_RELAXED_CONSTEXPR message(char const* msg) noexcept
+      : type_{NVTX_MESSAGE_TYPE_ASCII} {
+    value_.ascii = msg;
+  }
+
+  /**
+   * @brief Construct a `message` whose contents are specified by `msg`.
+   *
+   * @param msg The contents of the message
+   */
+  message(std::string const& msg) noexcept : message{msg.c_str()} {}
+
+  /**
+   * @brief Disallow construction for `std::string` r-value
+   *
+   * `message` is a non-owning type and therefore cannot take ownership of an
+   * r-value. Therefore, constructing from an r-value is disallowed to prevent
+   * a dangling pointer.
+   *
+   */
+  message(std::string&&) = delete;
+
+  /**
+   * @brief Construct a `message` whose contents are specified by `msg`.
+   *
+   * @param msg The contents of the message
+   */
+  NVTX3_RELAXED_CONSTEXPR message(wchar_t const* msg) noexcept
+      : type_{NVTX_MESSAGE_TYPE_UNICODE} {
+    value_.unicode = msg;
+  }
+
+  /**
+   * @brief Construct a `message` whose contents are specified by `msg`.
+   *
+   * @param msg The contents of the message
+   */
+  message(std::wstring const& msg) noexcept : message{msg.c_str()} {}
+
+  /**
+   * @brief Disallow construction for `std::wstring` r-value
+   *
+   * `message` is a non-owning type and therefore cannot take ownership of an
+   * r-value. Therefore, constructing from an r-value is disallowed to prevent
+   * a dangling pointer.
+   *
+   */
+  message(std::wstring&&) = delete;
+
+  /**
+   * @brief Construct a `message` from a `registered_message`.
+   *
+   * @tparam D Type containing `name` member used to identify the `domain`
+   * to which the `registered_message` belongs. Else, `domain::global` to
+   * indicate that the global NVTX domain should be used.
+   * @param msg The message that has already been registered with NVTX.
+   */
+  template <typename D>
+  NVTX3_RELAXED_CONSTEXPR message(registered_message<D> const& msg) noexcept
+      : type_{NVTX_MESSAGE_TYPE_REGISTERED} {
+    value_.registered = msg.get_handle();
+  }
+
+  /**
+   * @brief Return the union holding the value of the message.
+   *
+   */
+  NVTX3_RELAXED_CONSTEXPR value_type get_value() const noexcept {
+    return value_;
+  }
+
+  /**
+   * @brief Return the type information about the value the union holds.
+   *
+   */
+  NVTX3_RELAXED_CONSTEXPR nvtxMessageType_t get_type() const noexcept {
+    return type_;
+  }
+
+ private:
+  nvtxMessageType_t const type_{};  ///< message type
+  nvtxMessageValue_t value_{};      ///< message contents
+};
+
+/**
+ * @brief A numerical value that can be associated with an NVTX event via
+ * its `event_attributes`.
+ *
+ * Example:
+ * ```
+ * nvtx3:: event_attributes attr{nvtx3::payload{42}}; // Constructs a payload
+ * from
+ *                                                 // the `int32_t` value 42
+ *
+ * // `range0` will have an int32_t payload of 42
+ * nvtx3::thread_range range0{attr};
+ *
+ * // range1 has double payload of 3.14
+ * nvtx3::thread_range range1{ nvtx3::payload{3.14} };
+ * ```
+ */
+class payload {
+ public:
+  using value_type = typename nvtxEventAttributes_v2::payload_t;
+
+  /**
+   * @brief Construct a `payload` from a signed, 8 byte integer.
+   *
+   * @param value Value to use as contents of the payload
+   */
+  NVTX3_RELAXED_CONSTEXPR explicit payload(int64_t value) noexcept
+      : type_{NVTX_PAYLOAD_TYPE_INT64}, value_{} {
+    value_.llValue = value;
+  }
+
+  /**
+   * @brief Construct a `payload` from a signed, 4 byte integer.
+   *
+   * @param value Value to use as contents of the payload
+   */
+  NVTX3_RELAXED_CONSTEXPR explicit payload(int32_t value) noexcept
+      : type_{NVTX_PAYLOAD_TYPE_INT32}, value_{} {
+    value_.iValue = value;
+  }
+
+  /**
+   * @brief Construct a `payload` from an unsigned, 8 byte integer.
+   *
+   * @param value Value to use as contents of the payload
+   */
+  NVTX3_RELAXED_CONSTEXPR explicit payload(uint64_t value) noexcept
+      : type_{NVTX_PAYLOAD_TYPE_UNSIGNED_INT64}, value_{} {
+    value_.ullValue = value;
+  }
+
+  /**
+   * @brief Construct a `payload` from an unsigned, 4 byte integer.
+   *
+   * @param value Value to use as contents of the payload
+   */
+  NVTX3_RELAXED_CONSTEXPR explicit payload(uint32_t value) noexcept
+      : type_{NVTX_PAYLOAD_TYPE_UNSIGNED_INT32}, value_{} {
+    value_.uiValue = value;
+  }
+
+  /**
+   * @brief Construct a `payload` from a single-precision floating point
+   * value.
+   *
+   * @param value Value to use as contents of the payload
+   */
+  NVTX3_RELAXED_CONSTEXPR explicit payload(float value) noexcept
+      : type_{NVTX_PAYLOAD_TYPE_FLOAT}, value_{} {
+    value_.fValue = value;
+  }
+
+  /**
+   * @brief Construct a `payload` from a double-precision floating point
+   * value.
+   *
+   * @param value Value to use as contents of the payload
+   */
+  NVTX3_RELAXED_CONSTEXPR explicit payload(double value) noexcept
+      : type_{NVTX_PAYLOAD_TYPE_DOUBLE}, value_{} {
+    value_.dValue = value;
+  }
+
+  /**
+   * @brief Return the union holding the value of the payload
+   *
+   */
+  NVTX3_RELAXED_CONSTEXPR value_type get_value() const noexcept {
+    return value_;
+  }
+
+  /**
+   * @brief Return the information about the type the union holds.
+   *
+   */
+  NVTX3_RELAXED_CONSTEXPR nvtxPayloadType_t get_type() const noexcept {
+    return type_;
+  }
+
+ private:
+  nvtxPayloadType_t const type_;  ///< Type of the payload value
+  value_type value_;              ///< Union holding the payload value
+};
+
+/**
+ * @brief Describes the attributes of a NVTX event.
+ *
+ * NVTX events can be customized via four "attributes":
+ *
+ * - color:    color used to visualize the event in tools such as Nsight
+ *             Systems. See `color`.
+ * - message:  Custom message string. See `message`.
+ * - payload:  User-defined numerical value. See `payload`.
+ * - category: Intra-domain grouping. See `category`.
+ *
+ * These component attributes are specified via an `event_attributes` object.
+ * See `nvtx3::color`, `nvtx3::message`, `nvtx3::payload`, and
+ * `nvtx3::category` for how these individual attributes are constructed.
+ *
+ * While it is possible to specify all four attributes, it is common to want
+ * to only specify a subset of attributes and use default values for the
+ * others. For convenience, `event_attributes` can be constructed from any
+ * number of attribute components in any order.
+ *
+ * Example:
+ * \code{.cpp}
+ * event_attributes attr{}; // No arguments, use defaults for all attributes
+ *
+ * event_attributes attr{"message"}; // Custom message, rest defaulted
+ *
+ * // Custom color & message
+ * event_attributes attr{"message", nvtx3::rgb{127, 255, 0}};
+ *
+ * /// Custom color & message, can use any order of arguments
+ * event_attributes attr{nvtx3::rgb{127, 255, 0}, "message"};
+ *
+ *
+ * // Custom color, message, payload, category
+ * event_attributes attr{nvtx3::rgb{127, 255, 0},
+ *                      "message",
+ *                      nvtx3::payload{42},
+ *                      nvtx3::category{1}};
+ *
+ * // Custom color, message, payload, category, can use any order of arguments
+ * event_attributes attr{nvtx3::payload{42},
+ *                      nvtx3::category{1},
+ *                      "message",
+ *                      nvtx3::rgb{127, 255, 0}};
+ *
+ * // Multiple arguments of the same type are allowed, but only the first is
+ * // used. All others are ignored
+ * event_attributes attr{ nvtx3::payload{42}, nvtx3::payload{7} }; // payload
+ * is 42
+ *
+ * // Range `r` will be customized according the attributes in `attr`
+ * nvtx3::thread_range r{attr};
+ *
+ * // For convenience, the arguments that can be passed to the
+ * `event_attributes`
+ * // constructor may be passed to the `domain_thread_range` contructor where
+ * // they will be forwarded to the `EventAttribute`s constructor
+ * nvtx3::thread_range r{nvtx3::payload{42}, nvtx3::category{1}, "message"};
+ * \endcode
+ *
+ */
+class event_attributes {
+ public:
+  using value_type = nvtxEventAttributes_t;
+
+  /**
+   * @brief Default constructor creates an `event_attributes` with no
+   * category, color, payload, nor message.
+   */
+  constexpr event_attributes() noexcept
+      : attributes_{
+            NVTX_VERSION,                  // version
+            sizeof(nvtxEventAttributes_t), // size
+            0,                             // category
+            NVTX_COLOR_UNKNOWN,            // color type
+            0,                             // color value
+            NVTX_PAYLOAD_UNKNOWN,          // payload type
+            0,                             // payload value (union)
+            NVTX_MESSAGE_UNKNOWN,          // message type
+            0                              // message value (union)
+        } {}
+
+  /**
+   * @brief Variadic constructor where the first argument is a `category`.
+   *
+   * Sets the value of the `EventAttribute`s category based on `c` and
+   * forwards the remaining variadic parameter pack to the next constructor.
+   *
+   */
+  template <typename... Args>
+  NVTX3_RELAXED_CONSTEXPR explicit event_attributes(
+      category const& c, Args const&... args) noexcept
+      : event_attributes(args...) {
+    attributes_.category = c.get_id();
+  }
+
+  /**
+   * @brief Variadic constructor where the first argument is a `color`.
+   *
+   * Sets the value of the `EventAttribute`s color based on `c` and forwards
+   * the remaining variadic parameter pack to the next constructor.
+   *
+   */
+  template <typename... Args>
+  NVTX3_RELAXED_CONSTEXPR explicit event_attributes(
+      color const& c, Args const&... args) noexcept
+      : event_attributes(args...) {
+    attributes_.color = c.get_value();
+    attributes_.colorType = c.get_type();
+  }
+
+  /**
+   * @brief Variadic constructor where the first argument is a `payload`.
+   *
+   * Sets the value of the `EventAttribute`s payload based on `p` and forwards
+   * the remaining variadic parameter pack to the next constructor.
+   *
+   */
+  template <typename... Args>
+  NVTX3_RELAXED_CONSTEXPR explicit event_attributes(
+      payload const& p, Args const&... args) noexcept
+      : event_attributes(args...) {
+    attributes_.payload = p.get_value();
+    attributes_.payloadType = p.get_type();
+  }
+
+  /**
+   * @brief Variadic constructor where the first argument is a `message`.
+   *
+   * Sets the value of the `EventAttribute`s message based on `m` and forwards
+   * the remaining variadic parameter pack to the next constructor.
+   *
+   */
+  template <typename... Args>
+  NVTX3_RELAXED_CONSTEXPR explicit event_attributes(
+      message const& m, Args const&... args) noexcept
+      : event_attributes(args...) {
+    attributes_.message = m.get_value();
+    attributes_.messageType = m.get_type();
+  }
+
+  ~event_attributes() = default;
+  event_attributes(event_attributes const&) = default;
+  event_attributes& operator=(event_attributes const&) = default;
+  event_attributes(event_attributes&&) = default;
+  event_attributes& operator=(event_attributes&&) = default;
+
+  /**
+   * @brief Get raw pointer to underlying NVTX attributes object.
+   *
+   */
+  constexpr value_type const* get() const noexcept { return &attributes_; }
+
+ private:
+  value_type attributes_{};  ///< The NVTX attributes structure
+};
+
+/**
+ * @brief A RAII object for creating a NVTX range local to a thread within a
+ * domain.
+ *
+ * When constructed, begins a nested NVTX range on the calling thread in the
+ * specified domain. Upon destruction, ends the NVTX range.
+ *
+ * Behavior is undefined if a `domain_thread_range` object is
+ * created/destroyed on different threads.
+ *
+ * `domain_thread_range` is neither moveable nor copyable.
+ *
+ * `domain_thread_range`s may be nested within other ranges.
+ *
+ * The domain of the range is specified by the template type parameter `D`.
+ * By default, the `domain::global` is used, which scopes the range to the
+ * global NVTX domain. The convenience alias `thread_range` is provided for
+ * ranges scoped to the global domain.
+ *
+ * A custom domain can be defined by creating a type, `D`, with a static
+ * member `D::name` whose value is used to name the domain associated with
+ * `D`. `D::name` must resolve to either `char const*` or `wchar_t const*`
+ *
+ * Example:
+ * ```
+ * // Define a type `my_domain` with a member `name` used to name the domain
+ * // associated with the type `my_domain`.
+ * struct my_domain{
+ *    static constexpr const char * name{"my domain"};
+ * };
+ * ```
+ *
+ * Usage:
+ * ```
+ * nvtx3::domain_thread_range<> r0{"range 0"}; // Range in global domain
+ *
+ * nvtx3::thread_range r1{"range 1"}; // Alias for range in global domain
+ *
+ * nvtx3::domain_thread_range<my_domain> r2{"range 2"}; // Range in custom
+ * domain
+ *
+ * // specify an alias to a range that uses a custom domain
+ * using my_thread_range = nvtx3::domain_thread_range<my_domain>;
+ *
+ * my_thread_range r3{"range 3"}; // Alias for range in custom domain
+ * ```
+ */
+template <class D = domain::global>
+class domain_thread_range {
+ public:
+  /**
+   * @brief Construct a `domain_thread_range` with the specified
+   * `event_attributes`
+   *
+   * Example:
+   * ```
+   * nvtx3::event_attributes attr{"msg", nvtx3::rgb{127,255,0}};
+   * nvtx3::domain_thread_range<> range{attr}; // Creates a range with message
+   * contents
+   *                                    // "msg" and green color
+   * ```
+   *
+   * @param[in] attr `event_attributes` that describes the desired attributes
+   * of the range.
+   */
+  explicit domain_thread_range(event_attributes const& attr) noexcept {
+    nvtxDomainRangePushEx(domain::get<D>(), attr.get());
+  }
+
+  /**
+   * @brief Constructs a `domain_thread_range` from the constructor arguments
+   * of an `event_attributes`.
+   *
+   * Forwards the arguments `first, args...` to construct an
+   * `event_attributes` object. The `event_attributes` object is then
+   * associated with the `domain_thread_range`.
+   *
+   * For more detail, see `event_attributes` documentation.
+   *
+   * Example:
+   * ```
+   * // Creates a range with message "message" and green color
+   * nvtx3::domain_thread_range<> r{"message", nvtx3::rgb{127,255,0}};
+   * ```
+   *
+   * @note To prevent making needless copies of `event_attributes` objects,
+   * this constructor is disabled when the first argument is an
+   * `event_attributes` object, instead preferring the explicit
+   * `domain_thread_range(event_attributes const&)` constructor.
+   *
+   * @param[in] first First argument to forward to the `event_attributes`
+   * constructor.
+   * @param[in] args Variadic parameter pack of additional arguments to
+   * forward.
+   *
+   */
+  template <typename First, typename... Args,
+            typename = typename std::enable_if<not std::is_same<
+                event_attributes, typename std::decay<First>>::value>>
+  explicit domain_thread_range(First const& first, Args const&... args) noexcept
+      : domain_thread_range{event_attributes{first, args...}} {}
+
+  /**
+   * @brief Default constructor creates a `domain_thread_range` with no
+   * message, color, payload, nor category.
+   *
+   */
+  domain_thread_range() : domain_thread_range{event_attributes{}} {}
+
+  domain_thread_range(domain_thread_range const&) = delete;
+  domain_thread_range& operator=(domain_thread_range const&) = delete;
+  domain_thread_range(domain_thread_range&&) = delete;
+  domain_thread_range& operator=(domain_thread_range&&) = delete;
+
+  /**
+   * @brief Destroy the domain_thread_range, ending the NVTX range event.
+   */
+  ~domain_thread_range() noexcept { nvtxDomainRangePop(domain::get<D>()); }
+};
+
+/**
+ * @brief Alias for a `domain_thread_range` in the global NVTX domain.
+ *
+ */
+using thread_range = domain_thread_range<>;
+
+/**
+ * @brief Handle used for correlating explicit range start and end events.
+ *
+ */
+struct range_handle {
+  /// Type used for the handle's value
+  using value_type = nvtxRangeId_t;
+
+  /**
+   * @brief Construct a `range_handle` from the given id.
+   *
+   */
+  constexpr range_handle(value_type id) noexcept : _range_id{id} {}
+
+  /**
+   * @brief Returns the `range_handle`'s value
+   *
+   * @return value_type The handle's value
+   */
+  constexpr value_type get_value() const noexcept { return _range_id; }
+
+private:
+  value_type _range_id{}; ///< The underlying NVTX range id
+};
+
+/**
+ * @brief Manually begin an NVTX range.
+ *
+ * Explicitly begins an NVTX range and returns a unique handle. To end the
+ * range, pass the handle to `end_range()`.
+ *
+ * `start_range/end_range` are the most explicit and lowest level APIs provided
+ * for creating ranges.  Use of `nvtx3::domain_process_range` should be
+ * preferred unless one is unable to tie the range to the lifetime of an object.
+ *
+ * Example:
+ * ```
+ * nvtx3::event_attributes attr{"msg", nvtx3::rgb{127,255,0}};
+ * nvtx3::range_handle h = nvxt3::start_range(attr); // Manually begins a range
+ * ...
+ * nvtx3::end_range(h); // Ends the range
+ * ```
+ *
+ * @tparam D Type containing `name` member used to identify the `domain`
+ * to which the range belongs. Else, `domain::global` to indicate that the
+ * global NVTX domain should be used.
+ * @param[in] attr `event_attributes` that describes the desired attributes
+ * of the range.
+ * @return Unique handle to be passed to `end_range` to end the range.
+ */
+template <typename D = domain::global>
+range_handle start_range(event_attributes const &attr) noexcept {
+  return range_handle{nvtxDomainRangeStartEx(domain::get<D>(), attr.get())};
+}
+
+/**
+ * @brief Manually begin an NVTX range.
+ *
+ * Explicitly begins an NVTX range and returns a unique handle. To end the
+ * range, pass the handle to `end_range()`.
+ *
+ * Forwards the arguments `first, args...` to construct an  `event_attributes`
+ * object. The `event_attributes` object is then  associated with the range.
+ *
+ * For more detail, see `event_attributes` documentation.
+ *
+ * Example:
+ * ```
+ * nvtx3::range_handle h = nvxt3::start_range("msg", nvtx3::rgb{127,255,0}); //
+ * Begin range
+ * ...
+ * nvtx3::end_range(h); // Ends the range
+ * ```
+ *
+ * `start_range/end_range` are the most explicit and lowest level APIs provided
+ * for creating ranges.  Use of `nvtx3::domain_process_range` should be
+ * preferred unless one is unable to tie the range to the lifetime of an object.
+ *
+ * @param first[in] First argument to pass to an `event_attributes`
+ * @param args[in] Variadiac parameter pack of the rest of the arguments for an
+ * `event_attributes`.
+ * @return Unique handle to be passed to `end_range` to end the range.
+ */
+template <typename First, typename... Args,
+          typename = typename std::enable_if<not std::is_same<
+              event_attributes, typename std::decay<First>>::value>>
+range_handle start_range(First const &first, Args const &... args) noexcept {
+  return start_range(event_attributes{first, args...});
+}
+
+/**
+ * @brief Manually end the range associated with the handle `r`.
+ *
+ * Explicitly ends the NVTX range indicated by the handle `r` returned from a
+ * prior call to `start_range`. The range may end on a different thread from
+ * where it began.
+ *
+ * This function does not have a Domain tag type template parameter as the
+ * handle `r` already indicates the domain to which the range belongs.
+ *
+ * @param r Handle to a range started by a prior call to `start_range`.
+ */
+void end_range(range_handle r) { nvtxRangeEnd(r.get_value()); }
+
+/**
+ * @brief A RAII object for creating a NVTX range within a domain that can
+ * be created and destroyed on different threads.
+ *
+ * When constructed, begins a NVTX range in the specified domain. Upon
+ * destruction, ends the NVTX range.
+ *
+ * Similar to `nvtx3::domain_thread_range`, the only difference being that
+ * `domain_process_range` can start and end on different threads.
+ *
+ * Use of `nvtx3::domain_thread_range` should be preferred unless one needs
+ * the ability to start and end a range on different threads.
+ *
+ * `domain_process_range` is moveable, but not copyable.
+ *
+ * @tparam D Type containing `name` member used to identify the `domain`
+ * to which the `domain_process_range` belongs. Else, `domain::global` to
+ * indicate that the global NVTX domain should be used.
+ */
+template <typename D = domain::global> class domain_process_range {
+ public:
+  /**
+   * @brief Construct a new domain process range object
+   *
+   * @param attr
+   */
+  explicit domain_process_range(event_attributes const &attr) noexcept
+      : handle_{start_range(attr)} {}
+
+  /**
+   * @brief Construct a new domain process range object
+   *
+   * @param first
+   * @param args
+   */
+  template <typename First, typename... Args,
+            typename = typename std::enable_if<not std::is_same<
+                event_attributes, typename std::decay<First>>::value>>
+  explicit domain_process_range(First const &first,
+                                Args const &... args) noexcept
+      : domain_process_range{event_attributes{first, args...}} {}
+
+  /**
+   * @brief Construct a new domain process range object
+   *
+   */
+  constexpr domain_process_range() noexcept
+      : domain_process_range{event_attributes{}} {}
+
+  /**
+   * @brief Destroy the `domain_process_range` ending the range.
+   *
+   */
+  ~domain_process_range() noexcept {
+    if (not moved_from_) {
+      end_range(handle_);
+    }
+  }
+
+  /**
+   * @brief Move constructor allows taking ownership of the NVTX range from
+   * another `domain_process_range`.
+   *
+   * @param other
+   */
+  domain_process_range(domain_process_range &&other) noexcept
+      : handle_{other.handle_} {
+    other.moved_from_ = true;
+  }
+
+  /**
+   * @brief Move assignment operator allows taking ownership of an NVTX range
+   * from another `domain_process_range`.
+   *
+   * @param other
+   * @return domain_process_range&
+   */
+  domain_process_range &operator=(domain_process_range &&other) noexcept {
+    handle_ = other.handle_;
+    other.moved_from_ = true;
+  }
+
+  /// Copy construction is not allowed to prevent multiple objects from owning
+  /// the same range handle
+  domain_process_range(domain_process_range const &) = delete;
+
+  /// Copy assignment is not allowed to prevent multiple objects from owning the
+  /// same range handle
+  domain_process_range &operator=(domain_process_range const &) = delete;
+
+ private:
+  range_handle handle_;    ///< Range handle used to correlate
+                            ///< the start/end of the range
+  bool moved_from_{false}; ///< Indicates if the object has had
+                            ///< it's contents moved from it,
+                            ///< indicating it should not attempt
+                            ///< to end the NVTX range.
+};
+
+/**
+ * @brief Alias for a `domain_process_range` in the global NVTX domain.
+ *
+ */
+using process_range = domain_process_range<>;
+
+/**
+ * @brief Annotates an instantaneous point in time with the attributes specified
+ * by `attr`.
+ *
+ * Unlike a "range", a mark is an instantaneous event in an application, e.g.,
+ * locking/unlocking a mutex.
+ *
+ * \code{.cpp}
+ * std::mutex global_lock;
+ * void lock_mutex(){
+ *    global_lock.lock();
+ *    nvtx3::mark("lock_mutex");
+ * }
+ * \endcode
+ *
+ * @tparam D Type containing `name` member used to identify the `domain`
+ * to which the `domain_process_range` belongs. Else, `domain::global` to
+ * indicate that the global NVTX domain should be used.
+ * @param[in] attr `event_attributes` that describes the desired attributes
+ * of the mark.
+ */
+template <typename D = nvtx3::domain::global>
+inline void mark(event_attributes const& attr) noexcept {
+  nvtxDomainMarkEx(domain::get<D>(), attr.get());
+}
+
+}  // namespace nvtx3
+
+/**
+ * @brief Convenience macro for generating a range in the specified `domain`
+ * from the lifetime of a function
+ *
+ * This macro is useful for generating an NVTX range in `domain` from
+ * the entry point of a function to its exit. It is intended to be the first
+ * line of the function.
+ *
+ * Constructs a static `registered_message` using the name of the immediately
+ * enclosing function returned by `__func__` and constructs a
+ * `nvtx3::thread_range` using the registered function name as the range's
+ * message.
+ *
+ * Example:
+ * ```
+ * struct my_domain{static constexpr char const* name{"my_domain"};};
+ *
+ * void foo(...){
+ *    NVTX3_FUNC_RANGE_IN(my_domain); // Range begins on entry to foo()
+ *    // do stuff
+ *    ...
+ * } // Range ends on return from foo()
+ * ```
+ *
+ * @param[in] D Type containing `name` member used to identify the
+ * `domain` to which the `registered_message` belongs. Else,
+ * `domain::global` to  indicate that the global NVTX domain should be used.
+ */
+#define NVTX3_FUNC_RANGE_IN(D)                                                 \
+  static ::nvtx3::registered_message<D> const nvtx3_func_name__{__func__};     \
+  static ::nvtx3::event_attributes const nvtx3_func_attr__{nvtx3_func_name__}; \
+  ::nvtx3::domain_thread_range<D> const nvtx3_range__{nvtx3_func_attr__};
+
+/**
+ * @brief Convenience macro for generating a range in the global domain from the
+ * lifetime of a function.
+ *
+ * This macro is useful for generating an NVTX range in the global domain from
+ * the entry point of a function to its exit. It is intended to be the first
+ * line of the function.
+ *
+ * Constructs a static `registered_message` using the name of the immediately
+ * enclosing function returned by `__func__` and constructs a
+ * `nvtx3::thread_range` using the registered function name as the range's
+ * message.
+ *
+ * Example:
+ * ```
+ * void foo(...){
+ *    NVTX3_FUNC_RANGE(); // Range begins on entry to foo()
+ *    // do stuff
+ *    ...
+ * } // Range ends on return from foo()
+ * ```
+ */
+#define NVTX3_FUNC_RANGE() NVTX3_FUNC_RANGE_IN(::nvtx3::domain::global)
\ No newline at end of file
diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh
index 267910b43..01399a610 100644
--- a/include/cuco/dynamic_map.cuh
+++ b/include/cuco/dynamic_map.cuh
@@ -99,14 +99,15 @@ class dynamic_map {
   static_assert(std::is_arithmetic<Key>::value, "Unsupported, non-arithmetic key type.");
 
  public:
-  using value_type      = cuco::pair_type<Key, Value>;       ///< Type of key/value pairs
-  using key_type        = Key;                               ///< Key type
-  using mapped_type     = Value;                             ///< Type of mapped values
-  using atomic_ctr_type = cuda::atomic<std::size_t, Scope>;  ///< Type of atomic counters
-  using view_type = typename static_map<Key, Value, Scope>::device_view;  ///< Device view type
-  using mutable_view_type = typename static_map<Key, Value, Scope>::device_mutable_view;
-  ///< Device mutable view type
-
+  using value_type                = cuco::pair_type<Key, Value>;
+  using key_type                  = Key;
+  using mapped_type               = Value;
+  using atomic_ctr_type           = cuda::atomic<std::size_t, Scope>;
+  using view_type                 = typename static_map<Key, Value, Scope>::device_view;
+  using mutable_view_type         = typename static_map<Key, Value, Scope>::device_mutable_view;
+  using counter_allocator_type =
+    typename std::allocator_traits<Allocator>::rebind_alloc<atomic_ctr_type>;
+  
   dynamic_map(dynamic_map const&) = delete;
   dynamic_map(dynamic_map&&)      = delete;
 
@@ -286,6 +287,7 @@ class dynamic_map {
   std::size_t min_insert_size_{};   ///< min remaining capacity of submap for insert
   atomic_ctr_type* num_successes_;  ///< number of successfully inserted keys on insert
   Allocator alloc_{};  ///< Allocator passed to submaps to allocate their device storage
+  counter_allocator_type counter_allocator_{};  ///< Allocator used to allocate `num_successes_`
 
   std::vector<atomic_ctr_type*> submap_num_successes_;
 };

From 7eac9d1ffe79ccf5256aebfe60523db1c223e702 Mon Sep 17 00:00:00 2001
From: Nico Iskos <niskos@nvidia.com>
Date: Wed, 6 Apr 2022 10:46:25 -0700
Subject: [PATCH 09/36] more efficient block reduce

---
 benchmarks/hash_table/dynamic_map_bench.cu  | 111 +++++++++++++++++++-
 include/cuco/detail/dynamic_map.inl         |   2 +-
 include/cuco/detail/dynamic_map_kernels.cuh |   8 +-
 include/cuco/dynamic_map.cuh                |   5 +-
 4 files changed, 117 insertions(+), 9 deletions(-)

diff --git a/benchmarks/hash_table/dynamic_map_bench.cu b/benchmarks/hash_table/dynamic_map_bench.cu
index c0306f901..e6c29ede6 100644
--- a/benchmarks/hash_table/dynamic_map_bench.cu
+++ b/benchmarks/hash_table/dynamic_map_bench.cu
@@ -57,7 +57,7 @@ static void generate_keys(OutputIt output_begin, OutputIt output_end)
 
 static void gen_final_size(benchmark::internal::Benchmark* b)
 {
-  for (auto size = 10'000'000; size <= 10'000'000; size += 20'000'000) {
+  for (auto size = 10'000'000; size <= 310'000'000; size += 20'000'000) {
     b->Args({size});
   }
 }
@@ -142,6 +142,43 @@ static void BM_dynamic_search_all(::benchmark::State& state)
                           int64_t(state.range(0)));
 }
 
+template <typename Key, typename Value, dist_type Dist>
+static void BM_dynamic_search_none(::benchmark::State& state)
+{
+  using map_type = cuco::dynamic_map<Key, Value>;
+
+  std::size_t num_keys     = state.range(0);
+  std::size_t initial_size = 1 << 27;
+
+  std::vector<Key> h_keys(num_keys);
+  std::vector<cuco::pair_type<Key, Value>> h_pairs(num_keys);
+
+  generate_keys<Dist, Key>(h_keys.begin(), h_keys.end());
+
+  for (auto i = 0; i < num_keys; ++i) {
+    Key key           = h_keys[i] + num_keys;
+    Value val         = h_keys[i] + num_keys;
+    h_pairs[i].first  = key;
+    h_pairs[i].second = val;
+  }
+
+  thrust::device_vector<Key> d_keys(h_keys);
+  thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs(h_pairs);
+  thrust::device_vector<Value> d_results(num_keys);
+
+  map_type map{initial_size, 
+    cuco::sentinel::empty_key<Key>{-1}, cuco::sentinel::empty_value<Value>{-1}};
+  map.insert(d_pairs.begin(), d_pairs.end());
+
+  for (auto _ : state) {
+    cuda_event_timer raii{state};
+    map.find(d_keys.begin(), d_keys.end(), d_results.begin());
+  }
+
+  state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) *
+                          int64_t(state.range(0)));
+}
+
 template <typename Key, typename Value, dist_type Dist>
 static void BM_dynamic_erase_all(::benchmark::State& state)
 {
@@ -186,21 +223,66 @@ static void BM_dynamic_erase_all(::benchmark::State& state)
                           int64_t(state.range(0)));
 }
 
+template <typename Key, typename Value, dist_type Dist>
+static void BM_dynamic_erase_none(::benchmark::State& state)
+{
+  using map_type = cuco::dynamic_map<Key, Value>;
+
+  std::size_t num_keys     = state.range(0);
+  std::size_t initial_size = 1 << 27;
+
+  std::vector<Key> h_keys(num_keys);
+  std::vector<cuco::pair_type<Key, Value>> h_pairs(num_keys);
+
+  generate_keys<Dist, Key>(h_keys.begin(), h_keys.end());
+
+  for (auto i = 0; i < num_keys; ++i) {
+    Key key           = h_keys[i] + num_keys;
+    Value val         = h_keys[i] + num_keys;
+    h_pairs[i].first  = key;
+    h_pairs[i].second = val;
+  }
+
+  thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs(h_pairs);
+  thrust::device_vector<Key> d_keys(h_keys);
+
+  std::size_t batch_size = 1E6;
+  for (auto _ : state) {
+    map_type map{initial_size, 
+      cuco::sentinel::empty_key<Key>{-1}, 
+      cuco::sentinel::empty_value<Value>{-1},
+      cuco::sentinel::erased_key<Key>{-2}};
+    for (auto i = 0; i < num_keys; i += batch_size) {
+      map.insert(d_pairs.begin() + i, d_pairs.begin() + i + batch_size);
+    }
+    {
+      cuda_event_timer raii{state};
+      map.erase(d_keys.begin(), d_keys.end());
+      //for (auto i = 0; i < num_keys; i += batch_size) {
+      //  map.erase(d_keys.begin() + i, d_keys.begin() + i + batch_size);
+      //}
+    }
+  }
+
+  state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) *
+                          int64_t(state.range(0)));
+}
+/*
 BENCHMARK_TEMPLATE(BM_dynamic_insert, int32_t, int32_t, dist_type::UNIQUE)
   ->Unit(benchmark::kMillisecond)
   ->Apply(gen_final_size)
   ->UseManualTime();
-/*
+
 BENCHMARK_TEMPLATE(BM_dynamic_search_all, int32_t, int32_t, dist_type::UNIQUE)
   ->Unit(benchmark::kMillisecond)
   ->Apply(gen_final_size)
   ->UseManualTime();
 */
+
 BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int32_t, int32_t, dist_type::UNIQUE)
   ->Unit(benchmark::kMillisecond)
   ->Apply(gen_final_size)
   ->UseManualTime();
-
 /*
 BENCHMARK_TEMPLATE(BM_dynamic_insert, int32_t, int32_t, dist_type::UNIFORM)
   ->Unit(benchmark::kMillisecond)
@@ -221,7 +303,12 @@ BENCHMARK_TEMPLATE(BM_dynamic_search_all, int32_t, int32_t, dist_type::GAUSSIAN)
   ->Unit(benchmark::kMillisecond)
   ->Apply(gen_final_size)
   ->UseManualTime();
-
+*/
+BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int32_t, int32_t, dist_type::GAUSSIAN)
+  ->Unit(benchmark::kMillisecond)
+  ->Apply(gen_final_size)
+  ->UseManualTime();
+/*
 BENCHMARK_TEMPLATE(BM_dynamic_insert, int64_t, int64_t, dist_type::UNIQUE)
   ->Unit(benchmark::kMillisecond)
   ->Apply(gen_final_size)
@@ -231,6 +318,22 @@ BENCHMARK_TEMPLATE(BM_dynamic_search_all, int64_t, int64_t, dist_type::UNIQUE)
   ->Unit(benchmark::kMillisecond)
   ->Apply(gen_final_size)
   ->UseManualTime();
+*/
+
+BENCHMARK_TEMPLATE(BM_dynamic_erase_none, int32_t, int32_t, dist_type::UNIQUE)
+  ->Unit(benchmark::kMillisecond)
+  ->Apply(gen_final_size)
+  ->UseManualTime();
+/*
+BENCHMARK_TEMPLATE(BM_dynamic_search_none, int32_t, int32_t, dist_type::UNIQUE)
+  ->Unit(benchmark::kMillisecond)
+  ->Apply(gen_final_size)
+  ->UseManualTime();
+/*
+BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int64_t, int64_t, dist_type::UNIQUE)
+  ->Unit(benchmark::kMillisecond)
+  ->Apply(gen_final_size)
+  ->UseManualTime();
 
 BENCHMARK_TEMPLATE(BM_dynamic_insert, int64_t, int64_t, dist_type::UNIFORM)
   ->Unit(benchmark::kMillisecond)
diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl
index 2bb1459bc..7b3ac6ec1 100644
--- a/include/cuco/detail/dynamic_map.inl
+++ b/include/cuco/detail/dynamic_map.inl
@@ -220,7 +220,7 @@ void dynamic_map<Key, Value, Scope, Allocator>::erase(InputIt first,
       submaps_.size(),
       hash,
       key_equal);
-      
+
   std::size_t h_num_successes;
   CUCO_CUDA_TRY(cudaMemcpy(
     &h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost));
diff --git a/include/cuco/detail/dynamic_map_kernels.cuh b/include/cuco/detail/dynamic_map_kernels.cuh
index c5605d463..fcfa8c921 100644
--- a/include/cuco/detail/dynamic_map_kernels.cuh
+++ b/include/cuco/detail/dynamic_map_kernels.cuh
@@ -234,12 +234,16 @@ __global__ void erase(InputIt first,
   }
 
   std::size_t block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes);
-  if (threadIdx.x == 0) { *num_successes += block_num_successes; }
+  if (threadIdx.x == 0) {
+    num_successes->fetch_add(block_num_successes, cuda::std::memory_order_relaxed);
+  }
 
   // update submap thread counts
   for(int i = 0; i < num_submaps; ++i) {
     std::size_t submap_block_num_successes = BlockReduce(temp_submap_storage[i]).Sum(submap_thread_num_successes[i]);
-    if(threadIdx.x == 0) {*submap_num_successes[i] += submap_block_num_successes; }
+    if(threadIdx.x == 0) {
+      submap_num_successes[i]->fetch_add(submap_block_num_successes, cuda::std::memory_order_relaxed);
+    }
   }
 }
 
diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh
index 01399a610..7ac06d61a 100644
--- a/include/cuco/dynamic_map.cuh
+++ b/include/cuco/dynamic_map.cuh
@@ -16,11 +16,12 @@
 
 #pragma once
 
-#include <cuco/detail/dynamic_map_kernels.cuh>
+
 #include <cuco/detail/error.hpp>
 #include <cuco/sentinel.cuh>
 #include <cuco/static_map.cuh>
-
+#include <cuda/std/atomic>
+#include <cuco/detail/dynamic_map_kernels.cuh>
 #include <thrust/device_vector.h>
 #include <thrust/functional.h>
 

From 4d10631e2d2cc14136d8a497b7f197b911a0f32d Mon Sep 17 00:00:00 2001
From: Nico Iskos <niskos@nvidia.com>
Date: Wed, 6 Apr 2022 11:46:44 -0700
Subject: [PATCH 10/36] doc changes

---
 benchmarks/hash_table/dynamic_map_bench.cu  | 20 ++++++++++----------
 include/cuco/detail/dynamic_map.inl         | 12 ++++++------
 include/cuco/detail/dynamic_map_kernels.cuh |  3 +++
 include/cuco/dynamic_map.cuh                |  3 +--
 4 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/benchmarks/hash_table/dynamic_map_bench.cu b/benchmarks/hash_table/dynamic_map_bench.cu
index e6c29ede6..46eb0aa18 100644
--- a/benchmarks/hash_table/dynamic_map_bench.cu
+++ b/benchmarks/hash_table/dynamic_map_bench.cu
@@ -258,9 +258,6 @@ static void BM_dynamic_erase_none(::benchmark::State& state)
     {
       cuda_event_timer raii{state};
       map.erase(d_keys.begin(), d_keys.end());
-      //for (auto i = 0; i < num_keys; i += batch_size) {
-      //  map.erase(d_keys.begin() + i, d_keys.begin() + i + batch_size);
-      //}
     }
   }
 
@@ -277,13 +274,12 @@ BENCHMARK_TEMPLATE(BM_dynamic_search_all, int32_t, int32_t, dist_type::UNIQUE)
   ->Unit(benchmark::kMillisecond)
   ->Apply(gen_final_size)
   ->UseManualTime();
-*/
 
 BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int32_t, int32_t, dist_type::UNIQUE)
   ->Unit(benchmark::kMillisecond)
   ->Apply(gen_final_size)
   ->UseManualTime();
-/*
+
 BENCHMARK_TEMPLATE(BM_dynamic_insert, int32_t, int32_t, dist_type::UNIFORM)
   ->Unit(benchmark::kMillisecond)
   ->Apply(gen_final_size)
@@ -303,12 +299,12 @@ BENCHMARK_TEMPLATE(BM_dynamic_search_all, int32_t, int32_t, dist_type::GAUSSIAN)
   ->Unit(benchmark::kMillisecond)
   ->Apply(gen_final_size)
   ->UseManualTime();
-*/
+
 BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int32_t, int32_t, dist_type::GAUSSIAN)
   ->Unit(benchmark::kMillisecond)
   ->Apply(gen_final_size)
   ->UseManualTime();
-/*
+
 BENCHMARK_TEMPLATE(BM_dynamic_insert, int64_t, int64_t, dist_type::UNIQUE)
   ->Unit(benchmark::kMillisecond)
   ->Apply(gen_final_size)
@@ -318,14 +314,18 @@ BENCHMARK_TEMPLATE(BM_dynamic_search_all, int64_t, int64_t, dist_type::UNIQUE)
   ->Unit(benchmark::kMillisecond)
   ->Apply(gen_final_size)
   ->UseManualTime();
-*/
 
 BENCHMARK_TEMPLATE(BM_dynamic_erase_none, int32_t, int32_t, dist_type::UNIQUE)
   ->Unit(benchmark::kMillisecond)
   ->Apply(gen_final_size)
   ->UseManualTime();
-/*
-BENCHMARK_TEMPLATE(BM_dynamic_search_none, int32_t, int32_t, dist_type::UNIQUE)
+
+BENCHMARK_TEMPLATE(BM_dynamic_erase_none, int32_t, int32_t, dist_type::GAUSSIAN)
+  ->Unit(benchmark::kMillisecond)
+  ->Apply(gen_final_size)
+  ->UseManualTime();
+*/
+BENCHMARK_TEMPLATE(BM_dynamic_search_none, int32_t, int32_t, dist_type::GAUSSIAN)
   ->Unit(benchmark::kMillisecond)
   ->Apply(gen_final_size)
   ->UseManualTime();
diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl
index 7b3ac6ec1..d0d4f0ba7 100644
--- a/include/cuco/detail/dynamic_map.inl
+++ b/include/cuco/detail/dynamic_map.inl
@@ -14,8 +14,6 @@
  * limitations under the License.
  */
 
-//#include "nvtx3.hpp"
-
 namespace cuco {
 
 template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
@@ -40,7 +38,6 @@ dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(std::size_t initial_capac
     alloc));
   submap_views_.push_back(submaps_[0]->get_device_view());
   submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view());
-
   submap_num_successes_.push_back(submaps_[0]->get_num_successes());
   
   num_successes_ = std::allocator_traits<counter_allocator_type>::allocate(counter_allocator_, 1);
@@ -130,8 +127,6 @@ void dynamic_map<Key, Value, Scope, Allocator>::insert(InputIt first,
                                                        Hash hash,
                                                        KeyEqual key_equal)
 {
-  //nvtx3::thread_range r{"insert"};
-
   std::size_t num_to_insert = std::distance(first, last);
 
   reserve(size_ + num_to_insert);
@@ -185,7 +180,6 @@ void dynamic_map<Key, Value, Scope, Allocator>::erase(InputIt first,
                                                        Hash hash,
                                                        KeyEqual key_equal)
 {
-  //nvtx3::thread_range r{"erase"};
   std::size_t num_keys = std::distance(first, last);
 
   auto const block_size = 128;
@@ -197,15 +191,18 @@ void dynamic_map<Key, Value, Scope, Allocator>::erase(InputIt first,
   static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type));
   CUCO_CUDA_TRY(cudaMemset(num_successes_, 0, sizeof(atomic_ctr_type)));
   
+  // zero out submap success counters
   static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type));
   for(int i = 0; i < submaps_.size(); ++i) {
     CUCO_CUDA_TRY(cudaMemset(submap_num_successes_[i], 0, sizeof(atomic_ctr_type)));
   }
   
   // TODO: hacky, improve this
+  // provide device-accessible vector for each submap num_successes variable
   thrust::device_vector<atomic_ctr_type*> d_submap_num_successes(submap_num_successes_);
 
   // TODO: hack (how to get size on host?)
+  // use dynamic shared memory to hold block reduce space for each submap's erases
   constexpr size_t temp_storage_size_one_block = 48;
   auto const temp_storage_size = submaps_.size() * temp_storage_size_one_block;
       
@@ -221,11 +218,14 @@ void dynamic_map<Key, Value, Scope, Allocator>::erase(InputIt first,
       hash,
       key_equal);
 
+  // update total dynamic map size
   std::size_t h_num_successes;
   CUCO_CUDA_TRY(cudaMemcpy(
     &h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost));
   size_ -= h_num_successes;
   
+  // TODO: if only one submap, skip this step
+  // update each submap's size
   for(int i = 0; i < submaps_.size(); ++i) {
     std::size_t h_submap_num_successes;
     CUCO_CUDA_TRY(cudaMemcpy(
diff --git a/include/cuco/detail/dynamic_map_kernels.cuh b/include/cuco/detail/dynamic_map_kernels.cuh
index fcfa8c921..e54b51586 100644
--- a/include/cuco/detail/dynamic_map_kernels.cuh
+++ b/include/cuco/detail/dynamic_map_kernels.cuh
@@ -210,6 +210,8 @@ __global__ void erase(InputIt first,
   __shared__ typename BlockReduce::TempStorage temp_storage;
 
   std::size_t thread_num_successes = 0;
+
+  // TODO: find permanent solution (only works for four submaps)
   std::size_t submap_thread_num_successes[4] = {0, 0, 0, 0};
 
   auto tile = cg::tiled_partition<tile_size>(cg::this_thread_block());
@@ -238,6 +240,7 @@ __global__ void erase(InputIt first,
     num_successes->fetch_add(block_num_successes, cuda::std::memory_order_relaxed);
   }
 
+  // TODO: if there's only one submap, skip this step
   // update submap thread counts
   for(int i = 0; i < num_submaps; ++i) {
     std::size_t submap_block_num_successes = BlockReduce(temp_submap_storage[i]).Sum(submap_thread_num_successes[i]);
diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh
index 7ac06d61a..977b00de0 100644
--- a/include/cuco/dynamic_map.cuh
+++ b/include/cuco/dynamic_map.cuh
@@ -287,10 +287,9 @@ class dynamic_map {
     submap_mutable_views_;          ///< vector of mutable device views for each submap
   std::size_t min_insert_size_{};   ///< min remaining capacity of submap for insert
   atomic_ctr_type* num_successes_;  ///< number of successfully inserted keys on insert
+  std::vector<atomic_ctr_type*> submap_num_successes_; ///< number of succesfully erased keys for each submap
   Allocator alloc_{};  ///< Allocator passed to submaps to allocate their device storage
   counter_allocator_type counter_allocator_{};  ///< Allocator used to allocate `num_successes_`
-
-  std::vector<atomic_ctr_type*> submap_num_successes_;
 };
 }  // namespace cuco
 

From b59a16b83b84497262546e7ba0215c042b703c1f Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 6 Apr 2022 18:52:40 +0000
Subject: [PATCH 11/36] [pre-commit.ci] auto code formatting

---
 benchmarks/hash_table/dynamic_map_bench.cu  |  29 +-
 benchmarks/hash_table/static_map_bench.cu   |  15 +-
 include/cuco/detail/dynamic_map.inl         |  76 ++---
 include/cuco/detail/dynamic_map_kernels.cuh |  28 +-
 include/cuco/detail/nvtx3.hpp               | 294 +++++++++++---------
 include/cuco/dynamic_map.cuh                |  45 +--
 include/cuco/static_map.cuh                 |   5 +-
 tests/dynamic_map/erase_test.cu             |  77 +++--
 8 files changed, 295 insertions(+), 274 deletions(-)

diff --git a/benchmarks/hash_table/dynamic_map_bench.cu b/benchmarks/hash_table/dynamic_map_bench.cu
index 46eb0aa18..222699abb 100644
--- a/benchmarks/hash_table/dynamic_map_bench.cu
+++ b/benchmarks/hash_table/dynamic_map_bench.cu
@@ -86,13 +86,8 @@ static void BM_dynamic_insert(::benchmark::State& state)
 
   std::size_t batch_size = 1E6;
   for (auto _ : state) {
-<<<<<<< HEAD
     map_type map{
       initial_size, cuco::sentinel::empty_key<Key>{-1}, cuco::sentinel::empty_value<Value>{-1}};
-=======
-    map_type map{initial_size, 
-      cuco::sentinel::empty_key<Key>{-1}, cuco::sentinel::empty_value<Value>{-1}};
->>>>>>> prevent implicit type conversion of sentinels during construction
     {
       cuda_event_timer raii{state};
       for (std::size_t i = 0; i < num_keys; i += batch_size) {
@@ -129,8 +124,8 @@ static void BM_dynamic_search_all(::benchmark::State& state)
   thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs(h_pairs);
   thrust::device_vector<Value> d_results(num_keys);
 
-  map_type map{initial_size, 
-    cuco::sentinel::empty_key<Key>{-1}, cuco::sentinel::empty_value<Value>{-1}};
+  map_type map{
+    initial_size, cuco::sentinel::empty_key<Key>{-1}, cuco::sentinel::empty_value<Value>{-1}};
   map.insert(d_pairs.begin(), d_pairs.end());
 
   for (auto _ : state) {
@@ -166,8 +161,8 @@ static void BM_dynamic_search_none(::benchmark::State& state)
   thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs(h_pairs);
   thrust::device_vector<Value> d_results(num_keys);
 
-  map_type map{initial_size, 
-    cuco::sentinel::empty_key<Key>{-1}, cuco::sentinel::empty_value<Value>{-1}};
+  map_type map{
+    initial_size, cuco::sentinel::empty_key<Key>{-1}, cuco::sentinel::empty_value<Value>{-1}};
   map.insert(d_pairs.begin(), d_pairs.end());
 
   for (auto _ : state) {
@@ -204,10 +199,10 @@ static void BM_dynamic_erase_all(::benchmark::State& state)
 
   std::size_t batch_size = 1E6;
   for (auto _ : state) {
-    map_type map{initial_size, 
-      cuco::sentinel::empty_key<Key>{-1}, 
-      cuco::sentinel::empty_value<Value>{-1},
-      cuco::sentinel::erased_key<Key>{-2}};
+    map_type map{initial_size,
+                 cuco::sentinel::empty_key<Key>{-1},
+                 cuco::sentinel::empty_value<Value>{-1},
+                 cuco::sentinel::erased_key<Key>{-2}};
     for (auto i = 0; i < num_keys; i += batch_size) {
       map.insert(d_pairs.begin() + i, d_pairs.begin() + i + batch_size);
     }
@@ -248,10 +243,10 @@ static void BM_dynamic_erase_none(::benchmark::State& state)
 
   std::size_t batch_size = 1E6;
   for (auto _ : state) {
-    map_type map{initial_size, 
-      cuco::sentinel::empty_key<Key>{-1}, 
-      cuco::sentinel::empty_value<Value>{-1},
-      cuco::sentinel::erased_key<Key>{-2}};
+    map_type map{initial_size,
+                 cuco::sentinel::empty_key<Key>{-1},
+                 cuco::sentinel::empty_value<Value>{-1},
+                 cuco::sentinel::erased_key<Key>{-2}};
     for (auto i = 0; i < num_keys; i += batch_size) {
       map.insert(d_pairs.begin() + i, d_pairs.begin() + i + batch_size);
     }
diff --git a/benchmarks/hash_table/static_map_bench.cu b/benchmarks/hash_table/static_map_bench.cu
index 363899a46..1e69c0c4e 100644
--- a/benchmarks/hash_table/static_map_bench.cu
+++ b/benchmarks/hash_table/static_map_bench.cu
@@ -180,9 +180,10 @@ static void BM_static_map_search_none(::benchmark::State& state)
     h_pairs[i].first  = key;
     h_pairs[i].second = val;
   }
-  
+
   // diff keys
-  for(int i = 0; i < num_keys; ++i) h_keys[i] += num_keys;
+  for (int i = 0; i < num_keys; ++i)
+    h_keys[i] += num_keys;
 
   thrust::device_vector<Key> d_keys(h_keys);
   thrust::device_vector<Value> d_results(num_keys);
@@ -269,23 +270,23 @@ static void BM_static_map_erase_none(::benchmark::State& state)
     h_pairs[i].second = val;
   }
 
-
   // diff keys
-  for(int i = 0; i < num_keys; ++i) h_keys[i] += num_keys;
+  for (int i = 0; i < num_keys; ++i)
+    h_keys[i] += num_keys;
 
   thrust::device_vector<Key> d_keys(h_keys);
   thrust::device_vector<bool> d_results(num_keys);
   thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs(h_pairs);
 
   for (auto _ : state) {
-    //state.ResumeTiming();
+    // state.ResumeTiming();
     state.PauseTiming();
     map.insert(d_pairs.begin(), d_pairs.end());
     state.ResumeTiming();
 
     map.erase(d_keys.begin(), d_keys.end());
-    
-    //state.PauseTiming();
+
+    // state.PauseTiming();
   }
 
   state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) *
diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl
index d0d4f0ba7..8be714c3d 100644
--- a/include/cuco/detail/dynamic_map.inl
+++ b/include/cuco/detail/dynamic_map.inl
@@ -17,10 +17,11 @@
 namespace cuco {
 
 template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
-dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(std::size_t initial_capacity,
-                                                       sentinel::empty_key<Key> empty_key_sentinel,
-                                                       sentinel::empty_value<Value> empty_value_sentinel,
-                                                       Allocator const& alloc)
+dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(
+  std::size_t initial_capacity,
+  sentinel::empty_key<Key> empty_key_sentinel,
+  sentinel::empty_value<Value> empty_value_sentinel,
+  Allocator const& alloc)
   : empty_key_sentinel_(empty_key_sentinel.value),
     empty_value_sentinel_(empty_value_sentinel.value),
     erased_key_sentinel_(empty_key_sentinel.value),
@@ -39,16 +40,17 @@ dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(std::size_t initial_capac
   submap_views_.push_back(submaps_[0]->get_device_view());
   submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view());
   submap_num_successes_.push_back(submaps_[0]->get_num_successes());
-  
+
   num_successes_ = std::allocator_traits<counter_allocator_type>::allocate(counter_allocator_, 1);
 }
 
 template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
-dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(std::size_t initial_capacity,
-                                                       sentinel::empty_key<Key> empty_key_sentinel,
-                                                       sentinel::empty_value<Value> empty_value_sentinel,
-                                                       sentinel::erased_key<Key> erased_key_sentinel,
-                                                       Allocator const& alloc)
+dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(
+  std::size_t initial_capacity,
+  sentinel::empty_key<Key> empty_key_sentinel,
+  sentinel::empty_value<Value> empty_value_sentinel,
+  sentinel::erased_key<Key> erased_key_sentinel,
+  Allocator const& alloc)
   : empty_key_sentinel_(empty_key_sentinel.value),
     empty_value_sentinel_(empty_value_sentinel.value),
     erased_key_sentinel_(erased_key_sentinel.value),
@@ -72,7 +74,6 @@ dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(std::size_t initial_capac
   num_successes_ = std::allocator_traits<counter_allocator_type>::allocate(counter_allocator_, 1);
 }
 
-
 template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
 dynamic_map<Key, Value, Scope, Allocator>::~dynamic_map()
 {
@@ -94,7 +95,7 @@ void dynamic_map<Key, Value, Scope, Allocator>::reserve(std::size_t n)
     // if the submap does not exist yet, create it
     else {
       submap_capacity = capacity_;
-      if(erased_key_sentinel_ != empty_key_sentinel_) {
+      if (erased_key_sentinel_ != empty_key_sentinel_) {
         submaps_.push_back(std::make_unique<static_map<Key, Value, Scope, Allocator>>(
           submap_capacity,
           sentinel::empty_key<Key>{empty_key_sentinel_},
@@ -142,7 +143,7 @@ void dynamic_map<Key, Value, Scope, Allocator>::insert(InputIt first,
       // TODO: memset an atomic variable is unsafe
       static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type));
       CUCO_CUDA_TRY(cudaMemset(num_successes_, 0, sizeof(atomic_ctr_type)));
-      
+
       auto n                = std::min(capacity_remaining, num_to_insert);
       auto const block_size = 128;
       auto const stride     = 1;
@@ -176,9 +177,9 @@ void dynamic_map<Key, Value, Scope, Allocator>::insert(InputIt first,
 template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
 template <typename InputIt, typename Hash, typename KeyEqual>
 void dynamic_map<Key, Value, Scope, Allocator>::erase(InputIt first,
-                                                       InputIt last,
-                                                       Hash hash,
-                                                       KeyEqual key_equal)
+                                                      InputIt last,
+                                                      Hash hash,
+                                                      KeyEqual key_equal)
 {
   std::size_t num_keys = std::distance(first, last);
 
@@ -190,13 +191,13 @@ void dynamic_map<Key, Value, Scope, Allocator>::erase(InputIt first,
   // TODO: memset an atomic variable is unsafe
   static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type));
   CUCO_CUDA_TRY(cudaMemset(num_successes_, 0, sizeof(atomic_ctr_type)));
-  
+
   // zero out submap success counters
   static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type));
-  for(int i = 0; i < submaps_.size(); ++i) {
+  for (int i = 0; i < submaps_.size(); ++i) {
     CUCO_CUDA_TRY(cudaMemset(submap_num_successes_[i], 0, sizeof(atomic_ctr_type)));
   }
-  
+
   // TODO: hacky, improve this
   // provide device-accessible vector for each submap num_successes variable
   thrust::device_vector<atomic_ctr_type*> d_submap_num_successes(submap_num_successes_);
@@ -204,32 +205,33 @@ void dynamic_map<Key, Value, Scope, Allocator>::erase(InputIt first,
   // TODO: hack (how to get size on host?)
   // use dynamic shared memory to hold block reduce space for each submap's erases
   constexpr size_t temp_storage_size_one_block = 48;
-  auto const temp_storage_size = submaps_.size() * temp_storage_size_one_block;
-      
+  auto const temp_storage_size                 = submaps_.size() * temp_storage_size_one_block;
+
   detail::erase<block_size, tile_size, cuco::pair_type<key_type, mapped_type>>
-    <<<grid_size, block_size, temp_storage_size>>>(
-      first,
-      first + num_keys,
-      submap_views_.data().get(),
-      submap_mutable_views_.data().get(),
-      num_successes_,
-      d_submap_num_successes.data().get(),
-      submaps_.size(),
-      hash,
-      key_equal);
+    <<<grid_size, block_size, temp_storage_size>>>(first,
+                                                   first + num_keys,
+                                                   submap_views_.data().get(),
+                                                   submap_mutable_views_.data().get(),
+                                                   num_successes_,
+                                                   d_submap_num_successes.data().get(),
+                                                   submaps_.size(),
+                                                   hash,
+                                                   key_equal);
 
   // update total dynamic map size
   std::size_t h_num_successes;
-  CUCO_CUDA_TRY(cudaMemcpy(
-    &h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost));
+  CUCO_CUDA_TRY(
+    cudaMemcpy(&h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost));
   size_ -= h_num_successes;
-  
+
   // TODO: if only one submap, skip this step
   // update each submap's size
-  for(int i = 0; i < submaps_.size(); ++i) {
+  for (int i = 0; i < submaps_.size(); ++i) {
     std::size_t h_submap_num_successes;
-    CUCO_CUDA_TRY(cudaMemcpy(
-      &h_submap_num_successes, submap_num_successes_[i], sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost));
+    CUCO_CUDA_TRY(cudaMemcpy(&h_submap_num_successes,
+                             submap_num_successes_[i],
+                             sizeof(atomic_ctr_type),
+                             cudaMemcpyDeviceToHost));
     submaps_[i]->size_ -= h_submap_num_successes;
   }
 }
diff --git a/include/cuco/detail/dynamic_map_kernels.cuh b/include/cuco/detail/dynamic_map_kernels.cuh
index e54b51586..61f32bda7 100644
--- a/include/cuco/detail/dynamic_map_kernels.cuh
+++ b/include/cuco/detail/dynamic_map_kernels.cuh
@@ -196,14 +196,14 @@ template <uint32_t block_size,
           typename Hash,
           typename KeyEqual>
 __global__ void erase(InputIt first,
-                       InputIt last,
-                       viewT* submap_views,
-                       mutableViewT* submap_mutable_views,
-                       atomicT* num_successes,
-                       atomicT** submap_num_successes,
-                       const uint32_t num_submaps,
-                       Hash hash,
-                       KeyEqual key_equal)
+                      InputIt last,
+                      viewT* submap_views,
+                      mutableViewT* submap_mutable_views,
+                      atomicT* num_successes,
+                      atomicT** submap_num_successes,
+                      const uint32_t num_submaps,
+                      Hash hash,
+                      KeyEqual key_equal)
 {
   typedef cub::BlockReduce<std::size_t, block_size> BlockReduce;
   extern __shared__ typename BlockReduce::TempStorage temp_submap_storage[];
@@ -219,7 +219,7 @@ __global__ void erase(InputIt first,
   auto it   = first + tid / tile_size;
 
   while (it < last) {
-    auto erased     = false;
+    auto erased = false;
 
     // manually check for duplicates in those submaps we are not inserting into
     int i;
@@ -242,10 +242,12 @@ __global__ void erase(InputIt first,
 
   // TODO: if there's only one submap, skip this step
   // update submap thread counts
-  for(int i = 0; i < num_submaps; ++i) {
-    std::size_t submap_block_num_successes = BlockReduce(temp_submap_storage[i]).Sum(submap_thread_num_successes[i]);
-    if(threadIdx.x == 0) {
-      submap_num_successes[i]->fetch_add(submap_block_num_successes, cuda::std::memory_order_relaxed);
+  for (int i = 0; i < num_submaps; ++i) {
+    std::size_t submap_block_num_successes =
+      BlockReduce(temp_submap_storage[i]).Sum(submap_thread_num_successes[i]);
+    if (threadIdx.x == 0) {
+      submap_num_successes[i]->fetch_add(submap_block_num_successes,
+                                         cuda::std::memory_order_relaxed);
     }
   }
 }
diff --git a/include/cuco/detail/nvtx3.hpp b/include/cuco/detail/nvtx3.hpp
index 08a02153b..075c6e5d4 100644
--- a/include/cuco/detail/nvtx3.hpp
+++ b/include/cuco/detail/nvtx3.hpp
@@ -17,7 +17,7 @@
 
 #if defined(NVTX3_MINOR_VERSION) and NVTX3_MINOR_VERSION < 0
 #error \
-    "Trying to #include NVTX version 3 in a source file where an older NVTX version has already been included.  If you are not directly using NVTX (the NVIDIA Tools Extension library), you are getting this error because libraries you are using have included different versions of NVTX.  Suggested solutions are: (1) reorder #includes so the newest NVTX version is included first, (2) avoid using the conflicting libraries in the same .c/.cpp file, or (3) update the library using the older NVTX version to use the newer version instead."
+  "Trying to #include NVTX version 3 in a source file where an older NVTX version has already been included.  If you are not directly using NVTX (the NVIDIA Tools Extension library), you are getting this error because libraries you are using have included different versions of NVTX.  Suggested solutions are: (1) reorder #includes so the newest NVTX version is included first, (2) avoid using the conflicting libraries in the same .c/.cpp file, or (3) update the library using the older NVTX version to use the newer version instead."
 #endif
 
 /**
@@ -219,7 +219,7 @@
  *
  * `nvtx3::mark` allows annotating an instantaneous event in an application's
  * timeline. For example, indicating when a mutex is locked or unlocked.
- * 
+ *
  * \code{.cpp}
  * std::mutex global_lock;
  * void lock_mutex(){
@@ -526,11 +526,10 @@ namespace detail {
  * `const wchar_t*`.
  */
 template <typename T>
-constexpr auto has_name_member() noexcept -> decltype(T::name, bool()) {
-  return (std::is_same<char const*,
-                       typename std::decay<decltype(T::name)>::type>::value or
-          std::is_same<wchar_t const*,
-                       typename std::decay<decltype(T::name)>::type>::value);
+constexpr auto has_name_member() noexcept -> decltype(T::name, bool())
+{
+  return (std::is_same<char const*, typename std::decay<decltype(T::name)>::type>::value or
+          std::is_same<wchar_t const*, typename std::decay<decltype(T::name)>::type>::value);
 }
 }  // namespace detail
 
@@ -592,7 +591,7 @@ class domain {
  public:
   domain(domain const&) = delete;
   domain& operator=(domain const&) = delete;
-  domain(domain&&) = delete;
+  domain(domain&&)                 = delete;
   domain& operator=(domain&&) = delete;
 
   /**
@@ -640,7 +639,8 @@ class domain {
    * @return Reference to the `domain` corresponding to the type `DomainName`.
    */
   template <typename DomainName>
-  static domain const& get() {
+  static domain const& get()
+  {
     static_assert(detail::has_name_member<DomainName>(),
                   "Type used to identify a domain must contain a name member of"
                   "type const char* or const wchar_t*");
@@ -667,7 +667,8 @@ class domain {
    * applications will be grouped together.
    *
    */
-  struct global {};
+  struct global {
+  };
 
  private:
   /**
@@ -678,8 +679,7 @@ class domain {
    *
    * @param name A unique name identifying the domain
    */
-  explicit domain(char const* name) noexcept
-      : _domain{nvtxDomainCreateA(name)} {}
+  explicit domain(char const* name) noexcept : _domain{nvtxDomainCreateA(name)} {}
 
   /**
    * @brief Construct a new domain with the specified `name`.
@@ -689,8 +689,7 @@ class domain {
    *
    * @param name A unique name identifying the domain
    */
-  explicit domain(wchar_t const* name) noexcept
-      : _domain{nvtxDomainCreateW(name)} {}
+  explicit domain(wchar_t const* name) noexcept : _domain{nvtxDomainCreateW(name)} {}
 
   /**
    * @brief Construct a new domain with the specified `name`.
@@ -746,7 +745,8 @@ class domain {
  *
  */
 template <>
-inline domain const& domain::get<domain::global>() {
+inline domain const& domain::get<domain::global>()
+{
   static domain const d{};
   return d;
 }
@@ -770,9 +770,10 @@ struct rgb {
    * @param green_ Value of the green channel
    * @param blue_ Value of the blue channel
    */
-  constexpr rgb(component_type red_, component_type green_,
-                component_type blue_) noexcept
-      : red{red_}, green{green_}, blue{blue_} {}
+  constexpr rgb(component_type red_, component_type green_, component_type blue_) noexcept
+    : red{red_}, green{green_}, blue{blue_}
+  {
+  }
 
   component_type const red{};    ///< Red channel value
   component_type const green{};  ///< Green channel value
@@ -797,9 +798,13 @@ struct argb final : rgb {
    * @param blue_  Value of the blue channel
    *
    */
-  constexpr argb(component_type alpha_, component_type red_,
-                 component_type green_, component_type blue_) noexcept
-      : rgb{red_, green_, blue_}, alpha{alpha_} {}
+  constexpr argb(component_type alpha_,
+                 component_type red_,
+                 component_type green_,
+                 component_type blue_) noexcept
+    : rgb{red_, green_, blue_}, alpha{alpha_}
+  {
+  }
 
   component_type const alpha{};  ///< Alpha channel value
 };
@@ -844,8 +849,9 @@ class color {
    * @param argb The alpha, red, green, blue components of the desired `color`
    */
   constexpr color(argb argb) noexcept
-      : color{from_bytes_msb_to_lsb(argb.alpha, argb.red, argb.green,
-                                    argb.blue)} {}
+    : color{from_bytes_msb_to_lsb(argb.alpha, argb.red, argb.green, argb.blue)}
+  {
+  }
 
   /**
    * @brief Construct a `color` using the red, green, blue components in
@@ -856,7 +862,9 @@ class color {
    * @param rgb The red, green, blue components of the desired `color`
    */
   constexpr color(rgb rgb) noexcept
-      : color{from_bytes_msb_to_lsb(0xFF, rgb.red, rgb.green, rgb.blue)} {}
+    : color{from_bytes_msb_to_lsb(0xFF, rgb.red, rgb.green, rgb.blue)}
+  {
+  }
 
   /**
    * @brief Returns the `color`s argb hex code
@@ -870,11 +878,11 @@ class color {
    */
   constexpr nvtxColorType_t get_type() const noexcept { return _type; }
 
-  color() = delete;
-  ~color() = default;
+  color()             = delete;
+  ~color()            = default;
   color(color const&) = default;
   color& operator=(color const&) = default;
-  color(color&&) = default;
+  color(color&&)                 = default;
   color& operator=(color&&) = default;
 
  private:
@@ -886,9 +894,9 @@ class color {
   constexpr static value_type from_bytes_msb_to_lsb(uint8_t byte3,
                                                     uint8_t byte2,
                                                     uint8_t byte1,
-                                                    uint8_t byte0) noexcept {
-    return uint32_t{byte3} << 24 | uint32_t{byte2} << 16 |
-           uint32_t{byte1} << 8 | uint32_t{byte0};
+                                                    uint8_t byte0) noexcept
+  {
+    return uint32_t{byte3} << 24 | uint32_t{byte2} << 16 | uint32_t{byte1} << 8 | uint32_t{byte0};
   }
 
   value_type const _value{};                     ///< color's argb color code
@@ -938,11 +946,11 @@ class category {
    */
   constexpr id_type get_id() const noexcept { return id_; }
 
-  category() = delete;
-  ~category() = default;
+  category()                = delete;
+  ~category()               = default;
   category(category const&) = default;
   category& operator=(category const&) = default;
-  category(category&&) = default;
+  category(category&&)                 = default;
   category& operator=(category&&) = default;
 
  private:
@@ -1036,7 +1044,8 @@ class named_category final : public category {
    * `char const*` or `wchar_t const*` and `C::id`.
    */
   template <typename C>
-  static named_category<D> const& get() noexcept {
+  static named_category<D> const& get() noexcept
+  {
     static_assert(detail::has_name_member<C>(),
                   "Type used to name a category must contain a name member.");
     static named_category<D> const category{C::id, C::name};
@@ -1052,7 +1061,8 @@ class named_category final : public category {
    * @param[in] id The category id to name
    * @param[in] name The name to associated with `id`
    */
-  named_category(id_type id, char const* name) noexcept : category{id} {
+  named_category(id_type id, char const* name) noexcept : category{id}
+  {
     nvtxDomainNameCategoryA(domain::get<D>(), get_id(), name);
   };
 
@@ -1066,7 +1076,8 @@ class named_category final : public category {
    * @param[in] id The category id to name
    * @param[in] name The name to associated with `id`
    */
-  named_category(id_type id, wchar_t const* name) noexcept : category{id} {
+  named_category(id_type id, wchar_t const* name) noexcept : category{id}
+  {
     nvtxDomainNameCategoryW(domain::get<D>(), get_id(), name);
   };
 };
@@ -1154,7 +1165,8 @@ class registered_message {
    * @return Reference to a `registered_message` associated with the type `M`.
    */
   template <typename M>
-  static registered_message<D> const& get() noexcept {
+  static registered_message<D> const& get() noexcept
+  {
     static registered_message<D> const registered_message{M::message};
     return registered_message;
   }
@@ -1171,7 +1183,9 @@ class registered_message {
    * @param msg The contents of the message
    */
   explicit registered_message(char const* msg) noexcept
-      : handle_{nvtxDomainRegisterStringA(domain::get<D>(), msg)} {}
+    : handle_{nvtxDomainRegisterStringA(domain::get<D>(), msg)}
+  {
+  }
 
   /**
    * @brief Constructs a `registered_message` from the specified `msg` string.
@@ -1184,8 +1198,7 @@ class registered_message {
    *
    * @param msg The contents of the message
    */
-  explicit registered_message(std::string const& msg) noexcept
-      : registered_message{msg.c_str()} {}
+  explicit registered_message(std::string const& msg) noexcept : registered_message{msg.c_str()} {}
 
   /**
    * @brief Constructs a `registered_message` from the specified `msg` string.
@@ -1199,7 +1212,9 @@ class registered_message {
    * @param msg The contents of the message
    */
   explicit registered_message(wchar_t const* msg) noexcept
-      : handle_{nvtxDomainRegisterStringW(domain::get<D>(), msg)} {}
+    : handle_{nvtxDomainRegisterStringW(domain::get<D>(), msg)}
+  {
+  }
 
   /**
    * @brief Constructs a `registered_message` from the specified `msg` string.
@@ -1212,8 +1227,7 @@ class registered_message {
    *
    * @param msg The contents of the message
    */
-  explicit registered_message(std::wstring const& msg) noexcept
-      : registered_message{msg.c_str()} {}
+  explicit registered_message(std::wstring const& msg) noexcept : registered_message{msg.c_str()} {}
 
   /**
    * @brief Returns the registered message's handle
@@ -1221,11 +1235,11 @@ class registered_message {
    */
   nvtxStringHandle_t get_handle() const noexcept { return handle_; }
 
-  registered_message() = delete;
-  ~registered_message() = default;
+  registered_message()                          = delete;
+  ~registered_message()                         = default;
   registered_message(registered_message const&) = default;
   registered_message& operator=(registered_message const&) = default;
-  registered_message(registered_message&&) = default;
+  registered_message(registered_message&&)                 = default;
   registered_message& operator=(registered_message&&) = default;
 
  private:
@@ -1279,8 +1293,8 @@ class message {
    *
    * @param msg The contents of the message
    */
-  NVTX3_RELAXED_CONSTEXPR message(char const* msg) noexcept
-      : type_{NVTX_MESSAGE_TYPE_ASCII} {
+  NVTX3_RELAXED_CONSTEXPR message(char const* msg) noexcept : type_{NVTX_MESSAGE_TYPE_ASCII}
+  {
     value_.ascii = msg;
   }
 
@@ -1306,8 +1320,8 @@ class message {
    *
    * @param msg The contents of the message
    */
-  NVTX3_RELAXED_CONSTEXPR message(wchar_t const* msg) noexcept
-      : type_{NVTX_MESSAGE_TYPE_UNICODE} {
+  NVTX3_RELAXED_CONSTEXPR message(wchar_t const* msg) noexcept : type_{NVTX_MESSAGE_TYPE_UNICODE}
+  {
     value_.unicode = msg;
   }
 
@@ -1338,7 +1352,8 @@ class message {
    */
   template <typename D>
   NVTX3_RELAXED_CONSTEXPR message(registered_message<D> const& msg) noexcept
-      : type_{NVTX_MESSAGE_TYPE_REGISTERED} {
+    : type_{NVTX_MESSAGE_TYPE_REGISTERED}
+  {
     value_.registered = msg.get_handle();
   }
 
@@ -1346,17 +1361,13 @@ class message {
    * @brief Return the union holding the value of the message.
    *
    */
-  NVTX3_RELAXED_CONSTEXPR value_type get_value() const noexcept {
-    return value_;
-  }
+  NVTX3_RELAXED_CONSTEXPR value_type get_value() const noexcept { return value_; }
 
   /**
    * @brief Return the type information about the value the union holds.
    *
    */
-  NVTX3_RELAXED_CONSTEXPR nvtxMessageType_t get_type() const noexcept {
-    return type_;
-  }
+  NVTX3_RELAXED_CONSTEXPR nvtxMessageType_t get_type() const noexcept { return type_; }
 
  private:
   nvtxMessageType_t const type_{};  ///< message type
@@ -1390,7 +1401,8 @@ class payload {
    * @param value Value to use as contents of the payload
    */
   NVTX3_RELAXED_CONSTEXPR explicit payload(int64_t value) noexcept
-      : type_{NVTX_PAYLOAD_TYPE_INT64}, value_{} {
+    : type_{NVTX_PAYLOAD_TYPE_INT64}, value_{}
+  {
     value_.llValue = value;
   }
 
@@ -1400,7 +1412,8 @@ class payload {
    * @param value Value to use as contents of the payload
    */
   NVTX3_RELAXED_CONSTEXPR explicit payload(int32_t value) noexcept
-      : type_{NVTX_PAYLOAD_TYPE_INT32}, value_{} {
+    : type_{NVTX_PAYLOAD_TYPE_INT32}, value_{}
+  {
     value_.iValue = value;
   }
 
@@ -1410,7 +1423,8 @@ class payload {
    * @param value Value to use as contents of the payload
    */
   NVTX3_RELAXED_CONSTEXPR explicit payload(uint64_t value) noexcept
-      : type_{NVTX_PAYLOAD_TYPE_UNSIGNED_INT64}, value_{} {
+    : type_{NVTX_PAYLOAD_TYPE_UNSIGNED_INT64}, value_{}
+  {
     value_.ullValue = value;
   }
 
@@ -1420,7 +1434,8 @@ class payload {
    * @param value Value to use as contents of the payload
    */
   NVTX3_RELAXED_CONSTEXPR explicit payload(uint32_t value) noexcept
-      : type_{NVTX_PAYLOAD_TYPE_UNSIGNED_INT32}, value_{} {
+    : type_{NVTX_PAYLOAD_TYPE_UNSIGNED_INT32}, value_{}
+  {
     value_.uiValue = value;
   }
 
@@ -1431,7 +1446,8 @@ class payload {
    * @param value Value to use as contents of the payload
    */
   NVTX3_RELAXED_CONSTEXPR explicit payload(float value) noexcept
-      : type_{NVTX_PAYLOAD_TYPE_FLOAT}, value_{} {
+    : type_{NVTX_PAYLOAD_TYPE_FLOAT}, value_{}
+  {
     value_.fValue = value;
   }
 
@@ -1442,7 +1458,8 @@ class payload {
    * @param value Value to use as contents of the payload
    */
   NVTX3_RELAXED_CONSTEXPR explicit payload(double value) noexcept
-      : type_{NVTX_PAYLOAD_TYPE_DOUBLE}, value_{} {
+    : type_{NVTX_PAYLOAD_TYPE_DOUBLE}, value_{}
+  {
     value_.dValue = value;
   }
 
@@ -1450,17 +1467,13 @@ class payload {
    * @brief Return the union holding the value of the payload
    *
    */
-  NVTX3_RELAXED_CONSTEXPR value_type get_value() const noexcept {
-    return value_;
-  }
+  NVTX3_RELAXED_CONSTEXPR value_type get_value() const noexcept { return value_; }
 
   /**
    * @brief Return the information about the type the union holds.
    *
    */
-  NVTX3_RELAXED_CONSTEXPR nvtxPayloadType_t get_type() const noexcept {
-    return type_;
-  }
+  NVTX3_RELAXED_CONSTEXPR nvtxPayloadType_t get_type() const noexcept { return type_; }
 
  private:
   nvtxPayloadType_t const type_;  ///< Type of the payload value
@@ -1537,17 +1550,19 @@ class event_attributes {
    * category, color, payload, nor message.
    */
   constexpr event_attributes() noexcept
-      : attributes_{
-            NVTX_VERSION,                  // version
-            sizeof(nvtxEventAttributes_t), // size
-            0,                             // category
-            NVTX_COLOR_UNKNOWN,            // color type
-            0,                             // color value
-            NVTX_PAYLOAD_UNKNOWN,          // payload type
-            0,                             // payload value (union)
-            NVTX_MESSAGE_UNKNOWN,          // message type
-            0                              // message value (union)
-        } {}
+    : attributes_{
+        NVTX_VERSION,                   // version
+        sizeof(nvtxEventAttributes_t),  // size
+        0,                              // category
+        NVTX_COLOR_UNKNOWN,             // color type
+        0,                              // color value
+        NVTX_PAYLOAD_UNKNOWN,           // payload type
+        0,                              // payload value (union)
+        NVTX_MESSAGE_UNKNOWN,           // message type
+        0                               // message value (union)
+      }
+  {
+  }
 
   /**
    * @brief Variadic constructor where the first argument is a `category`.
@@ -1557,9 +1572,9 @@ class event_attributes {
    *
    */
   template <typename... Args>
-  NVTX3_RELAXED_CONSTEXPR explicit event_attributes(
-      category const& c, Args const&... args) noexcept
-      : event_attributes(args...) {
+  NVTX3_RELAXED_CONSTEXPR explicit event_attributes(category const& c, Args const&... args) noexcept
+    : event_attributes(args...)
+  {
     attributes_.category = c.get_id();
   }
 
@@ -1571,10 +1586,10 @@ class event_attributes {
    *
    */
   template <typename... Args>
-  NVTX3_RELAXED_CONSTEXPR explicit event_attributes(
-      color const& c, Args const&... args) noexcept
-      : event_attributes(args...) {
-    attributes_.color = c.get_value();
+  NVTX3_RELAXED_CONSTEXPR explicit event_attributes(color const& c, Args const&... args) noexcept
+    : event_attributes(args...)
+  {
+    attributes_.color     = c.get_value();
     attributes_.colorType = c.get_type();
   }
 
@@ -1586,10 +1601,10 @@ class event_attributes {
    *
    */
   template <typename... Args>
-  NVTX3_RELAXED_CONSTEXPR explicit event_attributes(
-      payload const& p, Args const&... args) noexcept
-      : event_attributes(args...) {
-    attributes_.payload = p.get_value();
+  NVTX3_RELAXED_CONSTEXPR explicit event_attributes(payload const& p, Args const&... args) noexcept
+    : event_attributes(args...)
+  {
+    attributes_.payload     = p.get_value();
     attributes_.payloadType = p.get_type();
   }
 
@@ -1601,17 +1616,17 @@ class event_attributes {
    *
    */
   template <typename... Args>
-  NVTX3_RELAXED_CONSTEXPR explicit event_attributes(
-      message const& m, Args const&... args) noexcept
-      : event_attributes(args...) {
-    attributes_.message = m.get_value();
+  NVTX3_RELAXED_CONSTEXPR explicit event_attributes(message const& m, Args const&... args) noexcept
+    : event_attributes(args...)
+  {
+    attributes_.message     = m.get_value();
     attributes_.messageType = m.get_type();
   }
 
-  ~event_attributes() = default;
+  ~event_attributes()                       = default;
   event_attributes(event_attributes const&) = default;
   event_attributes& operator=(event_attributes const&) = default;
-  event_attributes(event_attributes&&) = default;
+  event_attributes(event_attributes&&)                 = default;
   event_attributes& operator=(event_attributes&&) = default;
 
   /**
@@ -1689,7 +1704,8 @@ class domain_thread_range {
    * @param[in] attr `event_attributes` that describes the desired attributes
    * of the range.
    */
-  explicit domain_thread_range(event_attributes const& attr) noexcept {
+  explicit domain_thread_range(event_attributes const& attr) noexcept
+  {
     nvtxDomainRangePushEx(domain::get<D>(), attr.get());
   }
 
@@ -1720,11 +1736,14 @@ class domain_thread_range {
    * forward.
    *
    */
-  template <typename First, typename... Args,
-            typename = typename std::enable_if<not std::is_same<
-                event_attributes, typename std::decay<First>>::value>>
+  template <typename First,
+            typename... Args,
+            typename = typename std::enable_if<
+              not std::is_same<event_attributes, typename std::decay<First>>::value>>
   explicit domain_thread_range(First const& first, Args const&... args) noexcept
-      : domain_thread_range{event_attributes{first, args...}} {}
+    : domain_thread_range{event_attributes{first, args...}}
+  {
+  }
 
   /**
    * @brief Default constructor creates a `domain_thread_range` with no
@@ -1735,7 +1754,7 @@ class domain_thread_range {
 
   domain_thread_range(domain_thread_range const&) = delete;
   domain_thread_range& operator=(domain_thread_range const&) = delete;
-  domain_thread_range(domain_thread_range&&) = delete;
+  domain_thread_range(domain_thread_range&&)                 = delete;
   domain_thread_range& operator=(domain_thread_range&&) = delete;
 
   /**
@@ -1771,8 +1790,8 @@ struct range_handle {
    */
   constexpr value_type get_value() const noexcept { return _range_id; }
 
-private:
-  value_type _range_id{}; ///< The underlying NVTX range id
+ private:
+  value_type _range_id{};  ///< The underlying NVTX range id
 };
 
 /**
@@ -1801,7 +1820,8 @@ struct range_handle {
  * @return Unique handle to be passed to `end_range` to end the range.
  */
 template <typename D = domain::global>
-range_handle start_range(event_attributes const &attr) noexcept {
+range_handle start_range(event_attributes const& attr) noexcept
+{
   return range_handle{nvtxDomainRangeStartEx(domain::get<D>(), attr.get())};
 }
 
@@ -1833,10 +1853,12 @@ range_handle start_range(event_attributes const &attr) noexcept {
  * `event_attributes`.
  * @return Unique handle to be passed to `end_range` to end the range.
  */
-template <typename First, typename... Args,
-          typename = typename std::enable_if<not std::is_same<
-              event_attributes, typename std::decay<First>>::value>>
-range_handle start_range(First const &first, Args const &... args) noexcept {
+template <typename First,
+          typename... Args,
+          typename = typename std::enable_if<
+            not std::is_same<event_attributes, typename std::decay<First>>::value>>
+range_handle start_range(First const& first, Args const&... args) noexcept
+{
   return start_range(event_attributes{first, args...});
 }
 
@@ -1873,15 +1895,17 @@ void end_range(range_handle r) { nvtxRangeEnd(r.get_value()); }
  * to which the `domain_process_range` belongs. Else, `domain::global` to
  * indicate that the global NVTX domain should be used.
  */
-template <typename D = domain::global> class domain_process_range {
+template <typename D = domain::global>
+class domain_process_range {
  public:
   /**
    * @brief Construct a new domain process range object
    *
    * @param attr
    */
-  explicit domain_process_range(event_attributes const &attr) noexcept
-      : handle_{start_range(attr)} {}
+  explicit domain_process_range(event_attributes const& attr) noexcept : handle_{start_range(attr)}
+  {
+  }
 
   /**
    * @brief Construct a new domain process range object
@@ -1889,28 +1913,28 @@ template <typename D = domain::global> class domain_process_range {
    * @param first
    * @param args
    */
-  template <typename First, typename... Args,
-            typename = typename std::enable_if<not std::is_same<
-                event_attributes, typename std::decay<First>>::value>>
-  explicit domain_process_range(First const &first,
-                                Args const &... args) noexcept
-      : domain_process_range{event_attributes{first, args...}} {}
+  template <typename First,
+            typename... Args,
+            typename = typename std::enable_if<
+              not std::is_same<event_attributes, typename std::decay<First>>::value>>
+  explicit domain_process_range(First const& first, Args const&... args) noexcept
+    : domain_process_range{event_attributes{first, args...}}
+  {
+  }
 
   /**
    * @brief Construct a new domain process range object
    *
    */
-  constexpr domain_process_range() noexcept
-      : domain_process_range{event_attributes{}} {}
+  constexpr domain_process_range() noexcept : domain_process_range{event_attributes{}} {}
 
   /**
    * @brief Destroy the `domain_process_range` ending the range.
    *
    */
-  ~domain_process_range() noexcept {
-    if (not moved_from_) {
-      end_range(handle_);
-    }
+  ~domain_process_range() noexcept
+  {
+    if (not moved_from_) { end_range(handle_); }
   }
 
   /**
@@ -1919,8 +1943,8 @@ template <typename D = domain::global> class domain_process_range {
    *
    * @param other
    */
-  domain_process_range(domain_process_range &&other) noexcept
-      : handle_{other.handle_} {
+  domain_process_range(domain_process_range&& other) noexcept : handle_{other.handle_}
+  {
     other.moved_from_ = true;
   }
 
@@ -1931,23 +1955,24 @@ template <typename D = domain::global> class domain_process_range {
    * @param other
    * @return domain_process_range&
    */
-  domain_process_range &operator=(domain_process_range &&other) noexcept {
-    handle_ = other.handle_;
+  domain_process_range& operator=(domain_process_range&& other) noexcept
+  {
+    handle_           = other.handle_;
     other.moved_from_ = true;
   }
 
   /// Copy construction is not allowed to prevent multiple objects from owning
   /// the same range handle
-  domain_process_range(domain_process_range const &) = delete;
+  domain_process_range(domain_process_range const&) = delete;
 
   /// Copy assignment is not allowed to prevent multiple objects from owning the
   /// same range handle
-  domain_process_range &operator=(domain_process_range const &) = delete;
+  domain_process_range& operator=(domain_process_range const&) = delete;
 
  private:
-  range_handle handle_;    ///< Range handle used to correlate
+  range_handle handle_;     ///< Range handle used to correlate
                             ///< the start/end of the range
-  bool moved_from_{false}; ///< Indicates if the object has had
+  bool moved_from_{false};  ///< Indicates if the object has had
                             ///< it's contents moved from it,
                             ///< indicating it should not attempt
                             ///< to end the NVTX range.
@@ -1981,7 +2006,8 @@ using process_range = domain_process_range<>;
  * of the mark.
  */
 template <typename D = nvtx3::domain::global>
-inline void mark(event_attributes const& attr) noexcept {
+inline void mark(event_attributes const& attr) noexcept
+{
   nvtxDomainMarkEx(domain::get<D>(), attr.get());
 }
 
diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh
index 977b00de0..35eb4898f 100644
--- a/include/cuco/dynamic_map.cuh
+++ b/include/cuco/dynamic_map.cuh
@@ -17,11 +17,13 @@
 #pragma once
 
 
+#include <cooperative_groups.h>
+#include <cub/cub.cuh>
+#include <cuco/detail/dynamic_map_kernels.cuh>
 #include <cuco/detail/error.hpp>
 #include <cuco/sentinel.cuh>
 #include <cuco/static_map.cuh>
 #include <cuda/std/atomic>
-#include <cuco/detail/dynamic_map_kernels.cuh>
 #include <thrust/device_vector.h>
 #include <thrust/functional.h>
 
@@ -100,26 +102,24 @@ class dynamic_map {
   static_assert(std::is_arithmetic<Key>::value, "Unsupported, non-arithmetic key type.");
 
  public:
-  using value_type                = cuco::pair_type<Key, Value>;
-  using key_type                  = Key;
-  using mapped_type               = Value;
-  using atomic_ctr_type           = cuda::atomic<std::size_t, Scope>;
-  using view_type                 = typename static_map<Key, Value, Scope>::device_view;
-  using mutable_view_type         = typename static_map<Key, Value, Scope>::device_mutable_view;
+  using value_type        = cuco::pair_type<Key, Value>;
+  using key_type          = Key;
+  using mapped_type       = Value;
+  using atomic_ctr_type   = cuda::atomic<std::size_t, Scope>;
+  using view_type         = typename static_map<Key, Value, Scope>::device_view;
+  using mutable_view_type = typename static_map<Key, Value, Scope>::device_mutable_view;
   using counter_allocator_type =
     typename std::allocator_traits<Allocator>::rebind_alloc<atomic_ctr_type>;
   
   dynamic_map(dynamic_map const&) = delete;
   dynamic_map(dynamic_map&&)      = delete;
 
-  template<typename T1, typename T2>
-  dynamic_map(std::size_t, T1, T2,
-              Allocator const& = Allocator{}) = delete;
-  
-  template<typename T1, typename T2, typename T3>
-  dynamic_map(std::size_t, T1, T2, T3,
-              Allocator const& = Allocator{}) = delete;
-  
+  template <typename T1, typename T2>
+  dynamic_map(std::size_t, T1, T2, Allocator const& = Allocator{}) = delete;
+
+  template <typename T1, typename T2, typename T3>
+  dynamic_map(std::size_t, T1, T2, T3, Allocator const& = Allocator{}) = delete;
+
   dynamic_map& operator=(dynamic_map const&) = delete;
   dynamic_map& operator=(dynamic_map&&) = delete;
 
@@ -147,7 +147,7 @@ class dynamic_map {
               sentinel::empty_key<Key> empty_key_sentinel,
               sentinel::empty_value<Value> empty_value_sentinel,
               Allocator const& alloc = Allocator{});
-  
+
   dynamic_map(std::size_t initial_capacity,
               sentinel::empty_key<Key> empty_key_sentinel,
               sentinel::empty_value<Value> empty_value_sentinel,
@@ -188,7 +188,7 @@ class dynamic_map {
             typename Hash     = cuco::detail::MurmurHash3_32<key_type>,
             typename KeyEqual = thrust::equal_to<key_type>>
   void insert(InputIt first, InputIt last, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{});
-  
+
   template <typename InputIt,
             typename Hash     = cuco::detail::MurmurHash3_32<key_type>,
             typename KeyEqual = thrust::equal_to<key_type>>
@@ -276,9 +276,9 @@ class dynamic_map {
   key_type erased_key_sentinel_{};
 
   // TODO: initialize this
-  std::size_t size_{};                  ///< Number of keys in the map
-  std::size_t capacity_{};              ///< Maximum number of keys that can be inserted
-  float max_load_factor_{};             ///< Max load factor before capacity growth
+  std::size_t size_{};       ///< Number of keys in the map
+  std::size_t capacity_{};   ///< Maximum number of keys that can be inserted
+  float max_load_factor_{};  ///< Max load factor before capacity growth
 
   std::vector<std::unique_ptr<static_map<key_type, mapped_type, Scope>>>
     submaps_;                                      ///< vector of pointers to each submap
@@ -287,8 +287,9 @@ class dynamic_map {
     submap_mutable_views_;          ///< vector of mutable device views for each submap
   std::size_t min_insert_size_{};   ///< min remaining capacity of submap for insert
   atomic_ctr_type* num_successes_;  ///< number of successfully inserted keys on insert
-  std::vector<atomic_ctr_type*> submap_num_successes_; ///< number of succesfully erased keys for each submap
-  Allocator alloc_{};  ///< Allocator passed to submaps to allocate their device storage
+  std::vector<atomic_ctr_type*>
+    submap_num_successes_;  ///< number of succesfully erased keys for each submap
+  Allocator alloc_{};       ///< Allocator passed to submaps to allocate their device storage
   counter_allocator_type counter_allocator_{};  ///< Allocator used to allocate `num_successes_`
 };
 }  // namespace cuco
diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh
index 3ef487b7c..f72ce41c5 100644
--- a/include/cuco/static_map.cuh
+++ b/include/cuco/static_map.cuh
@@ -1414,10 +1414,7 @@ class static_map {
                                sentinel::erased_key<Key>{erased_key_sentinel_});
   }
 
-  atomic_ctr_type* get_num_successes() const noexcept
-  {
-    return num_successes_;
-  }
+  atomic_ctr_type* get_num_successes() const noexcept { return num_successes_; }
 
  private:
   pair_atomic_type* slots_{nullptr};            ///< Pointer to flat slots storage
diff --git a/tests/dynamic_map/erase_test.cu b/tests/dynamic_map/erase_test.cu
index 385b2e426..64e4cce38 100644
--- a/tests/dynamic_map/erase_test.cu
+++ b/tests/dynamic_map/erase_test.cu
@@ -21,18 +21,16 @@
 
 #include <utils.hpp>
 
-
-TEMPLATE_TEST_CASE_SIG(
-  "erase key", "", ((typename T), T), (int32_t))
+TEMPLATE_TEST_CASE_SIG("erase key", "", ((typename T), T), (int32_t))
 {
   using Key   = T;
   using Value = T;
-  
+
   unsigned long num_keys = 1'000'000;
-  cuco::dynamic_map<Key, Value> map{num_keys * 2, 
-    cuco::sentinel::empty_key<Key>{-1}, 
-    cuco::sentinel::empty_value<Value>{-1}, 
-    cuco::sentinel::erased_key<Key>{-2}};
+  cuco::dynamic_map<Key, Value> map{num_keys * 2,
+                                    cuco::sentinel::empty_key<Key>{-1},
+                                    cuco::sentinel::empty_value<Value>{-1},
+                                    cuco::sentinel::erased_key<Key>{-2}};
 
   thrust::device_vector<Key> d_keys(num_keys);
   thrust::device_vector<Value> d_values(num_keys);
@@ -40,12 +38,11 @@ TEMPLATE_TEST_CASE_SIG(
 
   thrust::sequence(thrust::device, d_keys.begin(), d_keys.end(), 1);
   thrust::sequence(thrust::device, d_values.begin(), d_values.end(), 1);
-    
+
   auto pairs_begin =
     thrust::make_zip_iterator(thrust::make_tuple(d_keys.begin(), d_values.begin()));
 
-  SECTION(
-    "Check basic insert/erase")
+  SECTION("Check basic insert/erase")
   {
     // *****************************************
     // first, check single submap works properly
@@ -55,7 +52,6 @@ TEMPLATE_TEST_CASE_SIG(
 
     REQUIRE(map.get_size() == num_keys);
 
-    
     map.erase(d_keys.begin(), d_keys.end());
 
     // delete decreases count correctly
@@ -68,7 +64,7 @@ TEMPLATE_TEST_CASE_SIG(
                                 d_keys_exist.end(),
                                 [] __device__(const bool key_found) { return key_found; }));
 
-    // ensures that map is reusing deleted slots    
+    // ensures that map is reusing deleted slots
     map.insert(pairs_begin, pairs_begin + num_keys);
 
     REQUIRE(map.get_size() == num_keys);
@@ -76,63 +72,64 @@ TEMPLATE_TEST_CASE_SIG(
     map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin());
 
     REQUIRE(cuco::test::all_of(d_keys_exist.begin(),
-                                d_keys_exist.end(),
-                                [] __device__(const bool key_found) { return key_found; }));
+                               d_keys_exist.end(),
+                               [] __device__(const bool key_found) { return key_found; }));
 
     // erase can act selectively
-    map.erase(d_keys.begin(), d_keys.begin() + num_keys/2);
+    map.erase(d_keys.begin(), d_keys.begin() + num_keys / 2);
     map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin());
-    
+
     REQUIRE(cuco::test::none_of(d_keys_exist.begin(),
-                                d_keys_exist.begin() + num_keys/2,
+                                d_keys_exist.begin() + num_keys / 2,
                                 [] __device__(const bool key_found) { return key_found; }));
 
-    REQUIRE(cuco::test::all_of(d_keys_exist.begin() + num_keys/2,
-                                d_keys_exist.end(),
-                                [] __device__(const bool key_found) { return key_found; }));
-    
+    REQUIRE(cuco::test::all_of(d_keys_exist.begin() + num_keys / 2,
+                               d_keys_exist.end(),
+                               [] __device__(const bool key_found) { return key_found; }));
+
     // clear map
-    map.erase(d_keys.begin()+num_keys/2, d_keys.end());
-    
+    map.erase(d_keys.begin() + num_keys / 2, d_keys.end());
+
     // *************************************************
     // second, check multiple submaps case works properly
     // *************************************************
-    
+
     thrust::device_vector<Key> d_keys2(4 * num_keys);
     thrust::device_vector<Value> d_values2(4 * num_keys);
     thrust::device_vector<bool> d_keys_exist2(4 * num_keys);
-  
+
     thrust::sequence(thrust::device, d_keys2.begin(), d_keys2.end(), 1);
     thrust::sequence(thrust::device, d_values2.begin(), d_values2.end(), 1);
-      
+
     auto pairs_begin2 =
       thrust::make_zip_iterator(thrust::make_tuple(d_keys2.begin(), d_values2.begin()));
 
-    map.insert(pairs_begin2, pairs_begin2 + 4*num_keys);
-    
+    map.insert(pairs_begin2, pairs_begin2 + 4 * num_keys);
+
     // map should resize twice if the erased slots are successfully reused
-    REQUIRE(map.get_capacity() == 8*num_keys);
+    REQUIRE(map.get_capacity() == 8 * num_keys);
 
     // check that keys can be successfully deleted from only the first and second submaps
-    map.erase(d_keys2.begin(), d_keys2.begin() + 2*num_keys);
+    map.erase(d_keys2.begin(), d_keys2.begin() + 2 * num_keys);
 
     map.contains(d_keys2.begin(), d_keys2.end(), d_keys_exist2.begin());
-    
+
     REQUIRE(cuco::test::none_of(d_keys_exist2.begin(),
-                                d_keys_exist2.begin() + 2*num_keys,
+                                d_keys_exist2.begin() + 2 * num_keys,
                                 [] __device__(const bool key_found) { return key_found; }));
 
-    REQUIRE(cuco::test::all_of(d_keys_exist2.begin() + 2*num_keys,
-                                d_keys_exist2.end(),
-                                [] __device__(const bool key_found) { return key_found; }));
+    REQUIRE(cuco::test::all_of(d_keys_exist2.begin() + 2 * num_keys,
+                               d_keys_exist2.end(),
+                               [] __device__(const bool key_found) { return key_found; }));
 
-    REQUIRE(map.get_size() == 2*num_keys);
+    REQUIRE(map.get_size() == 2 * num_keys);
 
-    // check that keys can be successfully deleted from all submaps (some will be unsuccessful erases)
+    // check that keys can be successfully deleted from all submaps (some will be unsuccessful
+    // erases)
     map.erase(d_keys2.begin(), d_keys2.end());
-    
+
     map.contains(d_keys2.begin(), d_keys2.end(), d_keys_exist2.begin());
-    
+
     REQUIRE(cuco::test::none_of(d_keys_exist2.begin(),
                                 d_keys_exist2.end(),
                                 [] __device__(const bool key_found) { return key_found; }));

From b00fcba685e924b102058abac785a5f2fde74157 Mon Sep 17 00:00:00 2001
From: Nico Iskos <niskos@nvidia.com>
Date: Wed, 6 Apr 2022 22:54:24 -0700
Subject: [PATCH 12/36] shared mem atomics to keep track of per-submap erases

---
 benchmarks/hash_table/dynamic_map_bench.cu  |  12 +-
 include/cuco/detail/dynamic_map.inl         |  67 +++++------
 include/cuco/detail/dynamic_map_kernels.cuh | 122 ++++++++++++++++----
 include/cuco/dynamic_map.cuh                |   8 +-
 tests/dynamic_map/erase_test.cu             |  12 +-
 5 files changed, 141 insertions(+), 80 deletions(-)

diff --git a/benchmarks/hash_table/dynamic_map_bench.cu b/benchmarks/hash_table/dynamic_map_bench.cu
index 222699abb..e150c02be 100644
--- a/benchmarks/hash_table/dynamic_map_bench.cu
+++ b/benchmarks/hash_table/dynamic_map_bench.cu
@@ -269,12 +269,12 @@ BENCHMARK_TEMPLATE(BM_dynamic_search_all, int32_t, int32_t, dist_type::UNIQUE)
   ->Unit(benchmark::kMillisecond)
   ->Apply(gen_final_size)
   ->UseManualTime();
-
+*/
 BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int32_t, int32_t, dist_type::UNIQUE)
   ->Unit(benchmark::kMillisecond)
   ->Apply(gen_final_size)
   ->UseManualTime();
-
+/*
 BENCHMARK_TEMPLATE(BM_dynamic_insert, int32_t, int32_t, dist_type::UNIFORM)
   ->Unit(benchmark::kMillisecond)
   ->Apply(gen_final_size)
@@ -309,22 +309,22 @@ BENCHMARK_TEMPLATE(BM_dynamic_search_all, int64_t, int64_t, dist_type::UNIQUE)
   ->Unit(benchmark::kMillisecond)
   ->Apply(gen_final_size)
   ->UseManualTime();
-
+*/
 BENCHMARK_TEMPLATE(BM_dynamic_erase_none, int32_t, int32_t, dist_type::UNIQUE)
   ->Unit(benchmark::kMillisecond)
   ->Apply(gen_final_size)
   ->UseManualTime();
-
+/*
 BENCHMARK_TEMPLATE(BM_dynamic_erase_none, int32_t, int32_t, dist_type::GAUSSIAN)
   ->Unit(benchmark::kMillisecond)
   ->Apply(gen_final_size)
   ->UseManualTime();
-*/
+
 BENCHMARK_TEMPLATE(BM_dynamic_search_none, int32_t, int32_t, dist_type::GAUSSIAN)
   ->Unit(benchmark::kMillisecond)
   ->Apply(gen_final_size)
   ->UseManualTime();
-/*
+
 BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int64_t, int64_t, dist_type::UNIQUE)
   ->Unit(benchmark::kMillisecond)
   ->Apply(gen_final_size)
diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl
index 8be714c3d..c97622433 100644
--- a/include/cuco/detail/dynamic_map.inl
+++ b/include/cuco/detail/dynamic_map.inl
@@ -39,8 +39,6 @@ dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(
     alloc));
   submap_views_.push_back(submaps_[0]->get_device_view());
   submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view());
-  submap_num_successes_.push_back(submaps_[0]->get_num_successes());
-
   num_successes_ = std::allocator_traits<counter_allocator_type>::allocate(counter_allocator_, 1);
 }
 
@@ -70,7 +68,7 @@ dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(
   submap_views_.push_back(submaps_[0]->get_device_view());
   submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view());
   submap_num_successes_.push_back(submaps_[0]->get_num_successes());
-
+  d_submap_num_successes_ = submap_num_successes_;
   num_successes_ = std::allocator_traits<counter_allocator_type>::allocate(counter_allocator_, 1);
 }
 
@@ -102,6 +100,8 @@ void dynamic_map<Key, Value, Scope, Allocator>::reserve(std::size_t n)
           sentinel::empty_value<Value>{empty_value_sentinel_},
           sentinel::erased_key<Key>{erased_key_sentinel_},
           alloc_));
+        submap_num_successes_.push_back(submaps_[submap_idx]->get_num_successes());
+        d_submap_num_successes_ = submap_num_successes_;
       } else {
         submaps_.push_back(std::make_unique<static_map<Key, Value, Scope, Allocator>>(
           submap_capacity,
@@ -111,8 +111,6 @@ void dynamic_map<Key, Value, Scope, Allocator>::reserve(std::size_t n)
       }
       submap_views_.push_back(submaps_[submap_idx]->get_device_view());
       submap_mutable_views_.push_back(submaps_[submap_idx]->get_device_mutable_view());
-      submap_num_successes_.push_back(submaps_[submap_idx]->get_num_successes());
-
       capacity_ *= 2;
     }
 
@@ -164,7 +162,6 @@ void dynamic_map<Key, Value, Scope, Allocator>::insert(InputIt first,
       std::size_t h_num_successes;
       CUCO_CUDA_TRY(cudaMemcpy(
         &h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost));
-
       submaps_[submap_idx]->size_ += h_num_successes;
       size_ += h_num_successes;
       first += n;
@@ -193,46 +190,42 @@ void dynamic_map<Key, Value, Scope, Allocator>::erase(InputIt first,
   CUCO_CUDA_TRY(cudaMemset(num_successes_, 0, sizeof(atomic_ctr_type)));
 
   // zero out submap success counters
-  static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type));
-  for (int i = 0; i < submaps_.size(); ++i) {
-    CUCO_CUDA_TRY(cudaMemset(submap_num_successes_[i], 0, sizeof(atomic_ctr_type)));
+  if(submaps_.size() > 1) {
+    static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type));
+    for(int i = 0; i < submaps_.size(); ++i) {
+      CUCO_CUDA_TRY(cudaMemset(submap_num_successes_[i], 0, sizeof(atomic_ctr_type)));
+    }
   }
-
-  // TODO: hacky, improve this
-  // provide device-accessible vector for each submap num_successes variable
-  thrust::device_vector<atomic_ctr_type*> d_submap_num_successes(submap_num_successes_);
-
-  // TODO: hack (how to get size on host?)
-  // use dynamic shared memory to hold block reduce space for each submap's erases
-  constexpr size_t temp_storage_size_one_block = 48;
-  auto const temp_storage_size                 = submaps_.size() * temp_storage_size_one_block;
+  
+  auto const temp_storage_size = submaps_.size() * sizeof(unsigned long long);
 
   detail::erase<block_size, tile_size, cuco::pair_type<key_type, mapped_type>>
-    <<<grid_size, block_size, temp_storage_size>>>(first,
-                                                   first + num_keys,
-                                                   submap_views_.data().get(),
-                                                   submap_mutable_views_.data().get(),
-                                                   num_successes_,
-                                                   d_submap_num_successes.data().get(),
-                                                   submaps_.size(),
-                                                   hash,
-                                                   key_equal);
+    <<<grid_size, block_size, temp_storage_size>>>(
+      first,
+      first + num_keys,
+      submap_views_.data().get(),
+      submap_mutable_views_.data().get(),
+      num_successes_,
+      d_submap_num_successes_.data().get(),
+      submaps_.size(),
+      hash,
+      key_equal);
 
   // update total dynamic map size
   std::size_t h_num_successes;
   CUCO_CUDA_TRY(
     cudaMemcpy(&h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost));
   size_ -= h_num_successes;
-
-  // TODO: if only one submap, skip this step
-  // update each submap's size
-  for (int i = 0; i < submaps_.size(); ++i) {
-    std::size_t h_submap_num_successes;
-    CUCO_CUDA_TRY(cudaMemcpy(&h_submap_num_successes,
-                             submap_num_successes_[i],
-                             sizeof(atomic_ctr_type),
-                             cudaMemcpyDeviceToHost));
-    submaps_[i]->size_ -= h_submap_num_successes;
+  
+  if(submaps_.size() == 1) {
+    submaps_[0]->size_ -= h_num_successes;
+  } else {
+    for(int i = 0; i < submaps_.size(); ++i) {
+      std::size_t h_submap_num_successes;
+      CUCO_CUDA_TRY(cudaMemcpy(
+        &h_submap_num_successes, submap_num_successes_[i], sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost));
+      submaps_[i]->size_ -= h_submap_num_successes;
+    }
   }
 }
 
diff --git a/include/cuco/detail/dynamic_map_kernels.cuh b/include/cuco/detail/dynamic_map_kernels.cuh
index 61f32bda7..fbc7f9e35 100644
--- a/include/cuco/detail/dynamic_map_kernels.cuh
+++ b/include/cuco/detail/dynamic_map_kernels.cuh
@@ -187,7 +187,6 @@ __global__ void insert(InputIt first,
 }
 
 template <uint32_t block_size,
-          uint32_t tile_size,
           typename pair_type,
           typename InputIt,
           typename viewT,
@@ -206,33 +205,108 @@ __global__ void erase(InputIt first,
                       KeyEqual key_equal)
 {
   typedef cub::BlockReduce<std::size_t, block_size> BlockReduce;
-  extern __shared__ typename BlockReduce::TempStorage temp_submap_storage[];
   __shared__ typename BlockReduce::TempStorage temp_storage;
+  extern __shared__ unsigned long long submap_block_num_successes[];
 
   std::size_t thread_num_successes = 0;
 
-  // TODO: find permanent solution (only works for four submaps)
-  std::size_t submap_thread_num_successes[4] = {0, 0, 0, 0};
+  auto tid  = block_size * blockIdx.x + threadIdx.x;
+  auto it   = first + tid;
+
+  if(num_submaps > 1) {
+    for(int i = threadIdx.x; i < num_submaps; i += block_size)
+      submap_block_num_successes[i] = 0;
+    __syncthreads();
+  
+    while (it < last) {
+      int i;
+      for (i = 0; i < num_submaps; ++i) {
+        if (submap_mutable_views[i].erase(*it, hash, key_equal)) {
+          thread_num_successes++;
+          atomicAdd(&submap_block_num_successes[i], 1);
+          break;
+        }
+      }
+      it += gridDim.x * blockDim.x;
+    }
+  } else {
+    while (it < last) {
+      if(submap_mutable_views[0].erase(*it, hash, key_equal))
+        thread_num_successes++;
+      it += gridDim.x * blockDim.x;
+    }
+  }
+
+  std::size_t block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes);
+  if (threadIdx.x == 0) {
+    num_successes->fetch_add(block_num_successes, cuda::std::memory_order_relaxed);
+  }
+
+  if(num_submaps > 1) {
+    for(int i = 0; i < num_submaps; ++i) {
+      if(threadIdx.x == 0) {
+        submap_num_successes[i]->fetch_add(
+          static_cast<std::size_t>(submap_block_num_successes[i]), cuda::std::memory_order_relaxed);
+      }
+    }
+  }
+}
+
+template <uint32_t block_size,
+          uint32_t tile_size,
+          typename pair_type,
+          typename InputIt,
+          typename viewT,
+          typename mutableViewT,
+          typename atomicT,
+          typename Hash,
+          typename KeyEqual>
+__global__ void erase(InputIt first,
+                       InputIt last,
+                       viewT* submap_views,
+                       mutableViewT* submap_mutable_views,
+                       atomicT* num_successes,
+                       atomicT** submap_num_successes,
+                       const uint32_t num_submaps,
+                       Hash hash,
+                       KeyEqual key_equal)
+{
+  typedef cub::BlockReduce<std::size_t, block_size> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  extern __shared__ unsigned long long submap_block_num_successes[];
+
+  std::size_t thread_num_successes = 0;
 
   auto tile = cg::tiled_partition<tile_size>(cg::this_thread_block());
   auto tid  = block_size * blockIdx.x + threadIdx.x;
   auto it   = first + tid / tile_size;
 
-  while (it < last) {
-    auto erased = false;
-
-    // manually check for duplicates in those submaps we are not inserting into
-    int i;
-    for (i = 0; i < num_submaps; ++i) {
-      erased = submap_mutable_views[i].erase(tile, *it, hash, key_equal);
-      if (erased) { break; }
-    }
-    if (erased && tile.thread_rank() == 0) {
-      thread_num_successes++;
-      submap_thread_num_successes[i]++;
+  if(num_submaps > 1) {
+    for(int i = threadIdx.x; i < num_submaps; i += block_size)
+      submap_block_num_successes[i] = 0;
+    __syncthreads();
+  
+    while (it < last) {
+      auto erased     = false;
+      int i;
+      for (i = 0; i < num_submaps; ++i) {
+        erased = submap_mutable_views[i].erase(tile, *it, hash, key_equal);
+        if (erased) { break; }
+      }
+      if (erased && tile.thread_rank() == 0) {
+        thread_num_successes++;
+        atomicAdd(&submap_block_num_successes[i], 1);
+      }
+      it += (gridDim.x * blockDim.x) / tile_size;
     }
+  } else {
+    while (it < last) {
+      auto erased = submap_mutable_views[0].erase(tile, *it, hash, key_equal);
+      if (erased && tile.thread_rank() == 0)
+        thread_num_successes++;
 
-    it += (gridDim.x * blockDim.x) / tile_size;
+      it += (gridDim.x * blockDim.x) / tile_size;
+    }
   }
 
   std::size_t block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes);
@@ -240,14 +314,12 @@ __global__ void erase(InputIt first,
     num_successes->fetch_add(block_num_successes, cuda::std::memory_order_relaxed);
   }
 
-  // TODO: if there's only one submap, skip this step
-  // update submap thread counts
-  for (int i = 0; i < num_submaps; ++i) {
-    std::size_t submap_block_num_successes =
-      BlockReduce(temp_submap_storage[i]).Sum(submap_thread_num_successes[i]);
-    if (threadIdx.x == 0) {
-      submap_num_successes[i]->fetch_add(submap_block_num_successes,
-                                         cuda::std::memory_order_relaxed);
+  if(num_submaps > 1) {
+    for(int i = 0; i < num_submaps; ++i) {
+      if(threadIdx.x == 0) {
+        submap_num_successes[i]->fetch_add(
+          static_cast<std::size_t>(submap_block_num_successes[i]), cuda::std::memory_order_relaxed);
+      }
     }
   }
 }
diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh
index 35eb4898f..bb197f6dc 100644
--- a/include/cuco/dynamic_map.cuh
+++ b/include/cuco/dynamic_map.cuh
@@ -19,11 +19,11 @@
 
 #include <cooperative_groups.h>
 #include <cub/cub.cuh>
-#include <cuco/detail/dynamic_map_kernels.cuh>
 #include <cuco/detail/error.hpp>
 #include <cuco/sentinel.cuh>
 #include <cuco/static_map.cuh>
 #include <cuda/std/atomic>
+#include <cuco/detail/dynamic_map_kernels.cuh>
 #include <thrust/device_vector.h>
 #include <thrust/functional.h>
 
@@ -287,9 +287,9 @@ class dynamic_map {
     submap_mutable_views_;          ///< vector of mutable device views for each submap
   std::size_t min_insert_size_{};   ///< min remaining capacity of submap for insert
   atomic_ctr_type* num_successes_;  ///< number of successfully inserted keys on insert
-  std::vector<atomic_ctr_type*>
-    submap_num_successes_;  ///< number of succesfully erased keys for each submap
-  Allocator alloc_{};       ///< Allocator passed to submaps to allocate their device storage
+  std::vector<atomic_ctr_type*> submap_num_successes_; ///< number of succesfully erased keys for each submap
+  thrust::device_vector<atomic_ctr_type*> d_submap_num_successes_; 
+  Allocator alloc_{};  ///< Allocator passed to submaps to allocate their device storage
   counter_allocator_type counter_allocator_{};  ///< Allocator used to allocate `num_successes_`
 };
 }  // namespace cuco
diff --git a/tests/dynamic_map/erase_test.cu b/tests/dynamic_map/erase_test.cu
index 64e4cce38..e5753b544 100644
--- a/tests/dynamic_map/erase_test.cu
+++ b/tests/dynamic_map/erase_test.cu
@@ -107,11 +107,9 @@ TEMPLATE_TEST_CASE_SIG("erase key", "", ((typename T), T), (int32_t))
     map.insert(pairs_begin2, pairs_begin2 + 4 * num_keys);
 
     // map should resize twice if the erased slots are successfully reused
-    REQUIRE(map.get_capacity() == 8 * num_keys);
-
+    REQUIRE(map.get_capacity() == 8*num_keys);
     // check that keys can be successfully deleted from only the first and second submaps
-    map.erase(d_keys2.begin(), d_keys2.begin() + 2 * num_keys);
-
+    map.erase(d_keys2.begin(), d_keys2.begin() + 2*num_keys);
     map.contains(d_keys2.begin(), d_keys2.end(), d_keys_exist2.begin());
 
     REQUIRE(cuco::test::none_of(d_keys_exist2.begin(),
@@ -122,10 +120,8 @@ TEMPLATE_TEST_CASE_SIG("erase key", "", ((typename T), T), (int32_t))
                                d_keys_exist2.end(),
                                [] __device__(const bool key_found) { return key_found; }));
 
-    REQUIRE(map.get_size() == 2 * num_keys);
-
-    // check that keys can be successfully deleted from all submaps (some will be unsuccessful
-    // erases)
+    REQUIRE(map.get_size() == 2*num_keys);
+    // check that keys can be successfully deleted from all submaps (some will be unsuccessful erases)
     map.erase(d_keys2.begin(), d_keys2.end());
 
     map.contains(d_keys2.begin(), d_keys2.end(), d_keys_exist2.begin());

From c146f9d92e0988b76f442a4d7abd4354a5140aaa Mon Sep 17 00:00:00 2001
From: Nico Iskos <niskos@nvidia.com>
Date: Fri, 29 Apr 2022 17:01:25 -0700
Subject: [PATCH 13/36] doc improvements

---
 benchmarks/hash_table/dynamic_map_bench.cu  | 82 +--------------------
 include/cuco/detail/dynamic_map.inl         |  1 -
 include/cuco/detail/dynamic_map_kernels.cuh | 55 +++++++++++++-
 include/cuco/detail/static_map_kernels.cuh  | 41 +++++++++++
 include/cuco/dynamic_map.cuh                | 61 ++++++++++++++-
 5 files changed, 152 insertions(+), 88 deletions(-)

diff --git a/benchmarks/hash_table/dynamic_map_bench.cu b/benchmarks/hash_table/dynamic_map_bench.cu
index e150c02be..3a846b23d 100644
--- a/benchmarks/hash_table/dynamic_map_bench.cu
+++ b/benchmarks/hash_table/dynamic_map_bench.cu
@@ -259,7 +259,7 @@ static void BM_dynamic_erase_none(::benchmark::State& state)
   state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) *
                           int64_t(state.range(0)));
 }
-/*
+
 BENCHMARK_TEMPLATE(BM_dynamic_insert, int32_t, int32_t, dist_type::UNIQUE)
   ->Unit(benchmark::kMillisecond)
   ->Apply(gen_final_size)
@@ -269,84 +269,8 @@ BENCHMARK_TEMPLATE(BM_dynamic_search_all, int32_t, int32_t, dist_type::UNIQUE)
   ->Unit(benchmark::kMillisecond)
   ->Apply(gen_final_size)
   ->UseManualTime();
-*/
-BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int32_t, int32_t, dist_type::UNIQUE)
-  ->Unit(benchmark::kMillisecond)
-  ->Apply(gen_final_size)
-  ->UseManualTime();
-/*
-BENCHMARK_TEMPLATE(BM_dynamic_insert, int32_t, int32_t, dist_type::UNIFORM)
-  ->Unit(benchmark::kMillisecond)
-  ->Apply(gen_final_size)
-  ->UseManualTime();
-
-BENCHMARK_TEMPLATE(BM_dynamic_search_all, int32_t, int32_t, dist_type::UNIFORM)
-  ->Unit(benchmark::kMillisecond)
-  ->Apply(gen_final_size)
-  ->UseManualTime();
-
-BENCHMARK_TEMPLATE(BM_dynamic_insert, int32_t, int32_t, dist_type::GAUSSIAN)
-  ->Unit(benchmark::kMillisecond)
-  ->Apply(gen_final_size)
-  ->UseManualTime();
-
-BENCHMARK_TEMPLATE(BM_dynamic_search_all, int32_t, int32_t, dist_type::GAUSSIAN)
-  ->Unit(benchmark::kMillisecond)
-  ->Apply(gen_final_size)
-  ->UseManualTime();
-
-BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int32_t, int32_t, dist_type::GAUSSIAN)
-  ->Unit(benchmark::kMillisecond)
-  ->Apply(gen_final_size)
-  ->UseManualTime();
-
-BENCHMARK_TEMPLATE(BM_dynamic_insert, int64_t, int64_t, dist_type::UNIQUE)
-  ->Unit(benchmark::kMillisecond)
-  ->Apply(gen_final_size)
-  ->UseManualTime();
 
-BENCHMARK_TEMPLATE(BM_dynamic_search_all, int64_t, int64_t, dist_type::UNIQUE)
-  ->Unit(benchmark::kMillisecond)
-  ->Apply(gen_final_size)
-  ->UseManualTime();
-*/
-BENCHMARK_TEMPLATE(BM_dynamic_erase_none, int32_t, int32_t, dist_type::UNIQUE)
-  ->Unit(benchmark::kMillisecond)
-  ->Apply(gen_final_size)
-  ->UseManualTime();
-/*
-BENCHMARK_TEMPLATE(BM_dynamic_erase_none, int32_t, int32_t, dist_type::GAUSSIAN)
-  ->Unit(benchmark::kMillisecond)
-  ->Apply(gen_final_size)
-  ->UseManualTime();
-
-BENCHMARK_TEMPLATE(BM_dynamic_search_none, int32_t, int32_t, dist_type::GAUSSIAN)
-  ->Unit(benchmark::kMillisecond)
-  ->Apply(gen_final_size)
-  ->UseManualTime();
-
-BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int64_t, int64_t, dist_type::UNIQUE)
-  ->Unit(benchmark::kMillisecond)
-  ->Apply(gen_final_size)
-  ->UseManualTime();
-
-BENCHMARK_TEMPLATE(BM_dynamic_insert, int64_t, int64_t, dist_type::UNIFORM)
-  ->Unit(benchmark::kMillisecond)
-  ->Apply(gen_final_size)
-  ->UseManualTime();
-
-BENCHMARK_TEMPLATE(BM_dynamic_search_all, int64_t, int64_t, dist_type::UNIFORM)
-  ->Unit(benchmark::kMillisecond)
-  ->Apply(gen_final_size)
-  ->UseManualTime();
-
-BENCHMARK_TEMPLATE(BM_dynamic_insert, int64_t, int64_t, dist_type::GAUSSIAN)
-  ->Unit(benchmark::kMillisecond)
-  ->Apply(gen_final_size)
-  ->UseManualTime();
-
-BENCHMARK_TEMPLATE(BM_dynamic_search_all, int64_t, int64_t, dist_type::GAUSSIAN)
+BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int32_t, int32_t, dist_type::UNIQUE)
   ->Unit(benchmark::kMillisecond)
   ->Apply(gen_final_size)
-  ->UseManualTime();
-*/
\ No newline at end of file
+  ->UseManualTime();
\ No newline at end of file
diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl
index c97622433..c648c7029 100644
--- a/include/cuco/detail/dynamic_map.inl
+++ b/include/cuco/detail/dynamic_map.inl
@@ -203,7 +203,6 @@ void dynamic_map<Key, Value, Scope, Allocator>::erase(InputIt first,
     <<<grid_size, block_size, temp_storage_size>>>(
       first,
       first + num_keys,
-      submap_views_.data().get(),
       submap_mutable_views_.data().get(),
       num_successes_,
       d_submap_num_successes_.data().get(),
diff --git a/include/cuco/detail/dynamic_map_kernels.cuh b/include/cuco/detail/dynamic_map_kernels.cuh
index fbc7f9e35..6614cfe28 100644
--- a/include/cuco/detail/dynamic_map_kernels.cuh
+++ b/include/cuco/detail/dynamic_map_kernels.cuh
@@ -186,17 +186,40 @@ __global__ void insert(InputIt first,
   if (threadIdx.x == 0) { *num_successes += block_num_successes; }
 }
 
+/**
+ * @brief Erases the key/value pairs corresponding to all keys in the range `[first, last)`.
+ *
+ * If the key `*(first + i)` exists in the map, its slot is erased and made available for future
+   insertions.
+ * Else, no effect.
+ * @tparam block_size The size of the thread block
+ * @tparam pair_type Type of the pairs contained in the map
+ * @tparam InputIt Device accessible input iterator whose `value_type` is
+ * convertible to the map's `key_type`
+ * @tparam mutableViewT Type of device view allowing modification of hash map storage
+ * @tparam atomicT Type of atomic storage
+ * @tparam Hash Unary callable type
+ * @tparam KeyEqual Binary callable type
+ * @param first Beginning of the sequence of keys
+ * @param last End of the sequence of keys
+ * @param submap_mutable_views Array of `static_map::mutable_device_view` objects used to
+ * perform `erase` operations on each underlying `static_map`
+ * @param num_successes The number of successfully erased key/value pairs
+ * @param submap_num_successes The number of successfully erased key/value pairs
+ * in each submap
+ * @param num_submaps The number of submaps in the map
+ * @param hash The unary function to apply to hash each key
+ * @param key_equal The binary function to compare two keys for equality
+ */
 template <uint32_t block_size,
           typename pair_type,
           typename InputIt,
-          typename viewT,
           typename mutableViewT,
           typename atomicT,
           typename Hash,
           typename KeyEqual>
 __global__ void erase(InputIt first,
                       InputIt last,
-                      viewT* submap_views,
                       mutableViewT* submap_mutable_views,
                       atomicT* num_successes,
                       atomicT** submap_num_successes,
@@ -252,18 +275,42 @@ __global__ void erase(InputIt first,
   }
 }
 
+/**
+ * @brief Erases the key/value pairs corresponding to all keys in the range `[first, last)`.
+ *
+ * If the key `*(first + i)` exists in the map, its slot is erased and made available for future
+   insertions.
+ * Else, no effect.
+ * @tparam block_size The size of the thread block
+ * @tparam tile_size The number of threads in the Cooperative Groups used to perform erase
+ * @tparam pair_type Type of the pairs contained in the map
+ * @tparam InputIt Device accessible input iterator whose `value_type` is
+ * convertible to the map's `key_type`
+ * @tparam mutableViewT Type of device view allowing modification of hash map storage
+ * @tparam atomicT Type of atomic storage
+ * @tparam Hash Unary callable type
+ * @tparam KeyEqual Binary callable type
+ * @param first Beginning of the sequence of keys
+ * @param last End of the sequence of keys
+ * @param submap_mutable_views Array of `static_map::mutable_device_view` objects used to
+ * perform `erase` operations on each underlying `static_map`
+ * @param num_successes The number of successfully erased key/value pairs
+ * @param submap_num_successes The number of successfully erased key/value pairs
+ * in each submap
+ * @param num_submaps The number of submaps in the map
+ * @param hash The unary function to apply to hash each key
+ * @param key_equal The binary function to compare two keys for equality
+ */
 template <uint32_t block_size,
           uint32_t tile_size,
           typename pair_type,
           typename InputIt,
-          typename viewT,
           typename mutableViewT,
           typename atomicT,
           typename Hash,
           typename KeyEqual>
 __global__ void erase(InputIt first,
                        InputIt last,
-                       viewT* submap_views,
                        mutableViewT* submap_mutable_views,
                        atomicT* num_successes,
                        atomicT** submap_num_successes,
diff --git a/include/cuco/detail/static_map_kernels.cuh b/include/cuco/detail/static_map_kernels.cuh
index 7a3ca0dfa..2ebcd4c91 100644
--- a/include/cuco/detail/static_map_kernels.cuh
+++ b/include/cuco/detail/static_map_kernels.cuh
@@ -163,6 +163,26 @@ __global__ void insert(
   if (threadIdx.x == 0) { *num_successes += block_num_successes; }
 }
 
+/**
+ * @brief Erases the key/value pairs corresponding to all keys in the range `[first, last)`.
+ *
+ * If the key `*(first + i)` exists in the map, its slot is erased and made available for future
+   insertions.
+ * Else, no effect.
+ * @tparam block_size The size of the thread block
+ * @tparam InputIt Device accessible input iterator whose `value_type` is
+ * convertible to the map's `key_type`
+ * @tparam atomicT Type of atomic storage
+ * @tparam viewT Type of device view allowing access of hash map storage
+ * @tparam Hash Unary callable type
+ * @tparam KeyEqual Binary callable type
+ * @param first Beginning of the sequence of keys
+ * @param last End of the sequence of keys
+ * @param num_successes The number of successfully erased key/value pairs
+ * @param view Device view used to access the hash map's slot storage
+ * @param hash The unary function to apply to hash each key
+ * @param key_equal The binary function to compare two keys for equality
+ */
 template <std::size_t block_size,
           typename InputIt,
           typename atomicT,
@@ -192,6 +212,27 @@ __global__ void erase(
   }
 }
 
+/**
+ * @brief Erases the key/value pairs corresponding to all keys in the range `[first, last)`.
+ *
+ * If the key `*(first + i)` exists in the map, its slot is erased and made available for future
+   insertions.
+ * Else, no effect.
+ * @tparam block_size The size of the thread block
+ * @tparam tile_size The number of threads in the Cooperative Groups used to perform erase
+ * @tparam InputIt Device accessible input iterator whose `value_type` is
+ * convertible to the map's `key_type`
+ * @tparam atomicT Type of atomic storage
+ * @tparam viewT Type of device view allowing access of hash map storage
+ * @tparam Hash Unary callable type
+ * @tparam KeyEqual Binary callable type
+ * @param first Beginning of the sequence of keys
+ * @param last End of the sequence of keys
+ * @param num_successes The number of successfully erased key/value pairs
+ * @param view Device view used to access the hash map's slot storage
+ * @param hash The unary function to apply to hash each key
+ * @param key_equal The binary function to compare two keys for equality
+ */
 template <std::size_t block_size,
           uint32_t tile_size,
           typename InputIt,
diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh
index bb197f6dc..0efd87f4b 100644
--- a/include/cuco/dynamic_map.cuh
+++ b/include/cuco/dynamic_map.cuh
@@ -147,7 +147,31 @@ class dynamic_map {
               sentinel::empty_key<Key> empty_key_sentinel,
               sentinel::empty_value<Value> empty_value_sentinel,
               Allocator const& alloc = Allocator{});
-
+  
+  /**
+   * @brief Construct a dynamically-sized map with erase capability.
+   *
+   * The capacity of the map will automatically increase as the user adds key/value pairs using
+   * `insert`.
+   *
+   * Capacity increases by a factor of growth_factor each time the size of the map exceeds a
+   * threshold occupancy. The performance of `find` and `contains` decreases somewhat each time the
+   * map's capacity grows.
+   *
+   * The `empty_key_sentinel` and `empty_value_sentinel` values are reserved and
+   * undefined behavior results from attempting to insert any key/value pair
+   * that contains either.
+   *
+   * @param initial_capacity The initial number of slots in the map
+   * @param growth_factor The factor by which the capacity increases when resizing
+   * @param empty_key_sentinel The reserved key value for empty slots
+   * @param empty_value_sentinel The reserved mapped value for empty slots
+   * @param erased_key_sentinel The reserved key value for erased slots
+   * @param alloc Allocator used to allocate submap device storage
+   *
+   * @throw std::runtime error if the empty key sentinel and erased key sentinel
+   * are the same value
+   */
   dynamic_map(std::size_t initial_capacity,
               sentinel::empty_key<Key> empty_key_sentinel,
               sentinel::empty_value<Value> empty_value_sentinel,
@@ -188,7 +212,36 @@ class dynamic_map {
             typename Hash     = cuco::detail::MurmurHash3_32<key_type>,
             typename KeyEqual = thrust::equal_to<key_type>>
   void insert(InputIt first, InputIt last, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{});
-
+  
+  /**
+   * @brief Erases keys in the range `[first, last)`.
+   *
+   * For each key `k` in `[first, last)`, if `contains(k) == true), removes `k` and it's
+   * associated value from the map. Else, no effect.
+   *
+   *  Side-effects:
+   *  - `contains(k) == false`
+   *  - `find(k) == end()`
+   *  - `insert({k,v}) == true`
+   *  - `get_size()` is reduced by the total number of erased keys
+   *
+   * This function synchronizes `stream`.
+   *
+   * Keep in mind that `erase` does not cause the map to shrink its memory allocation.
+   *
+   * @tparam InputIt Device accessible input iterator whose `value_type` is
+   * convertible to the map's `value_type`
+   * @tparam Hash Unary callable type
+   * @tparam KeyEqual Binary callable type
+   * @param first Beginning of the sequence of keys
+   * @param last End of the sequence of keys
+   * @param hash The unary function to apply to hash each key
+   * @param key_equal The binary function to compare two keys for equality
+   * @param stream Stream used for executing the kernels
+   *
+   * @throw std::runtime_error if a unique erased key sentinel value was not
+   * provided at construction
+   */
   template <typename InputIt,
             typename Hash     = cuco::detail::MurmurHash3_32<key_type>,
             typename KeyEqual = thrust::equal_to<key_type>>
@@ -273,7 +326,7 @@ class dynamic_map {
  private:
   key_type empty_key_sentinel_{};       ///< Key value that represents an empty slot
   mapped_type empty_value_sentinel_{};  ///< Initial value of empty slot
-  key_type erased_key_sentinel_{};
+  key_type erased_key_sentinel_{};      ///< Key value that represents an erased slot
 
   // TODO: initialize this
   std::size_t size_{};       ///< Number of keys in the map
@@ -288,7 +341,7 @@ class dynamic_map {
   std::size_t min_insert_size_{};   ///< min remaining capacity of submap for insert
   atomic_ctr_type* num_successes_;  ///< number of successfully inserted keys on insert
   std::vector<atomic_ctr_type*> submap_num_successes_; ///< number of succesfully erased keys for each submap
-  thrust::device_vector<atomic_ctr_type*> d_submap_num_successes_; 
+  thrust::device_vector<atomic_ctr_type*> d_submap_num_successes_; ///< device-side number of successfully erased keys for each submap
   Allocator alloc_{};  ///< Allocator passed to submaps to allocate their device storage
   counter_allocator_type counter_allocator_{};  ///< Allocator used to allocate `num_successes_`
 };

From faf82240f287bd382268a985a4872295e96b5b4f Mon Sep 17 00:00:00 2001
From: Nico Iskos <niskos@nvidia.com>
Date: Wed, 31 Aug 2022 11:37:26 -0700
Subject: [PATCH 14/36] warning fixes

---
 benchmarks/hash_table/dynamic_map_bench.cu | 6 +++---
 include/cuco/detail/dynamic_map.inl        | 4 ++--
 tests/dynamic_map/erase_test.cu            | 2 ++
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/benchmarks/hash_table/dynamic_map_bench.cu b/benchmarks/hash_table/dynamic_map_bench.cu
index 3a846b23d..079018005 100644
--- a/benchmarks/hash_table/dynamic_map_bench.cu
+++ b/benchmarks/hash_table/dynamic_map_bench.cu
@@ -187,7 +187,7 @@ static void BM_dynamic_erase_all(::benchmark::State& state)
 
   generate_keys<Dist, Key>(h_keys.begin(), h_keys.end());
 
-  for (auto i = 0; i < num_keys; ++i) {
+  for (uint32_t i = 0; i < num_keys; ++i) {
     Key key           = h_keys[i];
     Value val         = h_keys[i];
     h_pairs[i].first  = key;
@@ -203,12 +203,12 @@ static void BM_dynamic_erase_all(::benchmark::State& state)
                  cuco::sentinel::empty_key<Key>{-1},
                  cuco::sentinel::empty_value<Value>{-1},
                  cuco::sentinel::erased_key<Key>{-2}};
-    for (auto i = 0; i < num_keys; i += batch_size) {
+    for (uint32_t i = 0; i < num_keys; i += batch_size) {
       map.insert(d_pairs.begin() + i, d_pairs.begin() + i + batch_size);
     }
     {
       cuda_event_timer raii{state};
-      for (auto i = 0; i < num_keys; i += batch_size) {
+      for (uint32_t i = 0; i < num_keys; i += batch_size) {
         map.erase(d_keys.begin() + i, d_keys.begin() + i + batch_size);
       }
     }
diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl
index c648c7029..97a628d15 100644
--- a/include/cuco/detail/dynamic_map.inl
+++ b/include/cuco/detail/dynamic_map.inl
@@ -192,7 +192,7 @@ void dynamic_map<Key, Value, Scope, Allocator>::erase(InputIt first,
   // zero out submap success counters
   if(submaps_.size() > 1) {
     static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type));
-    for(int i = 0; i < submaps_.size(); ++i) {
+    for(uint32_t i = 0; i < submaps_.size(); ++i) {
       CUCO_CUDA_TRY(cudaMemset(submap_num_successes_[i], 0, sizeof(atomic_ctr_type)));
     }
   }
@@ -219,7 +219,7 @@ void dynamic_map<Key, Value, Scope, Allocator>::erase(InputIt first,
   if(submaps_.size() == 1) {
     submaps_[0]->size_ -= h_num_successes;
   } else {
-    for(int i = 0; i < submaps_.size(); ++i) {
+    for(uint32_t i = 0; i < submaps_.size(); ++i) {
       std::size_t h_submap_num_successes;
       CUCO_CUDA_TRY(cudaMemcpy(
         &h_submap_num_successes, submap_num_successes_[i], sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost));
diff --git a/tests/dynamic_map/erase_test.cu b/tests/dynamic_map/erase_test.cu
index e5753b544..2254523c7 100644
--- a/tests/dynamic_map/erase_test.cu
+++ b/tests/dynamic_map/erase_test.cu
@@ -16,6 +16,8 @@
 
 #include <catch2/catch.hpp>
 #include <thrust/device_vector.h>
+#include <thrust/sequence.h>
+#include <thrust/execution_policy.h>
 
 #include <cuco/dynamic_map.cuh>
 

From e4b548e954ec60f7bee1a849e9a935733ca6584b Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 31 Aug 2022 18:38:09 +0000
Subject: [PATCH 15/36] [pre-commit.ci] auto code formatting

---
 benchmarks/hash_table/static_map_bench.cu   |  1 -
 include/cuco/detail/dynamic_map.inl         | 35 +++++++------
 include/cuco/detail/dynamic_map_kernels.cuh | 58 ++++++++++-----------
 include/cuco/dynamic_map.cuh                | 17 +++---
 tests/dynamic_map/erase_test.cu             | 11 ++--
 tests/dynamic_map/unique_sequence_test.cu   |  5 +-
 6 files changed, 63 insertions(+), 64 deletions(-)

diff --git a/benchmarks/hash_table/static_map_bench.cu b/benchmarks/hash_table/static_map_bench.cu
index 1e69c0c4e..ce1015b8d 100644
--- a/benchmarks/hash_table/static_map_bench.cu
+++ b/benchmarks/hash_table/static_map_bench.cu
@@ -346,4 +346,3 @@ BENCHMARK_TEMPLATE(BM_static_map_insert, int64_t, int64_t, dist_type::UNIQUE)
   ->Unit(benchmark::kMillisecond)
   ->Apply(generate_size_and_occupancy)
   ->UseManualTime();
-
diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl
index 97a628d15..0e0020e97 100644
--- a/include/cuco/detail/dynamic_map.inl
+++ b/include/cuco/detail/dynamic_map.inl
@@ -190,39 +190,40 @@ void dynamic_map<Key, Value, Scope, Allocator>::erase(InputIt first,
   CUCO_CUDA_TRY(cudaMemset(num_successes_, 0, sizeof(atomic_ctr_type)));
 
   // zero out submap success counters
-  if(submaps_.size() > 1) {
+  if (submaps_.size() > 1) {
     static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type));
-    for(uint32_t i = 0; i < submaps_.size(); ++i) {
+    for (uint32_t i = 0; i < submaps_.size(); ++i) {
       CUCO_CUDA_TRY(cudaMemset(submap_num_successes_[i], 0, sizeof(atomic_ctr_type)));
     }
   }
-  
+
   auto const temp_storage_size = submaps_.size() * sizeof(unsigned long long);
 
   detail::erase<block_size, tile_size, cuco::pair_type<key_type, mapped_type>>
-    <<<grid_size, block_size, temp_storage_size>>>(
-      first,
-      first + num_keys,
-      submap_mutable_views_.data().get(),
-      num_successes_,
-      d_submap_num_successes_.data().get(),
-      submaps_.size(),
-      hash,
-      key_equal);
+    <<<grid_size, block_size, temp_storage_size>>>(first,
+                                                   first + num_keys,
+                                                   submap_mutable_views_.data().get(),
+                                                   num_successes_,
+                                                   d_submap_num_successes_.data().get(),
+                                                   submaps_.size(),
+                                                   hash,
+                                                   key_equal);
 
   // update total dynamic map size
   std::size_t h_num_successes;
   CUCO_CUDA_TRY(
     cudaMemcpy(&h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost));
   size_ -= h_num_successes;
-  
-  if(submaps_.size() == 1) {
+
+  if (submaps_.size() == 1) {
     submaps_[0]->size_ -= h_num_successes;
   } else {
-    for(uint32_t i = 0; i < submaps_.size(); ++i) {
+    for (uint32_t i = 0; i < submaps_.size(); ++i) {
       std::size_t h_submap_num_successes;
-      CUCO_CUDA_TRY(cudaMemcpy(
-        &h_submap_num_successes, submap_num_successes_[i], sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost));
+      CUCO_CUDA_TRY(cudaMemcpy(&h_submap_num_successes,
+                               submap_num_successes_[i],
+                               sizeof(atomic_ctr_type),
+                               cudaMemcpyDeviceToHost));
       submaps_[i]->size_ -= h_submap_num_successes;
     }
   }
diff --git a/include/cuco/detail/dynamic_map_kernels.cuh b/include/cuco/detail/dynamic_map_kernels.cuh
index 6614cfe28..913149021 100644
--- a/include/cuco/detail/dynamic_map_kernels.cuh
+++ b/include/cuco/detail/dynamic_map_kernels.cuh
@@ -233,14 +233,14 @@ __global__ void erase(InputIt first,
 
   std::size_t thread_num_successes = 0;
 
-  auto tid  = block_size * blockIdx.x + threadIdx.x;
-  auto it   = first + tid;
+  auto tid = block_size * blockIdx.x + threadIdx.x;
+  auto it  = first + tid;
 
-  if(num_submaps > 1) {
-    for(int i = threadIdx.x; i < num_submaps; i += block_size)
+  if (num_submaps > 1) {
+    for (int i = threadIdx.x; i < num_submaps; i += block_size)
       submap_block_num_successes[i] = 0;
     __syncthreads();
-  
+
     while (it < last) {
       int i;
       for (i = 0; i < num_submaps; ++i) {
@@ -254,8 +254,7 @@ __global__ void erase(InputIt first,
     }
   } else {
     while (it < last) {
-      if(submap_mutable_views[0].erase(*it, hash, key_equal))
-        thread_num_successes++;
+      if (submap_mutable_views[0].erase(*it, hash, key_equal)) thread_num_successes++;
       it += gridDim.x * blockDim.x;
     }
   }
@@ -265,11 +264,11 @@ __global__ void erase(InputIt first,
     num_successes->fetch_add(block_num_successes, cuda::std::memory_order_relaxed);
   }
 
-  if(num_submaps > 1) {
-    for(int i = 0; i < num_submaps; ++i) {
-      if(threadIdx.x == 0) {
-        submap_num_successes[i]->fetch_add(
-          static_cast<std::size_t>(submap_block_num_successes[i]), cuda::std::memory_order_relaxed);
+  if (num_submaps > 1) {
+    for (int i = 0; i < num_submaps; ++i) {
+      if (threadIdx.x == 0) {
+        submap_num_successes[i]->fetch_add(static_cast<std::size_t>(submap_block_num_successes[i]),
+                                           cuda::std::memory_order_relaxed);
       }
     }
   }
@@ -310,13 +309,13 @@ template <uint32_t block_size,
           typename Hash,
           typename KeyEqual>
 __global__ void erase(InputIt first,
-                       InputIt last,
-                       mutableViewT* submap_mutable_views,
-                       atomicT* num_successes,
-                       atomicT** submap_num_successes,
-                       const uint32_t num_submaps,
-                       Hash hash,
-                       KeyEqual key_equal)
+                      InputIt last,
+                      mutableViewT* submap_mutable_views,
+                      atomicT* num_successes,
+                      atomicT** submap_num_successes,
+                      const uint32_t num_submaps,
+                      Hash hash,
+                      KeyEqual key_equal)
 {
   typedef cub::BlockReduce<std::size_t, block_size> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
@@ -328,13 +327,13 @@ __global__ void erase(InputIt first,
   auto tid  = block_size * blockIdx.x + threadIdx.x;
   auto it   = first + tid / tile_size;
 
-  if(num_submaps > 1) {
-    for(int i = threadIdx.x; i < num_submaps; i += block_size)
+  if (num_submaps > 1) {
+    for (int i = threadIdx.x; i < num_submaps; i += block_size)
       submap_block_num_successes[i] = 0;
     __syncthreads();
-  
+
     while (it < last) {
-      auto erased     = false;
+      auto erased = false;
       int i;
       for (i = 0; i < num_submaps; ++i) {
         erased = submap_mutable_views[i].erase(tile, *it, hash, key_equal);
@@ -349,8 +348,7 @@ __global__ void erase(InputIt first,
   } else {
     while (it < last) {
       auto erased = submap_mutable_views[0].erase(tile, *it, hash, key_equal);
-      if (erased && tile.thread_rank() == 0)
-        thread_num_successes++;
+      if (erased && tile.thread_rank() == 0) thread_num_successes++;
 
       it += (gridDim.x * blockDim.x) / tile_size;
     }
@@ -361,11 +359,11 @@ __global__ void erase(InputIt first,
     num_successes->fetch_add(block_num_successes, cuda::std::memory_order_relaxed);
   }
 
-  if(num_submaps > 1) {
-    for(int i = 0; i < num_submaps; ++i) {
-      if(threadIdx.x == 0) {
-        submap_num_successes[i]->fetch_add(
-          static_cast<std::size_t>(submap_block_num_successes[i]), cuda::std::memory_order_relaxed);
+  if (num_submaps > 1) {
+    for (int i = 0; i < num_submaps; ++i) {
+      if (threadIdx.x == 0) {
+        submap_num_successes[i]->fetch_add(static_cast<std::size_t>(submap_block_num_successes[i]),
+                                           cuda::std::memory_order_relaxed);
       }
     }
   }
diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh
index 0efd87f4b..f34eb3d86 100644
--- a/include/cuco/dynamic_map.cuh
+++ b/include/cuco/dynamic_map.cuh
@@ -16,14 +16,13 @@
 
 #pragma once
 
-
 #include <cooperative_groups.h>
 #include <cub/cub.cuh>
+#include <cuco/detail/dynamic_map_kernels.cuh>
 #include <cuco/detail/error.hpp>
 #include <cuco/sentinel.cuh>
 #include <cuco/static_map.cuh>
 #include <cuda/std/atomic>
-#include <cuco/detail/dynamic_map_kernels.cuh>
 #include <thrust/device_vector.h>
 #include <thrust/functional.h>
 
@@ -110,7 +109,7 @@ class dynamic_map {
   using mutable_view_type = typename static_map<Key, Value, Scope>::device_mutable_view;
   using counter_allocator_type =
     typename std::allocator_traits<Allocator>::rebind_alloc<atomic_ctr_type>;
-  
+
   dynamic_map(dynamic_map const&) = delete;
   dynamic_map(dynamic_map&&)      = delete;
 
@@ -147,7 +146,7 @@ class dynamic_map {
               sentinel::empty_key<Key> empty_key_sentinel,
               sentinel::empty_value<Value> empty_value_sentinel,
               Allocator const& alloc = Allocator{});
-  
+
   /**
    * @brief Construct a dynamically-sized map with erase capability.
    *
@@ -212,7 +211,7 @@ class dynamic_map {
             typename Hash     = cuco::detail::MurmurHash3_32<key_type>,
             typename KeyEqual = thrust::equal_to<key_type>>
   void insert(InputIt first, InputIt last, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{});
-  
+
   /**
    * @brief Erases keys in the range `[first, last)`.
    *
@@ -340,9 +339,11 @@ class dynamic_map {
     submap_mutable_views_;          ///< vector of mutable device views for each submap
   std::size_t min_insert_size_{};   ///< min remaining capacity of submap for insert
   atomic_ctr_type* num_successes_;  ///< number of successfully inserted keys on insert
-  std::vector<atomic_ctr_type*> submap_num_successes_; ///< number of succesfully erased keys for each submap
-  thrust::device_vector<atomic_ctr_type*> d_submap_num_successes_; ///< device-side number of successfully erased keys for each submap
-  Allocator alloc_{};  ///< Allocator passed to submaps to allocate their device storage
+  std::vector<atomic_ctr_type*>
+    submap_num_successes_;  ///< number of succesfully erased keys for each submap
+  thrust::device_vector<atomic_ctr_type*>
+    d_submap_num_successes_;  ///< device-side number of successfully erased keys for each submap
+  Allocator alloc_{};         ///< Allocator passed to submaps to allocate their device storage
   counter_allocator_type counter_allocator_{};  ///< Allocator used to allocate `num_successes_`
 };
 }  // namespace cuco
diff --git a/tests/dynamic_map/erase_test.cu b/tests/dynamic_map/erase_test.cu
index 2254523c7..1c81f400a 100644
--- a/tests/dynamic_map/erase_test.cu
+++ b/tests/dynamic_map/erase_test.cu
@@ -16,8 +16,8 @@
 
 #include <catch2/catch.hpp>
 #include <thrust/device_vector.h>
-#include <thrust/sequence.h>
 #include <thrust/execution_policy.h>
+#include <thrust/sequence.h>
 
 #include <cuco/dynamic_map.cuh>
 
@@ -109,9 +109,9 @@ TEMPLATE_TEST_CASE_SIG("erase key", "", ((typename T), T), (int32_t))
     map.insert(pairs_begin2, pairs_begin2 + 4 * num_keys);
 
     // map should resize twice if the erased slots are successfully reused
-    REQUIRE(map.get_capacity() == 8*num_keys);
+    REQUIRE(map.get_capacity() == 8 * num_keys);
     // check that keys can be successfully deleted from only the first and second submaps
-    map.erase(d_keys2.begin(), d_keys2.begin() + 2*num_keys);
+    map.erase(d_keys2.begin(), d_keys2.begin() + 2 * num_keys);
     map.contains(d_keys2.begin(), d_keys2.end(), d_keys_exist2.begin());
 
     REQUIRE(cuco::test::none_of(d_keys_exist2.begin(),
@@ -122,8 +122,9 @@ TEMPLATE_TEST_CASE_SIG("erase key", "", ((typename T), T), (int32_t))
                                d_keys_exist2.end(),
                                [] __device__(const bool key_found) { return key_found; }));
 
-    REQUIRE(map.get_size() == 2*num_keys);
-    // check that keys can be successfully deleted from all submaps (some will be unsuccessful erases)
+    REQUIRE(map.get_size() == 2 * num_keys);
+    // check that keys can be successfully deleted from all submaps (some will be unsuccessful
+    // erases)
     map.erase(d_keys2.begin(), d_keys2.end());
 
     map.contains(d_keys2.begin(), d_keys2.end(), d_keys_exist2.begin());
diff --git a/tests/dynamic_map/unique_sequence_test.cu b/tests/dynamic_map/unique_sequence_test.cu
index 24a2041aa..fea8de53d 100644
--- a/tests/dynamic_map/unique_sequence_test.cu
+++ b/tests/dynamic_map/unique_sequence_test.cu
@@ -39,9 +39,8 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys",
 {
   constexpr std::size_t num_keys{50'000'000};
 
-  cuco::dynamic_map<Key, Value> map{30'000'000, 
-    cuco::sentinel::empty_key<Key>{-1}, 
-    cuco::sentinel::empty_value<Value>{-1}};
+  cuco::dynamic_map<Key, Value> map{
+    30'000'000, cuco::sentinel::empty_key<Key>{-1}, cuco::sentinel::empty_value<Value>{-1}};
 
   thrust::device_vector<Key> d_keys(num_keys);
   thrust::device_vector<Value> d_values(num_keys);

From 93b79837db16cfb5ff85d3552db260bae78273bf Mon Sep 17 00:00:00 2001
From: Nico Iskos <niskos@nvidia.com>
Date: Wed, 31 Aug 2022 13:13:20 -0700
Subject: [PATCH 16/36] removed nvtx file

---
 include/cuco/detail/nvtx3.hpp | 2071 ---------------------------------
 1 file changed, 2071 deletions(-)
 delete mode 100644 include/cuco/detail/nvtx3.hpp

diff --git a/include/cuco/detail/nvtx3.hpp b/include/cuco/detail/nvtx3.hpp
deleted file mode 100644
index 075c6e5d4..000000000
--- a/include/cuco/detail/nvtx3.hpp
+++ /dev/null
@@ -1,2071 +0,0 @@
-/*
- *  Copyright (c) 2020, NVIDIA CORPORATION.
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-#pragma once
-
-#if defined(NVTX3_MINOR_VERSION) and NVTX3_MINOR_VERSION < 0
-#error \
-  "Trying to #include NVTX version 3 in a source file where an older NVTX version has already been included.  If you are not directly using NVTX (the NVIDIA Tools Extension library), you are getting this error because libraries you are using have included different versions of NVTX.  Suggested solutions are: (1) reorder #includes so the newest NVTX version is included first, (2) avoid using the conflicting libraries in the same .c/.cpp file, or (3) update the library using the older NVTX version to use the newer version instead."
-#endif
-
-/**
- * @brief Semantic minor version number.
- *
- * Major version number is hardcoded into the "nvtx3" namespace/prefix.
- *
- * If this value is incremented, the above version include guard needs to be
- * updated.
- *
- */
-#define NVTX3_MINOR_VERSION 0
-
-#include <nvtx3/nvToolsExt.h>
-
-#include <string>
-
-/**
- * @file nvtx3.hpp
- *
- * @brief Provides C++ constructs making the NVTX library safer and easier to
- * use with zero overhead.
- */
-
-/**
- * \mainpage
- * \tableofcontents
- *
- * \section QUICK_START Quick Start
- *
- * To add NVTX ranges to your code, use the `nvtx3::thread_range` RAII object. A
- * range begins when the object is created, and ends when the object is
- * destroyed.
- *
- * \code{.cpp}
- * #include "nvtx3.hpp"
- * void some_function(){
- *    // Begins a NVTX range with the messsage "some_function"
- *    // The range ends when some_function() returns and `r` is destroyed
- *    nvtx3::thread_range r{"some_function"};
- *
- *    for(int i = 0; i < 6; ++i){
- *       nvtx3::thread_range loop{"loop range"};
- *       std::this_thread::sleep_for(std::chrono::seconds{1});
- *    }
- * } // Range ends when `r` is destroyed
- * \endcode
- *
- * The example code above generates the following timeline view in Nsight
- * Systems:
- *
- * \image html
- * https://raw.githubusercontent.com/jrhemstad/nvtx_wrappers/master/docs/example_range.png
- *
- * Alternatively, use the \ref MACROS like `NVTX3_FUNC_RANGE()` to add
- * ranges to your code that automatically use the name of the enclosing function
- * as the range's message.
- *
- * \code{.cpp}
- * #include "nvtx3.hpp"
- * void some_function(){
- *    // Creates a range with a message "some_function" that ends when the
- * enclosing
- *    // function returns
- *    NVTX3_FUNC_RANGE();
- *    ...
- * }
- * \endcode
- *
- *
- * \section Overview
- *
- * The NVTX library provides a set of functions for users to annotate their code
- * to aid in performance profiling and optimization. These annotations provide
- * information to tools like Nsight Systems to improve visualization of
- * application timelines.
- *
- * \ref RANGES are one of the most commonly used NVTX constructs for annotating
- * a span of time. For example, imagine a user wanted to see every time a
- * function, `my_function`, is called and how long it takes to execute. This can
- * be accomplished with an NVTX range created on the entry to the function and
- * terminated on return from `my_function` using the push/pop C APIs:
- *
- * ```
- * void my_function(...){
- *    nvtxRangePushA("my_function"); // Begins NVTX range
- *    // do work
- *    nvtxRangePop(); // Ends NVTX range
- * }
- * ```
- *
- * One of the challenges with using the NVTX C API is that it requires manually
- * terminating the end of the range with `nvtxRangePop`. This can be challenging
- * if `my_function()` has multiple returns or can throw exceptions as it
- * requires calling `nvtxRangePop()` before all possible return points.
- *
- * NVTX++ solves this inconvenience through the "RAII" technique by providing a
- * `nvtx3::thread_range` class that begins a range at construction and ends the
- * range on destruction. The above example then becomes:
- *
- * ```
- * void my_function(...){
- *    nvtx3::thread_range r{"my_function"}; // Begins NVTX range
- *    // do work
- * } // Range ends on exit from `my_function` when `r` is destroyed
- * ```
- *
- * The range object `r` is deterministically destroyed whenever `my_function`
- * returns---ending the NVTX range without manual intervention. For more
- * information, see \ref RANGES and `nvtx3::domain_thread_range`.
- *
- * Another inconvenience of the NVTX C APIs are the several constructs where the
- * user is expected to initialize an object at the beginning of an application
- * and reuse that object throughout the lifetime of the application. For example
- * Domains, Categories, and Registered messages.
- *
- * Example:
- * ```
- * nvtxDomainHandle_t D = nvtxDomainCreateA("my domain");
- * // Reuse `D` throughout the rest of the application
- * ```
- *
- * This can be problematic if the user application or library does not have an
- * explicit initialization function called before all other functions to
- * ensure that these long-lived objects are initialized before being used.
- *
- * NVTX++ makes use of the "construct on first use" technique to alleviate this
- * inconvenience. In short, a function local static object is constructed upon
- * the first invocation of a function and returns a reference to that object on
- * all future invocations. See the documentation for
- * `nvtx3::registered_message`, `nvtx3::domain`, `nvtx3::named_category`,  and
- * https://isocpp.org/wiki/faq/ctors#static-init-order-on-first-use for more
- * information.
- *
- * Using construct on first use, the above example becomes:
- * ```
- * struct my_domain{ static constexpr char const* name{"my domain"}; };
- *
- * // The first invocation of `domain::get` for the type `my_domain` will
- * // construct a `nvtx3::domain` object and return a reference to it. Future
- * // invocations simply return a reference.
- * nvtx3::domain const& D = nvtx3::domain::get<my_domain>();
- * ```
- * For more information about NVTX and how it can be used, see
- * https://docs.nvidia.com/cuda/profiler-users-guide/index.html#nvtx and
- * https://devblogs.nvidia.com/cuda-pro-tip-generate-custom-application-profile-timelines-nvtx/
- * for more information.
- *
- * \section RANGES Ranges
- *
- * Ranges are used to describe a span of time during the execution of an
- * application. Common examples are using ranges to annotate the time it takes
- * to execute a function or an iteration of a loop.
- *
- * NVTX++ uses RAII to automate the generation of ranges that are tied to the
- * lifetime of objects. Similar to `std::lock_guard` in the C++ Standard
- * Template Library.
- *
- * \subsection THREAD_RANGE Thread Range
- *
- * `nvtx3::domain_thread_range` is a class that begins a range upon construction
- * and ends the range at destruction. This is one of the most commonly used
- * constructs in NVTX++ and is useful for annotating spans of time on a
- * particular thread. These ranges can be nested to arbitrary depths.
- *
- * `nvtx3::thread_range` is an alias for a `nvtx3::domain_thread_range` in the
- * global NVTX domain. For more information about Domains, see \ref DOMAINS.
- *
- * Various attributes of a range can be configured constructing a
- * `nvtx3::domain_thread_range` with a `nvtx3::event_attributes` object. For
- * more information, see \ref ATTRIBUTES.
- *
- * Example:
- *
- * \code{.cpp}
- * void some_function(){
- *    // Creates a range for the duration of `some_function`
- *    nvtx3::thread_range r{};
- *
- *    while(true){
- *       // Creates a range for every loop iteration
- *       // `loop_range` is nested inside `r`
- *       nvtx3::thread_range loop_range{};
- *    }
- * }
- * \endcode
- *
- * \subsection PROCESS_RANGE Process Range
- *
- * `nvtx3::domain_process_range` is identical to `nvtx3::domain_thread_range`
- * with the exception that a `domain_process_range` can be created and destroyed
- * on different threads. This is useful to annotate spans of time that can
- * bridge multiple threads.
- *
- * `nvtx3::domain_thread_range`s should be preferred unless one needs the
- * ability to begin and end a range on different threads.
- *
- * \section MARKS Marks
- *
- * `nvtx3::mark` allows annotating an instantaneous event in an application's
- * timeline. For example, indicating when a mutex is locked or unlocked.
- *
- * \code{.cpp}
- * std::mutex global_lock;
- * void lock_mutex(){
- *    global_lock.lock();
- *    // Marks an event immediately after the mutex is locked
- *    nvtx3::mark<my_domain>("lock_mutex");
- * }
- * \endcode
- *
- * \section DOMAINS Domains
- *
- * Similar to C++ namespaces, Domains allow for scoping NVTX events. By default,
- * all NVTX events belong to the "global" domain. Libraries and applications
- * should scope their events to use a custom domain to differentiate where the
- * events originate from.
- *
- * It is common for a library or application to have only a single domain and
- * for the name of that domain to be known at compile time. Therefore, Domains
- * in NVTX++ are represented by _tag types_.
- *
- * For example, to define a custom  domain, simply define a new concrete type
- * (a `class` or `struct`) with a `static` member called `name` that contains
- * the desired name of the domain.
- *
- * ```
- * struct my_domain{ static constexpr char const* name{"my domain"}; };
- * ```
- *
- * For any NVTX++ construct that can be scoped to a domain, the type `my_domain`
- * can be passed as an explicit template argument to scope it to the custom
- * domain.
- *
- * The tag type `nvtx3::domain::global` represents the global NVTX domain.
- *
- * \code{.cpp}
- * // By default, `domain_thread_range` belongs to the global domain
- * nvtx3::domain_thread_range<> r0{};
- *
- * // Alias for a `domain_thread_range` in the global domain
- * nvtx3::thread_range r1{};
- *
- * // `r` belongs to the custom domain
- * nvtx3::domain_thread_range<my_domain> r{};
- * \endcode
- *
- * When using a custom domain, it is reccomended to define type aliases for NVTX
- * constructs in the custom domain.
- * ```
- * using my_thread_range = nvtx3::domain_thread_range<my_domain>;
- * using my_registered_message = nvtx3::registered_message<my_domain>;
- * using my_named_category = nvtx3::named_category<my_domain>;
- * ```
- *
- * See `nvtx3::domain` for more information.
- *
- * \section ATTRIBUTES Event Attributes
- *
- * NVTX events can be customized with various attributes to provide additional
- * information (such as a custom message) or to control visualization of the
- * event (such as the color used). These attributes can be specified per-event
- * via arguments to a `nvtx3::event_attributes` object.
- *
- * NVTX events can be customized via four "attributes":
- * - \ref COLOR : color used to visualize the event in tools.
- * - \ref MESSAGES :  Custom message string.
- * - \ref PAYLOAD :  User-defined numerical value.
- * - \ref CATEGORY : Intra-domain grouping.
- *
- * It is possible to construct a `nvtx3::event_attributes` from any number of
- * attribute objects (nvtx3::color, nvtx3::message, nvtx3::payload,
- * nvtx3::category) in any order. If an attribute is not specified, a tool
- * specific default value is used. See `nvtx3::event_attributes` for more
- * information.
- *
- * \code{.cpp}
- * // Custom color, message
- * event_attributes attr{nvtx3::rgb{127, 255, 0},
- *                      "message"};
- *
- * // Custom color, message, payload, category
- * event_attributes attr{nvtx3::rgb{127, 255, 0},
- *                      nvtx3::payload{42},
- *                      "message",
- *                      nvtx3::category{1}};
- *
- * // Arguments can be in any order
- * event_attributes attr{nvtx3::payload{42},
- *                      nvtx3::category{1},
- *                      "message",
- *                      nvtx3::rgb{127, 255, 0}};
- *
- * // "First wins" with multiple arguments of the same type
- * event_attributes attr{ nvtx3::payload{42}, nvtx3::payload{7} }; // payload is
- * 42 \endcode
- *
- * \subsection MESSAGES message
- *
- * A `nvtx3::message` allows associating a custom message string with an NVTX
- * event.
- *
- * Example:
- * \code{.cpp}
- * // Create an `event_attributes` with the custom message "my message"
- * nvtx3::event_attributes attr{nvtx3::Mesage{"my message"}};
- *
- * // strings and string literals implicitly assumed to be a `nvtx3::message`
- * nvtx3::event_attributes attr{"my message"};
- * \endcode
- *
- * \subsubsection REGISTERED_MESSAGE Registered Messages
- *
- * Associating a `nvtx3::message` with an event requires copying the contents of
- * the message every time the message is used, i.e., copying the entire message
- * string. This may cause non-trivial overhead in performance sensitive code.
- *
- * To eliminate this overhead, NVTX allows registering a message string,
- * yielding a "handle" that is inexpensive to copy that may be used in place of
- * a message string. When visualizing the events, tools such as Nsight Systems
- * will take care of mapping the message handle to its string.
- *
- * A message should be registered once and the handle reused throughout the rest
- * of the application. This can be done by either explicitly creating static
- * `nvtx3::registered_message` objects, or using the
- * `nvtx3::registered_message::get` construct on first use helper (recommended).
- *
- * Similar to \ref DOMAINS, `nvtx3::registered_message::get` requires defining a
- * custom tag type with a static `message` member whose value will be the
- * contents of the registered string.
- *
- * Example:
- * \code{.cpp}
- * // Explicitly constructed, static `registered_message`
- * static registered_message<my_domain> static_message{"my message"};
- *
- * // Or use construct on first use:
- * // Define a tag type with a `message` member string to register
- * struct my_message{ static constexpr char const* message{ "my message" }; };
- *
- * // Uses construct on first use to register the contents of
- * // `my_message::message`
- * nvtx3::registered_message<my_domain> const& msg =
- * nvtx3::registered_message<my_domain>::get<my_message>(); \endcode
- *
- * \subsection COLOR color
- *
- * Associating a `nvtx3::color` with an event allows controlling how the event
- * is visualized in a tool such as Nsight Systems. This is a convenient way to
- * visually differentiate among different events.
- *
- * \code{.cpp}
- * // Define a color via rgb color values
- * nvtx3::color c{nvtx3::rgb{127, 255, 0}};
- * nvtx3::event_attributes attr{c};
- *
- * // rgb color values can be passed directly to an `event_attributes`
- * nvtx3::event_attributes attr1{nvtx3::rgb{127,255,0}};
- * \endcode
- *
- * \subsection CATEGORY category
- *
- * A `nvtx3::category` is simply an integer id that allows for fine-grain
- * grouping of NVTX events. For example, one might use separate categories for
- * IO, memory allocation, compute, etc.
- *
- * \code{.cpp}
- * nvtx3::event_attributes{nvtx3::category{1}};
- * \endcode
- *
- * \subsubsection NAMED_CATEGORIES Named Categories
- *
- * Associates a `name` string with a category `id` to help differentiate among
- * categories.
- *
- * For any given category id `Id`, a `named_category{Id, "name"}` should only
- * be constructed once and reused throughout an application. This can be done by
- * either explicitly creating static `nvtx3::named_category` objects, or using
- * the `nvtx3::named_category::get` construct on first use helper (recommended).
- *
- * Similar to \ref DOMAINS, `nvtx3::named_category::get` requires defining a
- * custom tag type with static `name` and `id` members.
- *
- * \code{.cpp}
- * // Explicitly constructed, static `named_category`
- * static nvtx3::named_category static_category{42, "my category"};
- *
- * // OR use construct on first use:
- * // Define a tag type with `name` and `id` members
- * struct my_category{
- *    static constexpr char const* name{"my category"}; // category name
- *    static constexpr category::id_type id{42}; // category id
- * };
- *
- * // Use construct on first use to name the category id `42`
- * // with name "my category"
- * nvtx3::named_category const& my_category =
- * named_category<my_domain>::get<my_category>();
- *
- * // Range `r` associated with category id `42`
- * nvtx3::event_attributes attr{my_category};
- * \endcode
- *
- * \subsection PAYLOAD payload
- *
- * Allows associating a user-defined numerical value with an event.
- *
- * ```
- * nvtx3:: event_attributes attr{nvtx3::payload{42}}; // Constructs a payload
- * from
- *                                                 // the `int32_t` value 42
- * ```
- *
- *
- * \section EXAMPLE Example
- *
- * Putting it all together:
- * \code{.cpp}
- * // Define a custom domain tag type
- * struct my_domain{ static constexpr char const* name{"my domain"}; };
- *
- * // Define a named category tag type
- * struct my_category{
- *    static constexpr char const* name{"my category"};
- *    static constexpr uint32_t id{42};
- * };
- *
- * // Define a registered message tag type
- * struct my_message{ static constexpr char const* message{"my message"}; };
- *
- * // For convenience, use aliases for domain scoped objects
- * using my_thread_range = nvtx3::domain_thread_range<my_domain>;
- * using my_registered_message = nvtx3::registered_message<my_domain>;
- * using my_named_category = nvtx3::named_category<my_domain>;
- *
- * // Default values for all attributes
- * nvtx3::event_attributes attr{};
- * my_thread_range r0{attr};
- *
- * // Custom (unregistered) message, and unnamed category
- * nvtx3::event_attributes attr1{"message", nvtx3::category{2}};
- * my_thread_range r1{attr1};
- *
- * // Alternatively, pass arguments of `event_attributes` ctor directly to
- * // `my_thread_range`
- * my_thread_range r2{"message", nvtx3::category{2}};
- *
- * // construct on first use a registered message
- * auto msg = my_registered_message::get<my_message>();
- *
- * // construct on first use a named category
- * auto category = my_named_category::get<my_category>();
- *
- * // Use registered message and named category
- * my_thread_range r3{msg, category, nvtx3::rgb{127, 255, 0},
- *                    nvtx3::payload{42}};
- *
- * // Any number of arguments in any order
- * my_thread_range r{nvtx3::rgb{127, 255,0}, msg};
- *
- * \endcode
- * \section MACROS Convenience Macros
- *
- * Oftentimes users want to quickly and easily add NVTX ranges to their library
- * or application to aid in profiling and optimization.
- *
- * A convenient way to do this is to use the \ref NVTX3_FUNC_RANGE and
- * \ref NVTX3_FUNC_RANGE_IN macros. These macros take care of constructing an
- * `nvtx3::domain_thread_range` with the name of the enclosing function as the
- * range's message.
- *
- * \code{.cpp}
- * void some_function(){
- *    // Automatically generates an NVTX range for the duration of the function
- *    // using "some_function" as the event's message.
- *    NVTX3_FUNC_RANGE();
- * }
- * \endcode
- *
- */
-
-/**
- * @brief Enables the use of constexpr when support for C++14 relaxed constexpr
- * is present.
- *
- * Initializing a legacy-C (i.e., no constructor) union member requires
- * initializing in the constructor body. Non-empty constexpr constructors
- * require C++14 relaxed constexpr.
- *
- */
-#if __cpp_constexpr >= 201304L
-#define NVTX3_RELAXED_CONSTEXPR constexpr
-#else
-#define NVTX3_RELAXED_CONSTEXPR
-#endif
-
-namespace nvtx3 {
-namespace detail {
-
-/**
- * @brief Verifies if a type `T` contains a member `T::name` of type `const
- * char*` or `const wchar_t*`.
- *
- * @tparam T The type to verify
- * @return True if `T` contains a member `T::name` of type `const char*` or
- * `const wchar_t*`.
- */
-template <typename T>
-constexpr auto has_name_member() noexcept -> decltype(T::name, bool())
-{
-  return (std::is_same<char const*, typename std::decay<decltype(T::name)>::type>::value or
-          std::is_same<wchar_t const*, typename std::decay<decltype(T::name)>::type>::value);
-}
-}  // namespace detail
-
-/**
- * @brief `domain`s allow for grouping NVTX events into a single scope to
- * differentiate them from events in other `domain`s.
- *
- * By default, all NVTX constructs are placed in the "global" NVTX domain.
- *
- * A custom `domain` may be used in order to differentiate a library's or
- * application's NVTX events from other events.
- *
- * `domain`s are expected to be long-lived and unique to a library or
- * application. As such, it is assumed a domain's name is known at compile
- * time. Therefore, all NVTX constructs that can be associated with a domain
- * require the domain to be specified via a *type* `DomainName` passed as an
- * explicit template parameter.
- *
- * The type `domain::global` may be used to indicate that the global NVTX
- * domain should be used.
- *
- * None of the C++ NVTX constructs require the user to manually construct a
- * `domain` object. Instead, if a custom domain is desired, the user is
- * expected to define a type `DomainName` that contains a member
- * `DomainName::name` which resolves to either a `char const*` or `wchar_t
- * const*`. The value of `DomainName::name` is used to name and uniquely
- * identify the custom domain.
- *
- * Upon the first use of an NVTX construct associated with the type
- * `DomainName`, the "construct on first use" pattern is used to construct a
- * function local static `domain` object. All future NVTX constructs
- * associated with `DomainType` will use a reference to the previously
- * constructed `domain` object. See `domain::get`.
- *
- * Example:
- * ```
- * // The type `my_domain` defines a `name` member used to name and identify
- * the
- * // `domain` object identified by `my_domain`.
- * struct my_domain{ static constexpr char const* name{"my_domain"}; };
- *
- * // The NVTX range `r` will be grouped with all other NVTX constructs
- * // associated with  `my_domain`.
- * nvtx3::domain_thread_range<my_domain> r{};
- *
- * // An alias can be created for a `domain_thread_range` in the custom domain
- * using my_thread_range = nvtx3::domain_thread_range<my_domain>;
- * my_thread_range my_range{};
- *
- * // `domain::global` indicates that the global NVTX domain is used
- * nvtx3::domain_thread_range<domain::global> r2{};
- *
- * // For convenience, `nvtx3::thread_range` is an alias for a range in the
- * // global domain
- * nvtx3::thread_range r3{};
- * ```
- */
-class domain {
- public:
-  domain(domain const&) = delete;
-  domain& operator=(domain const&) = delete;
-  domain(domain&&)                 = delete;
-  domain& operator=(domain&&) = delete;
-
-  /**
-   * @brief Returns reference to an instance of a function local static
-   * `domain` object.
-   *
-   * Uses the "construct on first use" idiom to safely ensure the `domain`
-   * object is initialized exactly once upon first invocation of
-   * `domain::get<DomainName>()`. All following invocations will return a
-   * reference to the previously constructed `domain` object. See
-   * https://isocpp.org/wiki/faq/ctors#static-init-order-on-first-use
-   *
-   * None of the constructs in this header require the user to directly invoke
-   * `domain::get`. It is automatically invoked when constructing objects like
-   * a `domain_thread_range` or `category`. Advanced users may wish to use
-   * `domain::get` for the convenience of the "construct on first use" idiom
-   * when using domains with their own use of the NVTX C API.
-   *
-   * This function is threadsafe as of C++11. If two or more threads call
-   * `domain::get<DomainName>` concurrently, exactly one of them is guaranteed
-   * to construct the `domain` object and the other(s) will receive a
-   * reference to the object after it is fully constructed.
-   *
-   * The domain's name is specified via the type `DomainName` pass as an
-   * explicit template parameter. `DomainName` is required to contain a
-   * member `DomainName::name` that resolves to either a `char const*` or
-   * `wchar_t const*`. The value of `DomainName::name` is used to name and
-   * uniquely identify the `domain`.
-   *
-   * Example:
-   * ```
-   * // The type `my_domain` defines a `name` member used to name and identify
-   * // the `domain` object identified by `my_domain`.
-   * struct my_domain{ static constexpr char const* name{"my domain"}; };
-   *
-   * auto D = domain::get<my_domain>(); // First invocation constructs a
-   *                                    // `domain` with the name "my domain"
-   *
-   * auto D1 = domain::get<my_domain>(); // Simply returns reference to
-   *                                     // previously constructed `domain`.
-   * ```
-   *
-   * @tparam DomainName Type that contains a `DomainName::name` member used to
-   * name the `domain` object.
-   * @return Reference to the `domain` corresponding to the type `DomainName`.
-   */
-  template <typename DomainName>
-  static domain const& get()
-  {
-    static_assert(detail::has_name_member<DomainName>(),
-                  "Type used to identify a domain must contain a name member of"
-                  "type const char* or const wchar_t*");
-    static domain const d{DomainName::name};
-    return d;
-  }
-
-  /**
-   * @brief Conversion operator to `nvtxDomainHandle_t`.
-   *
-   * Allows transparently passing a domain object into an API expecting a
-   * native `nvtxDomainHandle_t` object.
-   */
-  operator nvtxDomainHandle_t() const noexcept { return _domain; }
-
-  /**
-   * @brief Tag type for the "global" NVTX domain.
-   *
-   * This type may be passed as a template argument to any function/class
-   * expecting a type to identify a domain to indicate that the global domain
-   * should be used.
-   *
-   * All NVTX events in the global domain across all libraries and
-   * applications will be grouped together.
-   *
-   */
-  struct global {
-  };
-
- private:
-  /**
-   * @brief Construct a new domain with the specified `name`.
-   *
-   * This constructor is private as it is intended that `domain` objects only
-   * be created through the `domain::get` function.
-   *
-   * @param name A unique name identifying the domain
-   */
-  explicit domain(char const* name) noexcept : _domain{nvtxDomainCreateA(name)} {}
-
-  /**
-   * @brief Construct a new domain with the specified `name`.
-   *
-   * This constructor is private as it is intended that `domain` objects only
-   * be created through the `domain::get` function.
-   *
-   * @param name A unique name identifying the domain
-   */
-  explicit domain(wchar_t const* name) noexcept : _domain{nvtxDomainCreateW(name)} {}
-
-  /**
-   * @brief Construct a new domain with the specified `name`.
-   *
-   * This constructor is private as it is intended that `domain` objects only
-   * be created through the `domain::get` function.
-   *
-   * @param name A unique name identifying the domain
-   */
-  explicit domain(std::string const& name) noexcept : domain{name.c_str()} {}
-
-  /**
-   * @brief Construct a new domain with the specified `name`.
-   *
-   * This constructor is private as it is intended that `domain` objects only
-   * be created through the `domain::get` function.
-   *
-   * @param name A unique name identifying the domain
-   */
-  explicit domain(std::wstring const& name) noexcept : domain{name.c_str()} {}
-
-  /**
-   * @brief Default constructor creates a `domain` representing the
-   * "global" NVTX domain.
-   *
-   * All events not associated with a custom `domain` are grouped in the
-   * "global" NVTX domain.
-   *
-   */
-  domain() = default;
-
-  /**
-   * @brief Destroy the domain object, unregistering and freeing all domain
-   * specific resources.
-   */
-  ~domain() noexcept { nvtxDomainDestroy(_domain); }
-
- private:
-  nvtxDomainHandle_t const _domain{};  ///< The `domain`s NVTX handle
-};
-
-/**
- * @brief Returns reference to the `domain` object that represents the global
- * NVTX domain.
- *
- * This specialization for `domain::global` returns a default constructed,
- * `domain` object for use when the "global" domain is desired.
- *
- * All NVTX events in the global domain across all libraries and applications
- * will be grouped together.
- *
- * @return Reference to the `domain` corresponding to the global NVTX domain.
- *
- */
-template <>
-inline domain const& domain::get<domain::global>()
-{
-  static domain const d{};
-  return d;
-}
-
-/**
- * @brief Indicates the values of the red, green, blue color channels for
- * a rgb color code.
- *
- */
-struct rgb {
-  /// Type used for component values
-  using component_type = uint8_t;
-
-  /**
-   * @brief Construct a rgb with red, green, and blue channels
-   * specified by `red_`, `green_`, and `blue_`, respectively.
-   *
-   * Valid values are in the range `[0,255]`.
-   *
-   * @param red_ Value of the red channel
-   * @param green_ Value of the green channel
-   * @param blue_ Value of the blue channel
-   */
-  constexpr rgb(component_type red_, component_type green_, component_type blue_) noexcept
-    : red{red_}, green{green_}, blue{blue_}
-  {
-  }
-
-  component_type const red{};    ///< Red channel value
-  component_type const green{};  ///< Green channel value
-  component_type const blue{};   ///< Blue channel value
-};
-
-/**
- * @brief Indicates the value of the alpha, red, green, and blue color
- * channels for an argb color code.
- *
- */
-struct argb final : rgb {
-  /**
-   * @brief Construct an argb with alpha, red, green, and blue channels
-   * specified by `alpha_`, `red_`, `green_`, and `blue_`, respectively.
-   *
-   * Valid values are in the range `[0,255]`.
-   *
-   * @param alpha_  Value of the alpha channel (opacity)
-   * @param red_  Value of the red channel
-   * @param green_  Value of the green channel
-   * @param blue_  Value of the blue channel
-   *
-   */
-  constexpr argb(component_type alpha_,
-                 component_type red_,
-                 component_type green_,
-                 component_type blue_) noexcept
-    : rgb{red_, green_, blue_}, alpha{alpha_}
-  {
-  }
-
-  component_type const alpha{};  ///< Alpha channel value
-};
-
-/**
- * @brief Represents a custom color that can be associated with an NVTX event
- * via it's `event_attributes`.
- *
- * Specifying colors for NVTX events is a convenient way to visually
- * differentiate among different events in a visualization tool such as Nsight
- * Systems.
- *
- */
-class color {
- public:
-  /// Type used for the color's value
-  using value_type = uint32_t;
-
-  /**
-   * @brief Constructs a `color` using the value provided by `hex_code`.
-   *
-   * `hex_code` is expected to be a 4 byte argb hex code.
-   *
-   * The most significant byte indicates the value of the alpha channel
-   * (opacity) (0-255)
-   *
-   * The next byte indicates the value of the red channel (0-255)
-   *
-   * The next byte indicates the value of the green channel (0-255)
-   *
-   * The least significant byte indicates the value of the blue channel
-   * (0-255)
-   *
-   * @param hex_code The hex code used to construct the `color`
-   */
-  constexpr explicit color(value_type hex_code) noexcept : _value{hex_code} {}
-
-  /**
-   * @brief Construct a `color` using the alpha, red, green, blue components
-   * in `argb`.
-   *
-   * @param argb The alpha, red, green, blue components of the desired `color`
-   */
-  constexpr color(argb argb) noexcept
-    : color{from_bytes_msb_to_lsb(argb.alpha, argb.red, argb.green, argb.blue)}
-  {
-  }
-
-  /**
-   * @brief Construct a `color` using the red, green, blue components in
-   * `rgb`.
-   *
-   * Uses maximum value for the alpha channel (opacity) of the `color`.
-   *
-   * @param rgb The red, green, blue components of the desired `color`
-   */
-  constexpr color(rgb rgb) noexcept
-    : color{from_bytes_msb_to_lsb(0xFF, rgb.red, rgb.green, rgb.blue)}
-  {
-  }
-
-  /**
-   * @brief Returns the `color`s argb hex code
-   *
-   */
-  constexpr value_type get_value() const noexcept { return _value; }
-
-  /**
-   * @brief Return the NVTX color type of the color.
-   *
-   */
-  constexpr nvtxColorType_t get_type() const noexcept { return _type; }
-
-  color()             = delete;
-  ~color()            = default;
-  color(color const&) = default;
-  color& operator=(color const&) = default;
-  color(color&&)                 = default;
-  color& operator=(color&&) = default;
-
- private:
-  /**
-   * @brief Constructs an unsigned, 4B integer from the component bytes in
-   * most to least significant byte order.
-   *
-   */
-  constexpr static value_type from_bytes_msb_to_lsb(uint8_t byte3,
-                                                    uint8_t byte2,
-                                                    uint8_t byte1,
-                                                    uint8_t byte0) noexcept
-  {
-    return uint32_t{byte3} << 24 | uint32_t{byte2} << 16 | uint32_t{byte1} << 8 | uint32_t{byte0};
-  }
-
-  value_type const _value{};                     ///< color's argb color code
-  nvtxColorType_t const _type{NVTX_COLOR_ARGB};  ///< NVTX color type code
-};
-
-/**
- * @brief Object for intra-domain grouping of NVTX events.
- *
- * A `category` is simply an integer id that allows for fine-grain grouping of
- * NVTX events. For example, one might use separate categories for IO, memory
- * allocation, compute, etc.
- *
- * Example:
- * \code{.cpp}
- * nvtx3::category cat1{1};
- *
- * // Range `r1` belongs to the category identified by the value `1`.
- * nvtx3::thread_range r1{cat1};
- *
- * // Range `r2` belongs to the same category as `r1`
- * nvtx3::thread_range r2{nvtx3::category{1}};
- * \endcode
- *
- * To associate a name string with a category id, see `named_category`.
- *
- */
-class category {
- public:
-  /// Type used for `category`s integer id.
-  using id_type = uint32_t;
-
-  /**
-   * @brief Construct a `category` with the specified `id`.
-   *
-   * The `category` will be unnamed and identified only by its `id` value.
-   *
-   * All `category` objects sharing the same `id` are equivalent.
-   *
-   * @param[in] id The `category`'s identifying value
-   */
-  constexpr explicit category(id_type id) noexcept : id_{id} {}
-
-  /**
-   * @brief Returns the id of the category.
-   *
-   */
-  constexpr id_type get_id() const noexcept { return id_; }
-
-  category()                = delete;
-  ~category()               = default;
-  category(category const&) = default;
-  category& operator=(category const&) = default;
-  category(category&&)                 = default;
-  category& operator=(category&&) = default;
-
- private:
-  id_type const id_{};  ///< category's unique identifier
-};
-
-/**
- * @brief A `category` with an associated name string.
- *
- * Associates a `name` string with a category `id` to help differentiate among
- * categories.
- *
- * For any given category id `Id`, a `named_category(Id, "name")` should only
- * be constructed once and reused throughout an application. This can be done
- * by either explicitly creating static `named_category` objects, or using the
- * `named_category::get` construct on first use helper (recommended).
- *
- * Creating two or more `named_category` objects with the same value for `id`
- * in the same domain results in undefined behavior.
- *
- * Similarly, behavior is undefined when a `named_category` and `category`
- * share the same value of `id`.
- *
- * Example:
- * \code{.cpp}
- * // Explicitly constructed, static `named_category`
- * static nvtx3::named_category static_category{42, "my category"};
- *
- * // Range `r` associated with category id `42`
- * nvtx3::thread_range r{static_category};
- *
- * // OR use construct on first use:
- *
- * // Define a type with `name` and `id` members
- * struct my_category{
- *    static constexpr char const* name{"my category"}; // category name
- *    static constexpr category::id_type id{42}; // category id
- * };
- *
- * // Use construct on first use to name the category id `42`
- * // with name "my category"
- * auto my_category = named_category<my_domain>::get<my_category>();
- *
- * // Range `r` associated with category id `42`
- * nvtx3::thread_range r{my_category};
- * \endcode
- *
- * `named_category`'s association of a name to a category id is local to the
- * domain specified by the type `D`. An id may have a different name in
- * another domain.
- *
- * @tparam D Type containing `name` member used to identify the `domain` to
- * which the `named_category` belongs. Else, `domain::global` to  indicate
- * that the global NVTX domain should be used.
- */
-template <typename D = domain::global>
-class named_category final : public category {
- public:
-  /**
-   * @brief Returns a global instance of a `named_category` as a
-   * function-local static.
-   *
-   * Creates a `named_category` with name and id specified by the contents of
-   * a type `C`. `C::name` determines the name and `C::id` determines the
-   * category id.
-   *
-   * This function is useful for constructing a named `category` exactly once
-   * and reusing the same instance throughout an application.
-   *
-   * Example:
-   * \code{.cpp}
-   * // Define a type with `name` and `id` members
-   * struct my_category{
-   *    static constexpr char const* name{"my category"}; // category name
-   *    static constexpr uint32_t id{42}; // category id
-   * };
-   *
-   * // Use construct on first use to name the category id `42`
-   * // with name "my category"
-   * auto cat = named_category<my_domain>::get<my_category>();
-   *
-   * // Range `r` associated with category id `42`
-   * nvtx3::thread_range r{cat};
-   * \endcode
-   *
-   * Uses the "construct on first use" idiom to safely ensure the `category`
-   * object is initialized exactly once. See
-   * https://isocpp.org/wiki/faq/ctors#static-init-order-on-first-use
-   *
-   * @tparam C Type containing a member `C::name` that resolves  to either a
-   * `char const*` or `wchar_t const*` and `C::id`.
-   */
-  template <typename C>
-  static named_category<D> const& get() noexcept
-  {
-    static_assert(detail::has_name_member<C>(),
-                  "Type used to name a category must contain a name member.");
-    static named_category<D> const category{C::id, C::name};
-    return category;
-  }
-  /**
-   * @brief Construct a `category` with the specified `id` and `name`.
-   *
-   * The name `name` will be registered with `id`.
-   *
-   * Every unique value of `id` should only be named once.
-   *
-   * @param[in] id The category id to name
-   * @param[in] name The name to associated with `id`
-   */
-  named_category(id_type id, char const* name) noexcept : category{id}
-  {
-    nvtxDomainNameCategoryA(domain::get<D>(), get_id(), name);
-  };
-
-  /**
-   * @brief Construct a `category` with the specified `id` and `name`.
-   *
-   * The name `name` will be registered with `id`.
-   *
-   * Every unique value of `id` should only be named once.
-   *
-   * @param[in] id The category id to name
-   * @param[in] name The name to associated with `id`
-   */
-  named_category(id_type id, wchar_t const* name) noexcept : category{id}
-  {
-    nvtxDomainNameCategoryW(domain::get<D>(), get_id(), name);
-  };
-};
-
-/**
- * @brief A message registered with NVTX.
- *
- * Normally, associating a `message` with an NVTX event requires copying the
- * contents of the message string. This may cause non-trivial overhead in
- * highly performance sensitive regions of code.
- *
- * message registration is an optimization to lower the overhead of
- * associating a message with an NVTX event. Registering a message yields a
- * handle that is inexpensive to copy that may be used in place of a message
- * string.
- *
- * A particular message should only be registered once and the handle
- * reused throughout the rest of the application. This can be done by either
- * explicitly creating static `registered_message` objects, or using the
- * `registered_message::get` construct on first use helper (recommended).
- *
- * Example:
- * \code{.cpp}
- * // Explicitly constructed, static `registered_message`
- * static registered_message<my_domain> static_message{"message"};
- *
- * // "message" is associated with the range `r`
- * nvtx3::thread_range r{static_message};
- *
- * // Or use construct on first use:
- *
- * // Define a type with a `message` member that defines the contents of the
- * // registered message
- * struct my_message{ static constexpr char const* message{ "my message" }; };
- *
- * // Uses construct on first use to register the contents of
- * // `my_message::message`
- * auto msg = registered_message<my_domain>::get<my_message>();
- *
- * // "my message" is associated with the range `r`
- * nvtx3::thread_range r{msg};
- * \endcode
- *
- * `registered_message`s are local to a particular domain specified via
- * the type `D`.
- *
- * @tparam D Type containing `name` member used to identify the `domain` to
- * which the `registered_message` belongs. Else, `domain::global` to  indicate
- * that the global NVTX domain should be used.
- */
-template <typename D = domain::global>
-class registered_message {
- public:
-  /**
-   * @brief Returns a global instance of a `registered_message` as a function
-   * local static.
-   *
-   * Provides a convenient way to register a message with NVTX without having
-   * to explicitly register the message.
-   *
-   * Upon first invocation, constructs a `registered_message` whose contents
-   * are specified by `message::message`.
-   *
-   * All future invocations will return a reference to the object constructed
-   * in the first invocation.
-   *
-   * Example:
-   * \code{.cpp}
-   * // Define a type with a `message` member that defines the contents of the
-   * // registered message
-   * struct my_message{ static constexpr char const* message{ "my message" };
-   * };
-   *
-   * // Uses construct on first use to register the contents of
-   * // `my_message::message`
-   * auto msg = registered_message<my_domain>::get<my_message>();
-   *
-   * // "my message" is associated with the range `r`
-   * nvtx3::thread_range r{msg};
-   * \endcode
-   *
-   * @tparam M Type required to contain a member `M::message` that
-   * resolves to either a `char const*` or `wchar_t const*` used as the
-   * registered message's contents.
-   * @return Reference to a `registered_message` associated with the type `M`.
-   */
-  template <typename M>
-  static registered_message<D> const& get() noexcept
-  {
-    static registered_message<D> const registered_message{M::message};
-    return registered_message;
-  }
-
-  /**
-   * @brief Constructs a `registered_message` from the specified `msg` string.
-   *
-   * Registers `msg` with NVTX and associates a handle with the registered
-   * message.
-   *
-   * A particular message should should only be registered once and the handle
-   * reused throughout the rest of the application.
-   *
-   * @param msg The contents of the message
-   */
-  explicit registered_message(char const* msg) noexcept
-    : handle_{nvtxDomainRegisterStringA(domain::get<D>(), msg)}
-  {
-  }
-
-  /**
-   * @brief Constructs a `registered_message` from the specified `msg` string.
-   *
-   * Registers `msg` with NVTX and associates a handle with the registered
-   * message.
-   *
-   * A particular message should should only be registered once and the handle
-   * reused throughout the rest of the application.
-   *
-   * @param msg The contents of the message
-   */
-  explicit registered_message(std::string const& msg) noexcept : registered_message{msg.c_str()} {}
-
-  /**
-   * @brief Constructs a `registered_message` from the specified `msg` string.
-   *
-   * Registers `msg` with NVTX and associates a handle with the registered
-   * message.
-   *
-   * A particular message should should only be registered once and the handle
-   * reused throughout the rest of the application.
-   *
-   * @param msg The contents of the message
-   */
-  explicit registered_message(wchar_t const* msg) noexcept
-    : handle_{nvtxDomainRegisterStringW(domain::get<D>(), msg)}
-  {
-  }
-
-  /**
-   * @brief Constructs a `registered_message` from the specified `msg` string.
-   *
-   * Registers `msg` with NVTX and associates a handle with the registered
-   * message.
-   *
-   * A particular message should only be registered once and the handle
-   * reused throughout the rest of the application.
-   *
-   * @param msg The contents of the message
-   */
-  explicit registered_message(std::wstring const& msg) noexcept : registered_message{msg.c_str()} {}
-
-  /**
-   * @brief Returns the registered message's handle
-   *
-   */
-  nvtxStringHandle_t get_handle() const noexcept { return handle_; }
-
-  registered_message()                          = delete;
-  ~registered_message()                         = default;
-  registered_message(registered_message const&) = default;
-  registered_message& operator=(registered_message const&) = default;
-  registered_message(registered_message&&)                 = default;
-  registered_message& operator=(registered_message&&) = default;
-
- private:
-  nvtxStringHandle_t const handle_{};  ///< The handle returned from
-                                       ///< registering the message with NVTX
-};
-
-/**
- * @brief Allows associating a message string with an NVTX event via
- * its `EventAttribute`s.
- *
- * Associating a `message` with an NVTX event through its `event_attributes`
- * allows for naming events to easily differentiate them from other events.
- *
- * Every time an NVTX event is created with an associated `message`, the
- * contents of the message string must be copied.  This may cause non-trivial
- * overhead in highly performance sensitive sections of code. Use of a
- * `nvtx3::registered_message` is recommended in these situations.
- *
- * Example:
- * \code{.cpp}
- * // Creates an `event_attributes` with message "message 0"
- * nvtx3::event_attributes attr0{nvtx3::message{"message 0"}};
- *
- * // `range0` contains message "message 0"
- * nvtx3::thread_range range0{attr0};
- *
- * // `std::string` and string literals are implicitly assumed to be
- * // the contents of an `nvtx3::message`
- * // Creates an `event_attributes` with message "message 1"
- * nvtx3::event_attributes attr1{"message 1"};
- *
- * // `range1` contains message "message 1"
- * nvtx3::thread_range range1{attr1};
- *
- * // `range2` contains message "message 2"
- * nvtx3::thread_range range2{nvtx3::Mesage{"message 2"}};
- *
- * // `std::string` and string literals are implicitly assumed to be
- * // the contents of an `nvtx3::message`
- * // `range3` contains message "message 3"
- * nvtx3::thread_range range3{"message 3"};
- * \endcode
- */
-class message {
- public:
-  using value_type = nvtxMessageValue_t;
-
-  /**
-   * @brief Construct a `message` whose contents are specified by `msg`.
-   *
-   * @param msg The contents of the message
-   */
-  NVTX3_RELAXED_CONSTEXPR message(char const* msg) noexcept : type_{NVTX_MESSAGE_TYPE_ASCII}
-  {
-    value_.ascii = msg;
-  }
-
-  /**
-   * @brief Construct a `message` whose contents are specified by `msg`.
-   *
-   * @param msg The contents of the message
-   */
-  message(std::string const& msg) noexcept : message{msg.c_str()} {}
-
-  /**
-   * @brief Disallow construction for `std::string` r-value
-   *
-   * `message` is a non-owning type and therefore cannot take ownership of an
-   * r-value. Therefore, constructing from an r-value is disallowed to prevent
-   * a dangling pointer.
-   *
-   */
-  message(std::string&&) = delete;
-
-  /**
-   * @brief Construct a `message` whose contents are specified by `msg`.
-   *
-   * @param msg The contents of the message
-   */
-  NVTX3_RELAXED_CONSTEXPR message(wchar_t const* msg) noexcept : type_{NVTX_MESSAGE_TYPE_UNICODE}
-  {
-    value_.unicode = msg;
-  }
-
-  /**
-   * @brief Construct a `message` whose contents are specified by `msg`.
-   *
-   * @param msg The contents of the message
-   */
-  message(std::wstring const& msg) noexcept : message{msg.c_str()} {}
-
-  /**
-   * @brief Disallow construction for `std::wstring` r-value
-   *
-   * `message` is a non-owning type and therefore cannot take ownership of an
-   * r-value. Therefore, constructing from an r-value is disallowed to prevent
-   * a dangling pointer.
-   *
-   */
-  message(std::wstring&&) = delete;
-
-  /**
-   * @brief Construct a `message` from a `registered_message`.
-   *
-   * @tparam D Type containing `name` member used to identify the `domain`
-   * to which the `registered_message` belongs. Else, `domain::global` to
-   * indicate that the global NVTX domain should be used.
-   * @param msg The message that has already been registered with NVTX.
-   */
-  template <typename D>
-  NVTX3_RELAXED_CONSTEXPR message(registered_message<D> const& msg) noexcept
-    : type_{NVTX_MESSAGE_TYPE_REGISTERED}
-  {
-    value_.registered = msg.get_handle();
-  }
-
-  /**
-   * @brief Return the union holding the value of the message.
-   *
-   */
-  NVTX3_RELAXED_CONSTEXPR value_type get_value() const noexcept { return value_; }
-
-  /**
-   * @brief Return the type information about the value the union holds.
-   *
-   */
-  NVTX3_RELAXED_CONSTEXPR nvtxMessageType_t get_type() const noexcept { return type_; }
-
- private:
-  nvtxMessageType_t const type_{};  ///< message type
-  nvtxMessageValue_t value_{};      ///< message contents
-};
-
-/**
- * @brief A numerical value that can be associated with an NVTX event via
- * its `event_attributes`.
- *
- * Example:
- * ```
- * nvtx3:: event_attributes attr{nvtx3::payload{42}}; // Constructs a payload
- * from
- *                                                 // the `int32_t` value 42
- *
- * // `range0` will have an int32_t payload of 42
- * nvtx3::thread_range range0{attr};
- *
- * // range1 has double payload of 3.14
- * nvtx3::thread_range range1{ nvtx3::payload{3.14} };
- * ```
- */
-class payload {
- public:
-  using value_type = typename nvtxEventAttributes_v2::payload_t;
-
-  /**
-   * @brief Construct a `payload` from a signed, 8 byte integer.
-   *
-   * @param value Value to use as contents of the payload
-   */
-  NVTX3_RELAXED_CONSTEXPR explicit payload(int64_t value) noexcept
-    : type_{NVTX_PAYLOAD_TYPE_INT64}, value_{}
-  {
-    value_.llValue = value;
-  }
-
-  /**
-   * @brief Construct a `payload` from a signed, 4 byte integer.
-   *
-   * @param value Value to use as contents of the payload
-   */
-  NVTX3_RELAXED_CONSTEXPR explicit payload(int32_t value) noexcept
-    : type_{NVTX_PAYLOAD_TYPE_INT32}, value_{}
-  {
-    value_.iValue = value;
-  }
-
-  /**
-   * @brief Construct a `payload` from an unsigned, 8 byte integer.
-   *
-   * @param value Value to use as contents of the payload
-   */
-  NVTX3_RELAXED_CONSTEXPR explicit payload(uint64_t value) noexcept
-    : type_{NVTX_PAYLOAD_TYPE_UNSIGNED_INT64}, value_{}
-  {
-    value_.ullValue = value;
-  }
-
-  /**
-   * @brief Construct a `payload` from an unsigned, 4 byte integer.
-   *
-   * @param value Value to use as contents of the payload
-   */
-  NVTX3_RELAXED_CONSTEXPR explicit payload(uint32_t value) noexcept
-    : type_{NVTX_PAYLOAD_TYPE_UNSIGNED_INT32}, value_{}
-  {
-    value_.uiValue = value;
-  }
-
-  /**
-   * @brief Construct a `payload` from a single-precision floating point
-   * value.
-   *
-   * @param value Value to use as contents of the payload
-   */
-  NVTX3_RELAXED_CONSTEXPR explicit payload(float value) noexcept
-    : type_{NVTX_PAYLOAD_TYPE_FLOAT}, value_{}
-  {
-    value_.fValue = value;
-  }
-
-  /**
-   * @brief Construct a `payload` from a double-precision floating point
-   * value.
-   *
-   * @param value Value to use as contents of the payload
-   */
-  NVTX3_RELAXED_CONSTEXPR explicit payload(double value) noexcept
-    : type_{NVTX_PAYLOAD_TYPE_DOUBLE}, value_{}
-  {
-    value_.dValue = value;
-  }
-
-  /**
-   * @brief Return the union holding the value of the payload
-   *
-   */
-  NVTX3_RELAXED_CONSTEXPR value_type get_value() const noexcept { return value_; }
-
-  /**
-   * @brief Return the information about the type the union holds.
-   *
-   */
-  NVTX3_RELAXED_CONSTEXPR nvtxPayloadType_t get_type() const noexcept { return type_; }
-
- private:
-  nvtxPayloadType_t const type_;  ///< Type of the payload value
-  value_type value_;              ///< Union holding the payload value
-};
-
-/**
- * @brief Describes the attributes of a NVTX event.
- *
- * NVTX events can be customized via four "attributes":
- *
- * - color:    color used to visualize the event in tools such as Nsight
- *             Systems. See `color`.
- * - message:  Custom message string. See `message`.
- * - payload:  User-defined numerical value. See `payload`.
- * - category: Intra-domain grouping. See `category`.
- *
- * These component attributes are specified via an `event_attributes` object.
- * See `nvtx3::color`, `nvtx3::message`, `nvtx3::payload`, and
- * `nvtx3::category` for how these individual attributes are constructed.
- *
- * While it is possible to specify all four attributes, it is common to want
- * to only specify a subset of attributes and use default values for the
- * others. For convenience, `event_attributes` can be constructed from any
- * number of attribute components in any order.
- *
- * Example:
- * \code{.cpp}
- * event_attributes attr{}; // No arguments, use defaults for all attributes
- *
- * event_attributes attr{"message"}; // Custom message, rest defaulted
- *
- * // Custom color & message
- * event_attributes attr{"message", nvtx3::rgb{127, 255, 0}};
- *
- * /// Custom color & message, can use any order of arguments
- * event_attributes attr{nvtx3::rgb{127, 255, 0}, "message"};
- *
- *
- * // Custom color, message, payload, category
- * event_attributes attr{nvtx3::rgb{127, 255, 0},
- *                      "message",
- *                      nvtx3::payload{42},
- *                      nvtx3::category{1}};
- *
- * // Custom color, message, payload, category, can use any order of arguments
- * event_attributes attr{nvtx3::payload{42},
- *                      nvtx3::category{1},
- *                      "message",
- *                      nvtx3::rgb{127, 255, 0}};
- *
- * // Multiple arguments of the same type are allowed, but only the first is
- * // used. All others are ignored
- * event_attributes attr{ nvtx3::payload{42}, nvtx3::payload{7} }; // payload
- * is 42
- *
- * // Range `r` will be customized according the attributes in `attr`
- * nvtx3::thread_range r{attr};
- *
- * // For convenience, the arguments that can be passed to the
- * `event_attributes`
- * // constructor may be passed to the `domain_thread_range` contructor where
- * // they will be forwarded to the `EventAttribute`s constructor
- * nvtx3::thread_range r{nvtx3::payload{42}, nvtx3::category{1}, "message"};
- * \endcode
- *
- */
-class event_attributes {
- public:
-  using value_type = nvtxEventAttributes_t;
-
-  /**
-   * @brief Default constructor creates an `event_attributes` with no
-   * category, color, payload, nor message.
-   */
-  constexpr event_attributes() noexcept
-    : attributes_{
-        NVTX_VERSION,                   // version
-        sizeof(nvtxEventAttributes_t),  // size
-        0,                              // category
-        NVTX_COLOR_UNKNOWN,             // color type
-        0,                              // color value
-        NVTX_PAYLOAD_UNKNOWN,           // payload type
-        0,                              // payload value (union)
-        NVTX_MESSAGE_UNKNOWN,           // message type
-        0                               // message value (union)
-      }
-  {
-  }
-
-  /**
-   * @brief Variadic constructor where the first argument is a `category`.
-   *
-   * Sets the value of the `EventAttribute`s category based on `c` and
-   * forwards the remaining variadic parameter pack to the next constructor.
-   *
-   */
-  template <typename... Args>
-  NVTX3_RELAXED_CONSTEXPR explicit event_attributes(category const& c, Args const&... args) noexcept
-    : event_attributes(args...)
-  {
-    attributes_.category = c.get_id();
-  }
-
-  /**
-   * @brief Variadic constructor where the first argument is a `color`.
-   *
-   * Sets the value of the `EventAttribute`s color based on `c` and forwards
-   * the remaining variadic parameter pack to the next constructor.
-   *
-   */
-  template <typename... Args>
-  NVTX3_RELAXED_CONSTEXPR explicit event_attributes(color const& c, Args const&... args) noexcept
-    : event_attributes(args...)
-  {
-    attributes_.color     = c.get_value();
-    attributes_.colorType = c.get_type();
-  }
-
-  /**
-   * @brief Variadic constructor where the first argument is a `payload`.
-   *
-   * Sets the value of the `EventAttribute`s payload based on `p` and forwards
-   * the remaining variadic parameter pack to the next constructor.
-   *
-   */
-  template <typename... Args>
-  NVTX3_RELAXED_CONSTEXPR explicit event_attributes(payload const& p, Args const&... args) noexcept
-    : event_attributes(args...)
-  {
-    attributes_.payload     = p.get_value();
-    attributes_.payloadType = p.get_type();
-  }
-
-  /**
-   * @brief Variadic constructor where the first argument is a `message`.
-   *
-   * Sets the value of the `EventAttribute`s message based on `m` and forwards
-   * the remaining variadic parameter pack to the next constructor.
-   *
-   */
-  template <typename... Args>
-  NVTX3_RELAXED_CONSTEXPR explicit event_attributes(message const& m, Args const&... args) noexcept
-    : event_attributes(args...)
-  {
-    attributes_.message     = m.get_value();
-    attributes_.messageType = m.get_type();
-  }
-
-  ~event_attributes()                       = default;
-  event_attributes(event_attributes const&) = default;
-  event_attributes& operator=(event_attributes const&) = default;
-  event_attributes(event_attributes&&)                 = default;
-  event_attributes& operator=(event_attributes&&) = default;
-
-  /**
-   * @brief Get raw pointer to underlying NVTX attributes object.
-   *
-   */
-  constexpr value_type const* get() const noexcept { return &attributes_; }
-
- private:
-  value_type attributes_{};  ///< The NVTX attributes structure
-};
-
-/**
- * @brief A RAII object for creating a NVTX range local to a thread within a
- * domain.
- *
- * When constructed, begins a nested NVTX range on the calling thread in the
- * specified domain. Upon destruction, ends the NVTX range.
- *
- * Behavior is undefined if a `domain_thread_range` object is
- * created/destroyed on different threads.
- *
- * `domain_thread_range` is neither moveable nor copyable.
- *
- * `domain_thread_range`s may be nested within other ranges.
- *
- * The domain of the range is specified by the template type parameter `D`.
- * By default, the `domain::global` is used, which scopes the range to the
- * global NVTX domain. The convenience alias `thread_range` is provided for
- * ranges scoped to the global domain.
- *
- * A custom domain can be defined by creating a type, `D`, with a static
- * member `D::name` whose value is used to name the domain associated with
- * `D`. `D::name` must resolve to either `char const*` or `wchar_t const*`
- *
- * Example:
- * ```
- * // Define a type `my_domain` with a member `name` used to name the domain
- * // associated with the type `my_domain`.
- * struct my_domain{
- *    static constexpr const char * name{"my domain"};
- * };
- * ```
- *
- * Usage:
- * ```
- * nvtx3::domain_thread_range<> r0{"range 0"}; // Range in global domain
- *
- * nvtx3::thread_range r1{"range 1"}; // Alias for range in global domain
- *
- * nvtx3::domain_thread_range<my_domain> r2{"range 2"}; // Range in custom
- * domain
- *
- * // specify an alias to a range that uses a custom domain
- * using my_thread_range = nvtx3::domain_thread_range<my_domain>;
- *
- * my_thread_range r3{"range 3"}; // Alias for range in custom domain
- * ```
- */
-template <class D = domain::global>
-class domain_thread_range {
- public:
-  /**
-   * @brief Construct a `domain_thread_range` with the specified
-   * `event_attributes`
-   *
-   * Example:
-   * ```
-   * nvtx3::event_attributes attr{"msg", nvtx3::rgb{127,255,0}};
-   * nvtx3::domain_thread_range<> range{attr}; // Creates a range with message
-   * contents
-   *                                    // "msg" and green color
-   * ```
-   *
-   * @param[in] attr `event_attributes` that describes the desired attributes
-   * of the range.
-   */
-  explicit domain_thread_range(event_attributes const& attr) noexcept
-  {
-    nvtxDomainRangePushEx(domain::get<D>(), attr.get());
-  }
-
-  /**
-   * @brief Constructs a `domain_thread_range` from the constructor arguments
-   * of an `event_attributes`.
-   *
-   * Forwards the arguments `first, args...` to construct an
-   * `event_attributes` object. The `event_attributes` object is then
-   * associated with the `domain_thread_range`.
-   *
-   * For more detail, see `event_attributes` documentation.
-   *
-   * Example:
-   * ```
-   * // Creates a range with message "message" and green color
-   * nvtx3::domain_thread_range<> r{"message", nvtx3::rgb{127,255,0}};
-   * ```
-   *
-   * @note To prevent making needless copies of `event_attributes` objects,
-   * this constructor is disabled when the first argument is an
-   * `event_attributes` object, instead preferring the explicit
-   * `domain_thread_range(event_attributes const&)` constructor.
-   *
-   * @param[in] first First argument to forward to the `event_attributes`
-   * constructor.
-   * @param[in] args Variadic parameter pack of additional arguments to
-   * forward.
-   *
-   */
-  template <typename First,
-            typename... Args,
-            typename = typename std::enable_if<
-              not std::is_same<event_attributes, typename std::decay<First>>::value>>
-  explicit domain_thread_range(First const& first, Args const&... args) noexcept
-    : domain_thread_range{event_attributes{first, args...}}
-  {
-  }
-
-  /**
-   * @brief Default constructor creates a `domain_thread_range` with no
-   * message, color, payload, nor category.
-   *
-   */
-  domain_thread_range() : domain_thread_range{event_attributes{}} {}
-
-  domain_thread_range(domain_thread_range const&) = delete;
-  domain_thread_range& operator=(domain_thread_range const&) = delete;
-  domain_thread_range(domain_thread_range&&)                 = delete;
-  domain_thread_range& operator=(domain_thread_range&&) = delete;
-
-  /**
-   * @brief Destroy the domain_thread_range, ending the NVTX range event.
-   */
-  ~domain_thread_range() noexcept { nvtxDomainRangePop(domain::get<D>()); }
-};
-
-/**
- * @brief Alias for a `domain_thread_range` in the global NVTX domain.
- *
- */
-using thread_range = domain_thread_range<>;
-
-/**
- * @brief Handle used for correlating explicit range start and end events.
- *
- */
-struct range_handle {
-  /// Type used for the handle's value
-  using value_type = nvtxRangeId_t;
-
-  /**
-   * @brief Construct a `range_handle` from the given id.
-   *
-   */
-  constexpr range_handle(value_type id) noexcept : _range_id{id} {}
-
-  /**
-   * @brief Returns the `range_handle`'s value
-   *
-   * @return value_type The handle's value
-   */
-  constexpr value_type get_value() const noexcept { return _range_id; }
-
- private:
-  value_type _range_id{};  ///< The underlying NVTX range id
-};
-
-/**
- * @brief Manually begin an NVTX range.
- *
- * Explicitly begins an NVTX range and returns a unique handle. To end the
- * range, pass the handle to `end_range()`.
- *
- * `start_range/end_range` are the most explicit and lowest level APIs provided
- * for creating ranges.  Use of `nvtx3::domain_process_range` should be
- * preferred unless one is unable to tie the range to the lifetime of an object.
- *
- * Example:
- * ```
- * nvtx3::event_attributes attr{"msg", nvtx3::rgb{127,255,0}};
- * nvtx3::range_handle h = nvxt3::start_range(attr); // Manually begins a range
- * ...
- * nvtx3::end_range(h); // Ends the range
- * ```
- *
- * @tparam D Type containing `name` member used to identify the `domain`
- * to which the range belongs. Else, `domain::global` to indicate that the
- * global NVTX domain should be used.
- * @param[in] attr `event_attributes` that describes the desired attributes
- * of the range.
- * @return Unique handle to be passed to `end_range` to end the range.
- */
-template <typename D = domain::global>
-range_handle start_range(event_attributes const& attr) noexcept
-{
-  return range_handle{nvtxDomainRangeStartEx(domain::get<D>(), attr.get())};
-}
-
-/**
- * @brief Manually begin an NVTX range.
- *
- * Explicitly begins an NVTX range and returns a unique handle. To end the
- * range, pass the handle to `end_range()`.
- *
- * Forwards the arguments `first, args...` to construct an  `event_attributes`
- * object. The `event_attributes` object is then  associated with the range.
- *
- * For more detail, see `event_attributes` documentation.
- *
- * Example:
- * ```
- * nvtx3::range_handle h = nvxt3::start_range("msg", nvtx3::rgb{127,255,0}); //
- * Begin range
- * ...
- * nvtx3::end_range(h); // Ends the range
- * ```
- *
- * `start_range/end_range` are the most explicit and lowest level APIs provided
- * for creating ranges.  Use of `nvtx3::domain_process_range` should be
- * preferred unless one is unable to tie the range to the lifetime of an object.
- *
- * @param first[in] First argument to pass to an `event_attributes`
- * @param args[in] Variadiac parameter pack of the rest of the arguments for an
- * `event_attributes`.
- * @return Unique handle to be passed to `end_range` to end the range.
- */
-template <typename First,
-          typename... Args,
-          typename = typename std::enable_if<
-            not std::is_same<event_attributes, typename std::decay<First>>::value>>
-range_handle start_range(First const& first, Args const&... args) noexcept
-{
-  return start_range(event_attributes{first, args...});
-}
-
-/**
- * @brief Manually end the range associated with the handle `r`.
- *
- * Explicitly ends the NVTX range indicated by the handle `r` returned from a
- * prior call to `start_range`. The range may end on a different thread from
- * where it began.
- *
- * This function does not have a Domain tag type template parameter as the
- * handle `r` already indicates the domain to which the range belongs.
- *
- * @param r Handle to a range started by a prior call to `start_range`.
- */
-void end_range(range_handle r) { nvtxRangeEnd(r.get_value()); }
-
-/**
- * @brief A RAII object for creating a NVTX range within a domain that can
- * be created and destroyed on different threads.
- *
- * When constructed, begins a NVTX range in the specified domain. Upon
- * destruction, ends the NVTX range.
- *
- * Similar to `nvtx3::domain_thread_range`, the only difference being that
- * `domain_process_range` can start and end on different threads.
- *
- * Use of `nvtx3::domain_thread_range` should be preferred unless one needs
- * the ability to start and end a range on different threads.
- *
- * `domain_process_range` is moveable, but not copyable.
- *
- * @tparam D Type containing `name` member used to identify the `domain`
- * to which the `domain_process_range` belongs. Else, `domain::global` to
- * indicate that the global NVTX domain should be used.
- */
-template <typename D = domain::global>
-class domain_process_range {
- public:
-  /**
-   * @brief Construct a new domain process range object
-   *
-   * @param attr
-   */
-  explicit domain_process_range(event_attributes const& attr) noexcept : handle_{start_range(attr)}
-  {
-  }
-
-  /**
-   * @brief Construct a new domain process range object
-   *
-   * @param first
-   * @param args
-   */
-  template <typename First,
-            typename... Args,
-            typename = typename std::enable_if<
-              not std::is_same<event_attributes, typename std::decay<First>>::value>>
-  explicit domain_process_range(First const& first, Args const&... args) noexcept
-    : domain_process_range{event_attributes{first, args...}}
-  {
-  }
-
-  /**
-   * @brief Construct a new domain process range object
-   *
-   */
-  constexpr domain_process_range() noexcept : domain_process_range{event_attributes{}} {}
-
-  /**
-   * @brief Destroy the `domain_process_range` ending the range.
-   *
-   */
-  ~domain_process_range() noexcept
-  {
-    if (not moved_from_) { end_range(handle_); }
-  }
-
-  /**
-   * @brief Move constructor allows taking ownership of the NVTX range from
-   * another `domain_process_range`.
-   *
-   * @param other
-   */
-  domain_process_range(domain_process_range&& other) noexcept : handle_{other.handle_}
-  {
-    other.moved_from_ = true;
-  }
-
-  /**
-   * @brief Move assignment operator allows taking ownership of an NVTX range
-   * from another `domain_process_range`.
-   *
-   * @param other
-   * @return domain_process_range&
-   */
-  domain_process_range& operator=(domain_process_range&& other) noexcept
-  {
-    handle_           = other.handle_;
-    other.moved_from_ = true;
-  }
-
-  /// Copy construction is not allowed to prevent multiple objects from owning
-  /// the same range handle
-  domain_process_range(domain_process_range const&) = delete;
-
-  /// Copy assignment is not allowed to prevent multiple objects from owning the
-  /// same range handle
-  domain_process_range& operator=(domain_process_range const&) = delete;
-
- private:
-  range_handle handle_;     ///< Range handle used to correlate
-                            ///< the start/end of the range
-  bool moved_from_{false};  ///< Indicates if the object has had
-                            ///< it's contents moved from it,
-                            ///< indicating it should not attempt
-                            ///< to end the NVTX range.
-};
-
-/**
- * @brief Alias for a `domain_process_range` in the global NVTX domain.
- *
- */
-using process_range = domain_process_range<>;
-
-/**
- * @brief Annotates an instantaneous point in time with the attributes specified
- * by `attr`.
- *
- * Unlike a "range", a mark is an instantaneous event in an application, e.g.,
- * locking/unlocking a mutex.
- *
- * \code{.cpp}
- * std::mutex global_lock;
- * void lock_mutex(){
- *    global_lock.lock();
- *    nvtx3::mark("lock_mutex");
- * }
- * \endcode
- *
- * @tparam D Type containing `name` member used to identify the `domain`
- * to which the `domain_process_range` belongs. Else, `domain::global` to
- * indicate that the global NVTX domain should be used.
- * @param[in] attr `event_attributes` that describes the desired attributes
- * of the mark.
- */
-template <typename D = nvtx3::domain::global>
-inline void mark(event_attributes const& attr) noexcept
-{
-  nvtxDomainMarkEx(domain::get<D>(), attr.get());
-}
-
-}  // namespace nvtx3
-
-/**
- * @brief Convenience macro for generating a range in the specified `domain`
- * from the lifetime of a function
- *
- * This macro is useful for generating an NVTX range in `domain` from
- * the entry point of a function to its exit. It is intended to be the first
- * line of the function.
- *
- * Constructs a static `registered_message` using the name of the immediately
- * enclosing function returned by `__func__` and constructs a
- * `nvtx3::thread_range` using the registered function name as the range's
- * message.
- *
- * Example:
- * ```
- * struct my_domain{static constexpr char const* name{"my_domain"};};
- *
- * void foo(...){
- *    NVTX3_FUNC_RANGE_IN(my_domain); // Range begins on entry to foo()
- *    // do stuff
- *    ...
- * } // Range ends on return from foo()
- * ```
- *
- * @param[in] D Type containing `name` member used to identify the
- * `domain` to which the `registered_message` belongs. Else,
- * `domain::global` to  indicate that the global NVTX domain should be used.
- */
-#define NVTX3_FUNC_RANGE_IN(D)                                                 \
-  static ::nvtx3::registered_message<D> const nvtx3_func_name__{__func__};     \
-  static ::nvtx3::event_attributes const nvtx3_func_attr__{nvtx3_func_name__}; \
-  ::nvtx3::domain_thread_range<D> const nvtx3_range__{nvtx3_func_attr__};
-
-/**
- * @brief Convenience macro for generating a range in the global domain from the
- * lifetime of a function.
- *
- * This macro is useful for generating an NVTX range in the global domain from
- * the entry point of a function to its exit. It is intended to be the first
- * line of the function.
- *
- * Constructs a static `registered_message` using the name of the immediately
- * enclosing function returned by `__func__` and constructs a
- * `nvtx3::thread_range` using the registered function name as the range's
- * message.
- *
- * Example:
- * ```
- * void foo(...){
- *    NVTX3_FUNC_RANGE(); // Range begins on entry to foo()
- *    // do stuff
- *    ...
- * } // Range ends on return from foo()
- * ```
- */
-#define NVTX3_FUNC_RANGE() NVTX3_FUNC_RANGE_IN(::nvtx3::domain::global)
\ No newline at end of file

From cd2119096ddbfd38705f6b9326e0ce81a1288fe0 Mon Sep 17 00:00:00 2001
From: Nico Iskos <niskos@nvidia.com>
Date: Fri, 7 Oct 2022 11:02:44 -0700
Subject: [PATCH 17/36] num_successes_ removed

---
 benchmarks/hash_table/static_map_bench.cu   |  40 +++----
 include/cuco/detail/dynamic_map.inl         |  49 ++++-----
 include/cuco/detail/dynamic_map_kernels.cuh | 110 +++++++-------------
 include/cuco/dynamic_map.cuh                |   1 -
 tests/dynamic_map/erase_test.cu             |  11 +-
 5 files changed, 85 insertions(+), 126 deletions(-)

diff --git a/benchmarks/hash_table/static_map_bench.cu b/benchmarks/hash_table/static_map_bench.cu
index ce1015b8d..04b0e5372 100644
--- a/benchmarks/hash_table/static_map_bench.cu
+++ b/benchmarks/hash_table/static_map_bench.cu
@@ -293,56 +293,60 @@ static void BM_static_map_erase_none(::benchmark::State& state)
                           int64_t(state.range(0)));
 }
 
-/*
-BENCHMARK_TEMPLATE(BM_static_map_erase_none, int64_t, int64_t, dist_type::UNIFORM)
-  ->Unit(benchmark::kMillisecond)
-  ->Apply(generate_size_and_occupancy);
-
-BENCHMARK_TEMPLATE(BM_static_map_erase_none, int32_t, int32_t, dist_type::UNIFORM)
+BENCHMARK_TEMPLATE(BM_static_map_insert, int32_t, int32_t, dist_type::UNIQUE)
   ->Unit(benchmark::kMillisecond)
-  ->Apply(generate_size_and_occupancy);
+  ->Apply(generate_size_and_occupancy)
+  ->UseManualTime();
 
-BENCHMARK_TEMPLATE(BM_static_map_erase_all, int64_t, int64_t, dist_type::UNIFORM)
+BENCHMARK_TEMPLATE(BM_static_map_search_all, int32_t, int32_t, dist_type::UNIQUE)
   ->Unit(benchmark::kMillisecond)
   ->Apply(generate_size_and_occupancy);
 
-BENCHMARK_TEMPLATE(BM_static_map_erase_all, int32_t, int32_t, dist_type::UNIFORM)
+BENCHMARK_TEMPLATE(BM_static_map_insert, int32_t, int32_t, dist_type::UNIFORM)
   ->Unit(benchmark::kMillisecond)
-  ->Apply(generate_size_and_occupancy);
+  ->Apply(generate_size_and_occupancy)
+  ->UseManualTime();
 
-BENCHMARK_TEMPLATE(BM_static_map_search_none, int64_t, int64_t, dist_type::UNIQUE)
+BENCHMARK_TEMPLATE(BM_static_map_search_all, int32_t, int32_t, dist_type::UNIFORM)
   ->Unit(benchmark::kMillisecond)
   ->Apply(generate_size_and_occupancy);
-*/
 
 BENCHMARK_TEMPLATE(BM_static_map_insert, int32_t, int32_t, dist_type::GAUSSIAN)
   ->Unit(benchmark::kMillisecond)
   ->Apply(generate_size_and_occupancy)
   ->UseManualTime();
 
-BENCHMARK_TEMPLATE(BM_static_map_search_all, int32_t, int32_t, dist_type::UNIQUE)
+BENCHMARK_TEMPLATE(BM_static_map_search_all, int32_t, int32_t, dist_type::GAUSSIAN)
   ->Unit(benchmark::kMillisecond)
   ->Apply(generate_size_and_occupancy);
 
-BENCHMARK_TEMPLATE(BM_static_map_insert, int32_t, int32_t, dist_type::UNIFORM)
+BENCHMARK_TEMPLATE(BM_static_map_insert, int64_t, int64_t, dist_type::UNIQUE)
   ->Unit(benchmark::kMillisecond)
   ->Apply(generate_size_and_occupancy)
   ->UseManualTime();
 
-BENCHMARK_TEMPLATE(BM_static_map_search_all, int32_t, int32_t, dist_type::UNIFORM)
+BENCHMARK_TEMPLATE(BM_static_map_search_all, int64_t, int64_t, dist_type::UNIQUE)
   ->Unit(benchmark::kMillisecond)
   ->Apply(generate_size_and_occupancy);
 
-BENCHMARK_TEMPLATE(BM_static_map_insert, int32_t, int32_t, dist_type::GAUSSIAN)
+BENCHMARK_TEMPLATE(BM_static_map_insert, int64_t, int64_t, dist_type::UNIFORM)
   ->Unit(benchmark::kMillisecond)
   ->Apply(generate_size_and_occupancy)
   ->UseManualTime();
 
-BENCHMARK_TEMPLATE(BM_static_map_search_all, int32_t, int32_t, dist_type::GAUSSIAN)
+BENCHMARK_TEMPLATE(BM_static_map_search_all, int64_t, int64_t, dist_type::UNIFORM)
   ->Unit(benchmark::kMillisecond)
   ->Apply(generate_size_and_occupancy);
 
-BENCHMARK_TEMPLATE(BM_static_map_insert, int64_t, int64_t, dist_type::UNIQUE)
+BENCHMARK_TEMPLATE(BM_static_map_insert, int64_t, int64_t, dist_type::GAUSSIAN)
   ->Unit(benchmark::kMillisecond)
   ->Apply(generate_size_and_occupancy)
   ->UseManualTime();
+
+BENCHMARK_TEMPLATE(BM_static_map_search_all, int64_t, int64_t, dist_type::GAUSSIAN)
+  ->Unit(benchmark::kMillisecond)
+  ->Apply(generate_size_and_occupancy);
+
+BENCHMARK_TEMPLATE(BM_static_map_erase_all, int32_t, int32_t, dist_type::UNIQUE)
+  ->Unit(benchmark::kMillisecond)
+  ->Apply(generate_size_and_occupancy);
diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl
index 0e0020e97..4b857256a 100644
--- a/include/cuco/detail/dynamic_map.inl
+++ b/include/cuco/detail/dynamic_map.inl
@@ -39,7 +39,8 @@ dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(
     alloc));
   submap_views_.push_back(submaps_[0]->get_device_view());
   submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view());
-  num_successes_ = std::allocator_traits<counter_allocator_type>::allocate(counter_allocator_, 1);
+  submap_num_successes_.push_back(submaps_[0]->get_num_successes());
+  d_submap_num_successes_ = submap_num_successes_;
 }
 
 template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
@@ -69,13 +70,11 @@ dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(
   submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view());
   submap_num_successes_.push_back(submaps_[0]->get_num_successes());
   d_submap_num_successes_ = submap_num_successes_;
-  num_successes_ = std::allocator_traits<counter_allocator_type>::allocate(counter_allocator_, 1);
 }
 
 template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
 dynamic_map<Key, Value, Scope, Allocator>::~dynamic_map()
 {
-  std::allocator_traits<counter_allocator_type>::deallocate(counter_allocator_, num_successes_, 1);
 }
 
 template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
@@ -108,6 +107,8 @@ void dynamic_map<Key, Value, Scope, Allocator>::reserve(std::size_t n)
           sentinel::empty_key<Key>{empty_key_sentinel_},
           sentinel::empty_value<Value>{empty_value_sentinel_},
           alloc_));
+        submap_num_successes_.push_back(submaps_[submap_idx]->get_num_successes());
+        d_submap_num_successes_ = submap_num_successes_;
       }
       submap_views_.push_back(submaps_[submap_idx]->get_device_view());
       submap_mutable_views_.push_back(submaps_[submap_idx]->get_device_mutable_view());
@@ -140,7 +141,7 @@ void dynamic_map<Key, Value, Scope, Allocator>::insert(InputIt first,
     if (capacity_remaining >= min_insert_size_) {
       // TODO: memset an atomic variable is unsafe
       static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type));
-      CUCO_CUDA_TRY(cudaMemset(num_successes_, 0, sizeof(atomic_ctr_type)));
+      CUCO_CUDA_TRY(cudaMemset(submap_num_successes_[submap_idx], 0, sizeof(atomic_ctr_type)));
 
       auto n                = std::min(capacity_remaining, num_to_insert);
       auto const block_size = 128;
@@ -153,7 +154,8 @@ void dynamic_map<Key, Value, Scope, Allocator>::insert(InputIt first,
                                     first + n,
                                     submap_views_.data().get(),
                                     submap_mutable_views_.data().get(),
-                                    num_successes_,
+                                    //num_successes_,
+                                    d_submap_num_successes_.data().get(),
                                     submap_idx,
                                     submaps_.size(),
                                     hash,
@@ -161,7 +163,7 @@ void dynamic_map<Key, Value, Scope, Allocator>::insert(InputIt first,
 
       std::size_t h_num_successes;
       CUCO_CUDA_TRY(cudaMemcpy(
-        &h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost));
+        &h_num_successes, submap_num_successes_[submap_idx], sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost));
       submaps_[submap_idx]->size_ += h_num_successes;
       size_ += h_num_successes;
       first += n;
@@ -187,14 +189,11 @@ void dynamic_map<Key, Value, Scope, Allocator>::erase(InputIt first,
 
   // TODO: memset an atomic variable is unsafe
   static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type));
-  CUCO_CUDA_TRY(cudaMemset(num_successes_, 0, sizeof(atomic_ctr_type)));
 
   // zero out submap success counters
-  if (submaps_.size() > 1) {
-    static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type));
-    for (uint32_t i = 0; i < submaps_.size(); ++i) {
-      CUCO_CUDA_TRY(cudaMemset(submap_num_successes_[i], 0, sizeof(atomic_ctr_type)));
-    }
+  static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type));
+  for (uint32_t i = 0; i < submaps_.size(); ++i) {
+    CUCO_CUDA_TRY(cudaMemset(submap_num_successes_[i], 0, sizeof(atomic_ctr_type)));
   }
 
   auto const temp_storage_size = submaps_.size() * sizeof(unsigned long long);
@@ -203,29 +202,19 @@ void dynamic_map<Key, Value, Scope, Allocator>::erase(InputIt first,
     <<<grid_size, block_size, temp_storage_size>>>(first,
                                                    first + num_keys,
                                                    submap_mutable_views_.data().get(),
-                                                   num_successes_,
                                                    d_submap_num_successes_.data().get(),
                                                    submaps_.size(),
                                                    hash,
                                                    key_equal);
 
-  // update total dynamic map size
-  std::size_t h_num_successes;
-  CUCO_CUDA_TRY(
-    cudaMemcpy(&h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost));
-  size_ -= h_num_successes;
-
-  if (submaps_.size() == 1) {
-    submaps_[0]->size_ -= h_num_successes;
-  } else {
-    for (uint32_t i = 0; i < submaps_.size(); ++i) {
-      std::size_t h_submap_num_successes;
-      CUCO_CUDA_TRY(cudaMemcpy(&h_submap_num_successes,
-                               submap_num_successes_[i],
-                               sizeof(atomic_ctr_type),
-                               cudaMemcpyDeviceToHost));
-      submaps_[i]->size_ -= h_submap_num_successes;
-    }
+  for (uint32_t i = 0; i < submaps_.size(); ++i) {
+    std::size_t h_submap_num_successes;
+    CUCO_CUDA_TRY(cudaMemcpy(&h_submap_num_successes,
+                              submap_num_successes_[i],
+                              sizeof(atomic_ctr_type),
+                              cudaMemcpyDeviceToHost));
+    submaps_[i]->size_ -= h_submap_num_successes;
+    size_ -= h_submap_num_successes;
   }
 }
 
diff --git a/include/cuco/detail/dynamic_map_kernels.cuh b/include/cuco/detail/dynamic_map_kernels.cuh
index 913149021..7e2f84fce 100644
--- a/include/cuco/detail/dynamic_map_kernels.cuh
+++ b/include/cuco/detail/dynamic_map_kernels.cuh
@@ -147,7 +147,8 @@ __global__ void insert(InputIt first,
                        InputIt last,
                        viewT* submap_views,
                        mutableViewT* submap_mutable_views,
-                       atomicT* num_successes,
+                       //atomicT* num_successes,
+                       atomicT** submap_num_successes,
                        uint32_t insert_idx,
                        uint32_t num_submaps,
                        Hash hash,
@@ -183,7 +184,10 @@ __global__ void insert(InputIt first,
   }
 
   std::size_t block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes);
-  if (threadIdx.x == 0) { *num_successes += block_num_successes; }
+  if (threadIdx.x == 0) { 
+    //*num_successes += block_num_successes;
+    *submap_num_successes[insert_idx] += block_num_successes;
+  }
 }
 
 /**
@@ -221,55 +225,37 @@ template <uint32_t block_size,
 __global__ void erase(InputIt first,
                       InputIt last,
                       mutableViewT* submap_mutable_views,
-                      atomicT* num_successes,
                       atomicT** submap_num_successes,
                       const uint32_t num_submaps,
                       Hash hash,
                       KeyEqual key_equal)
 {
   typedef cub::BlockReduce<std::size_t, block_size> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
   extern __shared__ unsigned long long submap_block_num_successes[];
 
-  std::size_t thread_num_successes = 0;
-
   auto tid = block_size * blockIdx.x + threadIdx.x;
   auto it  = first + tid;
 
-  if (num_submaps > 1) {
-    for (int i = threadIdx.x; i < num_submaps; i += block_size)
-      submap_block_num_successes[i] = 0;
-    __syncthreads();
+  for (int i = threadIdx.x; i < num_submaps; i += block_size)
+    submap_block_num_successes[i] = 0;
+  __syncthreads();
 
-    while (it < last) {
-      int i;
-      for (i = 0; i < num_submaps; ++i) {
-        if (submap_mutable_views[i].erase(*it, hash, key_equal)) {
-          thread_num_successes++;
-          atomicAdd(&submap_block_num_successes[i], 1);
-          break;
-        }
+  while (it < last) {
+    int i;
+    for (i = 0; i < num_submaps; ++i) {
+      if (submap_mutable_views[i].erase(*it, hash, key_equal)) {
+        atomicAdd(&submap_block_num_successes[i], 1);
+        break;
       }
-      it += gridDim.x * blockDim.x;
-    }
-  } else {
-    while (it < last) {
-      if (submap_mutable_views[0].erase(*it, hash, key_equal)) thread_num_successes++;
-      it += gridDim.x * blockDim.x;
     }
+    it += gridDim.x * blockDim.x;
   }
+  __syncthreads();
 
-  std::size_t block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes);
-  if (threadIdx.x == 0) {
-    num_successes->fetch_add(block_num_successes, cuda::std::memory_order_relaxed);
-  }
-
-  if (num_submaps > 1) {
-    for (int i = 0; i < num_submaps; ++i) {
-      if (threadIdx.x == 0) {
-        submap_num_successes[i]->fetch_add(static_cast<std::size_t>(submap_block_num_successes[i]),
-                                           cuda::std::memory_order_relaxed);
-      }
+  for (int i = 0; i < num_submaps; ++i) {
+    if (threadIdx.x == 0) {
+      submap_num_successes[i]->fetch_add(static_cast<std::size_t>(submap_block_num_successes[i]),
+                                          cuda::std::memory_order_relaxed);
     }
   }
 }
@@ -311,60 +297,40 @@ template <uint32_t block_size,
 __global__ void erase(InputIt first,
                       InputIt last,
                       mutableViewT* submap_mutable_views,
-                      atomicT* num_successes,
                       atomicT** submap_num_successes,
                       const uint32_t num_submaps,
                       Hash hash,
                       KeyEqual key_equal)
 {
   typedef cub::BlockReduce<std::size_t, block_size> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
   extern __shared__ unsigned long long submap_block_num_successes[];
 
-  std::size_t thread_num_successes = 0;
-
   auto tile = cg::tiled_partition<tile_size>(cg::this_thread_block());
   auto tid  = block_size * blockIdx.x + threadIdx.x;
   auto it   = first + tid / tile_size;
 
-  if (num_submaps > 1) {
-    for (int i = threadIdx.x; i < num_submaps; i += block_size)
-      submap_block_num_successes[i] = 0;
-    __syncthreads();
+  for (int i = threadIdx.x; i < num_submaps; i += block_size)
+    submap_block_num_successes[i] = 0;
+  __syncthreads();
 
-    while (it < last) {
-      auto erased = false;
-      int i;
-      for (i = 0; i < num_submaps; ++i) {
-        erased = submap_mutable_views[i].erase(tile, *it, hash, key_equal);
-        if (erased) { break; }
-      }
-      if (erased && tile.thread_rank() == 0) {
-        thread_num_successes++;
-        atomicAdd(&submap_block_num_successes[i], 1);
-      }
-      it += (gridDim.x * blockDim.x) / tile_size;
+  while (it < last) {
+    auto erased = false;
+    int i;
+    for (i = 0; i < num_submaps; ++i) {
+      erased = submap_mutable_views[i].erase(tile, *it, hash, key_equal);
+      if (erased) { break; }
     }
-  } else {
-    while (it < last) {
-      auto erased = submap_mutable_views[0].erase(tile, *it, hash, key_equal);
-      if (erased && tile.thread_rank() == 0) thread_num_successes++;
-
-      it += (gridDim.x * blockDim.x) / tile_size;
+    if (erased && tile.thread_rank() == 0) {
+      atomicAdd(&submap_block_num_successes[i], 1);
     }
+    it += (gridDim.x * blockDim.x) / tile_size;
   }
+  __syncthreads();
 
-  std::size_t block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes);
-  if (threadIdx.x == 0) {
-    num_successes->fetch_add(block_num_successes, cuda::std::memory_order_relaxed);
-  }
-
-  if (num_submaps > 1) {
-    for (int i = 0; i < num_submaps; ++i) {
-      if (threadIdx.x == 0) {
-        submap_num_successes[i]->fetch_add(static_cast<std::size_t>(submap_block_num_successes[i]),
-                                           cuda::std::memory_order_relaxed);
-      }
+  for (int i = 0; i < num_submaps; ++i) {
+    if (threadIdx.x == 0) {
+      submap_num_successes[i]->fetch_add(static_cast<std::size_t>(submap_block_num_successes[i]),
+                                          cuda::std::memory_order_relaxed);
     }
   }
 }
diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh
index f34eb3d86..d22ff1d8c 100644
--- a/include/cuco/dynamic_map.cuh
+++ b/include/cuco/dynamic_map.cuh
@@ -338,7 +338,6 @@ class dynamic_map {
   thrust::device_vector<mutable_view_type>
     submap_mutable_views_;          ///< vector of mutable device views for each submap
   std::size_t min_insert_size_{};   ///< min remaining capacity of submap for insert
-  atomic_ctr_type* num_successes_;  ///< number of successfully inserted keys on insert
   std::vector<atomic_ctr_type*>
     submap_num_successes_;  ///< number of succesfully erased keys for each submap
   thrust::device_vector<atomic_ctr_type*>
diff --git a/tests/dynamic_map/erase_test.cu b/tests/dynamic_map/erase_test.cu
index 1c81f400a..4d046a89b 100644
--- a/tests/dynamic_map/erase_test.cu
+++ b/tests/dynamic_map/erase_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,14 +14,15 @@
  * limitations under the License.
  */
 
-#include <catch2/catch.hpp>
+#include <utils.hpp>
+#include <cuco/dynamic_map.cuh>
+
 #include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
 #include <thrust/sequence.h>
 
-#include <cuco/dynamic_map.cuh>
+#include <catch2/catch.hpp>
 
-#include <utils.hpp>
 
 TEMPLATE_TEST_CASE_SIG("erase key", "", ((typename T), T), (int32_t))
 {
@@ -121,7 +122,7 @@ TEMPLATE_TEST_CASE_SIG("erase key", "", ((typename T), T), (int32_t))
     REQUIRE(cuco::test::all_of(d_keys_exist2.begin() + 2 * num_keys,
                                d_keys_exist2.end(),
                                [] __device__(const bool key_found) { return key_found; }));
-
+    
     REQUIRE(map.get_size() == 2 * num_keys);
     // check that keys can be successfully deleted from all submaps (some will be unsuccessful
     // erases)

From 4c1952da326eaf38ca581b7bc2b5085ba9a2fdfc Mon Sep 17 00:00:00 2001
From: Nico Iskos <niskos@nvidia.com>
Date: Fri, 7 Oct 2022 16:17:34 -0700
Subject: [PATCH 18/36] doxygen warning fixes

---
 include/cuco/detail/dynamic_map.inl | 50 ++++++++++++++---------------
 include/cuco/dynamic_map.cuh        | 45 ++++++++++++++++----------
 include/cuco/static_map.cuh         |  6 ++++
 3 files changed, 58 insertions(+), 43 deletions(-)

diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl
index 4b857256a..33def1f83 100644
--- a/include/cuco/detail/dynamic_map.inl
+++ b/include/cuco/detail/dynamic_map.inl
@@ -21,7 +21,8 @@ dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(
   std::size_t initial_capacity,
   sentinel::empty_key<Key> empty_key_sentinel,
   sentinel::empty_value<Value> empty_value_sentinel,
-  Allocator const& alloc)
+  Allocator const& alloc,
+  cudaStream_t stream)
   : empty_key_sentinel_(empty_key_sentinel.value),
     empty_value_sentinel_(empty_value_sentinel.value),
     erased_key_sentinel_(empty_key_sentinel.value),
@@ -36,7 +37,7 @@ dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(
     initial_capacity,
     sentinel::empty_key<Key>{empty_key_sentinel},
     sentinel::empty_value<Value>{empty_value_sentinel},
-    alloc));
+    alloc, stream));
   submap_views_.push_back(submaps_[0]->get_device_view());
   submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view());
   submap_num_successes_.push_back(submaps_[0]->get_num_successes());
@@ -49,7 +50,8 @@ dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(
   sentinel::empty_key<Key> empty_key_sentinel,
   sentinel::empty_value<Value> empty_value_sentinel,
   sentinel::erased_key<Key> erased_key_sentinel,
-  Allocator const& alloc)
+  Allocator const& alloc,
+  cudaStream_t stream)
   : empty_key_sentinel_(empty_key_sentinel.value),
     empty_value_sentinel_(empty_value_sentinel.value),
     erased_key_sentinel_(erased_key_sentinel.value),
@@ -65,7 +67,7 @@ dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(
     sentinel::empty_key<Key>{empty_key_sentinel_},
     sentinel::empty_value<Value>{empty_value_sentinel_},
     sentinel::erased_key<Key>{erased_key_sentinel_},
-    alloc));
+    alloc, stream));
   submap_views_.push_back(submaps_[0]->get_device_view());
   submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view());
   submap_num_successes_.push_back(submaps_[0]->get_num_successes());
@@ -73,12 +75,7 @@ dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(
 }
 
 template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
-dynamic_map<Key, Value, Scope, Allocator>::~dynamic_map()
-{
-}
-
-template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
-void dynamic_map<Key, Value, Scope, Allocator>::reserve(std::size_t n)
+void dynamic_map<Key, Value, Scope, Allocator>::reserve(std::size_t n, cudaStream_t stream)
 {
   int64_t num_elements_remaining = n;
   uint32_t submap_idx            = 0;
@@ -98,18 +95,16 @@ void dynamic_map<Key, Value, Scope, Allocator>::reserve(std::size_t n)
           sentinel::empty_key<Key>{empty_key_sentinel_},
           sentinel::empty_value<Value>{empty_value_sentinel_},
           sentinel::erased_key<Key>{erased_key_sentinel_},
-          alloc_));
-        submap_num_successes_.push_back(submaps_[submap_idx]->get_num_successes());
-        d_submap_num_successes_ = submap_num_successes_;
+          alloc_, stream));
       } else {
         submaps_.push_back(std::make_unique<static_map<Key, Value, Scope, Allocator>>(
           submap_capacity,
           sentinel::empty_key<Key>{empty_key_sentinel_},
           sentinel::empty_value<Value>{empty_value_sentinel_},
-          alloc_));
-        submap_num_successes_.push_back(submaps_[submap_idx]->get_num_successes());
-        d_submap_num_successes_ = submap_num_successes_;
+          alloc_, stream));
       }
+      submap_num_successes_.push_back(submaps_[submap_idx]->get_num_successes());
+      d_submap_num_successes_ = submap_num_successes_;
       submap_views_.push_back(submaps_[submap_idx]->get_device_view());
       submap_mutable_views_.push_back(submaps_[submap_idx]->get_device_mutable_view());
       capacity_ *= 2;
@@ -125,11 +120,12 @@ template <typename InputIt, typename Hash, typename KeyEqual>
 void dynamic_map<Key, Value, Scope, Allocator>::insert(InputIt first,
                                                        InputIt last,
                                                        Hash hash,
-                                                       KeyEqual key_equal)
+                                                       KeyEqual key_equal,
+                                                       cudaStream_t stream)
 {
   std::size_t num_to_insert = std::distance(first, last);
 
-  reserve(size_ + num_to_insert);
+  reserve(size_ + num_to_insert, stream);
 
   uint32_t submap_idx = 0;
   while (num_to_insert > 0) {
@@ -150,11 +146,10 @@ void dynamic_map<Key, Value, Scope, Allocator>::insert(InputIt first,
       auto const grid_size  = (tile_size * n + stride * block_size - 1) / (stride * block_size);
 
       detail::insert<block_size, tile_size, cuco::pair_type<key_type, mapped_type>>
-        <<<grid_size, block_size>>>(first,
+        <<<grid_size, block_size, 0, stream>>>(first,
                                     first + n,
                                     submap_views_.data().get(),
                                     submap_mutable_views_.data().get(),
-                                    //num_successes_,
                                     d_submap_num_successes_.data().get(),
                                     submap_idx,
                                     submaps_.size(),
@@ -178,7 +173,8 @@ template <typename InputIt, typename Hash, typename KeyEqual>
 void dynamic_map<Key, Value, Scope, Allocator>::erase(InputIt first,
                                                       InputIt last,
                                                       Hash hash,
-                                                      KeyEqual key_equal)
+                                                      KeyEqual key_equal,
+                                                      cudaStream_t stream)
 {
   std::size_t num_keys = std::distance(first, last);
 
@@ -199,7 +195,7 @@ void dynamic_map<Key, Value, Scope, Allocator>::erase(InputIt first,
   auto const temp_storage_size = submaps_.size() * sizeof(unsigned long long);
 
   detail::erase<block_size, tile_size, cuco::pair_type<key_type, mapped_type>>
-    <<<grid_size, block_size, temp_storage_size>>>(first,
+    <<<grid_size, block_size, temp_storage_size, stream>>>(first,
                                                    first + num_keys,
                                                    submap_mutable_views_.data().get(),
                                                    d_submap_num_successes_.data().get(),
@@ -221,7 +217,8 @@ void dynamic_map<Key, Value, Scope, Allocator>::erase(InputIt first,
 template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
 template <typename InputIt, typename OutputIt, typename Hash, typename KeyEqual>
 void dynamic_map<Key, Value, Scope, Allocator>::find(
-  InputIt first, InputIt last, OutputIt output_begin, Hash hash, KeyEqual key_equal)
+  InputIt first, InputIt last, OutputIt output_begin, Hash hash, KeyEqual key_equal,
+  cudaStream_t stream)
 {
   auto num_keys         = std::distance(first, last);
   auto const block_size = 128;
@@ -229,7 +226,7 @@ void dynamic_map<Key, Value, Scope, Allocator>::find(
   auto const tile_size  = 4;
   auto const grid_size  = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size);
 
-  detail::find<block_size, tile_size, Value><<<grid_size, block_size>>>(
+  detail::find<block_size, tile_size, Value><<<grid_size, block_size, 0, stream>>>(
     first, last, output_begin, submap_views_.data().get(), submaps_.size(), hash, key_equal);
   CUCO_CUDA_TRY(cudaDeviceSynchronize());
 }
@@ -237,7 +234,8 @@ void dynamic_map<Key, Value, Scope, Allocator>::find(
 template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
 template <typename InputIt, typename OutputIt, typename Hash, typename KeyEqual>
 void dynamic_map<Key, Value, Scope, Allocator>::contains(
-  InputIt first, InputIt last, OutputIt output_begin, Hash hash, KeyEqual key_equal)
+  InputIt first, InputIt last, OutputIt output_begin, Hash hash, KeyEqual key_equal,
+  cudaStream_t stream)
 {
   auto num_keys         = std::distance(first, last);
   auto const block_size = 128;
@@ -245,7 +243,7 @@ void dynamic_map<Key, Value, Scope, Allocator>::contains(
   auto const tile_size  = 4;
   auto const grid_size  = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size);
 
-  detail::contains<block_size, tile_size><<<grid_size, block_size>>>(
+  detail::contains<block_size, tile_size><<<grid_size, block_size, 0, stream>>>(
     first, last, output_begin, submap_views_.data().get(), submaps_.size(), hash, key_equal);
   CUCO_CUDA_TRY(cudaDeviceSynchronize());
 }
diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh
index d22ff1d8c..f2239b75e 100644
--- a/include/cuco/dynamic_map.cuh
+++ b/include/cuco/dynamic_map.cuh
@@ -101,14 +101,14 @@ class dynamic_map {
   static_assert(std::is_arithmetic<Key>::value, "Unsupported, non-arithmetic key type.");
 
  public:
-  using value_type        = cuco::pair_type<Key, Value>;
-  using key_type          = Key;
-  using mapped_type       = Value;
-  using atomic_ctr_type   = cuda::atomic<std::size_t, Scope>;
-  using view_type         = typename static_map<Key, Value, Scope>::device_view;
-  using mutable_view_type = typename static_map<Key, Value, Scope>::device_mutable_view;
-  using counter_allocator_type =
-    typename std::allocator_traits<Allocator>::rebind_alloc<atomic_ctr_type>;
+  using value_type        = cuco::pair_type<Key, Value>; ///< Type of key/value pairs
+  using key_type          = Key;                         ///< Key type
+  using mapped_type       = Value;                       ///< Type of mapped values
+  using atomic_ctr_type   = cuda::atomic<std::size_t, Scope>; ///< Atomic counter type
+  using view_type         = typename static_map<Key, Value, Scope>::device_view; ///< Type for submap device view
+  using mutable_view_type = typename static_map<Key, Value, Scope>::device_mutable_view; ///< Type for submap mutable device view
+  using counter_allocator_type = typename std::allocator_traits<Allocator>::rebind_alloc<
+    atomic_ctr_type>; ///< Type of the allocator to (de)allocate atomic counters
 
   dynamic_map(dynamic_map const&) = delete;
   dynamic_map(dynamic_map&&)      = delete;
@@ -141,11 +141,13 @@ class dynamic_map {
    * @param empty_key_sentinel The reserved key value for empty slots
    * @param empty_value_sentinel The reserved mapped value for empty slots
    * @param alloc Allocator used to allocate submap device storage
+   * @param stream Stream used for executing the kernels
    */
   dynamic_map(std::size_t initial_capacity,
               sentinel::empty_key<Key> empty_key_sentinel,
               sentinel::empty_value<Value> empty_value_sentinel,
-              Allocator const& alloc = Allocator{});
+              Allocator const& alloc = Allocator{},
+              cudaStream_t stream = 0);
 
   /**
    * @brief Construct a dynamically-sized map with erase capability.
@@ -162,11 +164,11 @@ class dynamic_map {
    * that contains either.
    *
    * @param initial_capacity The initial number of slots in the map
-   * @param growth_factor The factor by which the capacity increases when resizing
    * @param empty_key_sentinel The reserved key value for empty slots
    * @param empty_value_sentinel The reserved mapped value for empty slots
    * @param erased_key_sentinel The reserved key value for erased slots
    * @param alloc Allocator used to allocate submap device storage
+   * @param stream Stream used for executing the kernels
    *
    * @throw std::runtime error if the empty key sentinel and erased key sentinel
    * are the same value
@@ -175,13 +177,14 @@ class dynamic_map {
               sentinel::empty_key<Key> empty_key_sentinel,
               sentinel::empty_value<Value> empty_value_sentinel,
               sentinel::erased_key<Key> erased_key_sentinel,
-              Allocator const& alloc = Allocator{});
+              Allocator const& alloc = Allocator{},
+              cudaStream_t stream = 0);
 
   /**
    * @brief Destroy the map and frees its contents
    *
    */
-  ~dynamic_map();
+  ~dynamic_map() {}
 
   /**
    * @brief Grows the capacity of the map so there is enough space for `n` key/value pairs.
@@ -189,8 +192,9 @@ class dynamic_map {
    * If there is already enough space for `n` key/value pairs, the capacity remains the same.
    *
    * @param n The number of key value pairs for which there must be space
+   * @param stream Stream used for executing the kernels
    */
-  void reserve(std::size_t n);
+  void reserve(std::size_t n, cudaStream_t stream = 0);
 
   /**
    * @brief Inserts all key/value pairs in the range `[first, last)`.
@@ -206,11 +210,13 @@ class dynamic_map {
    * @param last End of the sequence of key/value pairs
    * @param hash The unary function to apply to hash each key
    * @param key_equal The binary function to compare two keys for equality
+   * @param stream Stream used for executing the kernels
    */
   template <typename InputIt,
             typename Hash     = cuco::detail::MurmurHash3_32<key_type>,
             typename KeyEqual = thrust::equal_to<key_type>>
-  void insert(InputIt first, InputIt last, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{});
+  void insert(InputIt first, InputIt last, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{},
+              cudaStream_t stream = 0);
 
   /**
    * @brief Erases keys in the range `[first, last)`.
@@ -244,7 +250,8 @@ class dynamic_map {
   template <typename InputIt,
             typename Hash     = cuco::detail::MurmurHash3_32<key_type>,
             typename KeyEqual = thrust::equal_to<key_type>>
-  void erase(InputIt first, InputIt last, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{});
+  void erase(InputIt first, InputIt last, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{},
+             cudaStream_t stream = 0);
 
   /**
    * @brief Finds the values corresponding to all keys in the range `[first, last)`.
@@ -263,6 +270,7 @@ class dynamic_map {
    * @param output_begin Beginning of the sequence of values retrieved for each key
    * @param hash The unary function to apply to hash each key
    * @param key_equal The binary function to compare two keys for equality
+   * @param stream Stream used for executing the kernels
    */
   template <typename InputIt,
             typename OutputIt,
@@ -272,7 +280,8 @@ class dynamic_map {
             InputIt last,
             OutputIt output_begin,
             Hash hash          = Hash{},
-            KeyEqual key_equal = KeyEqual{});
+            KeyEqual key_equal = KeyEqual{},
+            cudaStream_t stream = 0);
 
   /**
    * @brief Indicates whether the keys in the range `[first, last)` are contained in the map.
@@ -290,6 +299,7 @@ class dynamic_map {
    * @param output_begin Beginning of the sequence of booleans for the presence of each key
    * @param hash The unary function to apply to hash each key
    * @param key_equal The binary function to compare two keys for equality
+   * @param stream Stream used for executing the kernels
    */
   template <typename InputIt,
             typename OutputIt,
@@ -299,7 +309,8 @@ class dynamic_map {
                 InputIt last,
                 OutputIt output_begin,
                 Hash hash          = Hash{},
-                KeyEqual key_equal = KeyEqual{});
+                KeyEqual key_equal = KeyEqual{},
+                cudaStream_t stream = 0);
 
   /**
    * @brief Gets the current number of elements in the map
diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh
index f72ce41c5..a55a726f7 100644
--- a/include/cuco/static_map.cuh
+++ b/include/cuco/static_map.cuh
@@ -1414,6 +1414,12 @@ class static_map {
                                sentinel::erased_key<Key>{erased_key_sentinel_});
   }
 
+  /**
+   * @brief Gets the number of successfully inserted/erased keys from the last
+   * insert/erase operation
+   *
+   * @return Number of successfully inserted/erased keys from the last insert/erase operation
+   */
   atomic_ctr_type* get_num_successes() const noexcept { return num_successes_; }
 
  private:

From 80f4d14265a63c35ce5e8ea4eb0f9893c33f7980 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 7 Oct 2022 23:17:51 +0000
Subject: [PATCH 19/36] [pre-commit.ci] auto code formatting

---
 include/cuco/detail/dynamic_map.inl         | 84 +++++++++++----------
 include/cuco/detail/dynamic_map_kernels.cuh | 12 ++-
 include/cuco/dynamic_map.cuh                | 43 ++++++-----
 tests/dynamic_map/erase_test.cu             |  5 +-
 4 files changed, 78 insertions(+), 66 deletions(-)

diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl
index 33def1f83..8dcfd89cb 100644
--- a/include/cuco/detail/dynamic_map.inl
+++ b/include/cuco/detail/dynamic_map.inl
@@ -37,7 +37,8 @@ dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(
     initial_capacity,
     sentinel::empty_key<Key>{empty_key_sentinel},
     sentinel::empty_value<Value>{empty_value_sentinel},
-    alloc, stream));
+    alloc,
+    stream));
   submap_views_.push_back(submaps_[0]->get_device_view());
   submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view());
   submap_num_successes_.push_back(submaps_[0]->get_num_successes());
@@ -67,7 +68,8 @@ dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(
     sentinel::empty_key<Key>{empty_key_sentinel_},
     sentinel::empty_value<Value>{empty_value_sentinel_},
     sentinel::erased_key<Key>{erased_key_sentinel_},
-    alloc, stream));
+    alloc,
+    stream));
   submap_views_.push_back(submaps_[0]->get_device_view());
   submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view());
   submap_num_successes_.push_back(submaps_[0]->get_num_successes());
@@ -95,13 +97,15 @@ void dynamic_map<Key, Value, Scope, Allocator>::reserve(std::size_t n, cudaStrea
           sentinel::empty_key<Key>{empty_key_sentinel_},
           sentinel::empty_value<Value>{empty_value_sentinel_},
           sentinel::erased_key<Key>{erased_key_sentinel_},
-          alloc_, stream));
+          alloc_,
+          stream));
       } else {
         submaps_.push_back(std::make_unique<static_map<Key, Value, Scope, Allocator>>(
           submap_capacity,
           sentinel::empty_key<Key>{empty_key_sentinel_},
           sentinel::empty_value<Value>{empty_value_sentinel_},
-          alloc_, stream));
+          alloc_,
+          stream));
       }
       submap_num_successes_.push_back(submaps_[submap_idx]->get_num_successes());
       d_submap_num_successes_ = submap_num_successes_;
@@ -117,11 +121,8 @@ void dynamic_map<Key, Value, Scope, Allocator>::reserve(std::size_t n, cudaStrea
 
 template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
 template <typename InputIt, typename Hash, typename KeyEqual>
-void dynamic_map<Key, Value, Scope, Allocator>::insert(InputIt first,
-                                                       InputIt last,
-                                                       Hash hash,
-                                                       KeyEqual key_equal,
-                                                       cudaStream_t stream)
+void dynamic_map<Key, Value, Scope, Allocator>::insert(
+  InputIt first, InputIt last, Hash hash, KeyEqual key_equal, cudaStream_t stream)
 {
   std::size_t num_to_insert = std::distance(first, last);
 
@@ -147,18 +148,20 @@ void dynamic_map<Key, Value, Scope, Allocator>::insert(InputIt first,
 
       detail::insert<block_size, tile_size, cuco::pair_type<key_type, mapped_type>>
         <<<grid_size, block_size, 0, stream>>>(first,
-                                    first + n,
-                                    submap_views_.data().get(),
-                                    submap_mutable_views_.data().get(),
-                                    d_submap_num_successes_.data().get(),
-                                    submap_idx,
-                                    submaps_.size(),
-                                    hash,
-                                    key_equal);
+                                               first + n,
+                                               submap_views_.data().get(),
+                                               submap_mutable_views_.data().get(),
+                                               d_submap_num_successes_.data().get(),
+                                               submap_idx,
+                                               submaps_.size(),
+                                               hash,
+                                               key_equal);
 
       std::size_t h_num_successes;
-      CUCO_CUDA_TRY(cudaMemcpy(
-        &h_num_successes, submap_num_successes_[submap_idx], sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost));
+      CUCO_CUDA_TRY(cudaMemcpy(&h_num_successes,
+                               submap_num_successes_[submap_idx],
+                               sizeof(atomic_ctr_type),
+                               cudaMemcpyDeviceToHost));
       submaps_[submap_idx]->size_ += h_num_successes;
       size_ += h_num_successes;
       first += n;
@@ -170,11 +173,8 @@ void dynamic_map<Key, Value, Scope, Allocator>::insert(InputIt first,
 
 template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
 template <typename InputIt, typename Hash, typename KeyEqual>
-void dynamic_map<Key, Value, Scope, Allocator>::erase(InputIt first,
-                                                      InputIt last,
-                                                      Hash hash,
-                                                      KeyEqual key_equal,
-                                                      cudaStream_t stream)
+void dynamic_map<Key, Value, Scope, Allocator>::erase(
+  InputIt first, InputIt last, Hash hash, KeyEqual key_equal, cudaStream_t stream)
 {
   std::size_t num_keys = std::distance(first, last);
 
@@ -196,19 +196,19 @@ void dynamic_map<Key, Value, Scope, Allocator>::erase(InputIt first,
 
   detail::erase<block_size, tile_size, cuco::pair_type<key_type, mapped_type>>
     <<<grid_size, block_size, temp_storage_size, stream>>>(first,
-                                                   first + num_keys,
-                                                   submap_mutable_views_.data().get(),
-                                                   d_submap_num_successes_.data().get(),
-                                                   submaps_.size(),
-                                                   hash,
-                                                   key_equal);
+                                                           first + num_keys,
+                                                           submap_mutable_views_.data().get(),
+                                                           d_submap_num_successes_.data().get(),
+                                                           submaps_.size(),
+                                                           hash,
+                                                           key_equal);
 
   for (uint32_t i = 0; i < submaps_.size(); ++i) {
     std::size_t h_submap_num_successes;
     CUCO_CUDA_TRY(cudaMemcpy(&h_submap_num_successes,
-                              submap_num_successes_[i],
-                              sizeof(atomic_ctr_type),
-                              cudaMemcpyDeviceToHost));
+                             submap_num_successes_[i],
+                             sizeof(atomic_ctr_type),
+                             cudaMemcpyDeviceToHost));
     submaps_[i]->size_ -= h_submap_num_successes;
     size_ -= h_submap_num_successes;
   }
@@ -216,9 +216,12 @@ void dynamic_map<Key, Value, Scope, Allocator>::erase(InputIt first,
 
 template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
 template <typename InputIt, typename OutputIt, typename Hash, typename KeyEqual>
-void dynamic_map<Key, Value, Scope, Allocator>::find(
-  InputIt first, InputIt last, OutputIt output_begin, Hash hash, KeyEqual key_equal,
-  cudaStream_t stream)
+void dynamic_map<Key, Value, Scope, Allocator>::find(InputIt first,
+                                                     InputIt last,
+                                                     OutputIt output_begin,
+                                                     Hash hash,
+                                                     KeyEqual key_equal,
+                                                     cudaStream_t stream)
 {
   auto num_keys         = std::distance(first, last);
   auto const block_size = 128;
@@ -233,9 +236,12 @@ void dynamic_map<Key, Value, Scope, Allocator>::find(
 
 template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
 template <typename InputIt, typename OutputIt, typename Hash, typename KeyEqual>
-void dynamic_map<Key, Value, Scope, Allocator>::contains(
-  InputIt first, InputIt last, OutputIt output_begin, Hash hash, KeyEqual key_equal,
-  cudaStream_t stream)
+void dynamic_map<Key, Value, Scope, Allocator>::contains(InputIt first,
+                                                         InputIt last,
+                                                         OutputIt output_begin,
+                                                         Hash hash,
+                                                         KeyEqual key_equal,
+                                                         cudaStream_t stream)
 {
   auto num_keys         = std::distance(first, last);
   auto const block_size = 128;
diff --git a/include/cuco/detail/dynamic_map_kernels.cuh b/include/cuco/detail/dynamic_map_kernels.cuh
index 7e2f84fce..0eeb1a632 100644
--- a/include/cuco/detail/dynamic_map_kernels.cuh
+++ b/include/cuco/detail/dynamic_map_kernels.cuh
@@ -147,7 +147,7 @@ __global__ void insert(InputIt first,
                        InputIt last,
                        viewT* submap_views,
                        mutableViewT* submap_mutable_views,
-                       //atomicT* num_successes,
+                       // atomicT* num_successes,
                        atomicT** submap_num_successes,
                        uint32_t insert_idx,
                        uint32_t num_submaps,
@@ -184,7 +184,7 @@ __global__ void insert(InputIt first,
   }
 
   std::size_t block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes);
-  if (threadIdx.x == 0) { 
+  if (threadIdx.x == 0) {
     //*num_successes += block_num_successes;
     *submap_num_successes[insert_idx] += block_num_successes;
   }
@@ -255,7 +255,7 @@ __global__ void erase(InputIt first,
   for (int i = 0; i < num_submaps; ++i) {
     if (threadIdx.x == 0) {
       submap_num_successes[i]->fetch_add(static_cast<std::size_t>(submap_block_num_successes[i]),
-                                          cuda::std::memory_order_relaxed);
+                                         cuda::std::memory_order_relaxed);
     }
   }
 }
@@ -320,9 +320,7 @@ __global__ void erase(InputIt first,
       erased = submap_mutable_views[i].erase(tile, *it, hash, key_equal);
       if (erased) { break; }
     }
-    if (erased && tile.thread_rank() == 0) {
-      atomicAdd(&submap_block_num_successes[i], 1);
-    }
+    if (erased && tile.thread_rank() == 0) { atomicAdd(&submap_block_num_successes[i], 1); }
     it += (gridDim.x * blockDim.x) / tile_size;
   }
   __syncthreads();
@@ -330,7 +328,7 @@ __global__ void erase(InputIt first,
   for (int i = 0; i < num_submaps; ++i) {
     if (threadIdx.x == 0) {
       submap_num_successes[i]->fetch_add(static_cast<std::size_t>(submap_block_num_successes[i]),
-                                          cuda::std::memory_order_relaxed);
+                                         cuda::std::memory_order_relaxed);
     }
   }
 }
diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh
index f2239b75e..3386208aa 100644
--- a/include/cuco/dynamic_map.cuh
+++ b/include/cuco/dynamic_map.cuh
@@ -101,14 +101,17 @@ class dynamic_map {
   static_assert(std::is_arithmetic<Key>::value, "Unsupported, non-arithmetic key type.");
 
  public:
-  using value_type        = cuco::pair_type<Key, Value>; ///< Type of key/value pairs
-  using key_type          = Key;                         ///< Key type
-  using mapped_type       = Value;                       ///< Type of mapped values
-  using atomic_ctr_type   = cuda::atomic<std::size_t, Scope>; ///< Atomic counter type
-  using view_type         = typename static_map<Key, Value, Scope>::device_view; ///< Type for submap device view
-  using mutable_view_type = typename static_map<Key, Value, Scope>::device_mutable_view; ///< Type for submap mutable device view
+  using value_type      = cuco::pair_type<Key, Value>;       ///< Type of key/value pairs
+  using key_type        = Key;                               ///< Key type
+  using mapped_type     = Value;                             ///< Type of mapped values
+  using atomic_ctr_type = cuda::atomic<std::size_t, Scope>;  ///< Atomic counter type
+  using view_type =
+    typename static_map<Key, Value, Scope>::device_view;  ///< Type for submap device view
+  using mutable_view_type =
+    typename static_map<Key, Value, Scope>::device_mutable_view;  ///< Type for submap mutable
+                                                                  ///< device view
   using counter_allocator_type = typename std::allocator_traits<Allocator>::rebind_alloc<
-    atomic_ctr_type>; ///< Type of the allocator to (de)allocate atomic counters
+    atomic_ctr_type>;  ///< Type of the allocator to (de)allocate atomic counters
 
   dynamic_map(dynamic_map const&) = delete;
   dynamic_map(dynamic_map&&)      = delete;
@@ -147,7 +150,7 @@ class dynamic_map {
               sentinel::empty_key<Key> empty_key_sentinel,
               sentinel::empty_value<Value> empty_value_sentinel,
               Allocator const& alloc = Allocator{},
-              cudaStream_t stream = 0);
+              cudaStream_t stream    = 0);
 
   /**
    * @brief Construct a dynamically-sized map with erase capability.
@@ -178,7 +181,7 @@ class dynamic_map {
               sentinel::empty_value<Value> empty_value_sentinel,
               sentinel::erased_key<Key> erased_key_sentinel,
               Allocator const& alloc = Allocator{},
-              cudaStream_t stream = 0);
+              cudaStream_t stream    = 0);
 
   /**
    * @brief Destroy the map and frees its contents
@@ -215,7 +218,10 @@ class dynamic_map {
   template <typename InputIt,
             typename Hash     = cuco::detail::MurmurHash3_32<key_type>,
             typename KeyEqual = thrust::equal_to<key_type>>
-  void insert(InputIt first, InputIt last, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{},
+  void insert(InputIt first,
+              InputIt last,
+              Hash hash           = Hash{},
+              KeyEqual key_equal  = KeyEqual{},
               cudaStream_t stream = 0);
 
   /**
@@ -250,7 +256,10 @@ class dynamic_map {
   template <typename InputIt,
             typename Hash     = cuco::detail::MurmurHash3_32<key_type>,
             typename KeyEqual = thrust::equal_to<key_type>>
-  void erase(InputIt first, InputIt last, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{},
+  void erase(InputIt first,
+             InputIt last,
+             Hash hash           = Hash{},
+             KeyEqual key_equal  = KeyEqual{},
              cudaStream_t stream = 0);
 
   /**
@@ -279,8 +288,8 @@ class dynamic_map {
   void find(InputIt first,
             InputIt last,
             OutputIt output_begin,
-            Hash hash          = Hash{},
-            KeyEqual key_equal = KeyEqual{},
+            Hash hash           = Hash{},
+            KeyEqual key_equal  = KeyEqual{},
             cudaStream_t stream = 0);
 
   /**
@@ -308,8 +317,8 @@ class dynamic_map {
   void contains(InputIt first,
                 InputIt last,
                 OutputIt output_begin,
-                Hash hash          = Hash{},
-                KeyEqual key_equal = KeyEqual{},
+                Hash hash           = Hash{},
+                KeyEqual key_equal  = KeyEqual{},
                 cudaStream_t stream = 0);
 
   /**
@@ -347,8 +356,8 @@ class dynamic_map {
     submaps_;                                      ///< vector of pointers to each submap
   thrust::device_vector<view_type> submap_views_;  ///< vector of device views for each submap
   thrust::device_vector<mutable_view_type>
-    submap_mutable_views_;          ///< vector of mutable device views for each submap
-  std::size_t min_insert_size_{};   ///< min remaining capacity of submap for insert
+    submap_mutable_views_;         ///< vector of mutable device views for each submap
+  std::size_t min_insert_size_{};  ///< min remaining capacity of submap for insert
   std::vector<atomic_ctr_type*>
     submap_num_successes_;  ///< number of succesfully erased keys for each submap
   thrust::device_vector<atomic_ctr_type*>
diff --git a/tests/dynamic_map/erase_test.cu b/tests/dynamic_map/erase_test.cu
index 4d046a89b..0e53197ea 100644
--- a/tests/dynamic_map/erase_test.cu
+++ b/tests/dynamic_map/erase_test.cu
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include <utils.hpp>
 #include <cuco/dynamic_map.cuh>
+#include <utils.hpp>
 
 #include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
@@ -23,7 +23,6 @@
 
 #include <catch2/catch.hpp>
 
-
 TEMPLATE_TEST_CASE_SIG("erase key", "", ((typename T), T), (int32_t))
 {
   using Key   = T;
@@ -122,7 +121,7 @@ TEMPLATE_TEST_CASE_SIG("erase key", "", ((typename T), T), (int32_t))
     REQUIRE(cuco::test::all_of(d_keys_exist2.begin() + 2 * num_keys,
                                d_keys_exist2.end(),
                                [] __device__(const bool key_found) { return key_found; }));
-    
+
     REQUIRE(map.get_size() == 2 * num_keys);
     // check that keys can be successfully deleted from all submaps (some will be unsuccessful
     // erases)

From 66168897e2acc8f2cc3cf9a52259ee68e8ad2d10 Mon Sep 17 00:00:00 2001
From: Nico Iskos <niskos@nvidia.com>
Date: Sun, 13 Nov 2022 22:33:00 -0800
Subject: [PATCH 20/36] code cleanup

---
 include/cuco/detail/dynamic_map.inl         | 13 +++++---
 include/cuco/detail/dynamic_map_kernels.cuh |  7 +++--
 include/cuco/dynamic_map.cuh                | 34 ++++++++++-----------
 3 files changed, 29 insertions(+), 25 deletions(-)

diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl
index 8dcfd89cb..b712a9f41 100644
--- a/include/cuco/detail/dynamic_map.inl
+++ b/include/cuco/detail/dynamic_map.inl
@@ -63,6 +63,9 @@ dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(
     alloc_{alloc},
     counter_allocator_{alloc}
 {
+  CUCO_RUNTIME_EXPECTS(empty_key_sentinel_ != erased_key_sentinel_,
+                       "The empty key sentinel and erased key sentinel cannot be the same value.");
+
   submaps_.push_back(std::make_unique<static_map<Key, Value, Scope, Allocator>>(
     initial_capacity,
     sentinel::empty_key<Key>{empty_key_sentinel_},
@@ -124,6 +127,10 @@ template <typename InputIt, typename Hash, typename KeyEqual>
 void dynamic_map<Key, Value, Scope, Allocator>::insert(
   InputIt first, InputIt last, Hash hash, KeyEqual key_equal, cudaStream_t stream)
 {
+  // TODO: memset an atomic variable is unsafe
+  CUCO_RUNTIME_EXPECTS(sizeof(std::size_t) == sizeof(atomic_ctr_type),
+                       "sizeof(atomic_ctr_type) must be equal to sizeof(std:size_t).");
+  
   std::size_t num_to_insert = std::distance(first, last);
 
   reserve(size_ + num_to_insert, stream);
@@ -136,8 +143,6 @@ void dynamic_map<Key, Value, Scope, Allocator>::insert(
     // only if we meet the minimum insert size.
 
     if (capacity_remaining >= min_insert_size_) {
-      // TODO: memset an atomic variable is unsafe
-      static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type));
       CUCO_CUDA_TRY(cudaMemset(submap_num_successes_[submap_idx], 0, sizeof(atomic_ctr_type)));
 
       auto n                = std::min(capacity_remaining, num_to_insert);
@@ -184,10 +189,10 @@ void dynamic_map<Key, Value, Scope, Allocator>::erase(
   auto const grid_size  = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size);
 
   // TODO: memset an atomic variable is unsafe
-  static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type));
+  CUCO_RUNTIME_EXPECTS(sizeof(std::size_t) == sizeof(atomic_ctr_type),
+                       "sizeof(atomic_ctr_type) must be equal to sizeof(std:size_t).");
 
   // zero out submap success counters
-  static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type));
   for (uint32_t i = 0; i < submaps_.size(); ++i) {
     CUCO_CUDA_TRY(cudaMemset(submap_num_successes_[i], 0, sizeof(atomic_ctr_type)));
   }
diff --git a/include/cuco/detail/dynamic_map_kernels.cuh b/include/cuco/detail/dynamic_map_kernels.cuh
index 0eeb1a632..2ae519220 100644
--- a/include/cuco/detail/dynamic_map_kernels.cuh
+++ b/include/cuco/detail/dynamic_map_kernels.cuh
@@ -305,13 +305,14 @@ __global__ void erase(InputIt first,
   typedef cub::BlockReduce<std::size_t, block_size> BlockReduce;
   extern __shared__ unsigned long long submap_block_num_successes[];
 
+  auto block = cg::this_thread_block();
   auto tile = cg::tiled_partition<tile_size>(cg::this_thread_block());
-  auto tid  = block_size * blockIdx.x + threadIdx.x;
+  auto tid  = block_size * block.group_index().x + block.thread_rank();
   auto it   = first + tid / tile_size;
 
   for (int i = threadIdx.x; i < num_submaps; i += block_size)
     submap_block_num_successes[i] = 0;
-  __syncthreads();
+  block.sync();
 
   while (it < last) {
     auto erased = false;
@@ -323,7 +324,7 @@ __global__ void erase(InputIt first,
     if (erased && tile.thread_rank() == 0) { atomicAdd(&submap_block_num_successes[i], 1); }
     it += (gridDim.x * blockDim.x) / tile_size;
   }
-  __syncthreads();
+  block.sync();
 
   for (int i = 0; i < num_submaps; ++i) {
     if (threadIdx.x == 0) {
diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh
index 3386208aa..1c47ab2c1 100644
--- a/include/cuco/dynamic_map.cuh
+++ b/include/cuco/dynamic_map.cuh
@@ -150,7 +150,11 @@ class dynamic_map {
               sentinel::empty_key<Key> empty_key_sentinel,
               sentinel::empty_value<Value> empty_value_sentinel,
               Allocator const& alloc = Allocator{},
+<<<<<<< HEAD
               cudaStream_t stream    = 0);
+=======
+              cudaStream_t stream = nullptr);
+>>>>>>> code cleanup
 
   /**
    * @brief Construct a dynamically-sized map with erase capability.
@@ -181,7 +185,7 @@ class dynamic_map {
               sentinel::empty_value<Value> empty_value_sentinel,
               sentinel::erased_key<Key> erased_key_sentinel,
               Allocator const& alloc = Allocator{},
-              cudaStream_t stream    = 0);
+              cudaStream_t stream = nullptr);
 
   /**
    * @brief Destroy the map and frees its contents
@@ -197,7 +201,7 @@ class dynamic_map {
    * @param n The number of key value pairs for which there must be space
    * @param stream Stream used for executing the kernels
    */
-  void reserve(std::size_t n, cudaStream_t stream = 0);
+  void reserve(std::size_t n, cudaStream_t stream = nullptr);
 
   /**
    * @brief Inserts all key/value pairs in the range `[first, last)`.
@@ -218,11 +222,8 @@ class dynamic_map {
   template <typename InputIt,
             typename Hash     = cuco::detail::MurmurHash3_32<key_type>,
             typename KeyEqual = thrust::equal_to<key_type>>
-  void insert(InputIt first,
-              InputIt last,
-              Hash hash           = Hash{},
-              KeyEqual key_equal  = KeyEqual{},
-              cudaStream_t stream = 0);
+  void insert(InputIt first, InputIt last, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{},
+              cudaStream_t stream = nullptr);
 
   /**
    * @brief Erases keys in the range `[first, last)`.
@@ -256,11 +257,8 @@ class dynamic_map {
   template <typename InputIt,
             typename Hash     = cuco::detail::MurmurHash3_32<key_type>,
             typename KeyEqual = thrust::equal_to<key_type>>
-  void erase(InputIt first,
-             InputIt last,
-             Hash hash           = Hash{},
-             KeyEqual key_equal  = KeyEqual{},
-             cudaStream_t stream = 0);
+  void erase(InputIt first, InputIt last, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{},
+             cudaStream_t stream = nullptr);
 
   /**
    * @brief Finds the values corresponding to all keys in the range `[first, last)`.
@@ -288,9 +286,9 @@ class dynamic_map {
   void find(InputIt first,
             InputIt last,
             OutputIt output_begin,
-            Hash hash           = Hash{},
-            KeyEqual key_equal  = KeyEqual{},
-            cudaStream_t stream = 0);
+            Hash hash          = Hash{},
+            KeyEqual key_equal = KeyEqual{},
+            cudaStream_t stream = nullptr);
 
   /**
    * @brief Indicates whether the keys in the range `[first, last)` are contained in the map.
@@ -317,9 +315,9 @@ class dynamic_map {
   void contains(InputIt first,
                 InputIt last,
                 OutputIt output_begin,
-                Hash hash           = Hash{},
-                KeyEqual key_equal  = KeyEqual{},
-                cudaStream_t stream = 0);
+                Hash hash          = Hash{},
+                KeyEqual key_equal = KeyEqual{},
+                cudaStream_t stream = nullptr);
 
   /**
    * @brief Gets the current number of elements in the map

From 2df247c63bb37fe1f195de697d8dd658af72a9d8 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 14 Nov 2022 06:35:52 +0000
Subject: [PATCH 21/36] [pre-commit.ci] auto code formatting

---
 include/cuco/detail/dynamic_map.inl         |  2 +-
 include/cuco/detail/dynamic_map_kernels.cuh |  6 +++---
 include/cuco/dynamic_map.cuh                | 20 +++++++++++++-------
 3 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl
index b712a9f41..ce800653e 100644
--- a/include/cuco/detail/dynamic_map.inl
+++ b/include/cuco/detail/dynamic_map.inl
@@ -130,7 +130,7 @@ void dynamic_map<Key, Value, Scope, Allocator>::insert(
   // TODO: memset an atomic variable is unsafe
   CUCO_RUNTIME_EXPECTS(sizeof(std::size_t) == sizeof(atomic_ctr_type),
                        "sizeof(atomic_ctr_type) must be equal to sizeof(std:size_t).");
-  
+
   std::size_t num_to_insert = std::distance(first, last);
 
   reserve(size_ + num_to_insert, stream);
diff --git a/include/cuco/detail/dynamic_map_kernels.cuh b/include/cuco/detail/dynamic_map_kernels.cuh
index 2ae519220..3feadbd34 100644
--- a/include/cuco/detail/dynamic_map_kernels.cuh
+++ b/include/cuco/detail/dynamic_map_kernels.cuh
@@ -306,9 +306,9 @@ __global__ void erase(InputIt first,
   extern __shared__ unsigned long long submap_block_num_successes[];
 
   auto block = cg::this_thread_block();
-  auto tile = cg::tiled_partition<tile_size>(cg::this_thread_block());
-  auto tid  = block_size * block.group_index().x + block.thread_rank();
-  auto it   = first + tid / tile_size;
+  auto tile  = cg::tiled_partition<tile_size>(cg::this_thread_block());
+  auto tid   = block_size * block.group_index().x + block.thread_rank();
+  auto it    = first + tid / tile_size;
 
   for (int i = threadIdx.x; i < num_submaps; i += block_size)
     submap_block_num_successes[i] = 0;
diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh
index 1c47ab2c1..874fdeab1 100644
--- a/include/cuco/dynamic_map.cuh
+++ b/include/cuco/dynamic_map.cuh
@@ -185,7 +185,7 @@ class dynamic_map {
               sentinel::empty_value<Value> empty_value_sentinel,
               sentinel::erased_key<Key> erased_key_sentinel,
               Allocator const& alloc = Allocator{},
-              cudaStream_t stream = nullptr);
+              cudaStream_t stream    = nullptr);
 
   /**
    * @brief Destroy the map and frees its contents
@@ -222,7 +222,10 @@ class dynamic_map {
   template <typename InputIt,
             typename Hash     = cuco::detail::MurmurHash3_32<key_type>,
             typename KeyEqual = thrust::equal_to<key_type>>
-  void insert(InputIt first, InputIt last, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{},
+  void insert(InputIt first,
+              InputIt last,
+              Hash hash           = Hash{},
+              KeyEqual key_equal  = KeyEqual{},
               cudaStream_t stream = nullptr);
 
   /**
@@ -257,7 +260,10 @@ class dynamic_map {
   template <typename InputIt,
             typename Hash     = cuco::detail::MurmurHash3_32<key_type>,
             typename KeyEqual = thrust::equal_to<key_type>>
-  void erase(InputIt first, InputIt last, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{},
+  void erase(InputIt first,
+             InputIt last,
+             Hash hash           = Hash{},
+             KeyEqual key_equal  = KeyEqual{},
              cudaStream_t stream = nullptr);
 
   /**
@@ -286,8 +292,8 @@ class dynamic_map {
   void find(InputIt first,
             InputIt last,
             OutputIt output_begin,
-            Hash hash          = Hash{},
-            KeyEqual key_equal = KeyEqual{},
+            Hash hash           = Hash{},
+            KeyEqual key_equal  = KeyEqual{},
             cudaStream_t stream = nullptr);
 
   /**
@@ -315,8 +321,8 @@ class dynamic_map {
   void contains(InputIt first,
                 InputIt last,
                 OutputIt output_begin,
-                Hash hash          = Hash{},
-                KeyEqual key_equal = KeyEqual{},
+                Hash hash           = Hash{},
+                KeyEqual key_equal  = KeyEqual{},
                 cudaStream_t stream = nullptr);
 
   /**

From af7706d8268050b884b1a5703f6a92f296be62cc Mon Sep 17 00:00:00 2001
From: Nico Iskos <niskos@nvidia.com>
Date: Mon, 14 Nov 2022 14:19:38 -0800
Subject: [PATCH 22/36] switched typedef to using

---
 include/cuco/detail/dynamic_map_kernels.cuh | 8 ++++----
 include/cuco/dynamic_map.cuh                | 4 ----
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/include/cuco/detail/dynamic_map_kernels.cuh b/include/cuco/detail/dynamic_map_kernels.cuh
index 3feadbd34..aefe8c873 100644
--- a/include/cuco/detail/dynamic_map_kernels.cuh
+++ b/include/cuco/detail/dynamic_map_kernels.cuh
@@ -71,7 +71,7 @@ __global__ void insert(InputIt first,
                        Hash hash,
                        KeyEqual key_equal)
 {
-  typedef cub::BlockReduce<std::size_t, block_size> BlockReduce;
+  using BlockReduce = cub::BlockReduce<std::size_t, block_size>;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   std::size_t thread_num_successes = 0;
 
@@ -154,7 +154,7 @@ __global__ void insert(InputIt first,
                        Hash hash,
                        KeyEqual key_equal)
 {
-  typedef cub::BlockReduce<std::size_t, block_size> BlockReduce;
+  using BlockReduce = cub::BlockReduce<std::size_t, block_size>;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   std::size_t thread_num_successes = 0;
 
@@ -230,7 +230,7 @@ __global__ void erase(InputIt first,
                       Hash hash,
                       KeyEqual key_equal)
 {
-  typedef cub::BlockReduce<std::size_t, block_size> BlockReduce;
+  using BlockReduce = cub::BlockReduce<std::size_t, block_size>;
   extern __shared__ unsigned long long submap_block_num_successes[];
 
   auto tid = block_size * blockIdx.x + threadIdx.x;
@@ -302,7 +302,7 @@ __global__ void erase(InputIt first,
                       Hash hash,
                       KeyEqual key_equal)
 {
-  typedef cub::BlockReduce<std::size_t, block_size> BlockReduce;
+  using BlockReduce = cub::BlockReduce<std::size_t, block_size>;
   extern __shared__ unsigned long long submap_block_num_successes[];
 
   auto block = cg::this_thread_block();
diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh
index 874fdeab1..dcfa192c5 100644
--- a/include/cuco/dynamic_map.cuh
+++ b/include/cuco/dynamic_map.cuh
@@ -150,11 +150,7 @@ class dynamic_map {
               sentinel::empty_key<Key> empty_key_sentinel,
               sentinel::empty_value<Value> empty_value_sentinel,
               Allocator const& alloc = Allocator{},
-<<<<<<< HEAD
-              cudaStream_t stream    = 0);
-=======
               cudaStream_t stream = nullptr);
->>>>>>> code cleanup
 
   /**
    * @brief Construct a dynamically-sized map with erase capability.

From 54ae2548d7f1bc18af020f0038d94baa976b0d6d Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 14 Nov 2022 22:22:26 +0000
Subject: [PATCH 23/36] [pre-commit.ci] auto code formatting

---
 include/cuco/dynamic_map.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh
index dcfa192c5..5dbd9c2f7 100644
--- a/include/cuco/dynamic_map.cuh
+++ b/include/cuco/dynamic_map.cuh
@@ -150,7 +150,7 @@ class dynamic_map {
               sentinel::empty_key<Key> empty_key_sentinel,
               sentinel::empty_value<Value> empty_value_sentinel,
               Allocator const& alloc = Allocator{},
-              cudaStream_t stream = nullptr);
+              cudaStream_t stream    = nullptr);
 
   /**
    * @brief Construct a dynamically-sized map with erase capability.

From 593fe127e91df2f9c12197abc2087f94c8cf9ae7 Mon Sep 17 00:00:00 2001
From: Nico Iskos <niskos@nvidia.com>
Date: Fri, 18 Nov 2022 11:24:43 -0800
Subject: [PATCH 24/36] responding to PR comments

---
 benchmarks/hash_table/dynamic_map_bench.cu  | 75 +++++++++++++++++++++
 include/cuco/detail/dynamic_map.inl         |  2 +-
 include/cuco/detail/dynamic_map_kernels.cuh | 17 +++--
 include/cuco/detail/static_map_kernels.cuh  |  8 ++-
 include/cuco/dynamic_map.cuh                | 13 +---
 5 files changed, 97 insertions(+), 18 deletions(-)

diff --git a/benchmarks/hash_table/dynamic_map_bench.cu b/benchmarks/hash_table/dynamic_map_bench.cu
index 079018005..de2317957 100644
--- a/benchmarks/hash_table/dynamic_map_bench.cu
+++ b/benchmarks/hash_table/dynamic_map_bench.cu
@@ -271,6 +271,81 @@ BENCHMARK_TEMPLATE(BM_dynamic_search_all, int32_t, int32_t, dist_type::UNIQUE)
   ->UseManualTime();
 
 BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int32_t, int32_t, dist_type::UNIQUE)
+  ->Unit(benchmark::kMillisecond)
+  ->Apply(gen_final_size)
+  ->UseManualTime();
+  
+BENCHMARK_TEMPLATE(BM_dynamic_insert, int64_t, int64_t, dist_type::UNIQUE)
+  ->Unit(benchmark::kMillisecond)
+  ->Apply(gen_final_size)
+  ->UseManualTime();
+
+BENCHMARK_TEMPLATE(BM_dynamic_search_all, int64_t, int64_t, dist_type::UNIQUE)
+  ->Unit(benchmark::kMillisecond)
+  ->Apply(gen_final_size)
+  ->UseManualTime();
+
+BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int64_t, int64_t, dist_type::UNIQUE)
+  ->Unit(benchmark::kMillisecond)
+  ->Apply(gen_final_size)
+  ->UseManualTime();
+  
+BENCHMARK_TEMPLATE(BM_dynamic_insert, int32_t, int32_t, dist_type::UNIFORM)
+  ->Unit(benchmark::kMillisecond)
+  ->Apply(gen_final_size)
+  ->UseManualTime();
+
+BENCHMARK_TEMPLATE(BM_dynamic_search_all, int32_t, int32_t, dist_type::UNIFORM)
+  ->Unit(benchmark::kMillisecond)
+  ->Apply(gen_final_size)
+  ->UseManualTime();
+
+BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int32_t, int32_t, dist_type::UNIFORM)
+  ->Unit(benchmark::kMillisecond)
+  ->Apply(gen_final_size)
+  ->UseManualTime();
+  
+BENCHMARK_TEMPLATE(BM_dynamic_insert, int64_t, int64_t, dist_type::UNIFORM)
+  ->Unit(benchmark::kMillisecond)
+  ->Apply(gen_final_size)
+  ->UseManualTime();
+
+BENCHMARK_TEMPLATE(BM_dynamic_search_all, int64_t, int64_t, dist_type::UNIFORM)
+  ->Unit(benchmark::kMillisecond)
+  ->Apply(gen_final_size)
+  ->UseManualTime();
+
+BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int64_t, int64_t, dist_type::UNIFORM)
+  ->Unit(benchmark::kMillisecond)
+  ->Apply(gen_final_size)
+  ->UseManualTime();
+  
+BENCHMARK_TEMPLATE(BM_dynamic_insert, int32_t, int32_t, dist_type::GAUSSIAN)
+  ->Unit(benchmark::kMillisecond)
+  ->Apply(gen_final_size)
+  ->UseManualTime();
+
+BENCHMARK_TEMPLATE(BM_dynamic_search_all, int32_t, int32_t, dist_type::GAUSSIAN)
+  ->Unit(benchmark::kMillisecond)
+  ->Apply(gen_final_size)
+  ->UseManualTime();
+
+BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int32_t, int32_t, dist_type::GAUSSIAN)
+  ->Unit(benchmark::kMillisecond)
+  ->Apply(gen_final_size)
+  ->UseManualTime();
+  
+BENCHMARK_TEMPLATE(BM_dynamic_insert, int64_t, int64_t, dist_type::GAUSSIAN)
+  ->Unit(benchmark::kMillisecond)
+  ->Apply(gen_final_size)
+  ->UseManualTime();
+
+BENCHMARK_TEMPLATE(BM_dynamic_search_all, int64_t, int64_t, dist_type::GAUSSIAN)
+  ->Unit(benchmark::kMillisecond)
+  ->Apply(gen_final_size)
+  ->UseManualTime();
+
+BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int64_t, int64_t, dist_type::GAUSSIAN)
   ->Unit(benchmark::kMillisecond)
   ->Apply(gen_final_size)
   ->UseManualTime();
\ No newline at end of file
diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl
index ce800653e..66c130899 100644
--- a/include/cuco/detail/dynamic_map.inl
+++ b/include/cuco/detail/dynamic_map.inl
@@ -199,7 +199,7 @@ void dynamic_map<Key, Value, Scope, Allocator>::erase(
 
   auto const temp_storage_size = submaps_.size() * sizeof(unsigned long long);
 
-  detail::erase<block_size, tile_size, cuco::pair_type<key_type, mapped_type>>
+  detail::erase<block_size, tile_size>
     <<<grid_size, block_size, temp_storage_size, stream>>>(first,
                                                            first + num_keys,
                                                            submap_mutable_views_.data().get(),
diff --git a/include/cuco/detail/dynamic_map_kernels.cuh b/include/cuco/detail/dynamic_map_kernels.cuh
index aefe8c873..37bcbc547 100644
--- a/include/cuco/detail/dynamic_map_kernels.cuh
+++ b/include/cuco/detail/dynamic_map_kernels.cuh
@@ -41,6 +41,7 @@ namespace cg = cooperative_groups;
  * @tparam viewT Type of device view allowing access of hash map storage
  * @tparam Hash Unary callable type
  * @tparam KeyEqual Binary callable type
+ *
  * @param first Beginning of the sequence of key/value pairs
  * @param last End of the sequence of key/value pairs
  * @param submap_views Array of `static_map::device_view` objects used to
@@ -122,6 +123,7 @@ __global__ void insert(InputIt first,
  * @tparam viewT Type of device view allowing access of hash map storage
  * @tparam Hash Unary callable type
  * @tparam KeyEqual Binary callable type
+ *
  * @param first Beginning of the sequence of key/value pairs
  * @param last End of the sequence of key/value pairs
  * @param submap_views Array of `static_map::device_view` objects used to
@@ -196,14 +198,15 @@ __global__ void insert(InputIt first,
  * If the key `*(first + i)` exists in the map, its slot is erased and made available for future
    insertions.
  * Else, no effect.
+ *
  * @tparam block_size The size of the thread block
- * @tparam pair_type Type of the pairs contained in the map
  * @tparam InputIt Device accessible input iterator whose `value_type` is
  * convertible to the map's `key_type`
  * @tparam mutableViewT Type of device view allowing modification of hash map storage
  * @tparam atomicT Type of atomic storage
  * @tparam Hash Unary callable type
  * @tparam KeyEqual Binary callable type
+ *
  * @param first Beginning of the sequence of keys
  * @param last End of the sequence of keys
  * @param submap_mutable_views Array of `static_map::mutable_device_view` objects used to
@@ -216,7 +219,6 @@ __global__ void insert(InputIt first,
  * @param key_equal The binary function to compare two keys for equality
  */
 template <uint32_t block_size,
-          typename pair_type,
           typename InputIt,
           typename mutableViewT,
           typename atomicT,
@@ -264,17 +266,18 @@ __global__ void erase(InputIt first,
  * @brief Erases the key/value pairs corresponding to all keys in the range `[first, last)`.
  *
  * If the key `*(first + i)` exists in the map, its slot is erased and made available for future
-   insertions.
+ * insertions.
  * Else, no effect.
+ *
  * @tparam block_size The size of the thread block
  * @tparam tile_size The number of threads in the Cooperative Groups used to perform erase
- * @tparam pair_type Type of the pairs contained in the map
  * @tparam InputIt Device accessible input iterator whose `value_type` is
  * convertible to the map's `key_type`
  * @tparam mutableViewT Type of device view allowing modification of hash map storage
  * @tparam atomicT Type of atomic storage
  * @tparam Hash Unary callable type
  * @tparam KeyEqual Binary callable type
+ *
  * @param first Beginning of the sequence of keys
  * @param last End of the sequence of keys
  * @param submap_mutable_views Array of `static_map::mutable_device_view` objects used to
@@ -288,7 +291,6 @@ __global__ void erase(InputIt first,
  */
 template <uint32_t block_size,
           uint32_t tile_size,
-          typename pair_type,
           typename InputIt,
           typename mutableViewT,
           typename atomicT,
@@ -339,6 +341,7 @@ __global__ void erase(InputIt first,
  *
  * If the key `*(first + i)` exists in the map, copies its associated value to `(output_begin + i)`.
  * Else, copies the empty value sentinel.
+ *
  * @tparam block_size The number of threads in the thread block
  * @tparam Value The mapped value type for the map
  * @tparam InputIt Device accessible input iterator whose `value_type` is
@@ -348,6 +351,7 @@ __global__ void erase(InputIt first,
  * @tparam viewT Type of `static_map` device view
  * @tparam Hash Unary callable type
  * @tparam KeyEqual Binary callable type
+ *
  * @param first Beginning of the sequence of keys
  * @param last End of the sequence of keys
  * @param output_begin Beginning of the sequence of values retrieved for each key
@@ -421,6 +425,7 @@ __global__ void find(InputIt first,
  * @tparam viewT Type of `static_map` device view
  * @tparam Hash Unary callable type
  * @tparam KeyEqual Binary callable type
+ *
  * @param first Beginning of the sequence of keys
  * @param last End of the sequence of keys
  * @param output_begin Beginning of the sequence of values retrieved for each key
@@ -493,6 +498,7 @@ __global__ void find(InputIt first,
  * @tparam viewT Type of `static_map` device view
  * @tparam Hash Unary callable type
  * @tparam KeyEqual Binary callable type
+ *
  * @param first Beginning of the sequence of keys
  * @param last End of the sequence of keys
  * @param output_begin Beginning of the sequence of booleans for the presence of each key
@@ -559,6 +565,7 @@ __global__ void contains(InputIt first,
  * @tparam viewT Type of `static_map` device view
  * @tparam Hash Unary callable type
  * @tparam KeyEqual Binary callable type
+ *
  * @param first Beginning of the sequence of keys
  * @param last End of the sequence of keys
  * @param output_begin Beginning of the sequence of booleans for the presence of each key
diff --git a/include/cuco/detail/static_map_kernels.cuh b/include/cuco/detail/static_map_kernels.cuh
index 2ebcd4c91..a5f5fc8b9 100644
--- a/include/cuco/detail/static_map_kernels.cuh
+++ b/include/cuco/detail/static_map_kernels.cuh
@@ -167,8 +167,9 @@ __global__ void insert(
  * @brief Erases the key/value pairs corresponding to all keys in the range `[first, last)`.
  *
  * If the key `*(first + i)` exists in the map, its slot is erased and made available for future
-   insertions.
+ * insertions.
  * Else, no effect.
+ *
  * @tparam block_size The size of the thread block
  * @tparam InputIt Device accessible input iterator whose `value_type` is
  * convertible to the map's `key_type`
@@ -176,6 +177,7 @@ __global__ void insert(
  * @tparam viewT Type of device view allowing access of hash map storage
  * @tparam Hash Unary callable type
  * @tparam KeyEqual Binary callable type
+ *
  * @param first Beginning of the sequence of keys
  * @param last End of the sequence of keys
  * @param num_successes The number of successfully erased key/value pairs
@@ -216,8 +218,9 @@ __global__ void erase(
  * @brief Erases the key/value pairs corresponding to all keys in the range `[first, last)`.
  *
  * If the key `*(first + i)` exists in the map, its slot is erased and made available for future
-   insertions.
+ * insertions.
  * Else, no effect.
+ *
  * @tparam block_size The size of the thread block
  * @tparam tile_size The number of threads in the Cooperative Groups used to perform erase
  * @tparam InputIt Device accessible input iterator whose `value_type` is
@@ -226,6 +229,7 @@ __global__ void erase(
  * @tparam viewT Type of device view allowing access of hash map storage
  * @tparam Hash Unary callable type
  * @tparam KeyEqual Binary callable type
+ *
  * @param first Beginning of the sequence of keys
  * @param last End of the sequence of keys
  * @param num_successes The number of successfully erased key/value pairs
diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh
index 5dbd9c2f7..34fd14ab3 100644
--- a/include/cuco/dynamic_map.cuh
+++ b/include/cuco/dynamic_map.cuh
@@ -16,8 +16,6 @@
 
 #pragma once
 
-#include <cooperative_groups.h>
-#include <cub/cub.cuh>
 #include <cuco/detail/dynamic_map_kernels.cuh>
 #include <cuco/detail/error.hpp>
 #include <cuco/sentinel.cuh>
@@ -26,8 +24,6 @@
 #include <thrust/device_vector.h>
 #include <thrust/functional.h>
 
-#include <cuda/std/atomic>
-
 #include <cstddef>
 #include <memory>
 #include <type_traits>
@@ -116,12 +112,6 @@ class dynamic_map {
   dynamic_map(dynamic_map const&) = delete;
   dynamic_map(dynamic_map&&)      = delete;
 
-  template <typename T1, typename T2>
-  dynamic_map(std::size_t, T1, T2, Allocator const& = Allocator{}) = delete;
-
-  template <typename T1, typename T2, typename T3>
-  dynamic_map(std::size_t, T1, T2, T3, Allocator const& = Allocator{}) = delete;
-
   dynamic_map& operator=(dynamic_map const&) = delete;
   dynamic_map& operator=(dynamic_map&&) = delete;
 
@@ -244,6 +234,7 @@ class dynamic_map {
    * convertible to the map's `value_type`
    * @tparam Hash Unary callable type
    * @tparam KeyEqual Binary callable type
+   *
    * @param first Beginning of the sequence of keys
    * @param last End of the sequence of keys
    * @param hash The unary function to apply to hash each key
@@ -274,6 +265,7 @@ class dynamic_map {
    * convertible to the map's `mapped_type`
    * @tparam Hash Unary callable type
    * @tparam KeyEqual Binary callable type
+   *
    * @param first Beginning of the sequence of keys
    * @param last End of the sequence of keys
    * @param output_begin Beginning of the sequence of values retrieved for each key
@@ -303,6 +295,7 @@ class dynamic_map {
    * convertible to the map's `mapped_type`
    * @tparam Hash Unary callable type
    * @tparam KeyEqual Binary callable type
+   *
    * @param first Beginning of the sequence of keys
    * @param last End of the sequence of keys
    * @param output_begin Beginning of the sequence of booleans for the presence of each key

From 7598e47184e6f66d4163aab61c9cff9f290f8224 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 18 Nov 2022 19:25:58 +0000
Subject: [PATCH 25/36] [pre-commit.ci] auto code formatting

---
 benchmarks/hash_table/dynamic_map_bench.cu | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/benchmarks/hash_table/dynamic_map_bench.cu b/benchmarks/hash_table/dynamic_map_bench.cu
index de2317957..420187b08 100644
--- a/benchmarks/hash_table/dynamic_map_bench.cu
+++ b/benchmarks/hash_table/dynamic_map_bench.cu
@@ -274,7 +274,7 @@ BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int32_t, int32_t, dist_type::UNIQUE)
   ->Unit(benchmark::kMillisecond)
   ->Apply(gen_final_size)
   ->UseManualTime();
-  
+
 BENCHMARK_TEMPLATE(BM_dynamic_insert, int64_t, int64_t, dist_type::UNIQUE)
   ->Unit(benchmark::kMillisecond)
   ->Apply(gen_final_size)
@@ -289,7 +289,7 @@ BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int64_t, int64_t, dist_type::UNIQUE)
   ->Unit(benchmark::kMillisecond)
   ->Apply(gen_final_size)
   ->UseManualTime();
-  
+
 BENCHMARK_TEMPLATE(BM_dynamic_insert, int32_t, int32_t, dist_type::UNIFORM)
   ->Unit(benchmark::kMillisecond)
   ->Apply(gen_final_size)
@@ -304,7 +304,7 @@ BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int32_t, int32_t, dist_type::UNIFORM)
   ->Unit(benchmark::kMillisecond)
   ->Apply(gen_final_size)
   ->UseManualTime();
-  
+
 BENCHMARK_TEMPLATE(BM_dynamic_insert, int64_t, int64_t, dist_type::UNIFORM)
   ->Unit(benchmark::kMillisecond)
   ->Apply(gen_final_size)
@@ -319,7 +319,7 @@ BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int64_t, int64_t, dist_type::UNIFORM)
   ->Unit(benchmark::kMillisecond)
   ->Apply(gen_final_size)
   ->UseManualTime();
-  
+
 BENCHMARK_TEMPLATE(BM_dynamic_insert, int32_t, int32_t, dist_type::GAUSSIAN)
   ->Unit(benchmark::kMillisecond)
   ->Apply(gen_final_size)
@@ -334,7 +334,7 @@ BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int32_t, int32_t, dist_type::GAUSSIAN)
   ->Unit(benchmark::kMillisecond)
   ->Apply(gen_final_size)
   ->UseManualTime();
-  
+
 BENCHMARK_TEMPLATE(BM_dynamic_insert, int64_t, int64_t, dist_type::GAUSSIAN)
   ->Unit(benchmark::kMillisecond)
   ->Apply(gen_final_size)

From 1e6ad99de0bd52f501e70cc084e74b013abf98fb Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 20 Dec 2022 11:18:31 -0500
Subject: [PATCH 26/36] Add more data types for erase tests

---
 tests/dynamic_map/erase_test.cu | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/tests/dynamic_map/erase_test.cu b/tests/dynamic_map/erase_test.cu
index 0e53197ea..25033ff14 100644
--- a/tests/dynamic_map/erase_test.cu
+++ b/tests/dynamic_map/erase_test.cu
@@ -23,11 +23,14 @@
 
 #include <catch2/catch.hpp>
 
-TEMPLATE_TEST_CASE_SIG("erase key", "", ((typename T), T), (int32_t))
+TEMPLATE_TEST_CASE_SIG("erase key",
+                       "",
+                       ((typename Key, typename Value), Key, Value),
+                       (int32_t, int32_t),
+                       (int32_t, int64_t),
+                       (int64_t, int32_t),
+                       (int64_t, int64_t))
 {
-  using Key   = T;
-  using Value = T;
-
   unsigned long num_keys = 1'000'000;
   cuco::dynamic_map<Key, Value> map{num_keys * 2,
                                     cuco::sentinel::empty_key<Key>{-1},
@@ -135,4 +138,4 @@ TEMPLATE_TEST_CASE_SIG("erase key", "", ((typename T), T), (int32_t))
 
     REQUIRE(map.get_size() == 0);
   }
-}
\ No newline at end of file
+}

From 9e324fcafcf1a92b329683f12f56106b6b451028 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 20 Dec 2022 11:19:23 -0500
Subject: [PATCH 27/36] Use public murmurhash

---
 include/cuco/dynamic_map.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh
index a677fdc83..794e67f2e 100644
--- a/include/cuco/dynamic_map.cuh
+++ b/include/cuco/dynamic_map.cuh
@@ -209,7 +209,7 @@ class dynamic_map {
    * @param stream Stream used for executing the kernels
    */
   template <typename InputIt,
-            typename Hash     = cuco::detail::MurmurHash3_32<key_type>,
+            typename Hash     = cuco::murmurhash3_32<key_type>,
             typename KeyEqual = thrust::equal_to<key_type>>
   void insert(InputIt first,
               InputIt last,

From bb0e4e9c3cdf6a22ade08ed55c8475408fe5aba2 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 20 Dec 2022 11:48:40 -0500
Subject: [PATCH 28/36] Update static map benchmark: fix runtime stall bug,
 remove redundant comments and add erase_none and search_none benchmarks

---
 benchmarks/hash_table/static_map_bench.cu | 33 ++++++++++++++---------
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/benchmarks/hash_table/static_map_bench.cu b/benchmarks/hash_table/static_map_bench.cu
index 57d17e222..1538a636a 100644
--- a/benchmarks/hash_table/static_map_bench.cu
+++ b/benchmarks/hash_table/static_map_bench.cu
@@ -164,8 +164,7 @@ static void BM_static_map_search_none(::benchmark::State& state)
   float occupancy      = state.range(1) / float{100};
   std::size_t size     = num_keys / occupancy;
 
-  map_type map{size, -1, -1};
-  auto view = map.get_device_mutable_view();
+  map_type map{size, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};
 
   std::vector<Key> h_keys(num_keys);
   std::vector<Value> h_values(num_keys);
@@ -174,7 +173,7 @@ static void BM_static_map_search_none(::benchmark::State& state)
 
   generate_keys<Dist, Key>(h_keys.begin(), h_keys.end());
 
-  for (auto i = 0; i < num_keys; ++i) {
+  for (std::size_t i = 0; i < num_keys; ++i) {
     Key key           = h_keys[i];
     Value val         = h_keys[i];
     h_pairs[i].first  = key;
@@ -182,8 +181,9 @@ static void BM_static_map_search_none(::benchmark::State& state)
   }
 
   // diff keys
-  for (int i = 0; i < num_keys; ++i)
+  for (std::size_t i = 0; i < num_keys; ++i) {
     h_keys[i] += num_keys;
+  }
 
   thrust::device_vector<Key> d_keys(h_keys);
   thrust::device_vector<Value> d_results(num_keys);
@@ -193,6 +193,9 @@ static void BM_static_map_search_none(::benchmark::State& state)
 
   for (auto _ : state) {
     map.find(d_keys.begin(), d_keys.end(), d_results.begin());
+    // TODO: get rid of sync and rewrite the benchmark with `nvbench`
+    // once https://github.com/NVIDIA/nvbench/pull/80 is merged
+    cudaDeviceSynchronize();
   }
 
   state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) *
@@ -251,8 +254,7 @@ static void BM_static_map_erase_none(::benchmark::State& state)
   float occupancy      = state.range(1) / float{100};
   std::size_t size     = num_keys / occupancy;
 
-  map_type map{size, -1, -1};
-  auto view = map.get_device_mutable_view();
+  map_type map{size, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}, cuco::erased_key{-2}};
 
   std::vector<Key> h_keys(num_keys);
   std::vector<Value> h_values(num_keys);
@@ -261,7 +263,7 @@ static void BM_static_map_erase_none(::benchmark::State& state)
 
   generate_keys<Dist, Key>(h_keys.begin(), h_keys.end());
 
-  for (auto i = 0; i < num_keys; ++i) {
+  for (std::size_t i = 0; i < num_keys; ++i) {
     Key key           = h_keys[i];
     Value val         = h_keys[i];
     h_pairs[i].first  = key;
@@ -269,22 +271,20 @@ static void BM_static_map_erase_none(::benchmark::State& state)
   }
 
   // diff keys
-  for (int i = 0; i < num_keys; ++i)
+  for (std::size_t i = 0; i < num_keys; ++i) {
     h_keys[i] += num_keys;
+  }
 
   thrust::device_vector<Key> d_keys(h_keys);
   thrust::device_vector<bool> d_results(num_keys);
   thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs(h_pairs);
 
   for (auto _ : state) {
-    // state.ResumeTiming();
     state.PauseTiming();
     map.insert(d_pairs.begin(), d_pairs.end());
     state.ResumeTiming();
 
     map.erase(d_keys.begin(), d_keys.end());
-
-    // state.PauseTiming();
   }
 
   state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) *
@@ -345,6 +345,15 @@ BENCHMARK_TEMPLATE(BM_static_map_search_all, int64_t, int64_t, dist_type::GAUSSI
   ->Unit(benchmark::kMillisecond)
   ->Apply(generate_size_and_occupancy);
 
-BENCHMARK_TEMPLATE(BM_static_map_erase_all, int32_t, int32_t, dist_type::UNIQUE)
+// TODO: comprehensive tests for erase_all, erase_none and search_none
+BENCHMARK_TEMPLATE(BM_static_map_erase_all, int32_t, int32_t, dist_type::UNIFORM)
+  ->Unit(benchmark::kMillisecond)
+  ->Apply(generate_size_and_occupancy);
+
+BENCHMARK_TEMPLATE(BM_static_map_search_none, int32_t, int32_t, dist_type::UNIFORM)
+  ->Unit(benchmark::kMillisecond)
+  ->Apply(generate_size_and_occupancy);
+
+BENCHMARK_TEMPLATE(BM_static_map_erase_none, int32_t, int32_t, dist_type::UNIFORM)
   ->Unit(benchmark::kMillisecond)
   ->Apply(generate_size_and_occupancy);

From 10fd08a658e3019de8cff0a002f1e89ef59980a2 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 20 Dec 2022 12:03:39 -0500
Subject: [PATCH 29/36] Update dynamic map benchmark: fix conversion warning,
 add search_none and erase_none benchmarks and get rid of sentinel namespace

---
 benchmarks/hash_table/dynamic_map_bench.cu | 34 ++++++++++++++--------
 tests/dynamic_map/erase_test.cu            |  6 ++--
 2 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/benchmarks/hash_table/dynamic_map_bench.cu b/benchmarks/hash_table/dynamic_map_bench.cu
index c31dde8ad..8fbb804de 100644
--- a/benchmarks/hash_table/dynamic_map_bench.cu
+++ b/benchmarks/hash_table/dynamic_map_bench.cu
@@ -148,7 +148,7 @@ static void BM_dynamic_search_none(::benchmark::State& state)
 
   generate_keys<Dist, Key>(h_keys.begin(), h_keys.end());
 
-  for (auto i = 0; i < num_keys; ++i) {
+  for (std::size_t i = 0; i < num_keys; ++i) {
     Key key           = h_keys[i] + num_keys;
     Value val         = h_keys[i] + num_keys;
     h_pairs[i].first  = key;
@@ -159,8 +159,7 @@ static void BM_dynamic_search_none(::benchmark::State& state)
   thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs(h_pairs);
   thrust::device_vector<Value> d_results(num_keys);
 
-  map_type map{
-    initial_size, cuco::sentinel::empty_key<Key>{-1}, cuco::sentinel::empty_value<Value>{-1}};
+  map_type map{initial_size, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};
   map.insert(d_pairs.begin(), d_pairs.end());
 
   for (auto _ : state) {
@@ -198,9 +197,9 @@ static void BM_dynamic_erase_all(::benchmark::State& state)
   std::size_t batch_size = 1E6;
   for (auto _ : state) {
     map_type map{initial_size,
-                 cuco::sentinel::empty_key<Key>{-1},
-                 cuco::sentinel::empty_value<Value>{-1},
-                 cuco::sentinel::erased_key<Key>{-2}};
+                 cuco::empty_key<Key>{-1},
+                 cuco::empty_value<Value>{-1},
+                 cuco::erased_key<Key>{-2}};
     for (uint32_t i = 0; i < num_keys; i += batch_size) {
       map.insert(d_pairs.begin() + i, d_pairs.begin() + i + batch_size);
     }
@@ -229,7 +228,7 @@ static void BM_dynamic_erase_none(::benchmark::State& state)
 
   generate_keys<Dist, Key>(h_keys.begin(), h_keys.end());
 
-  for (auto i = 0; i < num_keys; ++i) {
+  for (std::size_t i = 0; i < num_keys; ++i) {
     Key key           = h_keys[i] + num_keys;
     Value val         = h_keys[i] + num_keys;
     h_pairs[i].first  = key;
@@ -242,10 +241,10 @@ static void BM_dynamic_erase_none(::benchmark::State& state)
   std::size_t batch_size = 1E6;
   for (auto _ : state) {
     map_type map{initial_size,
-                 cuco::sentinel::empty_key<Key>{-1},
-                 cuco::sentinel::empty_value<Value>{-1},
-                 cuco::sentinel::erased_key<Key>{-2}};
-    for (auto i = 0; i < num_keys; i += batch_size) {
+                 cuco::empty_key<Key>{-1},
+                 cuco::empty_value<Value>{-1},
+                 cuco::erased_key<Key>{-2}};
+    for (std::size_t i = 0; i < num_keys; i += batch_size) {
       map.insert(d_pairs.begin() + i, d_pairs.begin() + i + batch_size);
     }
     {
@@ -346,4 +345,15 @@ BENCHMARK_TEMPLATE(BM_dynamic_search_all, int64_t, int64_t, dist_type::GAUSSIAN)
 BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int64_t, int64_t, dist_type::GAUSSIAN)
   ->Unit(benchmark::kMillisecond)
   ->Apply(gen_final_size)
-  ->UseManualTime();
\ No newline at end of file
+  ->UseManualTime();
+
+// TODO: comprehensive tests for erase_none and search_none?
+BENCHMARK_TEMPLATE(BM_dynamic_search_none, int32_t, int32_t, dist_type::UNIFORM)
+  ->Unit(benchmark::kMillisecond)
+  ->Apply(gen_final_size)
+  ->UseManualTime();
+
+BENCHMARK_TEMPLATE(BM_dynamic_erase_none, int32_t, int32_t, dist_type::UNIFORM)
+  ->Unit(benchmark::kMillisecond)
+  ->Apply(gen_final_size)
+  ->UseManualTime();
diff --git a/tests/dynamic_map/erase_test.cu b/tests/dynamic_map/erase_test.cu
index 25033ff14..f25caed30 100644
--- a/tests/dynamic_map/erase_test.cu
+++ b/tests/dynamic_map/erase_test.cu
@@ -33,9 +33,9 @@ TEMPLATE_TEST_CASE_SIG("erase key",
 {
   unsigned long num_keys = 1'000'000;
   cuco::dynamic_map<Key, Value> map{num_keys * 2,
-                                    cuco::sentinel::empty_key<Key>{-1},
-                                    cuco::sentinel::empty_value<Value>{-1},
-                                    cuco::sentinel::erased_key<Key>{-2}};
+                                    cuco::empty_key<Key>{-1},
+                                    cuco::empty_value<Value>{-1},
+                                    cuco::erased_key<Key>{-2}};
 
   thrust::device_vector<Key> d_keys(num_keys);
   thrust::device_vector<Value> d_values(num_keys);

From 788ad29ec8c92446666dfca9b8c14c7b9c431171 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 20 Dec 2022 13:37:35 -0500
Subject: [PATCH 30/36] Cleanups: get rid of host-side counter vector, remove
 get_ prefixes and async instructions when possible

---
 include/cuco/detail/dynamic_map.inl | 36 ++++++++++++++---------------
 include/cuco/dynamic_map.cuh        |  6 ++---
 include/cuco/static_map.cuh         |  4 ++--
 3 files changed, 22 insertions(+), 24 deletions(-)

diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl
index 66c130899..08aa4dd1c 100644
--- a/include/cuco/detail/dynamic_map.inl
+++ b/include/cuco/detail/dynamic_map.inl
@@ -41,8 +41,7 @@ dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(
     stream));
   submap_views_.push_back(submaps_[0]->get_device_view());
   submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view());
-  submap_num_successes_.push_back(submaps_[0]->get_num_successes());
-  d_submap_num_successes_ = submap_num_successes_;
+  submap_num_successes_.push_back(submaps_[0]->num_successes());
 }
 
 template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
@@ -75,8 +74,7 @@ dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(
     stream));
   submap_views_.push_back(submaps_[0]->get_device_view());
   submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view());
-  submap_num_successes_.push_back(submaps_[0]->get_num_successes());
-  d_submap_num_successes_ = submap_num_successes_;
+  submap_num_successes_.push_back(submaps_[0]->num_successes());
 }
 
 template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
@@ -110,8 +108,7 @@ void dynamic_map<Key, Value, Scope, Allocator>::reserve(std::size_t n, cudaStrea
           alloc_,
           stream));
       }
-      submap_num_successes_.push_back(submaps_[submap_idx]->get_num_successes());
-      d_submap_num_successes_ = submap_num_successes_;
+      submap_num_successes_.push_back(submaps_[submap_idx]->num_successes());
       submap_views_.push_back(submaps_[submap_idx]->get_device_view());
       submap_mutable_views_.push_back(submaps_[submap_idx]->get_device_mutable_view());
       capacity_ *= 2;
@@ -143,7 +140,8 @@ void dynamic_map<Key, Value, Scope, Allocator>::insert(
     // only if we meet the minimum insert size.
 
     if (capacity_remaining >= min_insert_size_) {
-      CUCO_CUDA_TRY(cudaMemset(submap_num_successes_[submap_idx], 0, sizeof(atomic_ctr_type)));
+      CUCO_CUDA_TRY(
+        cudaMemsetAsync(submap_num_successes_[submap_idx], 0, sizeof(atomic_ctr_type), stream));
 
       auto n                = std::min(capacity_remaining, num_to_insert);
       auto const block_size = 128;
@@ -156,17 +154,18 @@ void dynamic_map<Key, Value, Scope, Allocator>::insert(
                                                first + n,
                                                submap_views_.data().get(),
                                                submap_mutable_views_.data().get(),
-                                               d_submap_num_successes_.data().get(),
+                                               submap_num_successes_.data().get(),
                                                submap_idx,
                                                submaps_.size(),
                                                hash,
                                                key_equal);
 
       std::size_t h_num_successes;
-      CUCO_CUDA_TRY(cudaMemcpy(&h_num_successes,
-                               submap_num_successes_[submap_idx],
-                               sizeof(atomic_ctr_type),
-                               cudaMemcpyDeviceToHost));
+      CUCO_CUDA_TRY(cudaMemcpyAsync(&h_num_successes,
+                                    submap_num_successes_[submap_idx],
+                                    sizeof(atomic_ctr_type),
+                                    cudaMemcpyDeviceToHost,
+                                    stream));
       submaps_[submap_idx]->size_ += h_num_successes;
       size_ += h_num_successes;
       first += n;
@@ -194,7 +193,7 @@ void dynamic_map<Key, Value, Scope, Allocator>::erase(
 
   // zero out submap success counters
   for (uint32_t i = 0; i < submaps_.size(); ++i) {
-    CUCO_CUDA_TRY(cudaMemset(submap_num_successes_[i], 0, sizeof(atomic_ctr_type)));
+    CUCO_CUDA_TRY(cudaMemsetAsync(submap_num_successes_[i], 0, sizeof(atomic_ctr_type), stream));
   }
 
   auto const temp_storage_size = submaps_.size() * sizeof(unsigned long long);
@@ -203,17 +202,18 @@ void dynamic_map<Key, Value, Scope, Allocator>::erase(
     <<<grid_size, block_size, temp_storage_size, stream>>>(first,
                                                            first + num_keys,
                                                            submap_mutable_views_.data().get(),
-                                                           d_submap_num_successes_.data().get(),
+                                                           submap_num_successes_.data().get(),
                                                            submaps_.size(),
                                                            hash,
                                                            key_equal);
 
   for (uint32_t i = 0; i < submaps_.size(); ++i) {
     std::size_t h_submap_num_successes;
-    CUCO_CUDA_TRY(cudaMemcpy(&h_submap_num_successes,
-                             submap_num_successes_[i],
-                             sizeof(atomic_ctr_type),
-                             cudaMemcpyDeviceToHost));
+    CUCO_CUDA_TRY(cudaMemcpyAsync(&h_submap_num_successes,
+                                  submap_num_successes_[i],
+                                  sizeof(atomic_ctr_type),
+                                  cudaMemcpyDeviceToHost,
+                                  stream));
     submaps_[i]->size_ -= h_submap_num_successes;
     size_ -= h_submap_num_successes;
   }
diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh
index 794e67f2e..c0c76bde0 100644
--- a/include/cuco/dynamic_map.cuh
+++ b/include/cuco/dynamic_map.cuh
@@ -354,11 +354,9 @@ class dynamic_map {
   thrust::device_vector<mutable_view_type>
     submap_mutable_views_;         ///< vector of mutable device views for each submap
   std::size_t min_insert_size_{};  ///< min remaining capacity of submap for insert
-  std::vector<atomic_ctr_type*>
-    submap_num_successes_;  ///< number of succesfully erased keys for each submap
   thrust::device_vector<atomic_ctr_type*>
-    d_submap_num_successes_;  ///< device-side number of successfully erased keys for each submap
-  Allocator alloc_{};         ///< Allocator passed to submaps to allocate their device storage
+    submap_num_successes_;  ///< Number of successfully erased keys for each submap
+  Allocator alloc_{};       ///< Allocator passed to submaps to allocate their device storage
   counter_allocator_type counter_allocator_{};  ///< Allocator used to allocate `num_successes_`
 };
 }  // namespace cuco
diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh
index 89af45d41..316a5d77d 100644
--- a/include/cuco/static_map.cuh
+++ b/include/cuco/static_map.cuh
@@ -1420,10 +1420,10 @@ class static_map {
    *
    * @return Number of successfully inserted/erased keys from the last insert/erase operation
    */
-  atomic_ctr_type* get_num_successes() const noexcept { return num_successes_; }
+  atomic_ctr_type* num_successes() const noexcept { return num_successes_; }
 
  private:
-  pair_atomic_type* slots_{nullptr};            ///< Pointer to flat slots storage
+  pair_atomic_type* slots_{};                   ///< Pointer to flat slots storage
   std::size_t capacity_{};                      ///< Total number of slots
   std::size_t size_{};                          ///< Number of keys in map
   Key empty_key_sentinel_{};                    ///< Key value that represents an empty slot

From c71dd607684ec7b37457c97ff2d54736b6923983 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 20 Dec 2022 13:50:19 -0500
Subject: [PATCH 31/36] Get rid of num_successes getter

---
 include/cuco/detail/dynamic_map.inl | 6 +++---
 include/cuco/static_map.cuh         | 8 --------
 2 files changed, 3 insertions(+), 11 deletions(-)

diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl
index 08aa4dd1c..c50d5e3a5 100644
--- a/include/cuco/detail/dynamic_map.inl
+++ b/include/cuco/detail/dynamic_map.inl
@@ -41,7 +41,7 @@ dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(
     stream));
   submap_views_.push_back(submaps_[0]->get_device_view());
   submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view());
-  submap_num_successes_.push_back(submaps_[0]->num_successes());
+  submap_num_successes_.push_back(submaps_[0]->num_successes_);
 }
 
 template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
@@ -74,7 +74,7 @@ dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(
     stream));
   submap_views_.push_back(submaps_[0]->get_device_view());
   submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view());
-  submap_num_successes_.push_back(submaps_[0]->num_successes());
+  submap_num_successes_.push_back(submaps_[0]->num_successes_);
 }
 
 template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
@@ -108,7 +108,7 @@ void dynamic_map<Key, Value, Scope, Allocator>::reserve(std::size_t n, cudaStrea
           alloc_,
           stream));
       }
-      submap_num_successes_.push_back(submaps_[submap_idx]->num_successes());
+      submap_num_successes_.push_back(submaps_[submap_idx]->num_successes_);
       submap_views_.push_back(submaps_[submap_idx]->get_device_view());
       submap_mutable_views_.push_back(submaps_[submap_idx]->get_device_mutable_view());
       capacity_ *= 2;
diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh
index 316a5d77d..7a240da03 100644
--- a/include/cuco/static_map.cuh
+++ b/include/cuco/static_map.cuh
@@ -1414,14 +1414,6 @@ class static_map {
                                sentinel::erased_key<Key>{erased_key_sentinel_});
   }
 
-  /**
-   * @brief Gets the number of successfully inserted/erased keys from the last
-   * insert/erase operation
-   *
-   * @return Number of successfully inserted/erased keys from the last insert/erase operation
-   */
-  atomic_ctr_type* num_successes() const noexcept { return num_successes_; }
-
  private:
   pair_atomic_type* slots_{};                   ///< Pointer to flat slots storage
   std::size_t capacity_{};                      ///< Total number of slots

From ab4ef0c9058f564a925396ee7a89134883e42f4e Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 20 Dec 2022 13:53:23 -0500
Subject: [PATCH 32/36] Fix comments

---
 include/cuco/dynamic_map.cuh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh
index c0c76bde0..a35aee893 100644
--- a/include/cuco/dynamic_map.cuh
+++ b/include/cuco/dynamic_map.cuh
@@ -119,8 +119,8 @@ class dynamic_map {
   dynamic_map& operator=(dynamic_map&&) = delete;
 
   /**
-   * @brief Construct a dynamically-sized map with the specified initial capacity, growth factor and
-   * sentinel values.
+   * @brief Constructs a dynamically-sized map with the specified initial capacity, growth factor
+   * and sentinel values.
    *
    * The capacity of the map will automatically increase as the user adds key/value pairs using
    * `insert`.
@@ -146,7 +146,7 @@ class dynamic_map {
               cudaStream_t stream    = nullptr);
 
   /**
-   * @brief Construct a dynamically-sized map with erase capability.
+   * @brief Constructs a dynamically-sized map with erase capability.
    *
    * The capacity of the map will automatically increase as the user adds key/value pairs using
    * `insert`.
@@ -177,7 +177,7 @@ class dynamic_map {
               cudaStream_t stream    = nullptr);
 
   /**
-   * @brief Destroy the map and frees its contents
+   * @brief Destroys the map and frees its contents
    *
    */
   ~dynamic_map() {}

From d72e40303ff6af01f76f5a78ca904d0381aadaba Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 20 Dec 2022 14:06:34 -0500
Subject: [PATCH 33/36] Update tests

---
 tests/dynamic_map/erase_test.cu | 72 ++++++++++++++++-----------------
 1 file changed, 35 insertions(+), 37 deletions(-)

diff --git a/tests/dynamic_map/erase_test.cu b/tests/dynamic_map/erase_test.cu
index f25caed30..fc3dc3c28 100644
--- a/tests/dynamic_map/erase_test.cu
+++ b/tests/dynamic_map/erase_test.cu
@@ -14,9 +14,10 @@
  * limitations under the License.
  */
 
-#include <cuco/dynamic_map.cuh>
 #include <utils.hpp>
 
+#include <cuco/dynamic_map.cuh>
+
 #include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
 #include <thrust/sequence.h>
@@ -31,27 +32,23 @@ TEMPLATE_TEST_CASE_SIG("erase key",
                        (int64_t, int32_t),
                        (int64_t, int64_t))
 {
-  unsigned long num_keys = 1'000'000;
+  constexpr std::size_t num_keys = 1'000'000;
   cuco::dynamic_map<Key, Value> map{num_keys * 2,
                                     cuco::empty_key<Key>{-1},
                                     cuco::empty_value<Value>{-1},
                                     cuco::erased_key<Key>{-2}};
 
-  thrust::device_vector<Key> d_keys(num_keys);
-  thrust::device_vector<Value> d_values(num_keys);
-  thrust::device_vector<bool> d_keys_exist(num_keys);
-
-  thrust::sequence(thrust::device, d_keys.begin(), d_keys.end(), 1);
-  thrust::sequence(thrust::device, d_values.begin(), d_values.end(), 1);
+  SECTION("Check single submap insert/erase")
+  {
+    thrust::device_vector<Key> d_keys(num_keys);
+    thrust::device_vector<Value> d_values(num_keys);
+    thrust::device_vector<bool> d_keys_exist(num_keys);
 
-  auto pairs_begin =
-    thrust::make_zip_iterator(thrust::make_tuple(d_keys.begin(), d_values.begin()));
+    thrust::sequence(thrust::device, d_keys.begin(), d_keys.end(), 1);
+    thrust::sequence(thrust::device, d_values.begin(), d_values.end(), 1);
 
-  SECTION("Check basic insert/erase")
-  {
-    // *****************************************
-    // first, check single submap works properly
-    // *****************************************
+    auto pairs_begin =
+      thrust::make_zip_iterator(thrust::make_tuple(d_keys.begin(), d_values.begin()));
 
     map.insert(pairs_begin, pairs_begin + num_keys);
 
@@ -94,46 +91,47 @@ TEMPLATE_TEST_CASE_SIG("erase key",
 
     // clear map
     map.erase(d_keys.begin() + num_keys / 2, d_keys.end());
+  }
 
-    // *************************************************
-    // second, check multiple submaps case works properly
-    // *************************************************
+  SECTION("Check multiple submaps insert/erase")
+  {
+    constexpr std::size_t num = 4 * num_keys;
 
-    thrust::device_vector<Key> d_keys2(4 * num_keys);
-    thrust::device_vector<Value> d_values2(4 * num_keys);
-    thrust::device_vector<bool> d_keys_exist2(4 * num_keys);
+    thrust::device_vector<Key> d_keys(num);
+    thrust::device_vector<Value> d_values(num);
+    thrust::device_vector<bool> d_keys_exist(num);
 
-    thrust::sequence(thrust::device, d_keys2.begin(), d_keys2.end(), 1);
-    thrust::sequence(thrust::device, d_values2.begin(), d_values2.end(), 1);
+    thrust::sequence(thrust::device, d_keys.begin(), d_keys.end(), 1);
+    thrust::sequence(thrust::device, d_values.begin(), d_values.end(), 1);
 
-    auto pairs_begin2 =
-      thrust::make_zip_iterator(thrust::make_tuple(d_keys2.begin(), d_values2.begin()));
+    auto pairs_begin =
+      thrust::make_zip_iterator(thrust::make_tuple(d_keys.begin(), d_values.begin()));
 
-    map.insert(pairs_begin2, pairs_begin2 + 4 * num_keys);
+    map.insert(pairs_begin, pairs_begin + num);
 
     // map should resize twice if the erased slots are successfully reused
-    REQUIRE(map.get_capacity() == 8 * num_keys);
+    REQUIRE(map.get_capacity() == 2 * num);
     // check that keys can be successfully deleted from only the first and second submaps
-    map.erase(d_keys2.begin(), d_keys2.begin() + 2 * num_keys);
-    map.contains(d_keys2.begin(), d_keys2.end(), d_keys_exist2.begin());
+    map.erase(d_keys.begin(), d_keys.begin() + 2 * num_keys);
+    map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin());
 
-    REQUIRE(cuco::test::none_of(d_keys_exist2.begin(),
-                                d_keys_exist2.begin() + 2 * num_keys,
+    REQUIRE(cuco::test::none_of(d_keys_exist.begin(),
+                                d_keys_exist.begin() + 2 * num_keys,
                                 [] __device__(const bool key_found) { return key_found; }));
 
-    REQUIRE(cuco::test::all_of(d_keys_exist2.begin() + 2 * num_keys,
-                               d_keys_exist2.end(),
+    REQUIRE(cuco::test::all_of(d_keys_exist.begin() + 2 * num_keys,
+                               d_keys_exist.end(),
                                [] __device__(const bool key_found) { return key_found; }));
 
     REQUIRE(map.get_size() == 2 * num_keys);
     // check that keys can be successfully deleted from all submaps (some will be unsuccessful
     // erases)
-    map.erase(d_keys2.begin(), d_keys2.end());
+    map.erase(d_keys.begin(), d_keys.end());
 
-    map.contains(d_keys2.begin(), d_keys2.end(), d_keys_exist2.begin());
+    map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin());
 
-    REQUIRE(cuco::test::none_of(d_keys_exist2.begin(),
-                                d_keys_exist2.end(),
+    REQUIRE(cuco::test::none_of(d_keys_exist.begin(),
+                                d_keys_exist.end(),
                                 [] __device__(const bool key_found) { return key_found; }));
 
     REQUIRE(map.get_size() == 0);

From 9478650e5e6af934d5b6463a3d96ea9996004daa Mon Sep 17 00:00:00 2001
From: Yunsong Wang <wangyunsong89@gmail.com>
Date: Tue, 20 Dec 2022 15:35:44 -0500
Subject: [PATCH 34/36] Update include/cuco/detail/dynamic_map_kernels.cuh

---
 include/cuco/detail/dynamic_map_kernels.cuh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/cuco/detail/dynamic_map_kernels.cuh b/include/cuco/detail/dynamic_map_kernels.cuh
index 37bcbc547..b98516160 100644
--- a/include/cuco/detail/dynamic_map_kernels.cuh
+++ b/include/cuco/detail/dynamic_map_kernels.cuh
@@ -149,7 +149,6 @@ __global__ void insert(InputIt first,
                        InputIt last,
                        viewT* submap_views,
                        mutableViewT* submap_mutable_views,
-                       // atomicT* num_successes,
                        atomicT** submap_num_successes,
                        uint32_t insert_idx,
                        uint32_t num_submaps,

From 82e0f2e8f51c2b5593e8dc3cac27f4f505ad6e09 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 20 Dec 2022 21:47:44 -0500
Subject: [PATCH 35/36] Cleanups: relaxed memory atomic, static_assert instead
 of runtime expect, constexpr when possible

---
 include/cuco/detail/dynamic_map.inl         | 56 +++++++++++----------
 include/cuco/detail/dynamic_map_kernels.cuh | 36 ++++++-------
 2 files changed, 47 insertions(+), 45 deletions(-)

diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl
index c50d5e3a5..989225eea 100644
--- a/include/cuco/detail/dynamic_map.inl
+++ b/include/cuco/detail/dynamic_map.inl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -125,8 +125,12 @@ void dynamic_map<Key, Value, Scope, Allocator>::insert(
   InputIt first, InputIt last, Hash hash, KeyEqual key_equal, cudaStream_t stream)
 {
   // TODO: memset an atomic variable is unsafe
-  CUCO_RUNTIME_EXPECTS(sizeof(std::size_t) == sizeof(atomic_ctr_type),
-                       "sizeof(atomic_ctr_type) must be equal to sizeof(std:size_t).");
+  static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type),
+                "sizeof(atomic_ctr_type) must be equal to sizeof(std:size_t).");
+
+  auto constexpr block_size = 128;
+  auto constexpr stride     = 1;
+  auto constexpr tile_size  = 4;
 
   std::size_t num_to_insert = std::distance(first, last);
 
@@ -138,16 +142,12 @@ void dynamic_map<Key, Value, Scope, Allocator>::insert(
       max_load_factor_ * submaps_[submap_idx]->get_capacity() - submaps_[submap_idx]->get_size();
     // If we are tying to insert some of the remaining keys into this submap, we can insert
     // only if we meet the minimum insert size.
-
     if (capacity_remaining >= min_insert_size_) {
       CUCO_CUDA_TRY(
         cudaMemsetAsync(submap_num_successes_[submap_idx], 0, sizeof(atomic_ctr_type), stream));
 
-      auto n                = std::min(capacity_remaining, num_to_insert);
-      auto const block_size = 128;
-      auto const stride     = 1;
-      auto const tile_size  = 4;
-      auto const grid_size  = (tile_size * n + stride * block_size - 1) / (stride * block_size);
+      auto const n         = std::min(capacity_remaining, num_to_insert);
+      auto const grid_size = (tile_size * n + stride * block_size - 1) / (stride * block_size);
 
       detail::insert<block_size, tile_size, cuco::pair_type<key_type, mapped_type>>
         <<<grid_size, block_size, 0, stream>>>(first,
@@ -180,16 +180,16 @@ template <typename InputIt, typename Hash, typename KeyEqual>
 void dynamic_map<Key, Value, Scope, Allocator>::erase(
   InputIt first, InputIt last, Hash hash, KeyEqual key_equal, cudaStream_t stream)
 {
-  std::size_t num_keys = std::distance(first, last);
+  // TODO: memset an atomic variable is unsafe
+  static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type),
+                "sizeof(atomic_ctr_type) must be equal to sizeof(std:size_t).");
 
-  auto const block_size = 128;
-  auto const stride     = 1;
-  auto const tile_size  = 4;
-  auto const grid_size  = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size);
+  auto constexpr block_size = 128;
+  auto constexpr stride     = 1;
+  auto constexpr tile_size  = 4;
 
-  // TODO: memset an atomic variable is unsafe
-  CUCO_RUNTIME_EXPECTS(sizeof(std::size_t) == sizeof(atomic_ctr_type),
-                       "sizeof(atomic_ctr_type) must be equal to sizeof(std:size_t).");
+  auto const num_keys  = std::distance(first, last);
+  auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size);
 
   // zero out submap success counters
   for (uint32_t i = 0; i < submaps_.size(); ++i) {
@@ -228,11 +228,12 @@ void dynamic_map<Key, Value, Scope, Allocator>::find(InputIt first,
                                                      KeyEqual key_equal,
                                                      cudaStream_t stream)
 {
-  auto num_keys         = std::distance(first, last);
-  auto const block_size = 128;
-  auto const stride     = 1;
-  auto const tile_size  = 4;
-  auto const grid_size  = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size);
+  auto constexpr block_size = 128;
+  auto constexpr stride     = 1;
+  auto constexpr tile_size  = 4;
+
+  auto const num_keys  = std::distance(first, last);
+  auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size);
 
   detail::find<block_size, tile_size, Value><<<grid_size, block_size, 0, stream>>>(
     first, last, output_begin, submap_views_.data().get(), submaps_.size(), hash, key_equal);
@@ -248,11 +249,12 @@ void dynamic_map<Key, Value, Scope, Allocator>::contains(InputIt first,
                                                          KeyEqual key_equal,
                                                          cudaStream_t stream)
 {
-  auto num_keys         = std::distance(first, last);
-  auto const block_size = 128;
-  auto const stride     = 1;
-  auto const tile_size  = 4;
-  auto const grid_size  = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size);
+  auto constexpr block_size = 128;
+  auto constexpr stride     = 1;
+  auto constexpr tile_size  = 4;
+
+  auto const num_keys  = std::distance(first, last);
+  auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size);
 
   detail::contains<block_size, tile_size><<<grid_size, block_size, 0, stream>>>(
     first, last, output_begin, submap_views_.data().get(), submaps_.size(), hash, key_equal);
diff --git a/include/cuco/detail/dynamic_map_kernels.cuh b/include/cuco/detail/dynamic_map_kernels.cuh
index 37bcbc547..566576e1e 100644
--- a/include/cuco/detail/dynamic_map_kernels.cuh
+++ b/include/cuco/detail/dynamic_map_kernels.cuh
@@ -98,8 +98,10 @@ __global__ void insert(InputIt first,
     tid += gridDim.x * blockDim.x;
   }
 
-  std::size_t block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes);
-  if (threadIdx.x == 0) { *num_successes += block_num_successes; }
+  std::size_t const block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes);
+  if (threadIdx.x == 0) {
+    num_successes->fetch_add(block_num_successes, cuda::std::memory_order_relaxed);
+  }
 }
 
 /**
@@ -130,7 +132,7 @@ __global__ void insert(InputIt first,
  * perform `contains` operations on each underlying `static_map`
  * @param submap_mutable_views Array of `static_map::device_mutable_view` objects
  * used to perform an `insert` into the target `static_map` submap
- * @param num_successes The number of successfully inserted key/value pairs
+ * @param submap_num_successes The number of successfully inserted key/value pairs for each submap
  * @param insert_idx The index of the submap we are inserting into
  * @param num_submaps The total number of submaps in the map
  * @param hash The unary function to apply to hash each key
@@ -149,7 +151,6 @@ __global__ void insert(InputIt first,
                        InputIt last,
                        viewT* submap_views,
                        mutableViewT* submap_mutable_views,
-                       // atomicT* num_successes,
                        atomicT** submap_num_successes,
                        uint32_t insert_idx,
                        uint32_t num_submaps,
@@ -185,10 +186,10 @@ __global__ void insert(InputIt first,
     it += (gridDim.x * blockDim.x) / tile_size;
   }
 
-  std::size_t block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes);
+  std::size_t const block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes);
   if (threadIdx.x == 0) {
-    //*num_successes += block_num_successes;
-    *submap_num_successes[insert_idx] += block_num_successes;
+    submap_num_successes[insert_idx]->fetch_add(block_num_successes,
+                                                cuda::std::memory_order_relaxed);
   }
 }
 
@@ -228,23 +229,22 @@ __global__ void erase(InputIt first,
                       InputIt last,
                       mutableViewT* submap_mutable_views,
                       atomicT** submap_num_successes,
-                      const uint32_t num_submaps,
+                      uint32_t num_submaps,
                       Hash hash,
                       KeyEqual key_equal)
 {
-  using BlockReduce = cub::BlockReduce<std::size_t, block_size>;
   extern __shared__ unsigned long long submap_block_num_successes[];
 
   auto tid = block_size * blockIdx.x + threadIdx.x;
   auto it  = first + tid;
 
-  for (int i = threadIdx.x; i < num_submaps; i += block_size)
+  for (auto i = threadIdx.x; i < num_submaps; i += block_size) {
     submap_block_num_successes[i] = 0;
+  }
   __syncthreads();
 
   while (it < last) {
-    int i;
-    for (i = 0; i < num_submaps; ++i) {
+    for (auto i = 0; i < num_submaps; ++i) {
       if (submap_mutable_views[i].erase(*it, hash, key_equal)) {
         atomicAdd(&submap_block_num_successes[i], 1);
         break;
@@ -254,7 +254,7 @@ __global__ void erase(InputIt first,
   }
   __syncthreads();
 
-  for (int i = 0; i < num_submaps; ++i) {
+  for (auto i = 0; i < num_submaps; ++i) {
     if (threadIdx.x == 0) {
       submap_num_successes[i]->fetch_add(static_cast<std::size_t>(submap_block_num_successes[i]),
                                          cuda::std::memory_order_relaxed);
@@ -300,11 +300,10 @@ __global__ void erase(InputIt first,
                       InputIt last,
                       mutableViewT* submap_mutable_views,
                       atomicT** submap_num_successes,
-                      const uint32_t num_submaps,
+                      uint32_t num_submaps,
                       Hash hash,
                       KeyEqual key_equal)
 {
-  using BlockReduce = cub::BlockReduce<std::size_t, block_size>;
   extern __shared__ unsigned long long submap_block_num_successes[];
 
   auto block = cg::this_thread_block();
@@ -312,13 +311,14 @@ __global__ void erase(InputIt first,
   auto tid   = block_size * block.group_index().x + block.thread_rank();
   auto it    = first + tid / tile_size;
 
-  for (int i = threadIdx.x; i < num_submaps; i += block_size)
+  for (auto i = threadIdx.x; i < num_submaps; i += block_size) {
     submap_block_num_successes[i] = 0;
+  }
   block.sync();
 
   while (it < last) {
     auto erased = false;
-    int i;
+    int i       = 0;
     for (i = 0; i < num_submaps; ++i) {
       erased = submap_mutable_views[i].erase(tile, *it, hash, key_equal);
       if (erased) { break; }
@@ -328,7 +328,7 @@ __global__ void erase(InputIt first,
   }
   block.sync();
 
-  for (int i = 0; i < num_submaps; ++i) {
+  for (auto i = 0; i < num_submaps; ++i) {
     if (threadIdx.x == 0) {
       submap_num_successes[i]->fetch_add(static_cast<std::size_t>(submap_block_num_successes[i]),
                                          cuda::std::memory_order_relaxed);

From f5ec677e85b23e4a74c89fa1afd40bbe980d7623 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 20 Dec 2022 21:55:52 -0500
Subject: [PATCH 36/36] Reorder header groups + remove unused counter allocator

---
 include/cuco/detail/dynamic_map.inl | 6 ++----
 include/cuco/dynamic_map.cuh        | 7 +++----
 2 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl
index 989225eea..bb7986071 100644
--- a/include/cuco/detail/dynamic_map.inl
+++ b/include/cuco/detail/dynamic_map.inl
@@ -30,8 +30,7 @@ dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(
     capacity_(initial_capacity),
     min_insert_size_(1E4),
     max_load_factor_(0.60),
-    alloc_{alloc},
-    counter_allocator_{alloc}
+    alloc_{alloc}
 {
   submaps_.push_back(std::make_unique<static_map<Key, Value, Scope, Allocator>>(
     initial_capacity,
@@ -59,8 +58,7 @@ dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(
     capacity_(initial_capacity),
     min_insert_size_(1E4),
     max_load_factor_(0.60),
-    alloc_{alloc},
-    counter_allocator_{alloc}
+    alloc_{alloc}
 {
   CUCO_RUNTIME_EXPECTS(empty_key_sentinel_ != erased_key_sentinel_,
                        "The empty key sentinel and erased key sentinel cannot be the same value.");
diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh
index a35aee893..f2285efc8 100644
--- a/include/cuco/dynamic_map.cuh
+++ b/include/cuco/dynamic_map.cuh
@@ -21,10 +21,12 @@
 #include <cuco/hash_functions.cuh>
 #include <cuco/sentinel.cuh>
 #include <cuco/static_map.cuh>
-#include <cuda/std/atomic>
+
 #include <thrust/device_vector.h>
 #include <thrust/functional.h>
 
+#include <cuda/std/atomic>
+
 #include <cstddef>
 #include <memory>
 #include <type_traits>
@@ -109,8 +111,6 @@ class dynamic_map {
   using mutable_view_type =
     typename static_map<Key, Value, Scope>::device_mutable_view;  ///< Type for submap mutable
                                                                   ///< device view
-  using counter_allocator_type = typename std::allocator_traits<Allocator>::rebind_alloc<
-    atomic_ctr_type>;  ///< Type of the allocator to (de)allocate atomic counters
 
   dynamic_map(dynamic_map const&) = delete;
   dynamic_map(dynamic_map&&)      = delete;
@@ -357,7 +357,6 @@ class dynamic_map {
   thrust::device_vector<atomic_ctr_type*>
     submap_num_successes_;  ///< Number of successfully erased keys for each submap
   Allocator alloc_{};       ///< Allocator passed to submaps to allocate their device storage
-  counter_allocator_type counter_allocator_{};  ///< Allocator used to allocate `num_successes_`
 };
 }  // namespace cuco