From ee5addf697f4c3aa5e2038c1ec93331d22025dcd Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 5 Dec 2025 10:54:12 -0800
Subject: [PATCH 1/6] Repalce cudaMemcpyAsync with cudaMemcpyBatchAsync to get
 rid of a driver locking bug

---
 .../detail/hyperloglog/hyperloglog_impl.cuh   | 11 ++--
 .../open_addressing/open_addressing_impl.cuh  |  5 +-
 include/cuco/detail/static_map.inl            | 26 +++++---
 include/cuco/detail/utility/memcpy_async.cuh  | 60 +++++++++++++++++++
 4 files changed, 87 insertions(+), 15 deletions(-)
 create mode 100644 include/cuco/detail/utility/memcpy_async.cuh

diff --git a/include/cuco/detail/hyperloglog/hyperloglog_impl.cuh b/include/cuco/detail/hyperloglog/hyperloglog_impl.cuh
index 6c8bb2eff..e459ef6a7 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog_impl.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog_impl.cuh
@@ -19,6 +19,7 @@
 #include <cuco/detail/error.hpp>
 #include <cuco/detail/hyperloglog/finalizer.cuh>
 #include <cuco/detail/hyperloglog/kernels.cuh>
+#include <cuco/detail/utility/memcpy_async.cuh>
 #include <cuco/detail/utils.hpp>
 #include <cuco/hash_functions.cuh>
 #include <cuco/types.cuh>
@@ -420,11 +421,11 @@ class hyperloglog_impl {
     std::vector<register_type> host_sketch(num_regs);
 
     // TODO check if storage is host accessible
-    CUCO_CUDA_TRY(cudaMemcpyAsync(host_sketch.data(),
-                                  this->sketch_.data(),
-                                  sizeof(register_type) * num_regs,
-                                  cudaMemcpyDefault,
-                                  stream.get()));
+    cuco::detail::memcpy_async(host_sketch.data(),
+                               this->sketch_.data(),
+                               sizeof(register_type) * num_regs,
+                               cudaMemcpyDefault,
+                               stream);
 #if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1)
     stream.sync();
 #else
diff --git a/include/cuco/detail/open_addressing/open_addressing_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_impl.cuh
index 9bc781e83..246e3f7ed 100644
--- a/include/cuco/detail/open_addressing/open_addressing_impl.cuh
+++ b/include/cuco/detail/open_addressing/open_addressing_impl.cuh
@@ -21,6 +21,7 @@
 #include <cuco/detail/open_addressing/kernels.cuh>
 #include <cuco/detail/storage/counter_storage.cuh>
 #include <cuco/detail/utility/cuda.hpp>
+#include <cuco/detail/utility/memcpy_async.cuh>
 #include <cuco/detail/utils.hpp>
 #include <cuco/extent.cuh>
 #include <cuco/operator.hpp>
@@ -882,8 +883,8 @@ class open_addressing_impl {
                                           stream.get()));
 
       size_type temp_count;
-      CUCO_CUDA_TRY(cudaMemcpyAsync(
-        &temp_count, d_num_out, sizeof(size_type), cudaMemcpyDeviceToHost, stream.get()));
+      cuco::detail::memcpy_async(
+        &temp_count, d_num_out, sizeof(size_type), cudaMemcpyDeviceToHost, stream);
 #if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1)
       stream.sync();
 #else
diff --git a/include/cuco/detail/static_map.inl b/include/cuco/detail/static_map.inl
index 79bd5f08c..6246cdc02 100644
--- a/include/cuco/detail/static_map.inl
+++ b/include/cuco/detail/static_map.inl
@@ -16,6 +16,7 @@
 
 #include <cuco/detail/bitwise_compare.cuh>
 #include <cuco/detail/error.hpp>
+#include <cuco/detail/utility/memcpy_async.cuh>
 #include <cuco/detail/utils.cuh>
 #include <cuco/detail/utils.hpp>
 
@@ -108,8 +109,11 @@ void static_map<Key, Value, Scope, Allocator>::insert(
 
   detail::insert<block_size, tile_size>
     <<<grid_size, block_size, 0, stream>>>(first, num_keys, num_successes_, view, hash, key_equal);
-  CUCO_CUDA_TRY(cudaMemcpyAsync(
-    &h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
+  cuco::detail::memcpy_async(&h_num_successes,
+                             num_successes_,
+                             sizeof(atomic_ctr_type),
+                             cudaMemcpyDeviceToHost,
+                             cuda::stream_ref{stream});
 
   CUCO_CUDA_TRY(cudaStreamSynchronize(stream));  // stream sync to ensure h_num_successes is updated
 
@@ -146,8 +150,11 @@ void static_map<Key, Value, Scope, Allocator>::insert_if(InputIt first,
 
   detail::insert_if_n<block_size, tile_size><<<grid_size, block_size, 0, stream>>>(
     first, num_keys, num_successes_, view, stencil, pred, hash, key_equal);
-  CUCO_CUDA_TRY(cudaMemcpyAsync(
-    &h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
+  cuco::detail::memcpy_async(&h_num_successes,
+                             num_successes_,
+                             sizeof(atomic_ctr_type),
+                             cudaMemcpyDeviceToHost,
+                             cuda::stream_ref{stream});
   CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
 
   size_ += h_num_successes;
@@ -178,8 +185,11 @@ void static_map<Key, Value, Scope, Allocator>::erase(
 
   detail::erase<block_size, tile_size>
     <<<grid_size, block_size, 0, stream>>>(first, num_keys, num_successes_, view, hash, key_equal);
-  CUCO_CUDA_TRY(cudaMemcpyAsync(
-    &h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
+  cuco::detail::memcpy_async(&h_num_successes,
+                             num_successes_,
+                             sizeof(atomic_ctr_type),
+                             cudaMemcpyDeviceToHost,
+                             cuda::stream_ref{stream});
 
   CUCO_CUDA_TRY(cudaStreamSynchronize(stream));  // stream sync to ensure h_num_successes is updated
 
@@ -249,8 +259,8 @@ std::pair<KeyOut, ValueOut> static_map<Key, Value, Scope, Allocator>::retrieve_a
                         stream);
 
   std::size_t h_num_out;
-  CUCO_CUDA_TRY(
-    cudaMemcpyAsync(&h_num_out, d_num_out, sizeof(std::size_t), cudaMemcpyDeviceToHost, stream));
+  cuco::detail::memcpy_async(
+    &h_num_out, d_num_out, sizeof(std::size_t), cudaMemcpyDeviceToHost, cuda::stream_ref{stream});
   CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
   temp_allocator.deallocate(
     reinterpret_cast<char*>(d_num_out), sizeof(std::size_t), cuda::stream_ref{stream});
diff --git a/include/cuco/detail/utility/memcpy_async.cuh b/include/cuco/detail/utility/memcpy_async.cuh
new file mode 100644
index 000000000..eabc2f620
--- /dev/null
+++ b/include/cuco/detail/utility/memcpy_async.cuh
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuco/detail/error.hpp>
+
+#include <cuda/stream_ref>
+
+namespace cuco::detail {
+
+/**
+ * @brief Asynchronous memory copy utility that works around cudaMemcpyAsync bugs
+ *
+ * This function provides a drop-in replacement for cudaMemcpyAsync that uses
+ * cudaMemcpyBatchAsync internally to work around known issues with cudaMemcpyAsync.
+ * The function automatically handles the different API signatures between CUDA
+ * runtime versions.
+ *
+ * @param dst Destination memory address
+ * @param src Source memory address
+ * @param count Number of bytes to copy
+ * @param kind Type of memory copy (cudaMemcpyHostToDevice, cudaMemcpyDeviceToHost, etc.)
+ * @param stream CUDA stream for the asynchronous operation
+ */
+inline void memcpy_async(
+  void* dst, const void* src, size_t count, cudaMemcpyKind kind, cuda::stream_ref stream)
+{
+  // Use cudaMemcpyBatchAsync as a workaround for cudaMemcpyAsync bugs
+  void* dsts[1]                 = {dst};
+  void* srcs[1]                 = {const_cast<void*>(src)};
+  size_t sizes[1]               = {count};
+  cudaMemcpyAttributes attrs[1] = {{.srcAccessOrder = cudaMemcpySrcAccessOrderStream}};
+  size_t attrsIdxs[1]           = {0};
+
+#if CUDART_VERSION >= 13000
+  // CUDA 13.0+ API - no failIdx parameter
+  CUCO_CUDA_TRY(cudaMemcpyBatchAsync(dsts, srcs, sizes, 1, attrs, attrsIdxs, 1, stream.get()));
+#else
+  // CUDA 12.x API - requires failIdx parameter
+  size_t failIdx;
+  CUCO_CUDA_TRY(
+    cudaMemcpyBatchAsync(dsts, srcs, sizes, 1, attrs, attrsIdxs, 1, &failIdx, stream.get()));
+#endif
+}
+
+}  // namespace cuco::detail

From f034a2662cffed9cf590b9452f08ac1567439a8d Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 5 Dec 2025 11:02:36 -0800
Subject: [PATCH 2/6] Fix pre-12.8 compatibility

---
 include/cuco/detail/utility/memcpy_async.cuh | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/include/cuco/detail/utility/memcpy_async.cuh b/include/cuco/detail/utility/memcpy_async.cuh
index eabc2f620..f64238eeb 100644
--- a/include/cuco/detail/utility/memcpy_async.cuh
+++ b/include/cuco/detail/utility/memcpy_async.cuh
@@ -26,9 +26,10 @@ namespace cuco::detail {
  * @brief Asynchronous memory copy utility that works around cudaMemcpyAsync bugs
  *
  * This function provides a drop-in replacement for cudaMemcpyAsync that uses
- * cudaMemcpyBatchAsync internally to work around known issues with cudaMemcpyAsync.
- * The function automatically handles the different API signatures between CUDA
- * runtime versions.
+ * cudaMemcpyBatchAsync internally to work around known issues with cudaMemcpyAsync
+ * when available (CUDA 12.8+). For older CUDA versions, it falls back to the
+ * original cudaMemcpyAsync. The function automatically handles the different API
+ * signatures between CUDA runtime versions.
  *
  * @param dst Destination memory address
  * @param src Source memory address
@@ -39,7 +40,8 @@ namespace cuco::detail {
 inline void memcpy_async(
   void* dst, const void* src, size_t count, cudaMemcpyKind kind, cuda::stream_ref stream)
 {
-  // Use cudaMemcpyBatchAsync as a workaround for cudaMemcpyAsync bugs
+#if CUDART_VERSION >= 12080
+  // CUDA 12.8+ - Use cudaMemcpyBatchAsync as a workaround for cudaMemcpyAsync bugs
   void* dsts[1]                 = {dst};
   void* srcs[1]                 = {const_cast<void*>(src)};
   size_t sizes[1]               = {count};
@@ -50,11 +52,17 @@ inline void memcpy_async(
   // CUDA 13.0+ API - no failIdx parameter
   CUCO_CUDA_TRY(cudaMemcpyBatchAsync(dsts, srcs, sizes, 1, attrs, attrsIdxs, 1, stream.get()));
 #else
-  // CUDA 12.x API - requires failIdx parameter
+  // CUDA 12.8-12.x API - requires failIdx parameter
   size_t failIdx;
   CUCO_CUDA_TRY(
     cudaMemcpyBatchAsync(dsts, srcs, sizes, 1, attrs, attrsIdxs, 1, &failIdx, stream.get()));
 #endif
+
+#else
+  // CUDA 12.0-12.7 - Fall back to original cudaMemcpyAsync
+  // Note: This may still have the original bugs that cudaMemcpyBatchAsync was designed to fix
+  CUCO_CUDA_TRY(cudaMemcpyAsync(dst, src, count, kind, stream.get()));
+#endif
 }
 
 }  // namespace cuco::detail

From 17aa5505e6fbb1ce4da923de6ef066f74066db80 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 5 Dec 2025 16:46:44 -0800
Subject: [PATCH 3/6] Fix edge cases for cudaMemcpyBatchAsync

---
 include/cuco/detail/utility/memcpy_async.cuh | 55 +++++++++++---------
 1 file changed, 29 insertions(+), 26 deletions(-)

diff --git a/include/cuco/detail/utility/memcpy_async.cuh b/include/cuco/detail/utility/memcpy_async.cuh
index f64238eeb..d97406335 100644
--- a/include/cuco/detail/utility/memcpy_async.cuh
+++ b/include/cuco/detail/utility/memcpy_async.cuh
@@ -20,49 +20,52 @@
 
 #include <cuda/stream_ref>
 
+#include <cstring>
+
 namespace cuco::detail {
 
 /**
- * @brief Asynchronous memory copy utility that works around cudaMemcpyAsync bugs
+ * @brief Asynchronous memory copy utility using cudaMemcpyBatchAsync when possible
  *
- * This function provides a drop-in replacement for cudaMemcpyAsync that uses
- * cudaMemcpyBatchAsync internally to work around known issues with cudaMemcpyAsync
- * when available (CUDA 12.8+). For older CUDA versions, it falls back to the
- * original cudaMemcpyAsync. The function automatically handles the different API
- * signatures between CUDA runtime versions.
+ * Uses cudaMemcpyBatchAsync for CUDA 12.8+ with proper edge case handling.
+ * Falls back to cudaMemcpyAsync for older CUDA versions or edge cases.
  *
  * @param dst Destination memory address
  * @param src Source memory address
  * @param count Number of bytes to copy
- * @param kind Type of memory copy (cudaMemcpyHostToDevice, cudaMemcpyDeviceToHost, etc.)
- * @param stream CUDA stream for the asynchronous operation
+ * @param kind Memory copy direction
+ * @param stream CUDA stream for the operation
  */
 inline void memcpy_async(
-  void* dst, const void* src, size_t count, cudaMemcpyKind kind, cuda::stream_ref stream)
+  void* dst, void const* src, size_t count, cudaMemcpyKind kind, cuda::stream_ref stream)
 {
+  if (dst == nullptr || src == nullptr || count == 0) { return; }
+
 #if CUDART_VERSION >= 12080
-  // CUDA 12.8+ - Use cudaMemcpyBatchAsync as a workaround for cudaMemcpyAsync bugs
-  void* dsts[1]                 = {dst};
-  void* srcs[1]                 = {const_cast<void*>(src)};
-  size_t sizes[1]               = {count};
-  cudaMemcpyAttributes attrs[1] = {{.srcAccessOrder = cudaMemcpySrcAccessOrderStream}};
-  size_t attrsIdxs[1]           = {0};
+  if (stream.get() == 0) {
+    CUCO_CUDA_TRY(cudaMemcpyAsync(dst, src, count, kind, stream.get()));
+    return;
+  }
+
+  void* dsts[1]             = {dst};
+  void* srcs[1]             = {const_cast<void*>(src)};
+  std::size_t sizes[1]      = {count};
+  std::size_t attrs_idxs[1] = {0};
+
+  cudaMemcpyAttributes attrs[1] = {};
+  attrs[0].srcAccessOrder       = cudaMemcpySrcAccessOrderStream;
+  attrs[0].flags                = cudaMemcpyFlagPreferOverlapWithCompute;
 
 #if CUDART_VERSION >= 13000
-  // CUDA 13.0+ API - no failIdx parameter
-  CUCO_CUDA_TRY(cudaMemcpyBatchAsync(dsts, srcs, sizes, 1, attrs, attrsIdxs, 1, stream.get()));
+  CUCO_CUDA_TRY(cudaMemcpyBatchAsync(dsts, srcs, sizes, 1, attrs, attrs_idxs, 1, stream.get()));
 #else
-  // CUDA 12.8-12.x API - requires failIdx parameter
-  size_t failIdx;
+  std::size_t fail_idx;
   CUCO_CUDA_TRY(
-    cudaMemcpyBatchAsync(dsts, srcs, sizes, 1, attrs, attrsIdxs, 1, &failIdx, stream.get()));
-#endif
-
+    cudaMemcpyBatchAsync(dsts, srcs, sizes, 1, attrs, attrs_idxs, 1, &fail_idx, stream.get()));
+#endif  // CUDART_VERSION >= 13000
 #else
-  // CUDA 12.0-12.7 - Fall back to original cudaMemcpyAsync
-  // Note: This may still have the original bugs that cudaMemcpyBatchAsync was designed to fix
+  // CUDA < 12.8 - use regular cudaMemcpyAsync
   CUCO_CUDA_TRY(cudaMemcpyAsync(dst, src, count, kind, stream.get()));
-#endif
+#endif  // CUDART_VERSION >= 12080
 }
-
 }  // namespace cuco::detail

From d7354796173aca204f5e6b33e3c8c54e56195764 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 5 Dec 2025 16:48:05 -0800
Subject: [PATCH 4/6] Header cleanups

---
 include/cuco/detail/utility/memcpy_async.cuh | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/cuco/detail/utility/memcpy_async.cuh b/include/cuco/detail/utility/memcpy_async.cuh
index d97406335..655a94438 100644
--- a/include/cuco/detail/utility/memcpy_async.cuh
+++ b/include/cuco/detail/utility/memcpy_async.cuh
@@ -20,8 +20,6 @@
 
 #include <cuda/stream_ref>
 
-#include <cstring>
-
 namespace cuco::detail {
 
 /**

From 56246c6344b3e470491f66b2e5002c4df2d88319 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Sat, 6 Dec 2025 18:06:09 -0800
Subject: [PATCH 5/6] Update detail memcpy_async to return CUDA error

---
 .../detail/hyperloglog/hyperloglog_impl.cuh   | 12 +++----
 .../open_addressing/open_addressing_impl.cuh  |  6 ++--
 include/cuco/detail/static_map.inl            | 36 +++++++++----------
 include/cuco/detail/utility/memcpy_async.cuh  | 17 ++++-----
 4 files changed, 34 insertions(+), 37 deletions(-)

diff --git a/include/cuco/detail/hyperloglog/hyperloglog_impl.cuh b/include/cuco/detail/hyperloglog/hyperloglog_impl.cuh
index e459ef6a7..b4edf12fb 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog_impl.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog_impl.cuh
@@ -19,7 +19,7 @@
 #include <cuco/detail/error.hpp>
 #include <cuco/detail/hyperloglog/finalizer.cuh>
 #include <cuco/detail/hyperloglog/kernels.cuh>
-#include <cuco/detail/utility/memcpy_async.cuh>
+#include <cuco/detail/utility/memcpy_async.hpp>
 #include <cuco/detail/utils.hpp>
 #include <cuco/hash_functions.cuh>
 #include <cuco/types.cuh>
@@ -421,11 +421,11 @@ class hyperloglog_impl {
     std::vector<register_type> host_sketch(num_regs);
 
     // TODO check if storage is host accessible
-    cuco::detail::memcpy_async(host_sketch.data(),
-                               this->sketch_.data(),
-                               sizeof(register_type) * num_regs,
-                               cudaMemcpyDefault,
-                               stream);
+    CUCO_CUDA_TRY(cuco::detail::memcpy_async(host_sketch.data(),
+                                             this->sketch_.data(),
+                                             sizeof(register_type) * num_regs,
+                                             cudaMemcpyDefault,
+                                             stream));
 #if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1)
     stream.sync();
 #else
diff --git a/include/cuco/detail/open_addressing/open_addressing_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_impl.cuh
index 246e3f7ed..b176f6092 100644
--- a/include/cuco/detail/open_addressing/open_addressing_impl.cuh
+++ b/include/cuco/detail/open_addressing/open_addressing_impl.cuh
@@ -21,7 +21,7 @@
 #include <cuco/detail/open_addressing/kernels.cuh>
 #include <cuco/detail/storage/counter_storage.cuh>
 #include <cuco/detail/utility/cuda.hpp>
-#include <cuco/detail/utility/memcpy_async.cuh>
+#include <cuco/detail/utility/memcpy_async.hpp>
 #include <cuco/detail/utils.hpp>
 #include <cuco/extent.cuh>
 #include <cuco/operator.hpp>
@@ -883,8 +883,8 @@ class open_addressing_impl {
                                           stream.get()));
 
       size_type temp_count;
-      cuco::detail::memcpy_async(
-        &temp_count, d_num_out, sizeof(size_type), cudaMemcpyDeviceToHost, stream);
+      CUCO_CUDA_TRY(cuco::detail::memcpy_async(
+        &temp_count, d_num_out, sizeof(size_type), cudaMemcpyDeviceToHost, stream));
 #if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1)
       stream.sync();
 #else
diff --git a/include/cuco/detail/static_map.inl b/include/cuco/detail/static_map.inl
index 6246cdc02..a312f6f96 100644
--- a/include/cuco/detail/static_map.inl
+++ b/include/cuco/detail/static_map.inl
@@ -16,7 +16,7 @@
 
 #include <cuco/detail/bitwise_compare.cuh>
 #include <cuco/detail/error.hpp>
-#include <cuco/detail/utility/memcpy_async.cuh>
+#include <cuco/detail/utility/memcpy_async.hpp>
 #include <cuco/detail/utils.cuh>
 #include <cuco/detail/utils.hpp>
 
@@ -109,11 +109,11 @@ void static_map<Key, Value, Scope, Allocator>::insert(
 
   detail::insert<block_size, tile_size>
     <<<grid_size, block_size, 0, stream>>>(first, num_keys, num_successes_, view, hash, key_equal);
-  cuco::detail::memcpy_async(&h_num_successes,
-                             num_successes_,
-                             sizeof(atomic_ctr_type),
-                             cudaMemcpyDeviceToHost,
-                             cuda::stream_ref{stream});
+  CUCO_CUDA_TRY(cuco::detail::memcpy_async(&h_num_successes,
+                                           num_successes_,
+                                           sizeof(atomic_ctr_type),
+                                           cudaMemcpyDeviceToHost,
+                                           cuda::stream_ref{stream}));
 
   CUCO_CUDA_TRY(cudaStreamSynchronize(stream));  // stream sync to ensure h_num_successes is updated
 
@@ -150,11 +150,11 @@ void static_map<Key, Value, Scope, Allocator>::insert_if(InputIt first,
 
   detail::insert_if_n<block_size, tile_size><<<grid_size, block_size, 0, stream>>>(
     first, num_keys, num_successes_, view, stencil, pred, hash, key_equal);
-  cuco::detail::memcpy_async(&h_num_successes,
-                             num_successes_,
-                             sizeof(atomic_ctr_type),
-                             cudaMemcpyDeviceToHost,
-                             cuda::stream_ref{stream});
+  CUCO_CUDA_TRY(cuco::detail::memcpy_async(&h_num_successes,
+                                           num_successes_,
+                                           sizeof(atomic_ctr_type),
+                                           cudaMemcpyDeviceToHost,
+                                           cuda::stream_ref{stream}));
   CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
 
   size_ += h_num_successes;
@@ -185,11 +185,11 @@ void static_map<Key, Value, Scope, Allocator>::erase(
 
   detail::erase<block_size, tile_size>
     <<<grid_size, block_size, 0, stream>>>(first, num_keys, num_successes_, view, hash, key_equal);
-  cuco::detail::memcpy_async(&h_num_successes,
-                             num_successes_,
-                             sizeof(atomic_ctr_type),
-                             cudaMemcpyDeviceToHost,
-                             cuda::stream_ref{stream});
+  CUCO_CUDA_TRY(cuco::detail::memcpy_async(&h_num_successes,
+                                           num_successes_,
+                                           sizeof(atomic_ctr_type),
+                                           cudaMemcpyDeviceToHost,
+                                           cuda::stream_ref{stream}));
 
   CUCO_CUDA_TRY(cudaStreamSynchronize(stream));  // stream sync to ensure h_num_successes is updated
 
@@ -259,8 +259,8 @@ std::pair<KeyOut, ValueOut> static_map<Key, Value, Scope, Allocator>::retrieve_a
                         stream);
 
   std::size_t h_num_out;
-  cuco::detail::memcpy_async(
-    &h_num_out, d_num_out, sizeof(std::size_t), cudaMemcpyDeviceToHost, cuda::stream_ref{stream});
+  CUCO_CUDA_TRY(cuco::detail::memcpy_async(
+    &h_num_out, d_num_out, sizeof(std::size_t), cudaMemcpyDeviceToHost, cuda::stream_ref{stream}));
   CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
   temp_allocator.deallocate(
     reinterpret_cast<char*>(d_num_out), sizeof(std::size_t), cuda::stream_ref{stream});
diff --git a/include/cuco/detail/utility/memcpy_async.cuh b/include/cuco/detail/utility/memcpy_async.cuh
index 655a94438..fed6d6b3f 100644
--- a/include/cuco/detail/utility/memcpy_async.cuh
+++ b/include/cuco/detail/utility/memcpy_async.cuh
@@ -33,17 +33,15 @@ namespace cuco::detail {
  * @param count Number of bytes to copy
  * @param kind Memory copy direction
  * @param stream CUDA stream for the operation
+ * @return cudaError_t Error code from the memory copy operation
  */
-inline void memcpy_async(
+[[nodiscard]] inline cudaError_t memcpy_async(
   void* dst, void const* src, size_t count, cudaMemcpyKind kind, cuda::stream_ref stream)
 {
-  if (dst == nullptr || src == nullptr || count == 0) { return; }
+  if (dst == nullptr || src == nullptr || count == 0) { return cudaSuccess; }
 
 #if CUDART_VERSION >= 12080
-  if (stream.get() == 0) {
-    CUCO_CUDA_TRY(cudaMemcpyAsync(dst, src, count, kind, stream.get()));
-    return;
-  }
+  if (stream.get() == 0) { return cudaMemcpyAsync(dst, src, count, kind, stream.get()); }
 
   void* dsts[1]             = {dst};
   void* srcs[1]             = {const_cast<void*>(src)};
@@ -55,15 +53,14 @@ inline void memcpy_async(
   attrs[0].flags                = cudaMemcpyFlagPreferOverlapWithCompute;
 
 #if CUDART_VERSION >= 13000
-  CUCO_CUDA_TRY(cudaMemcpyBatchAsync(dsts, srcs, sizes, 1, attrs, attrs_idxs, 1, stream.get()));
+  return cudaMemcpyBatchAsync(dsts, srcs, sizes, 1, attrs, attrs_idxs, 1, stream.get());
 #else
   std::size_t fail_idx;
-  CUCO_CUDA_TRY(
-    cudaMemcpyBatchAsync(dsts, srcs, sizes, 1, attrs, attrs_idxs, 1, &fail_idx, stream.get()));
+  return cudaMemcpyBatchAsync(dsts, srcs, sizes, 1, attrs, attrs_idxs, 1, &fail_idx, stream.get());
 #endif  // CUDART_VERSION >= 13000
 #else
   // CUDA < 12.8 - use regular cudaMemcpyAsync
-  CUCO_CUDA_TRY(cudaMemcpyAsync(dst, src, count, kind, stream.get()));
+  return cudaMemcpyAsync(dst, src, count, kind, stream.get());
 #endif  // CUDART_VERSION >= 12080
 }
 }  // namespace cuco::detail

From fbe624940377558dc310b5d4bfae228607278cd4 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Sat, 6 Dec 2025 19:48:54 -0800
Subject: [PATCH 6/6] Use C++ header instead of CUDA header

---
 .../detail/utility/{memcpy_async.cuh => memcpy_async.hpp}    | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)
 rename include/cuco/detail/utility/{memcpy_async.cuh => memcpy_async.hpp} (95%)

diff --git a/include/cuco/detail/utility/memcpy_async.cuh b/include/cuco/detail/utility/memcpy_async.hpp
similarity index 95%
rename from include/cuco/detail/utility/memcpy_async.cuh
rename to include/cuco/detail/utility/memcpy_async.hpp
index fed6d6b3f..887c94447 100644
--- a/include/cuco/detail/utility/memcpy_async.cuh
+++ b/include/cuco/detail/utility/memcpy_async.hpp
@@ -20,6 +20,8 @@
 
 #include <cuda/stream_ref>
 
+#include <cstddef>
+
 namespace cuco::detail {
 
 /**
@@ -41,7 +43,7 @@ namespace cuco::detail {
   if (dst == nullptr || src == nullptr || count == 0) { return cudaSuccess; }
 
 #if CUDART_VERSION >= 12080
-  if (stream.get() == 0) { return cudaMemcpyAsync(dst, src, count, kind, stream.get()); }
+  if (stream.get() == nullptr) { return cudaMemcpyAsync(dst, src, count, kind, stream.get()); }
 
   void* dsts[1]             = {dst};
   void* srcs[1]             = {const_cast<void*>(src)};
@@ -63,4 +65,5 @@ namespace cuco::detail {
   return cudaMemcpyAsync(dst, src, count, kind, stream.get());
 #endif  // CUDART_VERSION >= 12080
 }
+
 }  // namespace cuco::detail