NVIDIA · PointKernel · Dec 5, 2025 · Dec 5, 2025 · Dec 6, 2025 · Dec 6, 2025
@@ -19,6 +19,7 @@
 #include <cuco/detail/error.hpp>
 #include <cuco/detail/hyperloglog/finalizer.cuh>
 #include <cuco/detail/hyperloglog/kernels.cuh>
+#include <cuco/detail/utility/memcpy_async.hpp>
 #include <cuco/detail/utility/strong_type.cuh>
 #include <cuco/detail/utils.hpp>
 #include <cuco/hash_functions.cuh>
@@ -457,11 +458,11 @@ class hyperloglog_impl {
     std::vector<register_type> host_sketch(num_regs);
 
     // TODO check if storage is host accessible
-    CUCO_CUDA_TRY(cudaMemcpyAsync(host_sketch.data(),
-                                  this->sketch_.data(),
-                                  sizeof(register_type) * num_regs,
-                                  cudaMemcpyDefault,
-                                  stream.get()));
+    CUCO_CUDA_TRY(cuco::detail::memcpy_async(host_sketch.data(),
+                                             this->sketch_.data(),
+                                             sizeof(register_type) * num_regs,
+                                             cudaMemcpyDefault,
+                                             stream));
 #if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1)
     stream.sync();
 #else

@@ -21,6 +21,7 @@
 #include <cuco/detail/open_addressing/kernels.cuh>
 #include <cuco/detail/storage/counter_storage.cuh>
 #include <cuco/detail/utility/cuda.hpp>
+#include <cuco/detail/utility/memcpy_async.hpp>
 #include <cuco/detail/utils.hpp>
 #include <cuco/extent.cuh>
 #include <cuco/operator.hpp>
@@ -882,8 +883,8 @@ class open_addressing_impl {
                                           stream.get()));
 
       size_type temp_count;
-      CUCO_CUDA_TRY(cudaMemcpyAsync(
-        &temp_count, d_num_out, sizeof(size_type), cudaMemcpyDeviceToHost, stream.get()));
+      CUCO_CUDA_TRY(cuco::detail::memcpy_async(
+        &temp_count, d_num_out, sizeof(size_type), cudaMemcpyDeviceToHost, stream));
 #if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1)
       stream.sync();
 #else

@@ -16,6 +16,7 @@
 
 #include <cuco/detail/bitwise_compare.cuh>
 #include <cuco/detail/error.hpp>
+#include <cuco/detail/utility/memcpy_async.hpp>
 #include <cuco/detail/utils.cuh>
 #include <cuco/detail/utils.hpp>
 
@@ -108,8 +109,11 @@ void static_map<Key, Value, Scope, Allocator>::insert(
 
   detail::insert<block_size, tile_size>
     <<<grid_size, block_size, 0, stream>>>(first, num_keys, num_successes_, view, hash, key_equal);
-  CUCO_CUDA_TRY(cudaMemcpyAsync(
-    &h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
+  CUCO_CUDA_TRY(cuco::detail::memcpy_async(&h_num_successes,
+                                           num_successes_,
+                                           sizeof(atomic_ctr_type),
+                                           cudaMemcpyDeviceToHost,
+                                           cuda::stream_ref{stream}));
 
   CUCO_CUDA_TRY(cudaStreamSynchronize(stream));  // stream sync to ensure h_num_successes is updated
 
@@ -146,8 +150,11 @@ void static_map<Key, Value, Scope, Allocator>::insert_if(InputIt first,
 
   detail::insert_if_n<block_size, tile_size><<<grid_size, block_size, 0, stream>>>(
     first, num_keys, num_successes_, view, stencil, pred, hash, key_equal);
-  CUCO_CUDA_TRY(cudaMemcpyAsync(
-    &h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
+  CUCO_CUDA_TRY(cuco::detail::memcpy_async(&h_num_successes,
+                                           num_successes_,
+                                           sizeof(atomic_ctr_type),
+                                           cudaMemcpyDeviceToHost,
+                                           cuda::stream_ref{stream}));
   CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
 
   size_ += h_num_successes;
@@ -178,8 +185,11 @@ void static_map<Key, Value, Scope, Allocator>::erase(
 
   detail::erase<block_size, tile_size>
     <<<grid_size, block_size, 0, stream>>>(first, num_keys, num_successes_, view, hash, key_equal);
-  CUCO_CUDA_TRY(cudaMemcpyAsync(
-    &h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
+  CUCO_CUDA_TRY(cuco::detail::memcpy_async(&h_num_successes,
+                                           num_successes_,
+                                           sizeof(atomic_ctr_type),
+                                           cudaMemcpyDeviceToHost,
+                                           cuda::stream_ref{stream}));
 
   CUCO_CUDA_TRY(cudaStreamSynchronize(stream));  // stream sync to ensure h_num_successes is updated
 
@@ -249,8 +259,8 @@ std::pair<KeyOut, ValueOut> static_map<Key, Value, Scope, Allocator>::retrieve_a
                         stream);
 
   std::size_t h_num_out;
-  CUCO_CUDA_TRY(
-    cudaMemcpyAsync(&h_num_out, d_num_out, sizeof(std::size_t), cudaMemcpyDeviceToHost, stream));
+  CUCO_CUDA_TRY(cuco::detail::memcpy_async(
+    &h_num_out, d_num_out, sizeof(std::size_t), cudaMemcpyDeviceToHost, cuda::stream_ref{stream}));
   CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
   temp_allocator.deallocate(
     reinterpret_cast<char*>(d_num_out), sizeof(std::size_t), cuda::stream_ref{stream});

@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuco/detail/error.hpp>
+
+#include <cuda/stream_ref>
+
+#include <cstddef>
+
+namespace cuco::detail {
+
+/**
+ * @brief Asynchronous memory copy utility using cudaMemcpyBatchAsync when possible
+ *
+ * Uses cudaMemcpyBatchAsync for CUDA 12.8+ with proper edge case handling.
+ * Falls back to cudaMemcpyAsync for older CUDA versions or edge cases.
+ *
+ * @param dst Destination memory address
+ * @param src Source memory address
+ * @param count Number of bytes to copy
+ * @param kind Memory copy direction
+ * @param stream CUDA stream for the operation
+ * @return cudaError_t Error code from the memory copy operation
+ */
+[[nodiscard]] inline cudaError_t memcpy_async(
+  void* dst, void const* src, size_t count, cudaMemcpyKind kind, cuda::stream_ref stream)
+{
+  if (dst == nullptr || src == nullptr || count == 0) { return cudaSuccess; }
+
+#if CUDART_VERSION >= 12080
+  if (stream.get() == nullptr) { return cudaMemcpyAsync(dst, src, count, kind, stream.get()); }
+
+  void* dsts[1]             = {dst};
+  void* srcs[1]             = {const_cast<void*>(src)};
+  std::size_t sizes[1]      = {count};
+  std::size_t attrs_idxs[1] = {0};
+
+  cudaMemcpyAttributes attrs[1] = {};
+  attrs[0].srcAccessOrder       = cudaMemcpySrcAccessOrderStream;
+  attrs[0].flags                = cudaMemcpyFlagPreferOverlapWithCompute;
+
+#if CUDART_VERSION >= 13000
+  return cudaMemcpyBatchAsync(dsts, srcs, sizes, 1, attrs, attrs_idxs, 1, stream.get());
+#else
+  std::size_t fail_idx;
+  return cudaMemcpyBatchAsync(dsts, srcs, sizes, 1, attrs, attrs_idxs, 1, &fail_idx, stream.get());
+#endif  // CUDART_VERSION >= 13000
+#else
+  // CUDA < 12.8 - use regular cudaMemcpyAsync
+  return cudaMemcpyAsync(dst, src, count, kind, stream.get());
+#endif  // CUDART_VERSION >= 12080
+}
+
+}  // namespace cuco::detail