diff --git a/include/cuco/detail/hyperloglog/hyperloglog_impl.cuh b/include/cuco/detail/hyperloglog/hyperloglog_impl.cuh index cd692e899..b30029a9f 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog_impl.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog_impl.cuh @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -457,11 +458,11 @@ class hyperloglog_impl { std::vector host_sketch(num_regs); // TODO check if storage is host accessible - CUCO_CUDA_TRY(cudaMemcpyAsync(host_sketch.data(), - this->sketch_.data(), - sizeof(register_type) * num_regs, - cudaMemcpyDefault, - stream.get())); + CUCO_CUDA_TRY(cuco::detail::memcpy_async(host_sketch.data(), + this->sketch_.data(), + sizeof(register_type) * num_regs, + cudaMemcpyDefault, + stream)); #if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1) stream.sync(); #else diff --git a/include/cuco/detail/open_addressing/open_addressing_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_impl.cuh index 9bc781e83..b176f6092 100644 --- a/include/cuco/detail/open_addressing/open_addressing_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_impl.cuh @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -882,8 +883,8 @@ class open_addressing_impl { stream.get())); size_type temp_count; - CUCO_CUDA_TRY(cudaMemcpyAsync( - &temp_count, d_num_out, sizeof(size_type), cudaMemcpyDeviceToHost, stream.get())); + CUCO_CUDA_TRY(cuco::detail::memcpy_async( + &temp_count, d_num_out, sizeof(size_type), cudaMemcpyDeviceToHost, stream)); #if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1) stream.sync(); #else diff --git a/include/cuco/detail/static_map.inl b/include/cuco/detail/static_map.inl index 79bd5f08c..a312f6f96 100644 --- a/include/cuco/detail/static_map.inl +++ b/include/cuco/detail/static_map.inl @@ -16,6 +16,7 @@ #include #include +#include #include #include @@ -108,8 +109,11 @@ void static_map::insert( detail::insert <<>>(first, num_keys, num_successes_, view, hash, key_equal); - CUCO_CUDA_TRY(cudaMemcpyAsync( - &h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream)); + CUCO_CUDA_TRY(cuco::detail::memcpy_async(&h_num_successes, + num_successes_, + sizeof(atomic_ctr_type), + cudaMemcpyDeviceToHost, + cuda::stream_ref{stream})); CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); // stream sync to ensure h_num_successes is updated @@ -146,8 +150,11 @@ void static_map::insert_if(InputIt first, detail::insert_if_n<<>>( first, num_keys, num_successes_, view, stencil, pred, hash, key_equal); - CUCO_CUDA_TRY(cudaMemcpyAsync( - &h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream)); + CUCO_CUDA_TRY(cuco::detail::memcpy_async(&h_num_successes, + num_successes_, + sizeof(atomic_ctr_type), + cudaMemcpyDeviceToHost, + cuda::stream_ref{stream})); CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); size_ += h_num_successes; @@ -178,8 +185,11 @@ void static_map::erase( detail::erase <<>>(first, num_keys, num_successes_, view, hash, key_equal); - CUCO_CUDA_TRY(cudaMemcpyAsync( - &h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream)); + CUCO_CUDA_TRY(cuco::detail::memcpy_async(&h_num_successes, + num_successes_, + sizeof(atomic_ctr_type), + cudaMemcpyDeviceToHost, + cuda::stream_ref{stream})); CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); // stream sync to ensure h_num_successes is updated @@ -249,8 +259,8 @@ std::pair static_map::retrieve_a stream); std::size_t h_num_out; - CUCO_CUDA_TRY( - cudaMemcpyAsync(&h_num_out, d_num_out, sizeof(std::size_t), cudaMemcpyDeviceToHost, stream)); + CUCO_CUDA_TRY(cuco::detail::memcpy_async( + &h_num_out, d_num_out, sizeof(std::size_t), cudaMemcpyDeviceToHost, cuda::stream_ref{stream})); CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); temp_allocator.deallocate( reinterpret_cast(d_num_out), sizeof(std::size_t), cuda::stream_ref{stream}); diff --git a/include/cuco/detail/utility/memcpy_async.hpp b/include/cuco/detail/utility/memcpy_async.hpp new file mode 100644 index 000000000..887c94447 --- /dev/null +++ b/include/cuco/detail/utility/memcpy_async.hpp @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +#include + +namespace cuco::detail { + +/** + * @brief Asynchronous memory copy utility using cudaMemcpyBatchAsync when possible + * + * Uses cudaMemcpyBatchAsync for CUDA 12.8+ with proper edge case handling. + * Falls back to cudaMemcpyAsync for older CUDA versions or edge cases. + * + * @param dst Destination memory address + * @param src Source memory address + * @param count Number of bytes to copy + * @param kind Memory copy direction + * @param stream CUDA stream for the operation + * @return cudaError_t Error code from the memory copy operation + */ +[[nodiscard]] inline cudaError_t memcpy_async( + void* dst, void const* src, size_t count, cudaMemcpyKind kind, cuda::stream_ref stream) +{ + if (dst == nullptr || src == nullptr || count == 0) { return cudaSuccess; } + +#if CUDART_VERSION >= 12080 + if (stream.get() == nullptr) { return cudaMemcpyAsync(dst, src, count, kind, stream.get()); } + + void* dsts[1] = {dst}; + void* srcs[1] = {const_cast(src)}; + std::size_t sizes[1] = {count}; + std::size_t attrs_idxs[1] = {0}; + + cudaMemcpyAttributes attrs[1] = {}; + attrs[0].srcAccessOrder = cudaMemcpySrcAccessOrderStream; + attrs[0].flags = cudaMemcpyFlagPreferOverlapWithCompute; + +#if CUDART_VERSION >= 13000 + return cudaMemcpyBatchAsync(dsts, srcs, sizes, 1, attrs, attrs_idxs, 1, stream.get()); +#else + std::size_t fail_idx; + return cudaMemcpyBatchAsync(dsts, srcs, sizes, 1, attrs, attrs_idxs, 1, &fail_idx, stream.get()); +#endif // CUDART_VERSION >= 13000 +#else + // CUDA < 12.8 - use regular cudaMemcpyAsync + return cudaMemcpyAsync(dst, src, count, kind, stream.get()); +#endif // CUDART_VERSION >= 12080 +} + +} // namespace cuco::detail