Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions include/cuco/detail/hyperloglog/hyperloglog_impl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <cuco/detail/error.hpp>
#include <cuco/detail/hyperloglog/finalizer.cuh>
#include <cuco/detail/hyperloglog/kernels.cuh>
#include <cuco/detail/utility/memcpy_async.hpp>
#include <cuco/detail/utility/strong_type.cuh>
#include <cuco/detail/utils.hpp>
#include <cuco/hash_functions.cuh>
Expand Down Expand Up @@ -457,11 +458,11 @@ class hyperloglog_impl {
std::vector<register_type> host_sketch(num_regs);

// TODO check if storage is host accessible
CUCO_CUDA_TRY(cudaMemcpyAsync(host_sketch.data(),
this->sketch_.data(),
sizeof(register_type) * num_regs,
cudaMemcpyDefault,
stream.get()));
CUCO_CUDA_TRY(cuco::detail::memcpy_async(host_sketch.data(),
this->sketch_.data(),
sizeof(register_type) * num_regs,
cudaMemcpyDefault,
stream));
#if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1)
stream.sync();
#else
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include <cuco/detail/open_addressing/kernels.cuh>
#include <cuco/detail/storage/counter_storage.cuh>
#include <cuco/detail/utility/cuda.hpp>
#include <cuco/detail/utility/memcpy_async.hpp>
#include <cuco/detail/utils.hpp>
#include <cuco/extent.cuh>
#include <cuco/operator.hpp>
Expand Down Expand Up @@ -882,8 +883,8 @@ class open_addressing_impl {
stream.get()));

size_type temp_count;
CUCO_CUDA_TRY(cudaMemcpyAsync(
&temp_count, d_num_out, sizeof(size_type), cudaMemcpyDeviceToHost, stream.get()));
CUCO_CUDA_TRY(cuco::detail::memcpy_async(
&temp_count, d_num_out, sizeof(size_type), cudaMemcpyDeviceToHost, stream));
#if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1)
stream.sync();
#else
Expand Down
26 changes: 18 additions & 8 deletions include/cuco/detail/static_map.inl
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

#include <cuco/detail/bitwise_compare.cuh>
#include <cuco/detail/error.hpp>
#include <cuco/detail/utility/memcpy_async.hpp>
#include <cuco/detail/utils.cuh>
#include <cuco/detail/utils.hpp>

Expand Down Expand Up @@ -108,8 +109,11 @@ void static_map<Key, Value, Scope, Allocator>::insert(

detail::insert<block_size, tile_size>
<<<grid_size, block_size, 0, stream>>>(first, num_keys, num_successes_, view, hash, key_equal);
CUCO_CUDA_TRY(cudaMemcpyAsync(
&h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
CUCO_CUDA_TRY(cuco::detail::memcpy_async(&h_num_successes,
num_successes_,
sizeof(atomic_ctr_type),
cudaMemcpyDeviceToHost,
cuda::stream_ref{stream}));

CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); // stream sync to ensure h_num_successes is updated

Expand Down Expand Up @@ -146,8 +150,11 @@ void static_map<Key, Value, Scope, Allocator>::insert_if(InputIt first,

detail::insert_if_n<block_size, tile_size><<<grid_size, block_size, 0, stream>>>(
first, num_keys, num_successes_, view, stencil, pred, hash, key_equal);
CUCO_CUDA_TRY(cudaMemcpyAsync(
&h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
CUCO_CUDA_TRY(cuco::detail::memcpy_async(&h_num_successes,
num_successes_,
sizeof(atomic_ctr_type),
cudaMemcpyDeviceToHost,
cuda::stream_ref{stream}));
CUCO_CUDA_TRY(cudaStreamSynchronize(stream));

size_ += h_num_successes;
Expand Down Expand Up @@ -178,8 +185,11 @@ void static_map<Key, Value, Scope, Allocator>::erase(

detail::erase<block_size, tile_size>
<<<grid_size, block_size, 0, stream>>>(first, num_keys, num_successes_, view, hash, key_equal);
CUCO_CUDA_TRY(cudaMemcpyAsync(
&h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
CUCO_CUDA_TRY(cuco::detail::memcpy_async(&h_num_successes,
num_successes_,
sizeof(atomic_ctr_type),
cudaMemcpyDeviceToHost,
cuda::stream_ref{stream}));

CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); // stream sync to ensure h_num_successes is updated

Expand Down Expand Up @@ -249,8 +259,8 @@ std::pair<KeyOut, ValueOut> static_map<Key, Value, Scope, Allocator>::retrieve_a
stream);

std::size_t h_num_out;
CUCO_CUDA_TRY(
cudaMemcpyAsync(&h_num_out, d_num_out, sizeof(std::size_t), cudaMemcpyDeviceToHost, stream));
CUCO_CUDA_TRY(cuco::detail::memcpy_async(
&h_num_out, d_num_out, sizeof(std::size_t), cudaMemcpyDeviceToHost, cuda::stream_ref{stream}));
CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
temp_allocator.deallocate(
reinterpret_cast<char*>(d_num_out), sizeof(std::size_t), cuda::stream_ref{stream});
Expand Down
69 changes: 69 additions & 0 deletions include/cuco/detail/utility/memcpy_async.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
/*
* Copyright (c) 2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <cuco/detail/error.hpp>

#include <cuda/stream_ref>

#include <cstddef>

namespace cuco::detail {

/**
* @brief Asynchronous memory copy utility using cudaMemcpyBatchAsync when possible
*
* Uses cudaMemcpyBatchAsync for CUDA 12.8+ with proper edge case handling.
* Falls back to cudaMemcpyAsync for older CUDA versions or edge cases.
*
* @param dst Destination memory address
* @param src Source memory address
* @param count Number of bytes to copy
* @param kind Memory copy direction
* @param stream CUDA stream for the operation
* @return cudaError_t Error code from the memory copy operation
*/
[[nodiscard]] inline cudaError_t memcpy_async(
void* dst, void const* src, size_t count, cudaMemcpyKind kind, cuda::stream_ref stream)
{
if (dst == nullptr || src == nullptr || count == 0) { return cudaSuccess; }

#if CUDART_VERSION >= 12080
if (stream.get() == nullptr) { return cudaMemcpyAsync(dst, src, count, kind, stream.get()); }

void* dsts[1] = {dst};
void* srcs[1] = {const_cast<void*>(src)};
std::size_t sizes[1] = {count};
std::size_t attrs_idxs[1] = {0};

cudaMemcpyAttributes attrs[1] = {};
attrs[0].srcAccessOrder = cudaMemcpySrcAccessOrderStream;
attrs[0].flags = cudaMemcpyFlagPreferOverlapWithCompute;

#if CUDART_VERSION >= 13000
return cudaMemcpyBatchAsync(dsts, srcs, sizes, 1, attrs, attrs_idxs, 1, stream.get());
#else
std::size_t fail_idx;
return cudaMemcpyBatchAsync(dsts, srcs, sizes, 1, attrs, attrs_idxs, 1, &fail_idx, stream.get());
#endif // CUDART_VERSION >= 13000
#else
// CUDA < 12.8 - use regular cudaMemcpyAsync
return cudaMemcpyAsync(dst, src, count, kind, stream.get());
#endif // CUDART_VERSION >= 12080
}

} // namespace cuco::detail
Loading