From ee5addf697f4c3aa5e2038c1ec93331d22025dcd Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 5 Dec 2025 10:54:12 -0800 Subject: [PATCH 1/6] Repalce cudaMemcpyAsync with cudaMemcpyBatchAsync to get rid of a driver locking bug --- .../detail/hyperloglog/hyperloglog_impl.cuh | 11 ++-- .../open_addressing/open_addressing_impl.cuh | 5 +- include/cuco/detail/static_map.inl | 26 +++++--- include/cuco/detail/utility/memcpy_async.cuh | 60 +++++++++++++++++++ 4 files changed, 87 insertions(+), 15 deletions(-) create mode 100644 include/cuco/detail/utility/memcpy_async.cuh diff --git a/include/cuco/detail/hyperloglog/hyperloglog_impl.cuh b/include/cuco/detail/hyperloglog/hyperloglog_impl.cuh index 6c8bb2eff..e459ef6a7 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog_impl.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog_impl.cuh @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -420,11 +421,11 @@ class hyperloglog_impl { std::vector host_sketch(num_regs); // TODO check if storage is host accessible - CUCO_CUDA_TRY(cudaMemcpyAsync(host_sketch.data(), - this->sketch_.data(), - sizeof(register_type) * num_regs, - cudaMemcpyDefault, - stream.get())); + cuco::detail::memcpy_async(host_sketch.data(), + this->sketch_.data(), + sizeof(register_type) * num_regs, + cudaMemcpyDefault, + stream); #if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1) stream.sync(); #else diff --git a/include/cuco/detail/open_addressing/open_addressing_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_impl.cuh index 9bc781e83..246e3f7ed 100644 --- a/include/cuco/detail/open_addressing/open_addressing_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_impl.cuh @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -882,8 +883,8 @@ class open_addressing_impl { stream.get())); size_type temp_count; - CUCO_CUDA_TRY(cudaMemcpyAsync( - &temp_count, d_num_out, sizeof(size_type), cudaMemcpyDeviceToHost, stream.get())); + cuco::detail::memcpy_async( + &temp_count, d_num_out, sizeof(size_type), cudaMemcpyDeviceToHost, stream); #if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1) stream.sync(); #else diff --git a/include/cuco/detail/static_map.inl b/include/cuco/detail/static_map.inl index 79bd5f08c..6246cdc02 100644 --- a/include/cuco/detail/static_map.inl +++ b/include/cuco/detail/static_map.inl @@ -16,6 +16,7 @@ #include #include +#include #include #include @@ -108,8 +109,11 @@ void static_map::insert( detail::insert <<>>(first, num_keys, num_successes_, view, hash, key_equal); - CUCO_CUDA_TRY(cudaMemcpyAsync( - &h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream)); + cuco::detail::memcpy_async(&h_num_successes, + num_successes_, + sizeof(atomic_ctr_type), + cudaMemcpyDeviceToHost, + cuda::stream_ref{stream}); CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); // stream sync to ensure h_num_successes is updated @@ -146,8 +150,11 @@ void static_map::insert_if(InputIt first, detail::insert_if_n<<>>( first, num_keys, num_successes_, view, stencil, pred, hash, key_equal); - CUCO_CUDA_TRY(cudaMemcpyAsync( - &h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream)); + cuco::detail::memcpy_async(&h_num_successes, + num_successes_, + sizeof(atomic_ctr_type), + cudaMemcpyDeviceToHost, + cuda::stream_ref{stream}); CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); size_ += h_num_successes; @@ -178,8 +185,11 @@ void static_map::erase( detail::erase <<>>(first, num_keys, num_successes_, view, hash, key_equal); - CUCO_CUDA_TRY(cudaMemcpyAsync( - &h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream)); + cuco::detail::memcpy_async(&h_num_successes, + num_successes_, + sizeof(atomic_ctr_type), + cudaMemcpyDeviceToHost, + cuda::stream_ref{stream}); CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); // stream sync to ensure h_num_successes is updated @@ -249,8 +259,8 @@ std::pair static_map::retrieve_a stream); std::size_t h_num_out; - CUCO_CUDA_TRY( - cudaMemcpyAsync(&h_num_out, d_num_out, sizeof(std::size_t), cudaMemcpyDeviceToHost, stream)); + cuco::detail::memcpy_async( + &h_num_out, d_num_out, sizeof(std::size_t), cudaMemcpyDeviceToHost, cuda::stream_ref{stream}); CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); temp_allocator.deallocate( reinterpret_cast(d_num_out), sizeof(std::size_t), cuda::stream_ref{stream}); diff --git a/include/cuco/detail/utility/memcpy_async.cuh b/include/cuco/detail/utility/memcpy_async.cuh new file mode 100644 index 000000000..eabc2f620 --- /dev/null +++ b/include/cuco/detail/utility/memcpy_async.cuh @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +namespace cuco::detail { + +/** + * @brief Asynchronous memory copy utility that works around cudaMemcpyAsync bugs + * + * This function provides a drop-in replacement for cudaMemcpyAsync that uses + * cudaMemcpyBatchAsync internally to work around known issues with cudaMemcpyAsync. + * The function automatically handles the different API signatures between CUDA + * runtime versions. + * + * @param dst Destination memory address + * @param src Source memory address + * @param count Number of bytes to copy + * @param kind Type of memory copy (cudaMemcpyHostToDevice, cudaMemcpyDeviceToHost, etc.) + * @param stream CUDA stream for the asynchronous operation + */ +inline void memcpy_async( + void* dst, const void* src, size_t count, cudaMemcpyKind kind, cuda::stream_ref stream) +{ + // Use cudaMemcpyBatchAsync as a workaround for cudaMemcpyAsync bugs + void* dsts[1] = {dst}; + void* srcs[1] = {const_cast(src)}; + size_t sizes[1] = {count}; + cudaMemcpyAttributes attrs[1] = {{.srcAccessOrder = cudaMemcpySrcAccessOrderStream}}; + size_t attrsIdxs[1] = {0}; + +#if CUDART_VERSION >= 13000 + // CUDA 13.0+ API - no failIdx parameter + CUCO_CUDA_TRY(cudaMemcpyBatchAsync(dsts, srcs, sizes, 1, attrs, attrsIdxs, 1, stream.get())); +#else + // CUDA 12.x API - requires failIdx parameter + size_t failIdx; + CUCO_CUDA_TRY( + cudaMemcpyBatchAsync(dsts, srcs, sizes, 1, attrs, attrsIdxs, 1, &failIdx, stream.get())); +#endif +} + +} // namespace cuco::detail From f034a2662cffed9cf590b9452f08ac1567439a8d Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 5 Dec 2025 11:02:36 -0800 Subject: [PATCH 2/6] Fix pre-12.8 compatibility --- include/cuco/detail/utility/memcpy_async.cuh | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/include/cuco/detail/utility/memcpy_async.cuh b/include/cuco/detail/utility/memcpy_async.cuh index eabc2f620..f64238eeb 100644 --- a/include/cuco/detail/utility/memcpy_async.cuh +++ b/include/cuco/detail/utility/memcpy_async.cuh @@ -26,9 +26,10 @@ namespace cuco::detail { * @brief Asynchronous memory copy utility that works around cudaMemcpyAsync bugs * * This function provides a drop-in replacement for cudaMemcpyAsync that uses - * cudaMemcpyBatchAsync internally to work around known issues with cudaMemcpyAsync. - * The function automatically handles the different API signatures between CUDA - * runtime versions. + * cudaMemcpyBatchAsync internally to work around known issues with cudaMemcpyAsync + * when available (CUDA 12.8+). For older CUDA versions, it falls back to the + * original cudaMemcpyAsync. The function automatically handles the different API + * signatures between CUDA runtime versions. * * @param dst Destination memory address * @param src Source memory address @@ -39,7 +40,8 @@ namespace cuco::detail { inline void memcpy_async( void* dst, const void* src, size_t count, cudaMemcpyKind kind, cuda::stream_ref stream) { - // Use cudaMemcpyBatchAsync as a workaround for cudaMemcpyAsync bugs +#if CUDART_VERSION >= 12080 + // CUDA 12.8+ - Use cudaMemcpyBatchAsync as a workaround for cudaMemcpyAsync bugs void* dsts[1] = {dst}; void* srcs[1] = {const_cast(src)}; size_t sizes[1] = {count}; @@ -50,11 +52,17 @@ inline void memcpy_async( // CUDA 13.0+ API - no failIdx parameter CUCO_CUDA_TRY(cudaMemcpyBatchAsync(dsts, srcs, sizes, 1, attrs, attrsIdxs, 1, stream.get())); #else - // CUDA 12.x API - requires failIdx parameter + // CUDA 12.8-12.x API - requires failIdx parameter size_t failIdx; CUCO_CUDA_TRY( cudaMemcpyBatchAsync(dsts, srcs, sizes, 1, attrs, attrsIdxs, 1, &failIdx, stream.get())); #endif + +#else + // CUDA 12.0-12.7 - Fall back to original cudaMemcpyAsync + // Note: This may still have the original bugs that cudaMemcpyBatchAsync was designed to fix + CUCO_CUDA_TRY(cudaMemcpyAsync(dst, src, count, kind, stream.get())); +#endif } } // namespace cuco::detail From 17aa5505e6fbb1ce4da923de6ef066f74066db80 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 5 Dec 2025 16:46:44 -0800 Subject: [PATCH 3/6] Fix edge cases for cudaMemcpyBatchAsync --- include/cuco/detail/utility/memcpy_async.cuh | 55 +++++++++++--------- 1 file changed, 29 insertions(+), 26 deletions(-) diff --git a/include/cuco/detail/utility/memcpy_async.cuh b/include/cuco/detail/utility/memcpy_async.cuh index f64238eeb..d97406335 100644 --- a/include/cuco/detail/utility/memcpy_async.cuh +++ b/include/cuco/detail/utility/memcpy_async.cuh @@ -20,49 +20,52 @@ #include +#include + namespace cuco::detail { /** - * @brief Asynchronous memory copy utility that works around cudaMemcpyAsync bugs + * @brief Asynchronous memory copy utility using cudaMemcpyBatchAsync when possible * - * This function provides a drop-in replacement for cudaMemcpyAsync that uses - * cudaMemcpyBatchAsync internally to work around known issues with cudaMemcpyAsync - * when available (CUDA 12.8+). For older CUDA versions, it falls back to the - * original cudaMemcpyAsync. The function automatically handles the different API - * signatures between CUDA runtime versions. + * Uses cudaMemcpyBatchAsync for CUDA 12.8+ with proper edge case handling. + * Falls back to cudaMemcpyAsync for older CUDA versions or edge cases. * * @param dst Destination memory address * @param src Source memory address * @param count Number of bytes to copy - * @param kind Type of memory copy (cudaMemcpyHostToDevice, cudaMemcpyDeviceToHost, etc.) - * @param stream CUDA stream for the asynchronous operation + * @param kind Memory copy direction + * @param stream CUDA stream for the operation */ inline void memcpy_async( - void* dst, const void* src, size_t count, cudaMemcpyKind kind, cuda::stream_ref stream) + void* dst, void const* src, size_t count, cudaMemcpyKind kind, cuda::stream_ref stream) { + if (dst == nullptr || src == nullptr || count == 0) { return; } + #if CUDART_VERSION >= 12080 - // CUDA 12.8+ - Use cudaMemcpyBatchAsync as a workaround for cudaMemcpyAsync bugs - void* dsts[1] = {dst}; - void* srcs[1] = {const_cast(src)}; - size_t sizes[1] = {count}; - cudaMemcpyAttributes attrs[1] = {{.srcAccessOrder = cudaMemcpySrcAccessOrderStream}}; - size_t attrsIdxs[1] = {0}; + if (stream.get() == 0) { + CUCO_CUDA_TRY(cudaMemcpyAsync(dst, src, count, kind, stream.get())); + return; + } + + void* dsts[1] = {dst}; + void* srcs[1] = {const_cast(src)}; + std::size_t sizes[1] = {count}; + std::size_t attrs_idxs[1] = {0}; + + cudaMemcpyAttributes attrs[1] = {}; + attrs[0].srcAccessOrder = cudaMemcpySrcAccessOrderStream; + attrs[0].flags = cudaMemcpyFlagPreferOverlapWithCompute; #if CUDART_VERSION >= 13000 - // CUDA 13.0+ API - no failIdx parameter - CUCO_CUDA_TRY(cudaMemcpyBatchAsync(dsts, srcs, sizes, 1, attrs, attrsIdxs, 1, stream.get())); + CUCO_CUDA_TRY(cudaMemcpyBatchAsync(dsts, srcs, sizes, 1, attrs, attrs_idxs, 1, stream.get())); #else - // CUDA 12.8-12.x API - requires failIdx parameter - size_t failIdx; + std::size_t fail_idx; CUCO_CUDA_TRY( - cudaMemcpyBatchAsync(dsts, srcs, sizes, 1, attrs, attrsIdxs, 1, &failIdx, stream.get())); -#endif - + cudaMemcpyBatchAsync(dsts, srcs, sizes, 1, attrs, attrs_idxs, 1, &fail_idx, stream.get())); +#endif // CUDART_VERSION >= 13000 #else - // CUDA 12.0-12.7 - Fall back to original cudaMemcpyAsync - // Note: This may still have the original bugs that cudaMemcpyBatchAsync was designed to fix + // CUDA < 12.8 - use regular cudaMemcpyAsync CUCO_CUDA_TRY(cudaMemcpyAsync(dst, src, count, kind, stream.get())); -#endif +#endif // CUDART_VERSION >= 12080 } - } // namespace cuco::detail From d7354796173aca204f5e6b33e3c8c54e56195764 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 5 Dec 2025 16:48:05 -0800 Subject: [PATCH 4/6] Header cleanups --- include/cuco/detail/utility/memcpy_async.cuh | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/cuco/detail/utility/memcpy_async.cuh b/include/cuco/detail/utility/memcpy_async.cuh index d97406335..655a94438 100644 --- a/include/cuco/detail/utility/memcpy_async.cuh +++ b/include/cuco/detail/utility/memcpy_async.cuh @@ -20,8 +20,6 @@ #include -#include - namespace cuco::detail { /** From 56246c6344b3e470491f66b2e5002c4df2d88319 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Sat, 6 Dec 2025 18:06:09 -0800 Subject: [PATCH 5/6] Update detail memcpy_async to return CUDA error --- .../detail/hyperloglog/hyperloglog_impl.cuh | 12 +++---- .../open_addressing/open_addressing_impl.cuh | 6 ++-- include/cuco/detail/static_map.inl | 36 +++++++++---------- include/cuco/detail/utility/memcpy_async.cuh | 17 ++++----- 4 files changed, 34 insertions(+), 37 deletions(-) diff --git a/include/cuco/detail/hyperloglog/hyperloglog_impl.cuh b/include/cuco/detail/hyperloglog/hyperloglog_impl.cuh index e459ef6a7..b4edf12fb 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog_impl.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog_impl.cuh @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include #include @@ -421,11 +421,11 @@ class hyperloglog_impl { std::vector host_sketch(num_regs); // TODO check if storage is host accessible - cuco::detail::memcpy_async(host_sketch.data(), - this->sketch_.data(), - sizeof(register_type) * num_regs, - cudaMemcpyDefault, - stream); + CUCO_CUDA_TRY(cuco::detail::memcpy_async(host_sketch.data(), + this->sketch_.data(), + sizeof(register_type) * num_regs, + cudaMemcpyDefault, + stream)); #if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1) stream.sync(); #else diff --git a/include/cuco/detail/open_addressing/open_addressing_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_impl.cuh index 246e3f7ed..b176f6092 100644 --- a/include/cuco/detail/open_addressing/open_addressing_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_impl.cuh @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include @@ -883,8 +883,8 @@ class open_addressing_impl { stream.get())); size_type temp_count; - cuco::detail::memcpy_async( - &temp_count, d_num_out, sizeof(size_type), cudaMemcpyDeviceToHost, stream); + CUCO_CUDA_TRY(cuco::detail::memcpy_async( + &temp_count, d_num_out, sizeof(size_type), cudaMemcpyDeviceToHost, stream)); #if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1) stream.sync(); #else diff --git a/include/cuco/detail/static_map.inl b/include/cuco/detail/static_map.inl index 6246cdc02..a312f6f96 100644 --- a/include/cuco/detail/static_map.inl +++ b/include/cuco/detail/static_map.inl @@ -16,7 +16,7 @@ #include #include -#include +#include #include #include @@ -109,11 +109,11 @@ void static_map::insert( detail::insert <<>>(first, num_keys, num_successes_, view, hash, key_equal); - cuco::detail::memcpy_async(&h_num_successes, - num_successes_, - sizeof(atomic_ctr_type), - cudaMemcpyDeviceToHost, - cuda::stream_ref{stream}); + CUCO_CUDA_TRY(cuco::detail::memcpy_async(&h_num_successes, + num_successes_, + sizeof(atomic_ctr_type), + cudaMemcpyDeviceToHost, + cuda::stream_ref{stream})); CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); // stream sync to ensure h_num_successes is updated @@ -150,11 +150,11 @@ void static_map::insert_if(InputIt first, detail::insert_if_n<<>>( first, num_keys, num_successes_, view, stencil, pred, hash, key_equal); - cuco::detail::memcpy_async(&h_num_successes, - num_successes_, - sizeof(atomic_ctr_type), - cudaMemcpyDeviceToHost, - cuda::stream_ref{stream}); + CUCO_CUDA_TRY(cuco::detail::memcpy_async(&h_num_successes, + num_successes_, + sizeof(atomic_ctr_type), + cudaMemcpyDeviceToHost, + cuda::stream_ref{stream})); CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); size_ += h_num_successes; @@ -185,11 +185,11 @@ void static_map::erase( detail::erase <<>>(first, num_keys, num_successes_, view, hash, key_equal); - cuco::detail::memcpy_async(&h_num_successes, - num_successes_, - sizeof(atomic_ctr_type), - cudaMemcpyDeviceToHost, - cuda::stream_ref{stream}); + CUCO_CUDA_TRY(cuco::detail::memcpy_async(&h_num_successes, + num_successes_, + sizeof(atomic_ctr_type), + cudaMemcpyDeviceToHost, + cuda::stream_ref{stream})); CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); // stream sync to ensure h_num_successes is updated @@ -259,8 +259,8 @@ std::pair static_map::retrieve_a stream); std::size_t h_num_out; - cuco::detail::memcpy_async( - &h_num_out, d_num_out, sizeof(std::size_t), cudaMemcpyDeviceToHost, cuda::stream_ref{stream}); + CUCO_CUDA_TRY(cuco::detail::memcpy_async( + &h_num_out, d_num_out, sizeof(std::size_t), cudaMemcpyDeviceToHost, cuda::stream_ref{stream})); CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); temp_allocator.deallocate( reinterpret_cast(d_num_out), sizeof(std::size_t), cuda::stream_ref{stream}); diff --git a/include/cuco/detail/utility/memcpy_async.cuh b/include/cuco/detail/utility/memcpy_async.cuh index 655a94438..fed6d6b3f 100644 --- a/include/cuco/detail/utility/memcpy_async.cuh +++ b/include/cuco/detail/utility/memcpy_async.cuh @@ -33,17 +33,15 @@ namespace cuco::detail { * @param count Number of bytes to copy * @param kind Memory copy direction * @param stream CUDA stream for the operation + * @return cudaError_t Error code from the memory copy operation */ -inline void memcpy_async( +[[nodiscard]] inline cudaError_t memcpy_async( void* dst, void const* src, size_t count, cudaMemcpyKind kind, cuda::stream_ref stream) { - if (dst == nullptr || src == nullptr || count == 0) { return; } + if (dst == nullptr || src == nullptr || count == 0) { return cudaSuccess; } #if CUDART_VERSION >= 12080 - if (stream.get() == 0) { - CUCO_CUDA_TRY(cudaMemcpyAsync(dst, src, count, kind, stream.get())); - return; - } + if (stream.get() == 0) { return cudaMemcpyAsync(dst, src, count, kind, stream.get()); } void* dsts[1] = {dst}; void* srcs[1] = {const_cast(src)}; @@ -55,15 +53,14 @@ inline void memcpy_async( attrs[0].flags = cudaMemcpyFlagPreferOverlapWithCompute; #if CUDART_VERSION >= 13000 - CUCO_CUDA_TRY(cudaMemcpyBatchAsync(dsts, srcs, sizes, 1, attrs, attrs_idxs, 1, stream.get())); + return cudaMemcpyBatchAsync(dsts, srcs, sizes, 1, attrs, attrs_idxs, 1, stream.get()); #else std::size_t fail_idx; - CUCO_CUDA_TRY( - cudaMemcpyBatchAsync(dsts, srcs, sizes, 1, attrs, attrs_idxs, 1, &fail_idx, stream.get())); + return cudaMemcpyBatchAsync(dsts, srcs, sizes, 1, attrs, attrs_idxs, 1, &fail_idx, stream.get()); #endif // CUDART_VERSION >= 13000 #else // CUDA < 12.8 - use regular cudaMemcpyAsync - CUCO_CUDA_TRY(cudaMemcpyAsync(dst, src, count, kind, stream.get())); + return cudaMemcpyAsync(dst, src, count, kind, stream.get()); #endif // CUDART_VERSION >= 12080 } } // namespace cuco::detail From fbe624940377558dc310b5d4bfae228607278cd4 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Sat, 6 Dec 2025 19:48:54 -0800 Subject: [PATCH 6/6] Use C++ header instead of CUDA header --- .../detail/utility/{memcpy_async.cuh => memcpy_async.hpp} | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) rename include/cuco/detail/utility/{memcpy_async.cuh => memcpy_async.hpp} (95%) diff --git a/include/cuco/detail/utility/memcpy_async.cuh b/include/cuco/detail/utility/memcpy_async.hpp similarity index 95% rename from include/cuco/detail/utility/memcpy_async.cuh rename to include/cuco/detail/utility/memcpy_async.hpp index fed6d6b3f..887c94447 100644 --- a/include/cuco/detail/utility/memcpy_async.cuh +++ b/include/cuco/detail/utility/memcpy_async.hpp @@ -20,6 +20,8 @@ #include +#include + namespace cuco::detail { /** @@ -41,7 +43,7 @@ namespace cuco::detail { if (dst == nullptr || src == nullptr || count == 0) { return cudaSuccess; } #if CUDART_VERSION >= 12080 - if (stream.get() == 0) { return cudaMemcpyAsync(dst, src, count, kind, stream.get()); } + if (stream.get() == nullptr) { return cudaMemcpyAsync(dst, src, count, kind, stream.get()); } void* dsts[1] = {dst}; void* srcs[1] = {const_cast(src)}; @@ -63,4 +65,5 @@ namespace cuco::detail { return cudaMemcpyAsync(dst, src, count, kind, stream.get()); #endif // CUDART_VERSION >= 12080 } + } // namespace cuco::detail