Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
179 changes: 179 additions & 0 deletions include/cuco/detail/reduction_functor_impl.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
/*
* Copyright (c) 2021-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <cuda/atomic>
#include <cuda/std/type_traits>

namespace cuco {
namespace detail {

/**
* @brief Base class of all reduction functors.
*
* @warning This class should not be used directly.
*
*/
class reduction_functor_base {
};

template <typename T, typename Enable = void>
struct reduce_add_impl {
template <cuda::thread_scope Scope>
__device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& rhs) const noexcept
{
return lhs.fetch_add(rhs, cuda::memory_order_relaxed) + rhs;
}
};

template <typename T, typename Enable = void>
struct reduce_min_impl {
template <cuda::thread_scope Scope>
__device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& rhs) const noexcept
{
return min(lhs.fetch_min(rhs, cuda::memory_order_relaxed), rhs);
}
};

template <typename T, typename Enable = void>
struct reduce_max_impl {
template <cuda::thread_scope Scope>
__device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& rhs) const noexcept
{
return max(lhs.fetch_max(rhs, cuda::memory_order_relaxed), rhs);
}
};

template <typename T, typename Enable = void>
struct reduce_count_impl {
template <cuda::thread_scope Scope>
__device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& /* rhs */) const noexcept
{
return lhs.fetch_add(1, cuda::memory_order_relaxed) + 1;
}
};

// remove the following WAR once libcu++ extends FP atomics support and fixes signed integer atomics
// https://github.com/NVIDIA/libcudacxx/pull/286
template <typename T>
struct reduce_add_impl<T, typename cuda::std::enable_if_t<cuda::std::is_floating_point_v<T>>> {
template <cuda::thread_scope Scope,
cuda::std::enable_if_t<Scope == cuda::thread_scope_system, bool> = true>
__device__ T operator()(cuda::atomic<T, Scope>& lhs, T rhs) const noexcept
{
return atomicAdd_system(reinterpret_cast<T*>(&lhs), rhs) + rhs;
}

template <cuda::thread_scope Scope,
cuda::std::enable_if_t<Scope == cuda::thread_scope_device, bool> = true>
__device__ T operator()(cuda::atomic<T, Scope>& lhs, T rhs) const noexcept
{
return atomicAdd(reinterpret_cast<T*>(&lhs), rhs) + rhs;
}

template <
cuda::thread_scope Scope,
cuda::std::enable_if_t<Scope != cuda::thread_scope_system && Scope != cuda::thread_scope_device,
bool> = true>
__device__ T operator()(cuda::atomic<T, Scope>& lhs, T rhs) const noexcept
{
return atomicAdd_block(reinterpret_cast<T*>(&lhs), rhs) + rhs;
}
};

template <typename T>
struct reduce_min_impl<
T,
typename cuda::std::enable_if_t<cuda::std::is_integral_v<T> && cuda::std::is_signed_v<T>>> {
private:
using internal_type = typename cuda::std::conditional_t<sizeof(T) == 8, long long int, int>;

public:
template <cuda::thread_scope Scope,
cuda::std::enable_if_t<Scope == cuda::thread_scope_system, bool> = true>
__device__ T operator()(cuda::atomic<T, Scope>& lhs, T rhs) const noexcept
{
return min(atomicMin_system(reinterpret_cast<internal_type*>(&lhs), rhs),
static_cast<internal_type>(rhs));
}

template <cuda::thread_scope Scope,
cuda::std::enable_if_t<Scope == cuda::thread_scope_device, bool> = true>
__device__ T operator()(cuda::atomic<T, Scope>& lhs, T rhs) const noexcept
{
return min(atomicMin(reinterpret_cast<internal_type*>(&lhs), rhs),
static_cast<internal_type>(rhs));
}

template <
cuda::thread_scope Scope,
cuda::std::enable_if_t<Scope != cuda::thread_scope_system && Scope != cuda::thread_scope_device,
bool> = true>
__device__ T operator()(cuda::atomic<T, Scope>& lhs, T rhs) const noexcept
{
return min(atomicMin_block(reinterpret_cast<internal_type*>(&lhs), rhs),
static_cast<internal_type>(rhs));
}
};

template <typename T>
struct reduce_max_impl<
T,
typename cuda::std::enable_if_t<cuda::std::is_integral_v<T> && cuda::std::is_signed_v<T>>> {
private:
using internal_type = typename cuda::std::conditional_t<sizeof(T) == 8, long long int, int>;

public:
template <cuda::thread_scope Scope,
cuda::std::enable_if_t<Scope == cuda::thread_scope_system, bool> = true>
__device__ T operator()(cuda::atomic<T, Scope>& lhs, T rhs) const noexcept
{
return max(atomicMax_system(reinterpret_cast<internal_type*>(&lhs), rhs),
static_cast<internal_type>(rhs));
}

template <cuda::thread_scope Scope,
cuda::std::enable_if_t<Scope == cuda::thread_scope_device, bool> = true>
__device__ T operator()(cuda::atomic<T, Scope>& lhs, T rhs) const noexcept
{
return max(atomicMax(reinterpret_cast<internal_type*>(&lhs), rhs),
static_cast<internal_type>(rhs));
}

template <
cuda::thread_scope Scope,
cuda::std::enable_if_t<Scope != cuda::thread_scope_system && Scope != cuda::thread_scope_device,
bool> = true>
__device__ T operator()(cuda::atomic<T, Scope>& lhs, T rhs) const noexcept
{
return max(atomicMax_block(reinterpret_cast<internal_type*>(&lhs), rhs),
static_cast<internal_type>(rhs));
}
};

template <typename T>
struct reduce_min_impl<T, typename cuda::std::enable_if_t<cuda::std::is_floating_point_v<T>>> {
__device__ T operator()(T lhs, T rhs) const noexcept { return min(lhs, rhs); }
};

template <typename T>
struct reduce_max_impl<T, typename cuda::std::enable_if_t<cuda::std::is_floating_point_v<T>>> {
__device__ T operator()(T lhs, T rhs) const noexcept { return max(lhs, rhs); }
};

} // namespace detail
} // namespace cuco
212 changes: 212 additions & 0 deletions include/cuco/reduction_functors.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <cuco/detail/reduction_functor_impl.cuh>

#include <cuda/atomic>
#include <cuda/std/limits>
#include <cuda/std/type_traits>

namespace cuco {

/**
* @brief Wrapper for reduction identity value.
*
* @tparam T The underlying value type used for reduction
*/
template <typename T>
class identity_value {
public:
using type = T;
explicit constexpr identity_value(T const& identity) noexcept : identity_(identity) {}
constexpr T value() const noexcept { return identity_; }

private:
T identity_;
};

/**
* @brief Wrapper for a user-defined custom reduction operator.
*
* External synchronization, if required,
* is established via an atomic compare-and-swap loop.
*
* Example:
* \code{.cpp}
* template <typename T>
* struct custom_plus {
* __device__ T operator()(T const& lhs, T const& rhs) const noexcept {
* return lhs + rhs;
* }
* };
*
* template <typename T>
* struct custom_plus_sync {
* template <cuda::thread_scope Scope>
* __device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& rhs) const noexcept {
* return lhs.fetch_add(rhs) + rhs;
* }
* };
*
* int main() {
* // define the identity value for the given reduction operation,
// i.e., op(identity, x) == x
* cuco::identity_value<int> identity{0};
*
* // synchronized via CAS loop
* auto f1 = cuco::reduction_functor<custom_plus<int>, int>(identity);
* // implicitly synchronized
* auto f2 = cuco::reduction_functor<custom_plus_sync<int>, int>(identity);
* }
* \endcode
*
* @tparam Func The user-defined reduction functor
* @tparam Value The value type used for reduction
*/
template <typename Func, typename Value>
class reduction_functor : detail::reduction_functor_base {
public:
using value_type = Value;

reduction_functor(cuco::identity_value<Value> identity, Func functor = Func{}) noexcept
: identity_(identity), functor_(functor)
{
}

template <cuda::thread_scope Scope>
__device__ value_type operator()(cuda::atomic<value_type, Scope>& lhs,
value_type const& rhs) const noexcept
{
if constexpr (uses_external_sync()) {
value_type old = lhs.load(cuda::memory_order_relaxed);
value_type desired;

do {
desired = functor_(old, rhs);
} while (not lhs.compare_exchange_weak(
old, desired, cuda::memory_order_release, cuda::memory_order_relaxed));

return desired;
}
if constexpr (not uses_external_sync()) { return functor_(lhs, rhs); }
}

__host__ __device__ value_type identity() const noexcept { return identity_.value(); }

__host__ __device__ static constexpr bool uses_external_sync() noexcept
{
return not atomic_invocable_ or naive_invocable_;
}

private:
cuco::identity_value<value_type> identity_;
Func functor_;

static constexpr bool naive_invocable_ =
cuda::std::is_invocable_r_v<value_type, Func, value_type, value_type>;
static constexpr bool atomic_invocable_ =
cuda::std::is_invocable_r_v<value_type,
Func,
cuda::atomic<value_type, cuda::thread_scope_system>&,
value_type> or
cuda::std::is_invocable_r_v<value_type,
Func,
cuda::atomic<value_type, cuda::thread_scope_device>&,
value_type> or
cuda::std::is_invocable_r_v<value_type,
Func,
cuda::atomic<value_type, cuda::thread_scope_block>&,
value_type> or
cuda::std::is_invocable_r_v<value_type,
Func,
cuda::atomic<value_type, cuda::thread_scope_thread>&,
value_type>;
static constexpr bool atomic_const_invocable_ =
cuda::std::is_invocable_r_v<value_type,
Func,
cuda::atomic<value_type, cuda::thread_scope_system> const&,
value_type> or
cuda::std::is_invocable_r_v<value_type,
Func,
cuda::atomic<value_type, cuda::thread_scope_device> const&,
value_type> or
cuda::std::is_invocable_r_v<value_type,
Func,
cuda::atomic<value_type, cuda::thread_scope_block> const&,
value_type> or
cuda::std::is_invocable_r_v<value_type,
Func,
cuda::atomic<value_type, cuda::thread_scope_thread> const&,
value_type>;

static_assert((atomic_invocable_ and not atomic_const_invocable_) or naive_invocable_,
"Invalid operator signature. Valid signatures are "
"(T const&, T const&)->T and (cuda::atomic<T, Scope>&, T const&)->T.");
static_assert(!(__nv_is_extended_device_lambda_closure_type(Func) or
__nv_is_extended_host_device_lambda_closure_type(Func)),
"Extended __device__/__host__ __device__ lambdas are not supported."
" Use a named function object instead.");
};

/**
* @brief Synchronized `+` reduction functor.
*
* @tparam T The value type used for reduction
*/
template <typename T>
auto reduce_add()
{
return reduction_functor(identity_value<T>{0}, detail::reduce_add_impl<T>{});
};

/**
* @brief Synchronized `min` reduction functor.
*
* @tparam T The value type used for reduction
*/
template <typename T>
auto reduce_min()
{
return reduction_functor(identity_value{cuda::std::numeric_limits<T>::max()},
detail::reduce_min_impl<T>{});
};

/**
* @brief Synchronized `max` reduction functor.
*
* @tparam T The value type used for reduction
*/
template <typename T>
auto reduce_max()
{
return reduction_functor(identity_value{cuda::std::numeric_limits<T>::lowest()},
detail::reduce_max_impl<T>{});
};

/**
* @brief Synchronized `count` reduction functor.
*
* @tparam T The value type used for reduction
*/
template <typename T>
auto reduce_count()
{
return reduction_functor(identity_value<T>{0}, detail::reduce_count_impl<T>{});
};

} // namespace cuco
Loading