From 836e77adf94182048c769d23a3256139ed4cd5dc Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Wed, 24 Jan 2024 00:58:40 +0000 Subject: [PATCH 01/78] First draft --- examples/CMakeLists.txt | 1 + .../host_bulk_example.cu | 61 + .../distinct_count_estimator.inl | 106 + .../distinct_count_estimator_ref.inl | 57 + include/cuco/detail/hyperloglog/finalizer.cuh | 79 + .../cuco/detail/hyperloglog/hyperloglog.cuh | 196 ++ .../detail/hyperloglog/hyperloglog_ref.cuh | 189 ++ include/cuco/detail/hyperloglog/kernels.cuh | 75 + include/cuco/detail/hyperloglog/storage.cuh | 24 + include/cuco/detail/hyperloglog/tuning.cuh | 2577 +++++++++++++++++ include/cuco/distinct_count_estimator.cuh | 94 + include/cuco/distinct_count_estimator_ref.cuh | 61 + 12 files changed, 3520 insertions(+) create mode 100644 examples/distinct_count_estimator/host_bulk_example.cu create mode 100644 include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl create mode 100644 include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl create mode 100644 include/cuco/detail/hyperloglog/finalizer.cuh create mode 100644 include/cuco/detail/hyperloglog/hyperloglog.cuh create mode 100644 include/cuco/detail/hyperloglog/hyperloglog_ref.cuh create mode 100644 include/cuco/detail/hyperloglog/kernels.cuh create mode 100644 include/cuco/detail/hyperloglog/storage.cuh create mode 100644 include/cuco/detail/hyperloglog/tuning.cuh create mode 100644 include/cuco/distinct_count_estimator.cuh create mode 100644 include/cuco/distinct_count_estimator_ref.cuh diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index a3d0ae247..f6e753cf2 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -41,3 +41,4 @@ ConfigureExample(STATIC_MAP_DEVICE_SIDE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/sta ConfigureExample(STATIC_MAP_CUSTOM_TYPE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/custom_type_example.cu") ConfigureExample(STATIC_MAP_COUNT_BY_KEY_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/count_by_key_example.cu") ConfigureExample(STATIC_MULTIMAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_multimap/host_bulk_example.cu") +ConfigureExample(DISTINCT_COUNT_ESTIMATOR_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/distinct_count_estimator/host_bulk_example.cu") diff --git a/examples/distinct_count_estimator/host_bulk_example.cu b/examples/distinct_count_estimator/host_bulk_example.cu new file mode 100644 index 000000000..18085e72f --- /dev/null +++ b/examples/distinct_count_estimator/host_bulk_example.cu @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include + +#include +#include + +#include +#include + +int main() +{ + using T = int; + std::size_t constexpr num_items = 1ull << 30; // 4GB + + thrust::device_vector items(num_items); + // create a vector of distinct items + thrust::sequence(items.begin(), items.end(), 0); + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + cuco::distinct_count_estimator estimator; + cudaEventRecord(start); + // add all items to the estimator + estimator.add(items.begin(), items.end()); + // after the estimator has seen all items, we can calculate the cardinality + std::size_t const estimated_cardinality = estimator.estimate(); + cudaEventRecord(stop); + cudaEventSynchronize(stop); + + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + float input_size_gb = num_items * sizeof(T) / 1073741824.0f; + float throughput = input_size_gb / (milliseconds / 1000.0f); + + std::cout << "True cardinality:\t" << num_items << "\nEstimated cardinality:\t" + << estimated_cardinality << "\nRelative error:\t" + << abs(static_cast(num_items) - static_cast(estimated_cardinality)) / + num_items + << "\nData size:\t" << input_size_gb << "GB" + << "\nElapsed time:\t" << milliseconds << "ms" + << "\nMemory throughput\t" << throughput << "GB/s" << std::endl; + + cudaEventDestroy(start); + cudaEventDestroy(stop); +} \ No newline at end of file diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl new file mode 100644 index 000000000..7013bc956 --- /dev/null +++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +namespace cuco { + +template +constexpr distinct_count_estimator::distinct_count_estimator( + cuco::cuda_thread_scope scope, + Hash const& hash, + Allocator const& alloc, + cuco::cuda_stream_ref stream) + : impl_{std::make_unique(scope, hash, alloc, stream)} +{ +} + +template +void distinct_count_estimator::clear_async( + cuco::cuda_stream_ref stream) noexcept +{ + this->impl_->clear_async(stream); +} + +template +void distinct_count_estimator::clear( + cuco::cuda_stream_ref stream) +{ + this->impl_->clear(stream); +} + +template +template +void distinct_count_estimator::add_async( + InputIt first, InputIt last, cuco::cuda_stream_ref stream) noexcept +{ + this->impl_->add_async(first, last, stream); +} + +template +template +void distinct_count_estimator::add( + InputIt first, InputIt last, cuco::cuda_stream_ref stream) +{ + this->impl_->add(first, last, stream); +} + +template +template +void distinct_count_estimator::merge_async( + distinct_count_estimator const& other, + cuco::cuda_stream_ref stream) noexcept +{ + this->impl_->merge_async(other, stream); +} + +template +template +void distinct_count_estimator::merge( + distinct_count_estimator const& other, + cuco::cuda_stream_ref stream) +{ + this->impl_->merge(other, stream); +} + +template +template +void distinct_count_estimator::merge_async( + ref_type const& other, cuco::cuda_stream_ref stream) noexcept +{ + this->impl_->merge_async(other, stream); +} + +template +template +void distinct_count_estimator::merge( + ref_type const& other, cuco::cuda_stream_ref stream) +{ + this->impl_->merge(other, stream); +} + +template +std::size_t distinct_count_estimator::estimate( + cuco::cuda_stream_ref stream) const +{ + return this->impl_->estimate(stream); +} + +template +typename distinct_count_estimator::ref_type<> +distinct_count_estimator::ref() const noexcept +{ + return this->impl_->ref(); +} +} // namespace cuco \ No newline at end of file diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl new file mode 100644 index 000000000..1359033d0 --- /dev/null +++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +namespace cuco { + +template +__host__ __device__ constexpr distinct_count_estimator_ref:: + distinct_count_estimator_ref(storage_type& storage, + cuco::cuda_thread_scope scope, + Hash const& hash) noexcept + : impl_{storage, scope, hash} +{ +} + +template +template +__device__ void distinct_count_estimator_ref::clear( + CG const& group) noexcept +{ + this->impl_.clear(group); +} + +template +__device__ void distinct_count_estimator_ref::add(T const& item) noexcept +{ + this->impl_.add(item); +} + +template +template +__device__ void distinct_count_estimator_ref::merge( + CG const& group, + distinct_count_estimator_ref const& other) noexcept +{ + this->impl_.merge(group, other); +} + +template +__device__ std::size_t distinct_count_estimator_ref::estimate( + cooperative_groups::thread_block const& group) const noexcept +{ + this->impl_.estimate(group); +} +} // namespace cuco \ No newline at end of file diff --git a/include/cuco/detail/hyperloglog/finalizer.cuh b/include/cuco/detail/hyperloglog/finalizer.cuh new file mode 100644 index 000000000..9f5c9a20d --- /dev/null +++ b/include/cuco/detail/hyperloglog/finalizer.cuh @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include + +namespace cuco::hyperloglog_ns::detail { +template +class finalizer { + // this minimum number of registers is required by HLL++ + static_assert(Precision >= 4, "Precision must be greater or equal to 4"); + + public: + __host__ __device__ static double constexpr finalize(double z, int v) noexcept + { + auto e = alpha_mm() / z; + // TODO remove test code + // printf("raw e: %lf\n", e); + + if (v > 0) { + // Use linear counting for small cardinality estimates. + double const h = m * log(static_cast(m) / v); + // HLL++ is defined only when p < 19, otherwise we need to fallback to HLL. + // The threshold `2.5 * m` is from the original HLL algorithm. + if ((Precision < 19 and h <= thresholds[Precision - 4]) or e <= 2.5 * m) { + e = h; + } else { + e = bias_corrected_estimate(e); + } + } else { + e = bias_corrected_estimate(e); + } + + return cuda::std::round(e); + } + + private: + static auto constexpr m = (1 << Precision); + + __host__ __device__ static double constexpr alpha_mm() noexcept + { + if constexpr (m == 16) { + return 0.673 * m * m; + } else if constexpr (m == 32) { + return 0.697 * m * m; + } else if constexpr (m == 64) { + return 0.709 * m * m; + } else { + return (0.7213 / (1.0 + 1.079 / m)) * m * m; + } + } + + __host__ __device__ static double constexpr bias_corrected_estimate(double e) noexcept + { + if constexpr (Precision < 19) { + if (e < 5.0 * m) { return e - bias(e); } + } + return e; + } + + // TODO implement HLL++ bias correction + __host__ __device__ static double constexpr bias(double e) noexcept { return e * 0; } +}; +} // namespace cuco::hyperloglog_ns::detail \ No newline at end of file diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh new file mode 100644 index 000000000..bd3871261 --- /dev/null +++ b/include/cuco/detail/hyperloglog/hyperloglog.cuh @@ -0,0 +1,196 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace cuco::detail { +template +class hyperloglog { + public: + static constexpr auto thread_scope = Scope; ///< CUDA thread scope + static constexpr auto precision = Precision; + + using allocator_type = Allocator; ///< Allocator type + using storage_type = detail::hyperloglog_storage; + using storage_allocator_type = + typename std::allocator_traits::template rebind_alloc; + + template + using ref_type = hyperloglog_ref; + + constexpr hyperloglog(cuco::cuda_thread_scope, + Hash const& hash, + Allocator const& alloc, + cuco::cuda_stream_ref stream) + : hash_{hash}, + storage_allocator_{alloc}, + storage_deleter_{storage_allocator_}, + storage_{storage_allocator_.allocate(1ull), storage_deleter_} + { + this->clear_async(stream); // TODO async or sync? + } + + hyperloglog(hyperloglog const&) = delete; + hyperloglog& operator=(hyperloglog const&) = delete; + hyperloglog(hyperloglog&&) = default; + hyperloglog& operator=(hyperloglog&&) = default; + ~hyperloglog() = default; + + void clear_async(cuco::cuda_stream_ref stream) noexcept + { + auto constexpr block_size = 1024; + cuco::hyperloglog_ns::detail::clear<<<1, block_size, 0, stream>>>(this->ref()); + } + + void clear(cuco::cuda_stream_ref stream) + { + this->clear_async(stream); + stream.synchronize(); + } + + template + void add_async(InputIt first, InputIt last, cuco::cuda_stream_ref stream) noexcept + { + auto const num_items = cuco::detail::distance(first, last); // TODO include + if (num_items == 0) { return; } + + // TODO fallback to local memory registers in case they don't fit in shmem + + int grid_size = 0; + int block_size = 0; + // TODO check cuda error? + cudaOccupancyMaxPotentialBlockSize( + &grid_size, &block_size, &cuco::hyperloglog_ns::detail::add_shmem>); + + cuco::hyperloglog_ns::detail::add_shmem<<>>( + first, num_items, this->ref()); + } + + template + void add(InputIt first, InputIt last, cuco::cuda_stream_ref stream) + { + this->add_async(first, last, stream); + stream.synchronize(); + } + + template + void merge_async(hyperloglog const& other, + cuco::cuda_stream_ref stream = {}) noexcept + { + this->merge_async(other.ref(), stream); + } + + template + void merge(hyperloglog const& other, + cuco::cuda_stream_ref stream = {}) + { + this->merge_async(other, stream); + stream.synchronize(); + } + + template + void merge_async(ref_type const& other, cuco::cuda_stream_ref stream = {}) noexcept + { + auto constexpr block_size = 1024; + cuco::hyperloglog_ns::detail::merge<<<1, block_size, 0, stream>>>(other, this->ref()); + } + + template + void merge(ref_type const& other, cuco::cuda_stream_ref stream = {}) + { + this->merge_async(other, stream); + stream.synchronize(); + } + + [[nodiscard]] std::size_t estimate(cuco::cuda_stream_ref stream) const + { + // TODO remove test code + // std::size_t* result; + // cudaMallocHost(&result, sizeof(std::size_t)); + + // int grid_size = 0; + // int block_size = 0; + // // TODO check cuda error? + // cudaOccupancyMaxPotentialBlockSize( + // &grid_size, &block_size, &cuco::hyperloglog_ns::detail::estimate>); + + // cuco::hyperloglog_ns::detail::estimate<<>>( + // result, this->ref()); + // stream.synchronize(); + + // return *result; + + // TODO this function currently copies the registers to the host and then finalizes the result; + // move computation to device? Edit: host computation is faster -.- + storage_type registers; + // TODO check if storage is host accessible + CUCO_CUDA_TRY(cudaMemcpyAsync( + ®isters, this->storage_.get(), sizeof(storage_type), cudaMemcpyDeviceToHost, stream)); + stream.synchronize(); + + using fp_type = typename ref_type<>::fp_type; + fp_type sum = 0; + int zeroes = 0; + // geometric mean computation + count registers with 0s + for (std::size_t i = 0; i < registers.size(); ++i) { + auto const reg = registers[i]; + sum += fp_type{1} / static_cast(1 << reg); + zeroes += reg == 0; + } + + // pass intermediate result to finalizer for bias correction, etc. + return cuco::hyperloglog_ns::detail::finalizer::finalize(sum, zeroes); + } + + [[nodiscard]] ref_type<> ref() const noexcept + { + return ref_type<>{*(this->storage_.get()), {}, this->hash_}; + } + + private: + struct storage_deleter { + using pointer = typename storage_allocator_type::value_type*; + + storage_deleter(storage_allocator_type& a) : allocator{a} {} + + storage_deleter(storage_deleter const&) = default; + + void operator()(pointer ptr) { allocator.deallocate(ptr, 1); } + + storage_allocator_type& allocator; + }; + + Hash hash_; + storage_allocator_type storage_allocator_; + storage_deleter storage_deleter_; + std::unique_ptr storage_; + + template + friend class hyperloglog; +}; +} // namespace cuco::detail \ No newline at end of file diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh new file mode 100644 index 000000000..ba9333f95 --- /dev/null +++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh @@ -0,0 +1,189 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +namespace cuco::detail { +template +class hyperloglog_ref { + public: + using fp_type = float; + static constexpr auto thread_scope = Scope; ///< CUDA thread scope + static constexpr auto precision = Precision; + + using storage_type = hyperloglog_storage; + template + using with_scope = hyperloglog_ref; + + __host__ __device__ constexpr hyperloglog_ref(storage_type& storage, + cuco::cuda_thread_scope = {}, + Hash const& hash = {}) noexcept + : hash_{hash}, storage_{storage} + { + } + + template + __device__ void clear(CG const& group) noexcept + { + for (int i = group.thread_rank(); i < this->storage_.size(); i += group.size()) { + this->storage_[i] = 0; + } + + // TODO remove test code + // int4 constexpr empty{0, 0, 0, 0}; + // auto vec4 = reinterpret_cast(this->storage_.data()); + // // #pragma unroll 2 + // for (int i = group.thread_rank(); i < (this->storage_.size() / 4); i += group.size()) { + // vec4[i] = empty; + // } + } + + __device__ void add(T const& item) noexcept + { + // static_assert NumBuckets is not too big + auto constexpr register_mask = (1 << Precision) - 1; + auto const h = this->hash_(item); + auto const reg = h & register_mask; + auto const zeroes = cuda::std::countl_zero(h | register_mask) + 1; // __clz + + if constexpr (Scope == cuda::thread_scope_thread) { + this->storage_[reg] = max(this->storage_[reg], zeroes); + } else if constexpr (Scope == cuda::thread_scope_block) { + atomicMax_block(&(this->storage_[reg]), zeroes); + } else if constexpr (Scope == cuda::thread_scope_device) { + atomicMax(&(this->storage_[reg]), zeroes); + } else if constexpr (Scope == cuda::thread_scope_system) { + atomicMax_system(&(this->storage_[reg]), zeroes); + } else { + static_assert(cuco::dependent_false, "Unsupported thread scope"); + } + } + + template + __device__ void merge(CG const& group, + hyperloglog_ref const& other) noexcept + { + for (int i = group.thread_rank(); i < this->storage_.size(); i += group.size()) { + if constexpr (Scope == cuda::thread_scope_thread) { + this->storage_[i] = max(this->storage_[i], other.storage_[i]); + } else if constexpr (Scope == cuda::thread_scope_block) { + atomicMax_block(this->storage_.data() + i, other.storage_[i]); + } else if constexpr (Scope == cuda::thread_scope_device) { + atomicMax(this->storage_.data() + i, other.storage_[i]); + } else if constexpr (Scope == cuda::thread_scope_system) { + atomicMax_system(this->storage_.data() + i, other.storage_[i]); + } else { + static_assert(cuco::dependent_false, "Unsupported thread scope"); + } + } + + // TODO remove test code + /* + auto vec4 = reinterpret_cast(other.storage_.data()); + // #pragma unroll 2 + for (int i = group.thread_rank(); i < (this->storage_.size() / 4); i += group.size()) { + auto const items = vec4[i]; + if constexpr (Scope == cuda::thread_scope_thread) { + auto max_vec4 = reinterpret_cast(this->storage_.data()); + auto max_items = max_vec4[i]; + max_items.x = max(max_items.x, items.x); + max_items.y = max(max_items.y, items.y); + max_items.z = max(max_items.z, items.z); + max_items.w = max(max_items.w, items.w); + max_vec4[i] = max_items; + } else if constexpr (Scope == cuda::thread_scope_block) { + atomicMax_block(this->storage_.data() + (i * 4 + 0), items.x); + atomicMax_block(this->storage_.data() + (i * 4 + 1), items.y); + atomicMax_block(this->storage_.data() + (i * 4 + 2), items.z); + atomicMax_block(this->storage_.data() + (i * 4 + 3), items.w); + } else if constexpr (Scope == cuda::thread_scope_device) { + atomicMax(this->storage_.data() + (i * 4 + 0), items.x); + atomicMax(this->storage_.data() + (i * 4 + 1), items.y); + atomicMax(this->storage_.data() + (i * 4 + 2), items.z); + atomicMax(this->storage_.data() + (i * 4 + 3), items.w); + } else if constexpr (Scope == cuda::thread_scope_system) { + atomicMax_system(this->storage_.data() + (i * 4 + 0), items.x); + atomicMax_system(this->storage_.data() + (i * 4 + 1), items.y); + atomicMax_system(this->storage_.data() + (i * 4 + 2), items.z); + atomicMax_system(this->storage_.data() + (i * 4 + 3), items.w); + } else { + static_assert(cuco::dependent_false, "Unsupported thread scope"); + } + } + */ + } + + [[nodiscard]] __device__ std::size_t estimate( + cooperative_groups::thread_block const& group) const noexcept + { + __shared__ cuda::atomic block_sum; + __shared__ cuda::atomic block_zeroes; + __shared__ std::size_t estimate; + + // TODO is this needed? + if (group.thread_rank() == 0) { + block_sum.store(0, cuda::std::memory_order_relaxed); + block_zeroes.store(0, cuda::std::memory_order_relaxed); + } + group.sync(); + + // a warp + auto const tile = cooperative_groups::tiled_partition<32>(group); + + fp_type thread_sum = 0; + int thread_zeroes = 0; + for (int i = group.thread_rank(); i < this->storage_.size(); i += group.size()) { + auto const reg = this->storage_[i]; + thread_sum += fp_type{1} / static_cast(1 << reg); + thread_zeroes += reg == 0; + } + + // CG reduce Z and V + cooperative_groups::reduce_update_async( + tile, block_sum, thread_sum, cooperative_groups::plus()); + cooperative_groups::reduce_update_async( + tile, block_zeroes, thread_zeroes, cooperative_groups::plus()); + group.sync(); + + if (group.thread_rank() == 0) { + auto const z = block_sum.load(cuda::std::memory_order_relaxed); + auto const v = block_zeroes.load(cuda::std::memory_order_relaxed); + estimate = cuco::hyperloglog_ns::detail::finalizer::finalize(z, v); + } + group.sync(); + + return estimate; + } + + private: + Hash hash_; + storage_type& storage_; // TODO is a reference the right choice here?? + + template + friend class hyperloglog_ref; +}; +} // namespace cuco::detail \ No newline at end of file diff --git a/include/cuco/detail/hyperloglog/kernels.cuh b/include/cuco/detail/hyperloglog/kernels.cuh new file mode 100644 index 000000000..70064abcc --- /dev/null +++ b/include/cuco/detail/hyperloglog/kernels.cuh @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +#include + +#include + +namespace cuco::hyperloglog_ns::detail { + +template +__global__ void clear(RefType ref) +{ + auto const block = cooperative_groups::this_thread_block(); + if (block.group_index().x == 0) { ref.clear(block); } +} + +template +__global__ void add_shmem(InputIt first, cuco::detail::index_type n, RefType ref) +{ + using local_ref_type = typename RefType::with_scope; + + __shared__ typename local_ref_type::storage_type local_storage; + + auto const loop_stride = cuco::detail::grid_stride(); + auto idx = cuco::detail::global_thread_id(); + auto const block = cooperative_groups::this_thread_block(); + + local_ref_type local_ref(local_storage); + local_ref.clear(block); + block.sync(); + + while (idx < n) { + local_ref.add(*(first + idx)); + idx += loop_stride; + } + block.sync(); + + ref.merge(block, local_ref); +} + +template +__global__ void merge(OtherRefType other_ref, RefType ref) +{ + auto const block = cooperative_groups::this_thread_block(); + if (block.group_index().x == 0) { ref.merge(block, other_ref); } +} + +// TODO this kernel currently isn't being used +template +__global__ void estimate(std::size_t* cardinality, RefType ref) +{ + auto const block = cooperative_groups::this_thread_block(); + if (block.group_index().x == 0) { + auto const estimate = ref.estimate(block); + if (block.thread_rank() == 0) { *cardinality = estimate; } + } +} +} // namespace cuco::hyperloglog_ns::detail \ No newline at end of file diff --git a/include/cuco/detail/hyperloglog/storage.cuh b/include/cuco/detail/hyperloglog/storage.cuh new file mode 100644 index 000000000..195bdbe1c --- /dev/null +++ b/include/cuco/detail/hyperloglog/storage.cuh @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +namespace cuco::detail { +template +struct alignas(sizeof(int) * 4) hyperloglog_storage + : public cuda::std::array {}; +} // namespace cuco::detail diff --git a/include/cuco/detail/hyperloglog/tuning.cuh b/include/cuco/detail/hyperloglog/tuning.cuh new file mode 100644 index 000000000..f49e43e24 --- /dev/null +++ b/include/cuco/detail/hyperloglog/tuning.cuh @@ -0,0 +1,2577 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +namespace cuco::hyperloglog_ns::detail { + +// TODO this will spawn one copy of each array in every TU :( +// TODO use float instead of double? +// TODO use __constant__? +#ifndef CUCO_HLL_TUNING_ARR_DECL +#define CUCO_HLL_TUNING_ARR_DECL __device__ static cuda::std::array constexpr +#endif + +CUCO_HLL_TUNING_ARR_DECL thresholds{10.0, + 20.0, + 40.0, + 80.0, + 220.0, + 400.0, + 900.0, + 1800.0, + 3100.0, + 6500.0, + 15500.0, + 20000.0, + 50000.0, + 120000.0, + 350000.0}; + +// HLL++ uses an interpolation method over the raw estimated cardinality to select the optimal bias. +// Parameters/interpolation points taken from +// https://docs.google.com/document/d/1gyjfMHy43U9OWBXxfaeG-3MjGzejW1dlpyMwEYAAWEI/mobilebasic +CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p4{ + 11.0, 11.717, 12.207, 12.7896, 13.2882, 13.8204, 14.3772, 14.9342, 15.5202, 16.161, + 16.7722, 17.4636, 18.0396, 18.6766, 19.3566, 20.0454, 20.7936, 21.4856, 22.2666, 22.9946, + 23.766, 24.4692, 25.3638, 26.0764, 26.7864, 27.7602, 28.4814, 29.433, 30.2926, 31.0664, + 31.9996, 32.7956, 33.5366, 34.5894, 35.5738, 36.2698, 37.3682, 38.0544, 39.2342, 40.0108, + 40.7966, 41.9298, 42.8704, 43.6358, 44.5194, 45.773, 46.6772, 47.6174, 48.4888, 49.3304, + 50.2506, 51.4996, 52.3824, 53.3078, 54.3984, 55.5838, 56.6618, 57.2174, 58.3514, 59.0802, + 60.1482, 61.0376, 62.3598, 62.8078, 63.9744, 64.914, 65.781, 67.1806, 68.0594, 68.8446, + 69.7928, 70.8248, 71.8324, 72.8598, 73.6246, 74.7014, 75.393, 76.6708, 77.2394}; +CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p5{ + 23.0, 23.1194, 23.8208, 24.2318, 24.77, 25.2436, 25.7774, 26.2848, 26.8224, + 27.3742, 27.9336, 28.503, 29.0494, 29.6292, 30.2124, 30.798, 31.367, 31.9728, + 32.5944, 33.217, 33.8438, 34.3696, 35.0956, 35.7044, 36.324, 37.0668, 37.6698, + 38.3644, 39.049, 39.6918, 40.4146, 41.082, 41.687, 42.5398, 43.2462, 43.857, + 44.6606, 45.4168, 46.1248, 46.9222, 47.6804, 48.447, 49.3454, 49.9594, 50.7636, + 51.5776, 52.331, 53.19, 53.9676, 54.7564, 55.5314, 56.4442, 57.3708, 57.9774, + 58.9624, 59.8796, 60.755, 61.472, 62.2076, 63.1024, 63.8908, 64.7338, 65.7728, + 66.629, 67.413, 68.3266, 69.1524, 70.2642, 71.1806, 72.0566, 72.9192, 73.7598, + 74.3516, 75.5802, 76.4386, 77.4916, 78.1524, 79.1892, 79.8414, 80.8798, 81.8376, + 82.4698, 83.7656, 84.331, 85.5914, 86.6012, 87.7016, 88.5582, 89.3394, 90.3544, + 91.4912, 92.308, 93.3552, 93.9746, 95.2052, 95.727, 97.1322, 98.3944, 98.7588, + 100.242, 101.1914, 102.2538, 102.8776, 103.6292, 105.1932, 105.9152, 107.0868, 107.6728, + 108.7144, 110.3114, 110.8716, 111.245, 112.7908, 113.7064, 114.636, 115.7464, 116.1788, + 117.7464, 118.4896, 119.6166, 120.5082, 121.7798, 122.9028, 123.4426, 124.8854, 125.705, + 126.4652, 128.3464, 128.3462, 130.0398, 131.0342, 131.0042, 132.4766, 133.511, 134.7252, + 135.425, 136.5172, 138.0572, 138.6694, 139.3712, 140.8598, 141.4594, 142.554, 143.4006, + 144.7374, 146.1634, 146.8994, 147.605, 147.9304, 149.1636, 150.2468, 151.5876, 152.2096, + 153.7032, 154.7146, 155.807, 156.9228, 157.0372, 158.5852}; +CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p6{ + 46.0, 46.1902, 47.271, 47.8358, 48.8142, 49.2854, 50.317, 51.354, 51.8924, + 52.9436, 53.4596, 54.5262, 55.6248, 56.1574, 57.2822, 57.837, 58.9636, 60.074, + 60.7042, 61.7976, 62.4772, 63.6564, 64.7942, 65.5004, 66.686, 67.291, 68.5672, + 69.8556, 70.4982, 71.8204, 72.4252, 73.7744, 75.0786, 75.8344, 77.0294, 77.8098, + 79.0794, 80.5732, 81.1878, 82.5648, 83.2902, 84.6784, 85.3352, 86.8946, 88.3712, + 89.0852, 90.499, 91.2686, 92.6844, 94.2234, 94.9732, 96.3356, 97.2286, 98.7262, + 100.3284, 101.1048, 102.5962, 103.3562, 105.1272, 106.4184, 107.4974, 109.0822, 109.856, + 111.48, 113.2834, 114.0208, 115.637, 116.5174, 118.0576, 119.7476, 120.427, 122.1326, + 123.2372, 125.2788, 126.6776, 127.7926, 129.1952, 129.9564, 131.6454, 133.87, 134.5428, + 136.2, 137.0294, 138.6278, 139.6782, 141.792, 143.3516, 144.2832, 146.0394, 147.0748, + 148.4912, 150.849, 151.696, 153.5404, 154.073, 156.3714, 157.7216, 158.7328, 160.4208, + 161.4184, 163.9424, 165.2772, 166.411, 168.1308, 168.769, 170.9258, 172.6828, 173.7502, + 175.706, 176.3886, 179.0186, 180.4518, 181.927, 183.4172, 184.4114, 186.033, 188.5124, + 189.5564, 191.6008, 192.4172, 193.8044, 194.997, 197.4548, 198.8948, 200.2346, 202.3086, + 203.1548, 204.8842, 206.6508, 206.6772, 209.7254, 210.4752, 212.7228, 214.6614, 215.1676, + 217.793, 218.0006, 219.9052, 221.66, 223.5588, 225.1636, 225.6882, 227.7126, 229.4502, + 231.1978, 232.9756, 233.1654, 236.727, 238.1974, 237.7474, 241.1346, 242.3048, 244.1948, + 245.3134, 246.879, 249.1204, 249.853, 252.6792, 253.857, 254.4486, 257.2362, 257.9534, + 260.0286, 260.5632, 262.663, 264.723, 265.7566, 267.2566, 267.1624, 270.62, 272.8216, + 273.2166, 275.2056, 276.2202, 278.3726, 280.3344, 281.9284, 283.9728, 284.1924, 286.4872, + 287.587, 289.807, 291.1206, 292.769, 294.8708, 296.665, 297.1182, 299.4012, 300.6352, + 302.1354, 304.1756, 306.1606, 307.3462, 308.5214, 309.4134, 310.8352, 313.9684, 315.837, + 316.7796, 318.9858}; +CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p7{ + 92.0, 93.4934, 94.9758, 96.4574, 97.9718, 99.4954, 101.5302, 103.0756, 104.6374, + 106.1782, 107.7888, 109.9522, 111.592, 113.2532, 114.9086, 116.5938, 118.9474, 120.6796, + 122.4394, 124.2176, 125.9768, 128.4214, 130.2528, 132.0102, 133.8658, 135.7278, 138.3044, + 140.1316, 142.093, 144.0032, 145.9092, 148.6306, 150.5294, 152.5756, 154.6508, 156.662, + 159.552, 161.3724, 163.617, 165.5754, 167.7872, 169.8444, 172.7988, 174.8606, 177.2118, + 179.3566, 181.4476, 184.5882, 186.6816, 189.0824, 191.0258, 193.6048, 196.4436, 198.7274, + 200.957, 203.147, 205.4364, 208.7592, 211.3386, 213.781, 215.8028, 218.656, 221.6544, + 223.996, 226.4718, 229.1544, 231.6098, 234.5956, 237.0616, 239.5758, 242.4878, 244.5244, + 248.2146, 250.724, 252.8722, 255.5198, 258.0414, 261.941, 264.9048, 266.87, 269.4304, + 272.028, 274.4708, 278.37, 281.0624, 283.4668, 286.5532, 289.4352, 293.2564, 295.2744, + 298.2118, 300.7472, 304.1456, 307.2928, 309.7504, 312.5528, 315.979, 318.2102, 322.1834, + 324.3494, 327.325, 330.6614, 332.903, 337.2544, 339.9042, 343.215, 345.2864, 348.0814, + 352.6764, 355.301, 357.139, 360.658, 363.1732, 366.5902, 369.9538, 373.0828, 375.922, + 378.9902, 382.7328, 386.4538, 388.1136, 391.2234, 394.0878, 396.708, 401.1556, 404.1852, + 406.6372, 409.6822, 412.7796, 416.6078, 418.4916, 422.131, 424.5376, 428.1988, 432.211, + 434.4502, 438.5282, 440.912, 444.0448, 447.7432, 450.8524, 453.7988, 456.7858, 458.8868, + 463.9886, 466.5064, 468.9124, 472.6616, 475.4682, 478.582, 481.304, 485.2738, 488.6894, + 490.329, 496.106, 497.6908, 501.1374, 504.5322, 506.8848, 510.3324, 513.4512, 516.179, + 520.4412, 522.6066, 526.167, 528.7794, 533.379, 536.067, 538.46, 542.9116, 545.692, + 547.9546, 552.493, 555.2722, 557.335, 562.449, 564.2014, 569.0738, 571.0974, 574.8564, + 578.2996, 581.409, 583.9704, 585.8098, 589.6528, 594.5998, 595.958, 600.068, 603.3278, + 608.2016, 609.9632, 612.864, 615.43, 620.7794, 621.272, 625.8644, 629.206, 633.219, + 634.5154, 638.6102}; +CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p8{ + 184.2152, 187.2454, 190.2096, 193.6652, 196.6312, 199.6822, 203.249, 206.3296, 210.0038, + 213.2074, 216.4612, 220.27, 223.5178, 227.4412, 230.8032, 234.1634, 238.1688, 241.6074, + 245.6946, 249.2664, 252.8228, 257.0432, 260.6824, 264.9464, 268.6268, 272.2626, 276.8376, + 280.4034, 284.8956, 288.8522, 292.7638, 297.3552, 301.3556, 305.7526, 309.9292, 313.8954, + 318.8198, 322.7668, 327.298, 331.6688, 335.9466, 340.9746, 345.1672, 349.3474, 354.3028, + 358.8912, 364.114, 368.4646, 372.9744, 378.4092, 382.6022, 387.843, 392.5684, 397.1652, + 402.5426, 407.4152, 412.5388, 417.3592, 422.1366, 427.486, 432.3918, 437.5076, 442.509, + 447.3834, 453.3498, 458.0668, 463.7346, 469.1228, 473.4528, 479.7, 484.644, 491.0518, + 495.5774, 500.9068, 506.432, 512.1666, 517.434, 522.6644, 527.4894, 533.6312, 538.3804, + 544.292, 550.5496, 556.0234, 562.8206, 566.6146, 572.4188, 579.117, 583.6762, 590.6576, + 595.7864, 601.509, 607.5334, 612.9204, 619.772, 624.2924, 630.8654, 636.1836, 642.745, + 649.1316, 655.0386, 660.0136, 666.6342, 671.6196, 678.1866, 684.4282, 689.3324, 695.4794, + 702.5038, 708.129, 713.528, 720.3204, 726.463, 732.7928, 739.123, 744.7418, 751.2192, + 756.5102, 762.6066, 769.0184, 775.2224, 781.4014, 787.7618, 794.1436, 798.6506, 805.6378, + 811.766, 819.7514, 824.5776, 828.7322, 837.8048, 843.6302, 849.9336, 854.4798, 861.3388, + 867.9894, 873.8196, 880.3136, 886.2308, 892.4588, 899.0816, 905.4076, 912.0064, 917.3878, + 923.619, 929.998, 937.3482, 943.9506, 947.991, 955.1144, 962.203, 968.8222, 975.7324, + 981.7826, 988.7666, 994.2648, 1000.3128, 1007.4082, 1013.7536, 1020.3376, 1026.7156, 1031.7478, + 1037.4292, 1045.393, 1051.2278, 1058.3434, 1062.8726, 1071.884, 1076.806, 1082.9176, 1089.1678, + 1095.5032, 1102.525, 1107.2264, 1115.315, 1120.93, 1127.252, 1134.1496, 1139.0408, 1147.5448, + 1153.3296, 1158.1974, 1166.5262, 1174.3328, 1175.657, 1184.4222, 1190.9172, 1197.1292, 1204.4606, + 1210.4578, 1218.8728, 1225.3336, 1226.6592, 1236.5768, 1241.363, 1249.4074, 1254.6566, 1260.8014, + 1266.5454, 1274.5192}; +CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p9{ + 369.0, 374.8294, 381.2452, 387.6698, 394.1464, 400.2024, 406.8782, 413.6598, 420.462, + 427.2826, 433.7102, 440.7416, 447.9366, 455.1046, 462.285, 469.0668, 476.306, 483.8448, + 491.301, 498.9886, 506.2422, 513.8138, 521.7074, 529.7428, 537.8402, 545.1664, 553.3534, + 561.594, 569.6886, 577.7876, 585.65, 594.228, 602.8036, 611.1666, 620.0818, 628.0824, + 637.2574, 646.302, 655.1644, 664.0056, 672.3802, 681.7192, 690.5234, 700.2084, 708.831, + 718.485, 728.1112, 737.4764, 746.76, 756.3368, 766.5538, 775.5058, 785.2646, 795.5902, + 804.3818, 814.8998, 824.9532, 835.2062, 845.2798, 854.4728, 864.9582, 875.3292, 886.171, + 896.781, 906.5716, 916.7048, 927.5322, 937.875, 949.3972, 958.3464, 969.7274, 980.2834, + 992.1444, 1003.4264, 1013.0166, 1024.018, 1035.0438, 1046.34, 1057.6856, 1068.9836, 1079.0312, + 1091.677, 1102.3188, 1113.4846, 1124.4424, 1135.739, 1147.1488, 1158.9202, 1169.406, 1181.5342, + 1193.2834, 1203.8954, 1216.3286, 1226.2146, 1239.6684, 1251.9946, 1262.123, 1275.4338, 1285.7378, + 1296.076, 1308.9692, 1320.4964, 1333.0998, 1343.9864, 1357.7754, 1368.3208, 1380.4838, 1392.7388, + 1406.0758, 1416.9098, 1428.9728, 1440.9228, 1453.9292, 1462.617, 1476.05, 1490.2996, 1500.6128, + 1513.7392, 1524.5174, 1536.6322, 1548.2584, 1562.3766, 1572.423, 1587.1232, 1596.5164, 1610.5938, + 1622.5972, 1633.1222, 1647.7674, 1658.5044, 1671.57, 1683.7044, 1695.4142, 1708.7102, 1720.6094, + 1732.6522, 1747.841, 1756.4072, 1769.9786, 1782.3276, 1797.5216, 1808.3186, 1819.0694, 1834.354, + 1844.575, 1856.2808, 1871.1288, 1880.7852, 1893.9622, 1906.3418, 1920.6548, 1932.9302, 1945.8584, + 1955.473, 1968.8248, 1980.6446, 1995.9598, 2008.349, 2019.8556, 2033.0334, 2044.0206, 2059.3956, + 2069.9174, 2082.6084, 2093.7036, 2106.6108, 2118.9124, 2132.301, 2144.7628, 2159.8422, 2171.0212, + 2183.101, 2193.5112, 2208.052, 2221.3194, 2233.3282, 2247.295, 2257.7222, 2273.342, 2286.5638, + 2299.6786, 2310.8114, 2322.3312, 2335.516, 2349.874, 2363.5968, 2373.865, 2387.1918, 2401.8328, + 2414.8496, 2424.544, 2436.7592, 2447.1682, 2464.1958, 2474.3438, 2489.0006, 2497.4526, 2513.6586, + 2527.19, 2540.7028, 2553.768}; +CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p10{ + 738.1256, 750.4234, 763.1064, 775.4732, 788.4636, 801.0644, 814.488, 827.9654, 841.0832, + 854.7864, 868.1992, 882.2176, 896.5228, 910.1716, 924.7752, 938.899, 953.6126, 968.6492, + 982.9474, 998.5214, 1013.1064, 1028.6364, 1044.2468, 1059.4588, 1075.3832, 1091.0584, 1106.8606, + 1123.3868, 1139.5062, 1156.1862, 1172.463, 1189.339, 1206.1936, 1223.1292, 1240.1854, 1257.2908, + 1275.3324, 1292.8518, 1310.5204, 1328.4854, 1345.9318, 1364.552, 1381.4658, 1400.4256, 1419.849, + 1438.152, 1456.8956, 1474.8792, 1494.118, 1513.62, 1532.5132, 1551.9322, 1570.7726, 1590.6086, + 1610.5332, 1630.5918, 1650.4294, 1669.7662, 1690.4106, 1710.7338, 1730.9012, 1750.4486, 1770.1556, + 1791.6338, 1812.7312, 1833.6264, 1853.9526, 1874.8742, 1896.8326, 1918.1966, 1939.5594, 1961.07, + 1983.037, 2003.1804, 2026.071, 2047.4884, 2070.0848, 2091.2944, 2114.333, 2135.9626, 2158.2902, + 2181.0814, 2202.0334, 2224.4832, 2246.39, 2269.7202, 2292.1714, 2314.2358, 2338.9346, 2360.891, + 2384.0264, 2408.3834, 2430.1544, 2454.8684, 2476.9896, 2501.4368, 2522.8702, 2548.0408, 2570.6738, + 2593.5208, 2617.0158, 2640.2302, 2664.0962, 2687.4986, 2714.2588, 2735.3914, 2759.6244, 2781.8378, + 2808.0072, 2830.6516, 2856.2454, 2877.2136, 2903.4546, 2926.785, 2951.2294, 2976.468, 3000.867, + 3023.6508, 3049.91, 3073.5984, 3098.162, 3121.5564, 3146.2328, 3170.9484, 3195.5902, 3221.3346, + 3242.7032, 3271.6112, 3296.5546, 3317.7376, 3345.072, 3369.9518, 3394.326, 3418.1818, 3444.6926, + 3469.086, 3494.2754, 3517.8698, 3544.248, 3565.3768, 3588.7234, 3616.979, 3643.7504, 3668.6812, + 3695.72, 3719.7392, 3742.6224, 3770.4456, 3795.6602, 3819.9058, 3844.002, 3869.517, 3895.6824, + 3920.8622, 3947.1364, 3973.985, 3995.4772, 4021.62, 4046.628, 4074.65, 4096.2256, 4121.831, + 4146.6406, 4173.276, 4195.0744, 4223.9696, 4251.3708, 4272.9966, 4300.8046, 4326.302, 4353.1248, + 4374.312, 4403.0322, 4426.819, 4450.0598, 4478.5206, 4504.8116, 4528.8928, 4553.9584, 4578.8712, + 4603.8384, 4632.3872, 4655.5128, 4675.821, 4704.6222, 4731.9862, 4755.4174, 4781.2628, 4804.332, + 4832.3048, 4862.8752, 4883.4148, 4906.9544, 4935.3516, 4954.3532, 4984.0248, 5011.217, 5035.3258, + 5057.3672, 5084.1828}; +CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p11{ + 1477.0, 1501.6014, 1526.5802, 1551.7942, 1577.3042, 1603.2062, 1629.8402, 1656.2292, + 1682.9462, 1709.9926, 1737.3026, 1765.4252, 1793.0578, 1821.6092, 1849.626, 1878.5568, + 1908.527, 1937.5154, 1967.1874, 1997.3878, 2027.37, 2058.1972, 2089.5728, 2120.1012, + 2151.9668, 2183.292, 2216.0772, 2247.8578, 2280.6562, 2313.041, 2345.714, 2380.3112, + 2414.1806, 2447.9854, 2481.656, 2516.346, 2551.5154, 2586.8378, 2621.7448, 2656.6722, + 2693.5722, 2729.1462, 2765.4124, 2802.8728, 2838.898, 2876.408, 2913.4926, 2951.4938, + 2989.6776, 3026.282, 3065.7704, 3104.1012, 3143.7388, 3181.6876, 3221.1872, 3261.5048, + 3300.0214, 3339.806, 3381.409, 3421.4144, 3461.4294, 3502.2286, 3544.651, 3586.6156, + 3627.337, 3670.083, 3711.1538, 3753.5094, 3797.01, 3838.6686, 3882.1678, 3922.8116, + 3967.9978, 4009.9204, 4054.3286, 4097.5706, 4140.6014, 4185.544, 4229.5976, 4274.583, + 4316.9438, 4361.672, 4406.2786, 4451.8628, 4496.1834, 4543.505, 4589.1816, 4632.5188, + 4678.2294, 4724.8908, 4769.0194, 4817.052, 4861.4588, 4910.1596, 4956.4344, 5002.5238, + 5048.13, 5093.6374, 5142.8162, 5187.7894, 5237.3984, 5285.6078, 5331.0858, 5379.1036, + 5428.6258, 5474.6018, 5522.7618, 5571.5822, 5618.59, 5667.9992, 5714.88, 5763.454, + 5808.6982, 5860.3644, 5910.2914, 5953.571, 6005.9232, 6055.1914, 6104.5882, 6154.5702, + 6199.7036, 6251.1764, 6298.7596, 6350.0302, 6398.061, 6448.4694, 6495.933, 6548.0474, + 6597.7166, 6646.9416, 6695.9208, 6742.6328, 6793.5276, 6842.1934, 6894.2372, 6945.3864, + 6996.9228, 7044.2372, 7094.1374, 7142.2272, 7192.2942, 7238.8338, 7288.9006, 7344.0908, + 7394.8544, 7443.5176, 7490.4148, 7542.9314, 7595.6738, 7641.9878, 7694.3688, 7743.0448, + 7797.522, 7845.53, 7899.594, 7950.3132, 7996.455, 8050.9442, 8092.9114, 8153.1374, + 8197.4472, 8252.8278, 8301.8728, 8348.6776, 8401.4698, 8453.551, 8504.6598, 8553.8944, + 8604.1276, 8657.6514, 8710.3062, 8758.908, 8807.8706, 8862.1702, 8910.4668, 8960.77, + 9007.2766, 9063.164, 9121.0534, 9164.1354, 9218.1594, 9267.767, 9319.0594, 9372.155, + 9419.7126, 9474.3722, 9520.1338, 9572.368, 9622.7702, 9675.8448, 9726.5396, 9778.7378, + 9827.6554, 9878.1922, 9928.7782, 9978.3984, 10026.578, 10076.5626, 10137.1618, 10177.5244, + 10229.9176}; +CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p12{ + 2954.0, 3003.4782, 3053.3568, 3104.3666, 3155.324, 3206.9598, 3259.648, 3312.539, + 3366.1474, 3420.2576, 3474.8376, 3530.6076, 3586.451, 3643.38, 3700.4104, 3757.5638, + 3815.9676, 3875.193, 3934.838, 3994.8548, 4055.018, 4117.1742, 4178.4482, 4241.1294, + 4304.4776, 4367.4044, 4431.8724, 4496.3732, 4561.4304, 4627.5326, 4693.949, 4761.5532, + 4828.7256, 4897.6182, 4965.5186, 5034.4528, 5104.865, 5174.7164, 5244.6828, 5316.6708, + 5387.8312, 5459.9036, 5532.476, 5604.8652, 5679.6718, 5753.757, 5830.2072, 5905.2828, + 5980.0434, 6056.6264, 6134.3192, 6211.5746, 6290.0816, 6367.1176, 6447.9796, 6526.5576, + 6606.1858, 6686.9144, 6766.1142, 6847.0818, 6927.9664, 7010.9096, 7091.0816, 7175.3962, + 7260.3454, 7344.018, 7426.4214, 7511.3106, 7596.0686, 7679.8094, 7765.818, 7852.4248, + 7936.834, 8022.363, 8109.5066, 8200.4554, 8288.5832, 8373.366, 8463.4808, 8549.7682, + 8642.0522, 8728.3288, 8820.9528, 8907.727, 9001.0794, 9091.2522, 9179.988, 9269.852, + 9362.6394, 9453.642, 9546.9024, 9640.6616, 9732.6622, 9824.3254, 9917.7484, 10007.9392, + 10106.7508, 10196.2152, 10289.8114, 10383.5494, 10482.3064, 10576.8734, 10668.7872, 10764.7156, + 10862.0196, 10952.793, 11049.9748, 11146.0702, 11241.4492, 11339.2772, 11434.2336, 11530.741, + 11627.6136, 11726.311, 11821.5964, 11918.837, 12015.3724, 12113.0162, 12213.0424, 12306.9804, + 12408.4518, 12504.8968, 12604.586, 12700.9332, 12798.705, 12898.5142, 12997.0488, 13094.788, + 13198.475, 13292.7764, 13392.9698, 13486.8574, 13590.1616, 13686.5838, 13783.6264, 13887.2638, + 13992.0978, 14081.0844, 14189.9956, 14280.0912, 14382.4956, 14486.4384, 14588.1082, 14686.2392, + 14782.276, 14888.0284, 14985.1864, 15088.8596, 15187.0998, 15285.027, 15383.6694, 15495.8266, + 15591.3736, 15694.2008, 15790.3246, 15898.4116, 15997.4522, 16095.5014, 16198.8514, 16291.7492, + 16402.6424, 16499.1266, 16606.2436, 16697.7186, 16796.3946, 16902.3376, 17005.7672, 17100.814, + 17206.8282, 17305.8262, 17416.0744, 17508.4092, 17617.0178, 17715.4554, 17816.758, 17920.1748, + 18012.9236, 18119.7984, 18223.2248, 18324.2482, 18426.6276, 18525.0932, 18629.8976, 18733.2588, + 18831.0466, 18940.1366, 19032.2696, 19131.729, 19243.4864, 19349.6932, 19442.866, 19547.9448, + 19653.2798, 19754.4034, 19854.0692, 19965.1224, 20065.1774, 20158.2212, 20253.353, 20366.3264, + 20463.22}; +CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p13{ + 5908.5052, 6007.2672, 6107.347, 6208.5794, 6311.2622, 6414.5514, 6519.3376, 6625.6952, + 6732.5988, 6841.3552, 6950.5972, 7061.3082, 7173.5646, 7287.109, 7401.8216, 7516.4344, + 7633.3802, 7751.2962, 7870.3784, 7990.292, 8110.79, 8233.4574, 8356.6036, 8482.2712, + 8607.7708, 8735.099, 8863.1858, 8993.4746, 9123.8496, 9255.6794, 9388.5448, 9522.7516, + 9657.3106, 9792.6094, 9930.5642, 10068.794, 10206.7256, 10347.81, 10490.3196, 10632.0778, + 10775.9916, 10920.4662, 11066.124, 11213.073, 11358.0362, 11508.1006, 11659.1716, 11808.7514, + 11959.4884, 12112.1314, 12265.037, 12420.3756, 12578.933, 12734.311, 12890.0006, 13047.2144, + 13207.3096, 13368.5144, 13528.024, 13689.847, 13852.7528, 14018.3168, 14180.5372, 14346.9668, + 14513.5074, 14677.867, 14846.2186, 15017.4186, 15184.9716, 15356.339, 15529.2972, 15697.3578, + 15871.8686, 16042.187, 16216.4094, 16389.4188, 16565.9126, 16742.3272, 16919.0042, 17094.7592, + 17273.965, 17451.8342, 17634.4254, 17810.5984, 17988.9242, 18171.051, 18354.7938, 18539.466, + 18721.0408, 18904.9972, 19081.867, 19271.9118, 19451.8694, 19637.9816, 19821.2922, 20013.1292, + 20199.3858, 20387.8726, 20572.9514, 20770.7764, 20955.1714, 21144.751, 21329.9952, 21520.709, + 21712.7016, 21906.3868, 22096.2626, 22286.0524, 22475.051, 22665.5098, 22862.8492, 23055.5294, + 23249.6138, 23437.848, 23636.273, 23826.093, 24020.3296, 24213.3896, 24411.7392, 24602.9614, + 24805.7952, 24998.1552, 25193.9588, 25389.0166, 25585.8392, 25780.6976, 25981.2728, 26175.977, + 26376.5252, 26570.1964, 26773.387, 26962.9812, 27163.0586, 27368.164, 27565.0534, 27758.7428, + 27961.1276, 28163.2324, 28362.3816, 28565.7668, 28758.644, 28956.9768, 29163.4722, 29354.7026, + 29561.1186, 29767.9948, 29959.9986, 30164.0492, 30366.9818, 30562.5338, 30762.9928, 30976.1592, + 31166.274, 31376.722, 31570.3734, 31770.809, 31974.8934, 32179.5286, 32387.5442, 32582.3504, + 32794.076, 32989.9528, 33191.842, 33392.4684, 33595.659, 33801.8672, 34000.3414, 34200.0922, + 34402.6792, 34610.0638, 34804.0084, 35011.13, 35218.669, 35418.6634, 35619.0792, 35830.6534, + 36028.4966, 36229.7902, 36438.6422, 36630.7764, 36833.3102, 37048.6728, 37247.3916, 37453.5904, + 37669.3614, 37854.5526, 38059.305, 38268.0936, 38470.2516, 38674.7064, 38876.167, 39068.3794, + 39281.9144, 39492.8566, 39684.8628, 39898.4108, 40093.1836, 40297.6858, 40489.7086, 40717.2424}; +CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p14{ + 11817.475, 12015.0046, 12215.3792, 12417.7504, 12623.1814, 12830.0086, 13040.0072, 13252.503, + 13466.178, 13683.2738, 13902.0344, 14123.9798, 14347.394, 14573.7784, 14802.6894, 15033.6824, + 15266.9134, 15502.8624, 15741.4944, 15980.7956, 16223.8916, 16468.6316, 16715.733, 16965.5726, + 17217.204, 17470.666, 17727.8516, 17986.7886, 18247.6902, 18510.9632, 18775.304, 19044.7486, + 19314.4408, 19587.202, 19862.2576, 20135.924, 20417.0324, 20697.9788, 20979.6112, 21265.0274, + 21550.723, 21841.6906, 22132.162, 22428.1406, 22722.127, 23020.5606, 23319.7394, 23620.4014, + 23925.2728, 24226.9224, 24535.581, 24845.505, 25155.9618, 25470.3828, 25785.9702, 26103.7764, + 26420.4132, 26742.0186, 27062.8852, 27388.415, 27714.6024, 28042.296, 28365.4494, 28701.1526, + 29031.8008, 29364.2156, 29704.497, 30037.1458, 30380.111, 30723.8168, 31059.5114, 31404.9498, + 31751.6752, 32095.2686, 32444.7792, 32794.767, 33145.204, 33498.4226, 33847.6502, 34209.006, + 34560.849, 34919.4838, 35274.9778, 35635.1322, 35996.3266, 36359.1394, 36722.8266, 37082.8516, + 37447.7354, 37815.9606, 38191.0692, 38559.4106, 38924.8112, 39294.6726, 39663.973, 40042.261, + 40416.2036, 40779.2036, 41161.6436, 41540.9014, 41921.1998, 42294.7698, 42678.5264, 43061.3464, + 43432.375, 43818.432, 44198.6598, 44583.0138, 44970.4794, 45353.924, 45729.858, 46118.2224, + 46511.5724, 46900.7386, 47280.6964, 47668.1472, 48055.6796, 48446.9436, 48838.7146, 49217.7296, + 49613.7796, 50010.7508, 50410.0208, 50793.7886, 51190.2456, 51583.1882, 51971.0796, 52376.5338, + 52763.319, 53165.5534, 53556.5594, 53948.2702, 54346.352, 54748.7914, 55138.577, 55543.4824, + 55941.1748, 56333.7746, 56745.1552, 57142.7944, 57545.2236, 57935.9956, 58348.5268, 58737.5474, + 59158.5962, 59542.6896, 59958.8004, 60349.3788, 60755.0212, 61147.6144, 61548.194, 61946.0696, + 62348.6042, 62763.603, 63162.781, 63560.635, 63974.3482, 64366.4908, 64771.5876, 65176.7346, + 65597.3916, 65995.915, 66394.0384, 66822.9396, 67203.6336, 67612.2032, 68019.0078, 68420.0388, + 68821.22, 69235.8388, 69640.0724, 70055.155, 70466.357, 70863.4266, 71276.2482, 71677.0306, + 72080.2006, 72493.0214, 72893.5952, 73314.5856, 73714.9852, 74125.3022, 74521.2122, 74933.6814, + 75341.5904, 75743.0244, 76166.0278, 76572.1322, 76973.1028, 77381.6284, 77800.6092, 78189.328, + 78607.0962, 79012.2508, 79407.8358, 79825.725, 80238.701, 80646.891, 81035.6436, 81460.0448, + 81876.3884}; +CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p15{ + 23635.0036, 24030.8034, 24431.4744, 24837.1524, 25246.7928, 25661.326, 26081.3532, + 26505.2806, 26933.9892, 27367.7098, 27805.318, 28248.799, 28696.4382, 29148.8244, + 29605.5138, 30066.8668, 30534.2344, 31006.32, 31480.778, 31962.2418, 32447.3324, + 32938.0232, 33432.731, 33930.728, 34433.9896, 34944.1402, 35457.5588, 35974.5958, + 36497.3296, 37021.9096, 37554.326, 38088.0826, 38628.8816, 39171.3192, 39723.2326, + 40274.5554, 40832.3142, 41390.613, 41959.5908, 42532.5466, 43102.0344, 43683.5072, + 44266.694, 44851.2822, 45440.7862, 46038.0586, 46640.3164, 47241.064, 47846.155, + 48454.7396, 49076.9168, 49692.542, 50317.4778, 50939.65, 51572.5596, 52210.2906, + 52843.7396, 53481.3996, 54127.236, 54770.406, 55422.6598, 56078.7958, 56736.7174, + 57397.6784, 58064.5784, 58730.308, 59404.9784, 60077.0864, 60751.9158, 61444.1386, + 62115.817, 62808.7742, 63501.4774, 64187.5454, 64883.6622, 65582.7468, 66274.5318, + 66976.9276, 67688.7764, 68402.138, 69109.6274, 69822.9706, 70543.6108, 71265.5202, + 71983.3848, 72708.4656, 73433.384, 74158.4664, 74896.4868, 75620.9564, 76362.1434, + 77098.3204, 77835.7662, 78582.6114, 79323.9902, 80067.8658, 80814.9246, 81567.0136, + 82310.8536, 83061.9952, 83821.4096, 84580.8608, 85335.547, 86092.5802, 86851.6506, + 87612.311, 88381.2016, 89146.3296, 89907.8974, 90676.846, 91451.4152, 92224.5518, + 92995.8686, 93763.5066, 94551.2796, 95315.1944, 96096.1806, 96881.0918, 97665.679, + 98442.68, 99229.3002, 100011.0994, 100790.6386, 101580.1564, 102377.7484, 103152.1392, + 103944.2712, 104730.216, 105528.6336, 106324.9398, 107117.6706, 107890.3988, 108695.2266, + 109485.238, 110294.7876, 111075.0958, 111878.0496, 112695.2864, 113464.5486, 114270.0474, + 115068.608, 115884.3626, 116673.2588, 117483.3716, 118275.097, 119085.4092, 119879.2808, + 120687.5868, 121499.9944, 122284.916, 123095.9254, 123912.5038, 124709.0454, 125503.7182, + 126323.259, 127138.9412, 127943.8294, 128755.646, 129556.5354, 130375.3298, 131161.4734, + 131971.1962, 132787.5458, 133588.1056, 134431.351, 135220.2906, 136023.398, 136846.6558, + 137667.0004, 138463.663, 139283.7154, 140074.6146, 140901.3072, 141721.8548, 142543.2322, + 143356.1096, 144173.7412, 144973.0948, 145794.3162, 146609.5714, 147420.003, 148237.9784, + 149050.5696, 149854.761, 150663.1966, 151494.0754, 152313.1416, 153112.6902, 153935.7206, + 154746.9262, 155559.547, 156401.9746, 157228.7036, 158008.7254, 158820.75, 159646.9184, + 160470.4458, 161279.5348, 162093.3114, 162918.542, 163729.2842}; +CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p16{ + 47271.0, 48062.3584, 48862.7074, 49673.152, 50492.8416, 51322.9514, 52161.03, + 53009.407, 53867.6348, 54734.206, 55610.5144, 56496.2096, 57390.795, 58297.268, + 59210.6448, 60134.665, 61068.0248, 62010.4472, 62962.5204, 63923.5742, 64895.0194, + 65876.4182, 66862.6136, 67862.6968, 68868.8908, 69882.8544, 70911.271, 71944.0924, + 72990.0326, 74040.692, 75100.6336, 76174.7826, 77252.5998, 78340.2974, 79438.2572, + 80545.4976, 81657.2796, 82784.6336, 83915.515, 85059.7362, 86205.9368, 87364.4424, + 88530.3358, 89707.3744, 90885.9638, 92080.197, 93275.5738, 94479.391, 95695.918, + 96919.2236, 98148.4602, 99382.3474, 100625.6974, 101878.0284, 103141.6278, 104409.4588, + 105686.2882, 106967.5402, 108261.6032, 109548.1578, 110852.0728, 112162.231, 113479.0072, + 114806.2626, 116137.9072, 117469.5048, 118813.5186, 120165.4876, 121516.2556, 122875.766, + 124250.5444, 125621.2222, 127003.2352, 128387.848, 129775.2644, 131181.7776, 132577.3086, + 133979.9458, 135394.1132, 136800.9078, 138233.217, 139668.5308, 141085.212, 142535.2122, + 143969.0684, 145420.2872, 146878.1542, 148332.7572, 149800.3202, 151269.66, 152743.6104, + 154213.0948, 155690.288, 157169.4246, 158672.1756, 160160.059, 161650.6854, 163145.7772, + 164645.6726, 166159.1952, 167682.1578, 169177.3328, 170700.0118, 172228.8964, 173732.6664, + 175265.5556, 176787.799, 178317.111, 179856.6914, 181400.865, 182943.4612, 184486.742, + 186033.4698, 187583.7886, 189148.1868, 190688.4526, 192250.1926, 193810.9042, 195354.2972, + 196938.7682, 198493.5898, 200079.2824, 201618.912, 203205.5492, 204765.5798, 206356.1124, + 207929.3064, 209498.7196, 211086.229, 212675.1324, 214256.7892, 215826.2392, 217412.8474, + 218995.6724, 220618.6038, 222207.1166, 223781.0364, 225387.4332, 227005.7928, 228590.4336, + 230217.8738, 231805.1054, 233408.9, 234995.3432, 236601.4956, 238190.7904, 239817.2548, + 241411.2832, 243002.4066, 244640.1884, 246255.3128, 247849.3508, 249479.9734, 251106.8822, + 252705.027, 254332.9242, 255935.129, 257526.9014, 259154.772, 260777.625, 262390.253, + 264004.4906, 265643.59, 267255.4076, 268873.426, 270470.7252, 272106.4804, 273722.4456, + 275337.794, 276945.7038, 278592.9154, 280204.3726, 281841.1606, 283489.171, 285130.1716, + 286735.3362, 288364.7164, 289961.1814, 291595.5524, 293285.683, 294899.6668, 296499.3434, + 298128.0462, 299761.8946, 301394.2424, 302997.6748, 304615.1478, 306269.7724, 307886.114, + 309543.1028, 311153.2862, 312782.8546, 314421.2008, 316033.2438, 317692.9636, 319305.2648, + 320948.7406, 322566.3364, 324228.4224, 325847.1542}; +CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p17{ + 94542.0, 96125.811, 97728.019, 99348.558, 100987.9705, 102646.7565, 104324.5125, + 106021.7435, 107736.7865, 109469.272, 111223.9465, 112995.219, 114787.432, 116593.152, + 118422.71, 120267.2345, 122134.6765, 124020.937, 125927.2705, 127851.255, 129788.9485, + 131751.016, 133726.8225, 135722.592, 137736.789, 139770.568, 141821.518, 143891.343, + 145982.1415, 148095.387, 150207.526, 152355.649, 154515.6415, 156696.05, 158887.7575, + 161098.159, 163329.852, 165569.053, 167837.4005, 170121.6165, 172420.4595, 174732.6265, + 177062.77, 179412.502, 181774.035, 184151.939, 186551.6895, 188965.691, 191402.8095, + 193857.949, 196305.0775, 198774.6715, 201271.2585, 203764.78, 206299.3695, 208818.1365, + 211373.115, 213946.7465, 216532.076, 219105.541, 221714.5375, 224337.5135, 226977.5125, + 229613.0655, 232270.2685, 234952.2065, 237645.3555, 240331.1925, 243034.517, 245756.0725, + 248517.6865, 251232.737, 254011.3955, 256785.995, 259556.44, 262368.335, 265156.911, + 267965.266, 270785.583, 273616.0495, 276487.4835, 279346.639, 282202.509, 285074.3885, + 287942.2855, 290856.018, 293774.0345, 296678.5145, 299603.6355, 302552.6575, 305492.9785, + 308466.8605, 311392.581, 314347.538, 317319.4295, 320285.9785, 323301.7325, 326298.3235, + 329301.3105, 332301.987, 335309.791, 338370.762, 341382.923, 344431.1265, 347464.1545, + 350507.28, 353619.2345, 356631.2005, 359685.203, 362776.7845, 365886.488, 368958.2255, + 372060.6825, 375165.4335, 378237.935, 381328.311, 384430.5225, 387576.425, 390683.242, + 393839.648, 396977.8425, 400101.9805, 403271.296, 406409.8425, 409529.5485, 412678.7, + 415847.423, 419020.8035, 422157.081, 425337.749, 428479.6165, 431700.902, 434893.1915, + 438049.582, 441210.5415, 444379.2545, 447577.356, 450741.931, 453959.548, 457137.0935, + 460329.846, 463537.4815, 466732.3345, 469960.5615, 473164.681, 476347.6345, 479496.173, + 482813.1645, 486025.6995, 489249.4885, 492460.1945, 495675.8805, 498908.0075, 502131.802, + 505374.3855, 508550.9915, 511806.7305, 515026.776, 518217.0005, 521523.9855, 524705.9855, + 527950.997, 531210.0265, 534472.497, 537750.7315, 540926.922, 544207.094, 547429.4345, + 550666.3745, 553975.3475, 557150.7185, 560399.6165, 563662.697, 566916.7395, 570146.1215, + 573447.425, 576689.6245, 579874.5745, 583202.337, 586503.0255, 589715.635, 592910.161, + 596214.3885, 599488.035, 602740.92, 605983.0685, 609248.67, 612491.3605, 615787.912, + 619107.5245, 622307.9555, 625577.333, 628840.4385, 632085.2155, 635317.6135, 638691.7195, + 641887.467, 645139.9405, 648441.546, 651666.252, 654941.845}; +CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p18{ + 189084.0, 192250.913, 195456.774, 198696.946, 201977.762, 205294.444, 208651.754, + 212042.099, 215472.269, 218941.91, 222443.912, 225996.845, 229568.199, 233193.568, + 236844.457, 240543.233, 244279.475, 248044.27, 251854.588, 255693.2, 259583.619, + 263494.621, 267445.385, 271454.061, 275468.769, 279549.456, 283646.446, 287788.198, + 291966.099, 296181.164, 300431.469, 304718.618, 309024.004, 313393.508, 317760.803, + 322209.731, 326675.061, 331160.627, 335654.47, 340241.442, 344841.833, 349467.132, + 354130.629, 358819.432, 363574.626, 368296.587, 373118.482, 377914.93, 382782.301, + 387680.669, 392601.981, 397544.323, 402529.115, 407546.018, 412593.658, 417638.657, + 422762.865, 427886.169, 433017.167, 438213.273, 443441.254, 448692.421, 453937.533, + 459239.049, 464529.569, 469910.083, 475274.03, 480684.473, 486070.26, 491515.237, + 496995.651, 502476.617, 507973.609, 513497.19, 519083.233, 524726.509, 530305.505, + 535945.728, 541584.404, 547274.055, 552967.236, 558667.862, 564360.216, 570128.148, + 575965.08, 581701.952, 587532.523, 593361.144, 599246.128, 605033.418, 610958.779, + 616837.117, 622772.818, 628672.04, 634675.369, 640574.831, 646585.739, 652574.547, + 658611.217, 664642.684, 670713.914, 676737.681, 682797.313, 688837.897, 694917.874, + 701009.882, 707173.648, 713257.254, 719415.392, 725636.761, 731710.697, 737906.209, + 744103.074, 750313.39, 756504.185, 762712.579, 768876.985, 775167.859, 781359.0, + 787615.959, 793863.597, 800245.477, 806464.582, 812785.294, 819005.925, 825403.057, + 831676.197, 837936.284, 844266.968, 850642.711, 856959.756, 863322.774, 869699.931, + 876102.478, 882355.787, 888694.463, 895159.952, 901536.143, 907872.631, 914293.672, + 920615.14, 927130.974, 933409.404, 939922.178, 946331.47, 952745.93, 959209.264, + 965590.224, 972077.284, 978501.961, 984953.19, 991413.271, 997817.479, 1004222.658, + 1010725.676, 1017177.138, 1023612.529, 1030098.236, 1036493.719, 1043112.207, 1049537.036, + 1056008.096, 1062476.184, 1068942.337, 1075524.95, 1081932.864, 1088426.025, 1094776.005, + 1101327.448, 1107901.673, 1114423.639, 1120884.602, 1127324.923, 1133794.24, 1140328.886, + 1146849.376, 1153346.682, 1159836.502, 1166478.703, 1172953.304, 1179391.502, 1185950.982, + 1192544.052, 1198913.41, 1205430.994, 1212015.525, 1218674.042, 1225121.683, 1231551.101, + 1238126.379, 1244673.795, 1251260.649, 1257697.86, 1264320.983, 1270736.319, 1277274.694, + 1283804.95, 1290211.514, 1296858.568, 1303455.691}; + +// Meta array storing interpolation points for estimates for Precision=4..18 +__device__ static cuda::std::array constexpr raw_estimate_data{raw_estimate_data_p4.data(), + raw_estimate_data_p5.data(), + raw_estimate_data_p6.data(), + raw_estimate_data_p7.data(), + raw_estimate_data_p8.data(), + raw_estimate_data_p9.data(), + raw_estimate_data_p10.data(), + raw_estimate_data_p11.data(), + raw_estimate_data_p12.data(), + raw_estimate_data_p13.data(), + raw_estimate_data_p14.data(), + raw_estimate_data_p15.data(), + raw_estimate_data_p16.data(), + raw_estimate_data_p17.data(), + raw_estimate_data_p18.data()}; + +CUCO_HLL_TUNING_ARR_DECL bias_data_p4{10.0, + 9.717, + 9.207, + 8.7896, + 8.2882, + 7.8204, + 7.3772, + 6.9342, + 6.5202, + 6.161, + 5.7722, + 5.4636, + 5.0396, + 4.6766, + 4.3566, + 4.0454, + 3.7936, + 3.4856, + 3.2666, + 2.9946, + 2.766, + 2.4692, + 2.3638, + 2.0764, + 1.7864, + 1.7602, + 1.4814, + 1.433, + 1.2926, + 1.0664, + 0.999600000000001, + 0.7956, + 0.5366, + 0.589399999999998, + 0.573799999999999, + 0.269799999999996, + 0.368200000000002, + 0.0544000000000011, + 0.234200000000001, + 0.0108000000000033, + -0.203400000000002, + -0.0701999999999998, + -0.129600000000003, + -0.364199999999997, + -0.480600000000003, + -0.226999999999997, + -0.322800000000001, + -0.382599999999996, + -0.511200000000002, + -0.669600000000003, + -0.749400000000001, + -0.500399999999999, + -0.617600000000003, + -0.6922, + -0.601599999999998, + -0.416200000000003, + -0.338200000000001, + -0.782600000000002, + -0.648600000000002, + -0.919800000000002, + -0.851799999999997, + -0.962400000000002, + -0.6402, + -1.1922, + -1.0256, + -1.086, + -1.21899999999999, + -0.819400000000002, + -0.940600000000003, + -1.1554, + -1.2072, + -1.1752, + -1.16759999999999, + -1.14019999999999, + -1.3754, + -1.29859999999999, + -1.607, + -1.3292, + -1.7606}; +CUCO_HLL_TUNING_ARR_DECL bias_data_p5{22.0, + 21.1194, + 20.8208, + 20.2318, + 19.77, + 19.2436, + 18.7774, + 18.2848, + 17.8224, + 17.3742, + 16.9336, + 16.503, + 16.0494, + 15.6292, + 15.2124, + 14.798, + 14.367, + 13.9728, + 13.5944, + 13.217, + 12.8438, + 12.3696, + 12.0956, + 11.7044, + 11.324, + 11.0668, + 10.6698, + 10.3644, + 10.049, + 9.6918, + 9.4146, + 9.082, + 8.687, + 8.5398, + 8.2462, + 7.857, + 7.6606, + 7.4168, + 7.1248, + 6.9222, + 6.6804, + 6.447, + 6.3454, + 5.9594, + 5.7636, + 5.5776, + 5.331, + 5.19, + 4.9676, + 4.7564, + 4.5314, + 4.4442, + 4.3708, + 3.9774, + 3.9624, + 3.8796, + 3.755, + 3.472, + 3.2076, + 3.1024, + 2.8908, + 2.7338, + 2.7728, + 2.629, + 2.413, + 2.3266, + 2.1524, + 2.2642, + 2.1806, + 2.0566, + 1.9192, + 1.7598, + 1.3516, + 1.5802, + 1.43859999999999, + 1.49160000000001, + 1.1524, + 1.1892, + 0.841399999999993, + 0.879800000000003, + 0.837599999999995, + 0.469800000000006, + 0.765600000000006, + 0.331000000000003, + 0.591399999999993, + 0.601200000000006, + 0.701599999999999, + 0.558199999999999, + 0.339399999999998, + 0.354399999999998, + 0.491200000000006, + 0.308000000000007, + 0.355199999999996, + -0.0254000000000048, + 0.205200000000005, + -0.272999999999996, + 0.132199999999997, + 0.394400000000005, + -0.241200000000006, + 0.242000000000004, + 0.191400000000002, + 0.253799999999998, + -0.122399999999999, + -0.370800000000003, + 0.193200000000004, + -0.0848000000000013, + 0.0867999999999967, + -0.327200000000005, + -0.285600000000002, + 0.311400000000006, + -0.128399999999999, + -0.754999999999995, + -0.209199999999996, + -0.293599999999998, + -0.364000000000004, + -0.253600000000006, + -0.821200000000005, + -0.253600000000006, + -0.510400000000004, + -0.383399999999995, + -0.491799999999998, + -0.220200000000006, + -0.0972000000000008, + -0.557400000000001, + -0.114599999999996, + -0.295000000000002, + -0.534800000000004, + 0.346399999999988, + -0.65379999999999, + 0.0398000000000138, + 0.0341999999999985, + -0.995800000000003, + -0.523400000000009, + -0.489000000000004, + -0.274799999999999, + -0.574999999999989, + -0.482799999999997, + 0.0571999999999946, + -0.330600000000004, + -0.628800000000012, + -0.140199999999993, + -0.540600000000012, + -0.445999999999998, + -0.599400000000003, + -0.262599999999992, + 0.163399999999996, + -0.100599999999986, + -0.39500000000001, + -1.06960000000001, + -0.836399999999998, + -0.753199999999993, + -0.412399999999991, + -0.790400000000005, + -0.29679999999999, + -0.28540000000001, + -0.193000000000012, + -0.0772000000000048, + -0.962799999999987, + -0.414800000000014}; +CUCO_HLL_TUNING_ARR_DECL bias_data_p6{45.0, + 44.1902, + 43.271, + 42.8358, + 41.8142, + 41.2854, + 40.317, + 39.354, + 38.8924, + 37.9436, + 37.4596, + 36.5262, + 35.6248, + 35.1574, + 34.2822, + 33.837, + 32.9636, + 32.074, + 31.7042, + 30.7976, + 30.4772, + 29.6564, + 28.7942, + 28.5004, + 27.686, + 27.291, + 26.5672, + 25.8556, + 25.4982, + 24.8204, + 24.4252, + 23.7744, + 23.0786, + 22.8344, + 22.0294, + 21.8098, + 21.0794, + 20.5732, + 20.1878, + 19.5648, + 19.2902, + 18.6784, + 18.3352, + 17.8946, + 17.3712, + 17.0852, + 16.499, + 16.2686, + 15.6844, + 15.2234, + 14.9732, + 14.3356, + 14.2286, + 13.7262, + 13.3284, + 13.1048, + 12.5962, + 12.3562, + 12.1272, + 11.4184, + 11.4974, + 11.0822, + 10.856, + 10.48, + 10.2834, + 10.0208, + 9.637, + 9.51739999999999, + 9.05759999999999, + 8.74760000000001, + 8.42700000000001, + 8.1326, + 8.2372, + 8.2788, + 7.6776, + 7.79259999999999, + 7.1952, + 6.9564, + 6.6454, + 6.87, + 6.5428, + 6.19999999999999, + 6.02940000000001, + 5.62780000000001, + 5.6782, + 5.792, + 5.35159999999999, + 5.28319999999999, + 5.0394, + 5.07480000000001, + 4.49119999999999, + 4.84899999999999, + 4.696, + 4.54040000000001, + 4.07300000000001, + 4.37139999999999, + 3.7216, + 3.7328, + 3.42080000000001, + 3.41839999999999, + 3.94239999999999, + 3.27719999999999, + 3.411, + 3.13079999999999, + 2.76900000000001, + 2.92580000000001, + 2.68279999999999, + 2.75020000000001, + 2.70599999999999, + 2.3886, + 3.01859999999999, + 2.45179999999999, + 2.92699999999999, + 2.41720000000001, + 2.41139999999999, + 2.03299999999999, + 2.51240000000001, + 2.5564, + 2.60079999999999, + 2.41720000000001, + 1.80439999999999, + 1.99700000000001, + 2.45480000000001, + 1.8948, + 2.2346, + 2.30860000000001, + 2.15479999999999, + 1.88419999999999, + 1.6508, + 0.677199999999999, + 1.72540000000001, + 1.4752, + 1.72280000000001, + 1.66139999999999, + 1.16759999999999, + 1.79300000000001, + 1.00059999999999, + 0.905200000000008, + 0.659999999999997, + 1.55879999999999, + 1.1636, + 0.688199999999995, + 0.712600000000009, + 0.450199999999995, + 1.1978, + 0.975599999999986, + 0.165400000000005, + 1.727, + 1.19739999999999, + -0.252600000000001, + 1.13460000000001, + 1.3048, + 1.19479999999999, + 0.313400000000001, + 0.878999999999991, + 1.12039999999999, + 0.853000000000009, + 1.67920000000001, + 0.856999999999999, + 0.448599999999999, + 1.2362, + 0.953399999999988, + 1.02859999999998, + 0.563199999999995, + 0.663000000000011, + 0.723000000000013, + 0.756599999999992, + 0.256599999999992, + -0.837600000000009, + 0.620000000000005, + 0.821599999999989, + 0.216600000000028, + 0.205600000000004, + 0.220199999999977, + 0.372599999999977, + 0.334400000000016, + 0.928400000000011, + 0.972800000000007, + 0.192400000000021, + 0.487199999999973, + -0.413000000000011, + 0.807000000000016, + 0.120600000000024, + 0.769000000000005, + 0.870799999999974, + 0.66500000000002, + 0.118200000000002, + 0.401200000000017, + 0.635199999999998, + 0.135400000000004, + 0.175599999999974, + 1.16059999999999, + 0.34620000000001, + 0.521400000000028, + -0.586599999999976, + -1.16480000000001, + 0.968399999999974, + 0.836999999999989, + 0.779600000000016, + 0.985799999999983}; +CUCO_HLL_TUNING_ARR_DECL bias_data_p7{91.0, + 89.4934, + 87.9758, + 86.4574, + 84.9718, + 83.4954, + 81.5302, + 80.0756, + 78.6374, + 77.1782, + 75.7888, + 73.9522, + 72.592, + 71.2532, + 69.9086, + 68.5938, + 66.9474, + 65.6796, + 64.4394, + 63.2176, + 61.9768, + 60.4214, + 59.2528, + 58.0102, + 56.8658, + 55.7278, + 54.3044, + 53.1316, + 52.093, + 51.0032, + 49.9092, + 48.6306, + 47.5294, + 46.5756, + 45.6508, + 44.662, + 43.552, + 42.3724, + 41.617, + 40.5754, + 39.7872, + 38.8444, + 37.7988, + 36.8606, + 36.2118, + 35.3566, + 34.4476, + 33.5882, + 32.6816, + 32.0824, + 31.0258, + 30.6048, + 29.4436, + 28.7274, + 27.957, + 27.147, + 26.4364, + 25.7592, + 25.3386, + 24.781, + 23.8028, + 23.656, + 22.6544, + 21.996, + 21.4718, + 21.1544, + 20.6098, + 19.5956, + 19.0616, + 18.5758, + 18.4878, + 17.5244, + 17.2146, + 16.724, + 15.8722, + 15.5198, + 15.0414, + 14.941, + 14.9048, + 13.87, + 13.4304, + 13.028, + 12.4708, + 12.37, + 12.0624, + 11.4668, + 11.5532, + 11.4352, + 11.2564, + 10.2744, + 10.2118, + 9.74720000000002, + 10.1456, + 9.2928, + 8.75040000000001, + 8.55279999999999, + 8.97899999999998, + 8.21019999999999, + 8.18340000000001, + 7.3494, + 7.32499999999999, + 7.66140000000001, + 6.90300000000002, + 7.25439999999998, + 6.9042, + 7.21499999999997, + 6.28640000000001, + 6.08139999999997, + 6.6764, + 6.30099999999999, + 5.13900000000001, + 5.65800000000002, + 5.17320000000001, + 4.59019999999998, + 4.9538, + 5.08280000000002, + 4.92200000000003, + 4.99020000000002, + 4.7328, + 5.4538, + 4.11360000000002, + 4.22340000000003, + 4.08780000000002, + 3.70800000000003, + 4.15559999999999, + 4.18520000000001, + 3.63720000000001, + 3.68220000000002, + 3.77960000000002, + 3.6078, + 2.49160000000001, + 3.13099999999997, + 2.5376, + 3.19880000000001, + 3.21100000000001, + 2.4502, + 3.52820000000003, + 2.91199999999998, + 3.04480000000001, + 2.7432, + 2.85239999999999, + 2.79880000000003, + 2.78579999999999, + 1.88679999999999, + 2.98860000000002, + 2.50639999999999, + 1.91239999999999, + 2.66160000000002, + 2.46820000000002, + 1.58199999999999, + 1.30399999999997, + 2.27379999999999, + 2.68939999999998, + 1.32900000000001, + 3.10599999999999, + 1.69080000000002, + 2.13740000000001, + 2.53219999999999, + 1.88479999999998, + 1.33240000000001, + 1.45119999999997, + 1.17899999999997, + 2.44119999999998, + 1.60659999999996, + 2.16700000000003, + 0.77940000000001, + 2.37900000000002, + 2.06700000000001, + 1.46000000000004, + 2.91160000000002, + 1.69200000000001, + 0.954600000000028, + 2.49300000000005, + 2.2722, + 1.33500000000004, + 2.44899999999996, + 1.20140000000004, + 3.07380000000001, + 2.09739999999999, + 2.85640000000001, + 2.29960000000005, + 2.40899999999999, + 1.97040000000004, + 0.809799999999996, + 1.65279999999996, + 2.59979999999996, + 0.95799999999997, + 2.06799999999998, + 2.32780000000002, + 4.20159999999998, + 1.96320000000003, + 1.86400000000003, + 1.42999999999995, + 3.77940000000001, + 1.27200000000005, + 1.86440000000005, + 2.20600000000002, + 3.21900000000005, + 1.5154, + 2.61019999999996}; +CUCO_HLL_TUNING_ARR_DECL bias_data_p8{183.2152, + 180.2454, + 177.2096, + 173.6652, + 170.6312, + 167.6822, + 164.249, + 161.3296, + 158.0038, + 155.2074, + 152.4612, + 149.27, + 146.5178, + 143.4412, + 140.8032, + 138.1634, + 135.1688, + 132.6074, + 129.6946, + 127.2664, + 124.8228, + 122.0432, + 119.6824, + 116.9464, + 114.6268, + 112.2626, + 109.8376, + 107.4034, + 104.8956, + 102.8522, + 100.7638, + 98.3552, + 96.3556, + 93.7526, + 91.9292, + 89.8954, + 87.8198, + 85.7668, + 83.298, + 81.6688, + 79.9466, + 77.9746, + 76.1672, + 74.3474, + 72.3028, + 70.8912, + 69.114, + 67.4646, + 65.9744, + 64.4092, + 62.6022, + 60.843, + 59.5684, + 58.1652, + 56.5426, + 55.4152, + 53.5388, + 52.3592, + 51.1366, + 49.486, + 48.3918, + 46.5076, + 45.509, + 44.3834, + 43.3498, + 42.0668, + 40.7346, + 40.1228, + 38.4528, + 37.7, + 36.644, + 36.0518, + 34.5774, + 33.9068, + 32.432, + 32.1666, + 30.434, + 29.6644, + 28.4894, + 27.6312, + 26.3804, + 26.292, + 25.5496000000001, + 25.0234, + 24.8206, + 22.6146, + 22.4188, + 22.117, + 20.6762, + 20.6576, + 19.7864, + 19.509, + 18.5334, + 17.9204, + 17.772, + 16.2924, + 16.8654, + 15.1836, + 15.745, + 15.1316, + 15.0386, + 14.0136, + 13.6342, + 12.6196, + 12.1866, + 12.4281999999999, + 11.3324, + 10.4794000000001, + 11.5038, + 10.129, + 9.52800000000002, + 10.3203999999999, + 9.46299999999997, + 9.79280000000006, + 9.12300000000005, + 8.74180000000001, + 9.2192, + 7.51020000000005, + 7.60659999999996, + 7.01840000000004, + 7.22239999999999, + 7.40139999999997, + 6.76179999999999, + 7.14359999999999, + 5.65060000000005, + 5.63779999999997, + 5.76599999999996, + 6.75139999999999, + 5.57759999999996, + 3.73220000000003, + 5.8048, + 5.63019999999995, + 4.93359999999996, + 3.47979999999995, + 4.33879999999999, + 3.98940000000005, + 3.81960000000004, + 3.31359999999995, + 3.23080000000004, + 3.4588, + 3.08159999999998, + 3.4076, + 3.00639999999999, + 2.38779999999997, + 2.61900000000003, + 1.99800000000005, + 3.34820000000002, + 2.95060000000001, + 0.990999999999985, + 2.11440000000005, + 2.20299999999997, + 2.82219999999995, + 2.73239999999998, + 2.7826, + 3.76660000000004, + 2.26480000000004, + 2.31280000000004, + 2.40819999999997, + 2.75360000000001, + 3.33759999999995, + 2.71559999999999, + 1.7478000000001, + 1.42920000000004, + 2.39300000000003, + 2.22779999999989, + 2.34339999999997, + 0.87259999999992, + 3.88400000000001, + 1.80600000000004, + 1.91759999999999, + 1.16779999999994, + 1.50320000000011, + 2.52500000000009, + 0.226400000000012, + 2.31500000000005, + 0.930000000000064, + 1.25199999999995, + 2.14959999999996, + 0.0407999999999902, + 2.5447999999999, + 1.32960000000003, + 0.197400000000016, + 2.52620000000002, + 3.33279999999991, + -1.34300000000007, + 0.422199999999975, + 0.917200000000093, + 1.12920000000008, + 1.46060000000011, + 1.45779999999991, + 2.8728000000001, + 3.33359999999993, + -1.34079999999994, + 1.57680000000005, + 0.363000000000056, + 1.40740000000005, + 0.656600000000026, + 0.801400000000058, + -0.454600000000028, + 1.51919999999996}; +CUCO_HLL_TUNING_ARR_DECL bias_data_p9{368.0, + 361.8294, + 355.2452, + 348.6698, + 342.1464, + 336.2024, + 329.8782, + 323.6598, + 317.462, + 311.2826, + 305.7102, + 299.7416, + 293.9366, + 288.1046, + 282.285, + 277.0668, + 271.306, + 265.8448, + 260.301, + 254.9886, + 250.2422, + 244.8138, + 239.7074, + 234.7428, + 229.8402, + 225.1664, + 220.3534, + 215.594, + 210.6886, + 205.7876, + 201.65, + 197.228, + 192.8036, + 188.1666, + 184.0818, + 180.0824, + 176.2574, + 172.302, + 168.1644, + 164.0056, + 160.3802, + 156.7192, + 152.5234, + 149.2084, + 145.831, + 142.485, + 139.1112, + 135.4764, + 131.76, + 129.3368, + 126.5538, + 122.5058, + 119.2646, + 116.5902, + 113.3818, + 110.8998, + 107.9532, + 105.2062, + 102.2798, + 99.4728, + 96.9582, + 94.3292, + 92.171, + 89.7809999999999, + 87.5716, + 84.7048, + 82.5322, + 79.875, + 78.3972, + 75.3464, + 73.7274, + 71.2834, + 70.1444, + 68.4263999999999, + 66.0166, + 64.018, + 62.0437999999999, + 60.3399999999999, + 58.6856, + 57.9836, + 55.0311999999999, + 54.6769999999999, + 52.3188, + 51.4846, + 49.4423999999999, + 47.739, + 46.1487999999999, + 44.9202, + 43.4059999999999, + 42.5342000000001, + 41.2834, + 38.8954000000001, + 38.3286000000001, + 36.2146, + 36.6684, + 35.9946, + 33.123, + 33.4338, + 31.7378000000001, + 29.076, + 28.9692, + 27.4964, + 27.0998, + 25.9864, + 26.7754, + 24.3208, + 23.4838, + 22.7388000000001, + 24.0758000000001, + 21.9097999999999, + 20.9728, + 19.9228000000001, + 19.9292, + 16.617, + 17.05, + 18.2996000000001, + 15.6128000000001, + 15.7392, + 14.5174, + 13.6322, + 12.2583999999999, + 13.3766000000001, + 11.423, + 13.1232, + 9.51639999999998, + 10.5938000000001, + 9.59719999999993, + 8.12220000000002, + 9.76739999999995, + 7.50440000000003, + 7.56999999999994, + 6.70440000000008, + 6.41419999999994, + 6.71019999999999, + 5.60940000000005, + 4.65219999999999, + 6.84099999999989, + 3.4072000000001, + 3.97859999999991, + 3.32760000000007, + 5.52160000000003, + 3.31860000000006, + 2.06940000000009, + 4.35400000000004, + 1.57500000000005, + 0.280799999999999, + 2.12879999999996, + -0.214799999999968, + -0.0378000000000611, + -0.658200000000079, + 0.654800000000023, + -0.0697999999999865, + 0.858400000000074, + -2.52700000000004, + -2.1751999999999, + -3.35539999999992, + -1.04019999999991, + -0.651000000000067, + -2.14439999999991, + -1.96659999999997, + -3.97939999999994, + -0.604400000000169, + -3.08260000000018, + -3.39159999999993, + -5.29640000000018, + -5.38920000000007, + -5.08759999999984, + -4.69900000000007, + -5.23720000000003, + -3.15779999999995, + -4.97879999999986, + -4.89899999999989, + -7.48880000000008, + -5.94799999999987, + -5.68060000000014, + -6.67180000000008, + -4.70499999999993, + -7.27779999999984, + -4.6579999999999, + -4.4362000000001, + -4.32139999999981, + -5.18859999999995, + -6.66879999999992, + -6.48399999999992, + -5.1260000000002, + -4.4032000000002, + -6.13500000000022, + -5.80819999999994, + -4.16719999999987, + -4.15039999999999, + -7.45600000000013, + -7.24080000000004, + -9.83179999999993, + -5.80420000000004, + -8.6561999999999, + -6.99940000000015, + -10.5473999999999, + -7.34139999999979, + -6.80999999999995, + -6.29719999999998, + -6.23199999999997}; +CUCO_HLL_TUNING_ARR_DECL bias_data_p10{737.1256, + 724.4234, + 711.1064, + 698.4732, + 685.4636, + 673.0644, + 660.488, + 647.9654, + 636.0832, + 623.7864, + 612.1992, + 600.2176, + 588.5228, + 577.1716, + 565.7752, + 554.899, + 543.6126, + 532.6492, + 521.9474, + 511.5214, + 501.1064, + 490.6364, + 480.2468, + 470.4588, + 460.3832, + 451.0584, + 440.8606, + 431.3868, + 422.5062, + 413.1862, + 404.463, + 395.339, + 386.1936, + 378.1292, + 369.1854, + 361.2908, + 353.3324, + 344.8518, + 337.5204, + 329.4854, + 321.9318, + 314.552, + 306.4658, + 299.4256, + 292.849, + 286.152, + 278.8956, + 271.8792, + 265.118, + 258.62, + 252.5132, + 245.9322, + 239.7726, + 233.6086, + 227.5332, + 222.5918, + 216.4294, + 210.7662, + 205.4106, + 199.7338, + 194.9012, + 188.4486, + 183.1556, + 178.6338, + 173.7312, + 169.6264, + 163.9526, + 159.8742, + 155.8326, + 151.1966, + 147.5594, + 143.07, + 140.037, + 134.1804, + 131.071, + 127.4884, + 124.0848, + 120.2944, + 117.333, + 112.9626, + 110.2902, + 107.0814, + 103.0334, + 99.4832000000001, + 96.3899999999999, + 93.7202000000002, + 90.1714000000002, + 87.2357999999999, + 85.9346, + 82.8910000000001, + 80.0264000000002, + 78.3834000000002, + 75.1543999999999, + 73.8683999999998, + 70.9895999999999, + 69.4367999999999, + 64.8701999999998, + 65.0408000000002, + 61.6738, + 59.5207999999998, + 57.0158000000001, + 54.2302, + 53.0962, + 50.4985999999999, + 52.2588000000001, + 47.3914, + 45.6244000000002, + 42.8377999999998, + 43.0072, + 40.6516000000001, + 40.2453999999998, + 35.2136, + 36.4546, + 33.7849999999999, + 33.2294000000002, + 32.4679999999998, + 30.8670000000002, + 28.6507999999999, + 28.9099999999999, + 27.5983999999999, + 26.1619999999998, + 24.5563999999999, + 23.2328000000002, + 21.9484000000002, + 21.5902000000001, + 21.3346000000001, + 17.7031999999999, + 20.6111999999998, + 19.5545999999999, + 15.7375999999999, + 17.0720000000001, + 16.9517999999998, + 15.326, + 13.1817999999998, + 14.6925999999999, + 13.0859999999998, + 13.2754, + 10.8697999999999, + 11.248, + 7.3768, + 4.72339999999986, + 7.97899999999981, + 8.7503999999999, + 7.68119999999999, + 9.7199999999998, + 7.73919999999998, + 5.6224000000002, + 7.44560000000001, + 6.6601999999998, + 5.9058, + 4.00199999999995, + 4.51699999999983, + 4.68240000000014, + 3.86220000000003, + 5.13639999999987, + 5.98500000000013, + 2.47719999999981, + 2.61999999999989, + 1.62800000000016, + 4.65000000000009, + 0.225599999999758, + 0.831000000000131, + -0.359400000000278, + 1.27599999999984, + -2.92559999999958, + -0.0303999999996449, + 2.37079999999969, + -2.0033999999996, + 0.804600000000391, + 0.30199999999968, + 1.1247999999996, + -2.6880000000001, + 0.0321999999996478, + -1.18099999999959, + -3.9402, + -1.47940000000017, + -0.188400000000001, + -2.10720000000038, + -2.04159999999956, + -3.12880000000041, + -4.16160000000036, + -0.612799999999879, + -3.48719999999958, + -8.17900000000009, + -5.37780000000021, + -4.01379999999972, + -5.58259999999973, + -5.73719999999958, + -7.66799999999967, + -5.69520000000011, + -1.1247999999996, + -5.58520000000044, + -8.04560000000038, + -4.64840000000004, + -11.6468000000004, + -7.97519999999986, + -5.78300000000036, + -7.67420000000038, + -10.6328000000003, + -9.81720000000041}; +CUCO_HLL_TUNING_ARR_DECL bias_data_p11{1476.0, + 1449.6014, + 1423.5802, + 1397.7942, + 1372.3042, + 1347.2062, + 1321.8402, + 1297.2292, + 1272.9462, + 1248.9926, + 1225.3026, + 1201.4252, + 1178.0578, + 1155.6092, + 1132.626, + 1110.5568, + 1088.527, + 1066.5154, + 1045.1874, + 1024.3878, + 1003.37, + 982.1972, + 962.5728, + 942.1012, + 922.9668, + 903.292, + 884.0772, + 864.8578, + 846.6562, + 828.041, + 809.714, + 792.3112, + 775.1806, + 757.9854, + 740.656, + 724.346, + 707.5154, + 691.8378, + 675.7448, + 659.6722, + 645.5722, + 630.1462, + 614.4124, + 600.8728, + 585.898, + 572.408, + 558.4926, + 544.4938, + 531.6776, + 517.282, + 505.7704, + 493.1012, + 480.7388, + 467.6876, + 456.1872, + 445.5048, + 433.0214, + 420.806, + 411.409, + 400.4144, + 389.4294, + 379.2286, + 369.651, + 360.6156, + 350.337, + 342.083, + 332.1538, + 322.5094, + 315.01, + 305.6686, + 298.1678, + 287.8116, + 280.9978, + 271.9204, + 265.3286, + 257.5706, + 249.6014, + 242.544, + 235.5976, + 229.583, + 220.9438, + 214.672, + 208.2786, + 201.8628, + 195.1834, + 191.505, + 186.1816, + 178.5188, + 172.2294, + 167.8908, + 161.0194, + 158.052, + 151.4588, + 148.1596, + 143.4344, + 138.5238, + 133.13, + 127.6374, + 124.8162, + 118.7894, + 117.3984, + 114.6078, + 109.0858, + 105.1036, + 103.6258, + 98.6018000000004, + 95.7618000000002, + 93.5821999999998, + 88.5900000000001, + 86.9992000000002, + 82.8800000000001, + 80.4539999999997, + 74.6981999999998, + 74.3644000000004, + 73.2914000000001, + 65.5709999999999, + 66.9232000000002, + 65.1913999999997, + 62.5882000000001, + 61.5702000000001, + 55.7035999999998, + 56.1764000000003, + 52.7596000000003, + 53.0302000000001, + 49.0609999999997, + 48.4694, + 44.933, + 46.0474000000004, + 44.7165999999997, + 41.9416000000001, + 39.9207999999999, + 35.6328000000003, + 35.5276000000003, + 33.1934000000001, + 33.2371999999996, + 33.3864000000003, + 33.9228000000003, + 30.2371999999996, + 29.1373999999996, + 25.2272000000003, + 24.2942000000003, + 19.8338000000003, + 18.9005999999999, + 23.0907999999999, + 21.8544000000002, + 19.5176000000001, + 15.4147999999996, + 16.9314000000004, + 18.6737999999996, + 12.9877999999999, + 14.3688000000002, + 12.0447999999997, + 15.5219999999999, + 12.5299999999997, + 14.5940000000001, + 14.3131999999996, + 9.45499999999993, + 12.9441999999999, + 3.91139999999996, + 13.1373999999996, + 5.44720000000052, + 9.82779999999912, + 7.87279999999919, + 3.67760000000089, + 5.46980000000076, + 5.55099999999948, + 5.65979999999945, + 3.89439999999922, + 3.1275999999998, + 5.65140000000065, + 6.3062000000009, + 3.90799999999945, + 1.87060000000019, + 5.17020000000048, + 2.46680000000015, + 0.770000000000437, + -3.72340000000077, + 1.16400000000067, + 8.05340000000069, + 0.135399999999208, + 2.15940000000046, + 0.766999999999825, + 1.0594000000001, + 3.15500000000065, + -0.287399999999252, + 2.37219999999979, + -2.86620000000039, + -1.63199999999961, + -2.22979999999916, + -0.15519999999924, + -1.46039999999994, + -0.262199999999211, + -2.34460000000036, + -2.8078000000005, + -3.22179999999935, + -5.60159999999996, + -8.42200000000048, + -9.43740000000071, + 0.161799999999857, + -10.4755999999998, + -10.0823999999993}; +CUCO_HLL_TUNING_ARR_DECL bias_data_p12{2953.0, + 2900.4782, + 2848.3568, + 2796.3666, + 2745.324, + 2694.9598, + 2644.648, + 2595.539, + 2546.1474, + 2498.2576, + 2450.8376, + 2403.6076, + 2357.451, + 2311.38, + 2266.4104, + 2221.5638, + 2176.9676, + 2134.193, + 2090.838, + 2048.8548, + 2007.018, + 1966.1742, + 1925.4482, + 1885.1294, + 1846.4776, + 1807.4044, + 1768.8724, + 1731.3732, + 1693.4304, + 1657.5326, + 1621.949, + 1586.5532, + 1551.7256, + 1517.6182, + 1483.5186, + 1450.4528, + 1417.865, + 1385.7164, + 1352.6828, + 1322.6708, + 1291.8312, + 1260.9036, + 1231.476, + 1201.8652, + 1173.6718, + 1145.757, + 1119.2072, + 1092.2828, + 1065.0434, + 1038.6264, + 1014.3192, + 988.5746, + 965.0816, + 940.1176, + 917.9796, + 894.5576, + 871.1858, + 849.9144, + 827.1142, + 805.0818, + 783.9664, + 763.9096, + 742.0816, + 724.3962, + 706.3454, + 688.018, + 667.4214, + 650.3106, + 633.0686, + 613.8094, + 597.818, + 581.4248, + 563.834, + 547.363, + 531.5066, + 520.455400000001, + 505.583199999999, + 488.366, + 476.480799999999, + 459.7682, + 450.0522, + 434.328799999999, + 423.952799999999, + 408.727000000001, + 399.079400000001, + 387.252200000001, + 373.987999999999, + 360.852000000001, + 351.6394, + 339.642, + 330.902400000001, + 322.661599999999, + 311.662200000001, + 301.3254, + 291.7484, + 279.939200000001, + 276.7508, + 263.215200000001, + 254.811400000001, + 245.5494, + 242.306399999999, + 234.8734, + 223.787200000001, + 217.7156, + 212.0196, + 200.793, + 195.9748, + 189.0702, + 182.449199999999, + 177.2772, + 170.2336, + 164.741, + 158.613600000001, + 155.311, + 147.5964, + 142.837, + 137.3724, + 132.0162, + 130.0424, + 121.9804, + 120.451800000001, + 114.8968, + 111.585999999999, + 105.933199999999, + 101.705, + 98.5141999999996, + 95.0488000000005, + 89.7880000000005, + 91.4750000000004, + 83.7764000000006, + 80.9698000000008, + 72.8574000000008, + 73.1615999999995, + 67.5838000000003, + 62.6263999999992, + 63.2638000000006, + 66.0977999999996, + 52.0843999999997, + 58.9956000000002, + 47.0912000000008, + 46.4956000000002, + 48.4383999999991, + 47.1082000000006, + 43.2392, + 37.2759999999998, + 40.0283999999992, + 35.1864000000005, + 35.8595999999998, + 32.0998, + 28.027, + 23.6694000000007, + 33.8266000000003, + 26.3736000000008, + 27.2008000000005, + 21.3245999999999, + 26.4115999999995, + 23.4521999999997, + 19.5013999999992, + 19.8513999999996, + 10.7492000000002, + 18.6424000000006, + 13.1265999999996, + 18.2436000000016, + 6.71860000000015, + 3.39459999999963, + 6.33759999999893, + 7.76719999999841, + 0.813999999998487, + 3.82819999999992, + 0.826199999999517, + 8.07440000000133, + -1.59080000000176, + 5.01780000000144, + 0.455399999998917, + -0.24199999999837, + 0.174800000000687, + -9.07640000000174, + -4.20160000000033, + -3.77520000000004, + -4.75179999999818, + -5.3724000000002, + -8.90680000000066, + -6.10239999999976, + -5.74120000000039, + -9.95339999999851, + -3.86339999999836, + -13.7304000000004, + -16.2710000000006, + -7.51359999999841, + -3.30679999999847, + -13.1339999999982, + -10.0551999999989, + -6.72019999999975, + -8.59660000000076, + -10.9307999999983, + -1.8775999999998, + -4.82259999999951, + -13.7788, + -21.6470000000008, + -10.6735999999983, + -15.7799999999988}; +CUCO_HLL_TUNING_ARR_DECL bias_data_p13{5907.5052, 5802.2672, + 5697.347, 5593.5794, + 5491.2622, 5390.5514, + 5290.3376, 5191.6952, + 5093.5988, 4997.3552, + 4902.5972, 4808.3082, + 4715.5646, 4624.109, + 4533.8216, 4444.4344, + 4356.3802, 4269.2962, + 4183.3784, 4098.292, + 4014.79, 3932.4574, + 3850.6036, 3771.2712, + 3691.7708, 3615.099, + 3538.1858, 3463.4746, + 3388.8496, 3315.6794, + 3244.5448, 3173.7516, + 3103.3106, 3033.6094, + 2966.5642, 2900.794, + 2833.7256, 2769.81, + 2707.3196, 2644.0778, + 2583.9916, 2523.4662, + 2464.124, 2406.073, + 2347.0362, 2292.1006, + 2238.1716, 2182.7514, + 2128.4884, 2077.1314, + 2025.037, 1975.3756, + 1928.933, 1879.311, + 1831.0006, 1783.2144, + 1738.3096, 1694.5144, + 1649.024, 1606.847, + 1564.7528, 1525.3168, + 1482.5372, 1443.9668, + 1406.5074, 1365.867, + 1329.2186, 1295.4186, + 1257.9716, 1225.339, + 1193.2972, 1156.3578, + 1125.8686, 1091.187, + 1061.4094, 1029.4188, + 1000.9126, 972.3272, + 944.004199999999, 915.7592, + 889.965, 862.834200000001, + 840.4254, 812.598399999999, + 785.924200000001, 763.050999999999, + 741.793799999999, 721.466, + 699.040799999999, 677.997200000002, + 649.866999999998, 634.911800000002, + 609.8694, 591.981599999999, + 570.2922, 557.129199999999, + 538.3858, 521.872599999999, + 502.951400000002, 495.776399999999, + 475.171399999999, 459.751, + 439.995200000001, 426.708999999999, + 413.7016, 402.3868, + 387.262599999998, 372.0524, + 357.050999999999, 342.5098, + 334.849200000001, 322.529399999999, + 311.613799999999, 295.848000000002, + 289.273000000001, 274.093000000001, + 263.329600000001, 251.389599999999, + 245.7392, 231.9614, + 229.7952, 217.155200000001, + 208.9588, 199.016599999999, + 190.839199999999, 180.6976, + 176.272799999999, 166.976999999999, + 162.5252, 151.196400000001, + 149.386999999999, 133.981199999998, + 130.0586, 130.164000000001, + 122.053400000001, 110.7428, + 108.1276, 106.232400000001, + 100.381600000001, 98.7668000000012, + 86.6440000000002, 79.9768000000004, + 82.4722000000002, 68.7026000000005, + 70.1186000000016, 71.9948000000004, + 58.998599999999, 59.0492000000013, + 56.9818000000014, 47.5338000000011, + 42.9928, 51.1591999999982, + 37.2740000000013, 42.7220000000016, + 31.3734000000004, 26.8090000000011, + 25.8934000000008, 26.5286000000015, + 29.5442000000003, 19.3503999999994, + 26.0760000000009, 17.9527999999991, + 14.8419999999969, 10.4683999999979, + 8.65899999999965, 9.86720000000059, + 4.34139999999752, -0.907800000000861, + -3.32080000000133, -0.936199999996461, + -11.9916000000012, -8.87000000000262, + -6.33099999999831, -11.3366000000024, + -15.9207999999999, -9.34659999999712, + -15.5034000000014, -19.2097999999969, + -15.357799999998, -28.2235999999975, + -30.6898000000001, -19.3271999999997, + -25.6083999999973, -24.409599999999, + -13.6385999999984, -33.4473999999973, + -32.6949999999997, -28.9063999999998, + -31.7483999999968, -32.2935999999972, + -35.8329999999987, -47.620600000002, + -39.0855999999985, -33.1434000000008, + -46.1371999999974, -37.5892000000022, + -46.8164000000033, -47.3142000000007, + -60.2914000000019, -37.7575999999972}; +CUCO_HLL_TUNING_ARR_DECL bias_data_p14{ + 11816.475, 11605.0046, 11395.3792, 11188.7504, 10984.1814, + 10782.0086, 10582.0072, 10384.503, 10189.178, 9996.2738, + 9806.0344, 9617.9798, 9431.394, 9248.7784, 9067.6894, + 8889.6824, 8712.9134, 8538.8624, 8368.4944, 8197.7956, + 8031.8916, 7866.6316, 7703.733, 7544.5726, 7386.204, + 7230.666, 7077.8516, 6926.7886, 6778.6902, 6631.9632, + 6487.304, 6346.7486, 6206.4408, 6070.202, 5935.2576, + 5799.924, 5671.0324, 5541.9788, 5414.6112, 5290.0274, + 5166.723, 5047.6906, 4929.162, 4815.1406, 4699.127, + 4588.5606, 4477.7394, 4369.4014, 4264.2728, 4155.9224, + 4055.581, 3955.505, 3856.9618, 3761.3828, 3666.9702, + 3575.7764, 3482.4132, 3395.0186, 3305.8852, 3221.415, + 3138.6024, 3056.296, 2970.4494, 2896.1526, 2816.8008, + 2740.2156, 2670.497, 2594.1458, 2527.111, 2460.8168, + 2387.5114, 2322.9498, 2260.6752, 2194.2686, 2133.7792, + 2074.767, 2015.204, 1959.4226, 1898.6502, 1850.006, + 1792.849, 1741.4838, 1687.9778, 1638.1322, 1589.3266, + 1543.1394, 1496.8266, 1447.8516, 1402.7354, 1361.9606, + 1327.0692, 1285.4106, 1241.8112, 1201.6726, 1161.973, + 1130.261, 1094.2036, 1048.2036, 1020.6436, 990.901400000002, + 961.199800000002, 924.769800000002, 899.526400000002, 872.346400000002, 834.375, + 810.432000000001, 780.659800000001, 756.013800000001, 733.479399999997, 707.923999999999, + 673.858, 652.222399999999, 636.572399999997, 615.738599999997, 586.696400000001, + 564.147199999999, 541.679600000003, 523.943599999999, 505.714599999999, 475.729599999999, + 461.779600000002, 449.750800000002, 439.020799999998, 412.7886, 400.245600000002, + 383.188199999997, 362.079599999997, 357.533799999997, 334.319000000003, 327.553399999997, + 308.559399999998, 291.270199999999, 279.351999999999, 271.791400000002, 252.576999999997, + 247.482400000001, 236.174800000001, 218.774599999997, 220.155200000001, 208.794399999999, + 201.223599999998, 182.995600000002, 185.5268, 164.547400000003, 176.5962, + 150.689599999998, 157.8004, 138.378799999999, 134.021200000003, 117.614399999999, + 108.194000000003, 97.0696000000025, 89.6042000000016, 95.6030000000028, 84.7810000000027, + 72.635000000002, 77.3482000000004, 59.4907999999996, 55.5875999999989, 50.7346000000034, + 61.3916000000027, 50.9149999999936, 39.0384000000049, 58.9395999999979, 29.633600000001, + 28.2032000000036, 26.0078000000067, 17.0387999999948, 9.22000000000116, 13.8387999999977, + 8.07240000000456, 14.1549999999988, 15.3570000000036, 3.42660000000615, 6.24820000000182, + -2.96940000000177, -8.79940000000352, -5.97860000000219, -14.4048000000039, -3.4143999999942, + -13.0148000000045, -11.6977999999945, -25.7878000000055, -22.3185999999987, -24.409599999999, + -31.9756000000052, -18.9722000000038, -22.8678000000073, -30.8972000000067, -32.3715999999986, + -22.3907999999938, -43.6720000000059, -35.9038, -39.7492000000057, -54.1641999999993, + -45.2749999999942, -42.2989999999991, -44.1089999999967, -64.3564000000042, -49.9551999999967, + -42.6116000000038}; +CUCO_HLL_TUNING_ARR_DECL bias_data_p15{ + 23634.0036, 23210.8034, 22792.4744, 22379.1524, + 21969.7928, 21565.326, 21165.3532, 20770.2806, + 20379.9892, 19994.7098, 19613.318, 19236.799, + 18865.4382, 18498.8244, 18136.5138, 17778.8668, + 17426.2344, 17079.32, 16734.778, 16397.2418, + 16063.3324, 15734.0232, 15409.731, 15088.728, + 14772.9896, 14464.1402, 14157.5588, 13855.5958, + 13559.3296, 13264.9096, 12978.326, 12692.0826, + 12413.8816, 12137.3192, 11870.2326, 11602.5554, + 11340.3142, 11079.613, 10829.5908, 10583.5466, + 10334.0344, 10095.5072, 9859.694, 9625.2822, + 9395.7862, 9174.0586, 8957.3164, 8738.064, + 8524.155, 8313.7396, 8116.9168, 7913.542, + 7718.4778, 7521.65, 7335.5596, 7154.2906, + 6968.7396, 6786.3996, 6613.236, 6437.406, + 6270.6598, 6107.7958, 5945.7174, 5787.6784, + 5635.5784, 5482.308, 5337.9784, 5190.0864, + 5045.9158, 4919.1386, 4771.817, 4645.7742, + 4518.4774, 4385.5454, 4262.6622, 4142.74679999999, + 4015.5318, 3897.9276, 3790.7764, 3685.13800000001, + 3573.6274, 3467.9706, 3368.61079999999, 3271.5202, + 3170.3848, 3076.4656, 2982.38400000001, 2888.4664, + 2806.4868, 2711.9564, 2634.1434, 2551.3204, + 2469.7662, 2396.61139999999, 2318.9902, 2243.8658, + 2171.9246, 2105.01360000001, 2028.8536, 1960.9952, + 1901.4096, 1841.86079999999, 1777.54700000001, 1714.5802, + 1654.65059999999, 1596.311, 1546.2016, 1492.3296, + 1433.8974, 1383.84600000001, 1339.4152, 1293.5518, + 1245.8686, 1193.50659999999, 1162.27959999999, 1107.19439999999, + 1069.18060000001, 1035.09179999999, 999.679000000004, 957.679999999993, + 925.300199999998, 888.099400000006, 848.638600000006, 818.156400000007, + 796.748399999997, 752.139200000005, 725.271200000003, 692.216, + 671.633600000001, 647.939799999993, 621.670599999998, 575.398799999995, + 561.226599999995, 532.237999999998, 521.787599999996, 483.095799999996, + 467.049599999998, 465.286399999997, 415.548599999995, 401.047399999996, + 380.607999999993, 377.362599999993, 347.258799999996, 338.371599999999, + 310.096999999994, 301.409199999995, 276.280799999993, 265.586800000005, + 258.994399999996, 223.915999999997, 215.925399999993, 213.503800000006, + 191.045400000003, 166.718200000003, 166.259000000005, 162.941200000001, + 148.829400000002, 141.645999999993, 123.535399999993, 122.329800000007, + 89.473399999988, 80.1962000000058, 77.5457999999926, 59.1056000000099, + 83.3509999999951, 52.2906000000075, 36.3979999999865, 40.6558000000077, + 42.0003999999899, 19.6630000000005, 19.7153999999864, -8.38539999999921, + -0.692799999989802, 0.854800000000978, 3.23219999999856, -3.89040000000386, + -5.25880000001052, -24.9052000000083, -22.6837999999989, -26.4286000000138, + -34.997000000003, -37.0216000000073, -43.430400000012, -58.2390000000014, + -68.8034000000043, -56.9245999999985, -57.8583999999973, -77.3097999999882, + -73.2793999999994, -81.0738000000129, -87.4530000000086, -65.0254000000132, + -57.296399999992, -96.2746000000043, -103.25, -96.081600000005, + -91.5542000000132, -102.465200000006, -107.688599999994, -101.458000000013, + -109.715800000005}; +CUCO_HLL_TUNING_ARR_DECL bias_data_p16{ + 47270.0, 46423.3584, 45585.7074, 44757.152, 43938.8416, + 43130.9514, 42330.03, 41540.407, 40759.6348, 39988.206, + 39226.5144, 38473.2096, 37729.795, 36997.268, 36272.6448, + 35558.665, 34853.0248, 34157.4472, 33470.5204, 32793.5742, + 32127.0194, 31469.4182, 30817.6136, 30178.6968, 29546.8908, + 28922.8544, 28312.271, 27707.0924, 27114.0326, 26526.692, + 25948.6336, 25383.7826, 24823.5998, 24272.2974, 23732.2572, + 23201.4976, 22674.2796, 22163.6336, 21656.515, 21161.7362, + 20669.9368, 20189.4424, 19717.3358, 19256.3744, 18795.9638, + 18352.197, 17908.5738, 17474.391, 17052.918, 16637.2236, + 16228.4602, 15823.3474, 15428.6974, 15043.0284, 14667.6278, + 14297.4588, 13935.2882, 13578.5402, 13234.6032, 12882.1578, + 12548.0728, 12219.231, 11898.0072, 11587.2626, 11279.9072, + 10973.5048, 10678.5186, 10392.4876, 10105.2556, 9825.766, + 9562.5444, 9294.2222, 9038.2352, 8784.848, 8533.2644, + 8301.7776, 8058.30859999999, 7822.94579999999, 7599.11319999999, 7366.90779999999, + 7161.217, 6957.53080000001, 6736.212, 6548.21220000001, 6343.06839999999, + 6156.28719999999, 5975.15419999999, 5791.75719999999, 5621.32019999999, 5451.66, + 5287.61040000001, 5118.09479999999, 4957.288, 4798.4246, 4662.17559999999, + 4512.05900000001, 4364.68539999999, 4220.77720000001, 4082.67259999999, 3957.19519999999, + 3842.15779999999, 3699.3328, 3583.01180000001, 3473.8964, 3338.66639999999, + 3233.55559999999, 3117.799, 3008.111, 2909.69140000001, 2814.86499999999, + 2719.46119999999, 2624.742, 2532.46979999999, 2444.7886, 2370.1868, + 2272.45259999999, 2196.19260000001, 2117.90419999999, 2023.2972, 1969.76819999999, + 1885.58979999999, 1833.2824, 1733.91200000001, 1682.54920000001, 1604.57980000001, + 1556.11240000001, 1491.3064, 1421.71960000001, 1371.22899999999, 1322.1324, + 1264.7892, 1196.23920000001, 1143.8474, 1088.67240000001, 1073.60380000001, + 1023.11660000001, 959.036400000012, 927.433199999999, 906.792799999996, 853.433599999989, + 841.873800000001, 791.1054, 756.899999999994, 704.343200000003, 672.495599999995, + 622.790399999998, 611.254799999995, 567.283200000005, 519.406599999988, 519.188400000014, + 495.312800000014, 451.350799999986, 443.973399999988, 431.882199999993, 392.027000000002, + 380.924200000009, 345.128999999986, 298.901400000002, 287.771999999997, 272.625, + 247.253000000026, 222.490600000019, 223.590000000026, 196.407599999977, 176.425999999978, + 134.725199999986, 132.4804, 110.445599999977, 86.7939999999944, 56.7038000000175, + 64.915399999998, 38.3726000000024, 37.1606000000029, 46.170999999973, 49.1716000000015, + 15.3362000000197, 6.71639999997569, -34.8185999999987, -39.4476000000141, 12.6830000000191, + -12.3331999999937, -50.6565999999875, -59.9538000000175, -65.1054000000004, -70.7576000000117, + -106.325200000021, -126.852200000023, -110.227599999984, -132.885999999999, -113.897200000007, + -142.713800000027, -151.145399999979, -150.799200000009, -177.756200000003, -156.036399999983, + -182.735199999996, -177.259399999981, -198.663600000029, -174.577600000019, -193.84580000001}; +CUCO_HLL_TUNING_ARR_DECL bias_data_p17{ + 94541.0, 92848.811, 91174.019, 89517.558, 87879.9705, + 86262.7565, 84663.5125, 83083.7435, 81521.7865, 79977.272, + 78455.9465, 76950.219, 75465.432, 73994.152, 72546.71, + 71115.2345, 69705.6765, 68314.937, 66944.2705, 65591.255, + 64252.9485, 62938.016, 61636.8225, 60355.592, 59092.789, + 57850.568, 56624.518, 55417.343, 54231.1415, 53067.387, + 51903.526, 50774.649, 49657.6415, 48561.05, 47475.7575, + 46410.159, 45364.852, 44327.053, 43318.4005, 42325.6165, + 41348.4595, 40383.6265, 39436.77, 38509.502, 37594.035, + 36695.939, 35818.6895, 34955.691, 34115.8095, 33293.949, + 32465.0775, 31657.6715, 30877.2585, 30093.78, 29351.3695, + 28594.1365, 27872.115, 27168.7465, 26477.076, 25774.541, + 25106.5375, 24452.5135, 23815.5125, 23174.0655, 22555.2685, + 21960.2065, 21376.3555, 20785.1925, 20211.517, 19657.0725, + 19141.6865, 18579.737, 18081.3955, 17578.995, 17073.44, + 16608.335, 16119.911, 15651.266, 15194.583, 14749.0495, + 14343.4835, 13925.639, 13504.509, 13099.3885, 12691.2855, + 12328.018, 11969.0345, 11596.5145, 11245.6355, 10917.6575, + 10580.9785, 10277.8605, 9926.58100000001, 9605.538, 9300.42950000003, + 8989.97850000003, 8728.73249999998, 8448.3235, 8175.31050000002, 7898.98700000002, + 7629.79100000003, 7413.76199999999, 7149.92300000001, 6921.12650000001, 6677.1545, + 6443.28000000003, 6278.23450000002, 6014.20049999998, 5791.20299999998, 5605.78450000001, + 5438.48800000001, 5234.2255, 5059.6825, 4887.43349999998, 4682.935, + 4496.31099999999, 4322.52250000002, 4191.42499999999, 4021.24200000003, 3900.64799999999, + 3762.84250000003, 3609.98050000001, 3502.29599999997, 3363.84250000003, 3206.54849999998, + 3079.70000000001, 2971.42300000001, 2867.80349999998, 2727.08100000001, 2630.74900000001, + 2496.6165, 2440.902, 2356.19150000002, 2235.58199999999, 2120.54149999999, + 2012.25449999998, 1933.35600000003, 1820.93099999998, 1761.54800000001, 1663.09350000002, + 1578.84600000002, 1509.48149999999, 1427.3345, 1379.56150000001, 1306.68099999998, + 1212.63449999999, 1084.17300000001, 1124.16450000001, 1060.69949999999, 1007.48849999998, + 941.194499999983, 879.880500000028, 836.007500000007, 782.802000000025, 748.385499999975, + 647.991500000004, 626.730500000005, 570.776000000013, 484.000500000024, 513.98550000001, + 418.985499999952, 386.996999999974, 370.026500000036, 355.496999999974, 356.731499999994, + 255.92200000002, 259.094000000041, 205.434499999974, 165.374500000034, 197.347500000033, + 95.718499999959, 67.6165000000037, 54.6970000000438, 31.7395000000251, -15.8784999999916, + 8.42500000004657, -26.3754999999655, -118.425500000012, -66.6629999999423, -42.9745000000112, + -107.364999999991, -189.839000000036, -162.611499999999, -164.964999999967, -189.079999999958, + -223.931499999948, -235.329999999958, -269.639500000048, -249.087999999989, -206.475499999942, + -283.04449999996, -290.667000000016, -304.561499999953, -336.784499999951, -380.386500000022, + -283.280499999993, -364.533000000054, -389.059499999974, -364.454000000027, -415.748000000021, + -417.155000000028}; +CUCO_HLL_TUNING_ARR_DECL bias_data_p18{189083.0, + 185696.913, + 182348.774, + 179035.946, + 175762.762, + 172526.444, + 169329.754, + 166166.099, + 163043.269, + 159958.91, + 156907.912, + 153906.845, + 150924.199, + 147996.568, + 145093.457, + 142239.233, + 139421.475, + 136632.27, + 133889.588, + 131174.2, + 128511.619, + 125868.621, + 123265.385, + 120721.061, + 118181.769, + 115709.456, + 113252.446, + 110840.198, + 108465.099, + 106126.164, + 103823.469, + 101556.618, + 99308.004, + 97124.508, + 94937.803, + 92833.731, + 90745.061, + 88677.627, + 86617.47, + 84650.442, + 82697.833, + 80769.132, + 78879.629, + 77014.432, + 75215.626, + 73384.587, + 71652.482, + 69895.93, + 68209.301, + 66553.669, + 64921.981, + 63310.323, + 61742.115, + 60205.018, + 58698.658, + 57190.657, + 55760.865, + 54331.169, + 52908.167, + 51550.273, + 50225.254, + 48922.421, + 47614.533, + 46362.049, + 45098.569, + 43926.083, + 42736.03, + 41593.473, + 40425.26, + 39316.237, + 38243.651, + 37170.617, + 36114.609, + 35084.19, + 34117.233, + 33206.509, + 32231.505, + 31318.728, + 30403.404, + 29540.0550000001, + 28679.236, + 27825.862, + 26965.216, + 26179.148, + 25462.08, + 24645.952, + 23922.523, + 23198.144, + 22529.128, + 21762.4179999999, + 21134.779, + 20459.117, + 19840.818, + 19187.04, + 18636.3689999999, + 17982.831, + 17439.7389999999, + 16874.547, + 16358.2169999999, + 15835.684, + 15352.914, + 14823.681, + 14329.313, + 13816.897, + 13342.874, + 12880.882, + 12491.648, + 12021.254, + 11625.392, + 11293.7610000001, + 10813.697, + 10456.209, + 10099.074, + 9755.39000000001, + 9393.18500000006, + 9047.57900000003, + 8657.98499999999, + 8395.85900000005, + 8033.0, + 7736.95900000003, + 7430.59699999995, + 7258.47699999996, + 6924.58200000005, + 6691.29399999999, + 6357.92500000005, + 6202.05700000003, + 5921.19700000004, + 5628.28399999999, + 5404.96799999999, + 5226.71100000001, + 4990.75600000005, + 4799.77399999998, + 4622.93099999998, + 4472.478, + 4171.78700000001, + 3957.46299999999, + 3868.95200000005, + 3691.14300000004, + 3474.63100000005, + 3341.67200000002, + 3109.14000000001, + 3071.97400000005, + 2796.40399999998, + 2756.17799999996, + 2611.46999999997, + 2471.93000000005, + 2382.26399999997, + 2209.22400000005, + 2142.28399999999, + 2013.96100000001, + 1911.18999999994, + 1818.27099999995, + 1668.47900000005, + 1519.65800000005, + 1469.67599999998, + 1367.13800000004, + 1248.52899999998, + 1181.23600000003, + 1022.71900000004, + 1088.20700000005, + 959.03600000008, + 876.095999999903, + 791.183999999892, + 703.337000000058, + 731.949999999953, + 586.86400000006, + 526.024999999907, + 323.004999999888, + 320.448000000091, + 340.672999999952, + 309.638999999966, + 216.601999999955, + 102.922999999952, + 19.2399999999907, + -0.114000000059605, + -32.6240000000689, + -89.3179999999702, + -153.497999999905, + -64.2970000000205, + -143.695999999996, + -259.497999999905, + -253.017999999924, + -213.948000000091, + -397.590000000084, + -434.006000000052, + -403.475000000093, + -297.958000000101, + -404.317000000039, + -528.898999999976, + -506.621000000043, + -513.205000000075, + -479.351000000024, + -596.139999999898, + -527.016999999993, + -664.681000000099, + -680.306000000099, + -704.050000000047, + -850.486000000034, + -757.43200000003, + -713.308999999892}; + +// Meta array storing interpolation points for biases for Precision=4..18 +__device__ static cuda::std::array constexpr bias_data{bias_data_p4.data(), + bias_data_p5.data(), + bias_data_p6.data(), + bias_data_p7.data(), + bias_data_p8.data(), + bias_data_p9.data(), + bias_data_p10.data(), + bias_data_p11.data(), + bias_data_p12.data(), + bias_data_p13.data(), + bias_data_p14.data(), + bias_data_p15.data(), + bias_data_p16.data(), + bias_data_p17.data(), + bias_data_p18.data()}; + +} // namespace cuco::hyperloglog_ns::detail \ No newline at end of file diff --git a/include/cuco/distinct_count_estimator.cuh b/include/cuco/distinct_count_estimator.cuh new file mode 100644 index 000000000..16c7b46d3 --- /dev/null +++ b/include/cuco/distinct_count_estimator.cuh @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace cuco { +template , + class Allocator = cuco::cuda_allocator> +class distinct_count_estimator { + using impl_type = detail::hyperloglog; + + public: + static constexpr auto thread_scope = impl_type::thread_scope; ///< CUDA thread scope + static constexpr auto precision = impl_type::precision; + + using allocator_type = typename impl_type::allocator_type; ///< Allocator type + using storage_type = typename impl_type::storage_type; + + template + using ref_type = cuco::distinct_count_estimator_ref; + + // TODO enable CTAD + constexpr distinct_count_estimator(cuco::cuda_thread_scope scope = {}, + Hash const& hash = {}, + Allocator const& alloc = {}, + cuco::cuda_stream_ref stream = {}); + + distinct_count_estimator(distinct_count_estimator const&) = delete; + distinct_count_estimator& operator=(distinct_count_estimator const&) = delete; + distinct_count_estimator(distinct_count_estimator&&) = default; + distinct_count_estimator& operator=(distinct_count_estimator&&) = default; + ~distinct_count_estimator() = default; + + void clear_async(cuco::cuda_stream_ref stream = {}) noexcept; + + void clear(cuco::cuda_stream_ref stream = {}); + + template + void add_async(InputIt first, InputIt last, cuco::cuda_stream_ref stream = {}) noexcept; + + template + void add(InputIt first, InputIt last, cuco::cuda_stream_ref stream = {}); + + template + void merge_async( + distinct_count_estimator const& other, + cuco::cuda_stream_ref stream = {}) noexcept; + + template + void merge(distinct_count_estimator const& other, + cuco::cuda_stream_ref stream = {}); + + template + void merge_async(ref_type const& other, cuco::cuda_stream_ref stream = {}) noexcept; + + template + void merge(ref_type const& other, cuco::cuda_stream_ref stream = {}); + + [[nodiscard]] std::size_t estimate(cuco::cuda_stream_ref stream = {}) const; + + [[nodiscard]] ref_type<> ref() const noexcept; + + private: + std::unique_ptr impl_; +}; +} // namespace cuco + +#include \ No newline at end of file diff --git a/include/cuco/distinct_count_estimator_ref.cuh b/include/cuco/distinct_count_estimator_ref.cuh new file mode 100644 index 000000000..5787e3f47 --- /dev/null +++ b/include/cuco/distinct_count_estimator_ref.cuh @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include + +#include + +namespace cuco { +template +class distinct_count_estimator_ref { + using impl_type = detail::hyperloglog_ref; + + public: + static constexpr auto thread_scope = impl_type::thread_scope; ///< CUDA thread scope + static constexpr auto precision = impl_type::precision; + + using storage_type = typename impl_type::storage_type; + template + using with_scope = distinct_count_estimator_ref; + + // TODO let storage_type be inferred? + __host__ __device__ constexpr distinct_count_estimator_ref( + storage_type& storage, + cuco::cuda_thread_scope scope = {}, + Hash const& hash = {}) noexcept; + + template + __device__ void clear(CG const& group) noexcept; + + __device__ void add(T const& item) noexcept; + + template + __device__ void merge( + CG const& group, + distinct_count_estimator_ref const& other) noexcept; + + [[nodiscard]] __device__ std::size_t estimate( + cooperative_groups::thread_block const& group) const noexcept; + + private: + impl_type impl_; +}; +} // namespace cuco + +#include \ No newline at end of file From 6718560ffab53e5e9dac7d20f7bd31a4f7b1dcff Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Wed, 24 Jan 2024 01:00:46 +0000 Subject: [PATCH 02/78] Code style --- include/cuco/detail/hyperloglog/hyperloglog.cuh | 6 +++--- include/cuco/detail/hyperloglog/storage.cuh | 3 ++- include/cuco/distinct_count_estimator.cuh | 6 +++--- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh index bd3871261..3d0dd6f29 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog.cuh @@ -55,11 +55,11 @@ class hyperloglog { this->clear_async(stream); // TODO async or sync? } - hyperloglog(hyperloglog const&) = delete; + hyperloglog(hyperloglog const&) = delete; hyperloglog& operator=(hyperloglog const&) = delete; hyperloglog(hyperloglog&&) = default; - hyperloglog& operator=(hyperloglog&&) = default; - ~hyperloglog() = default; + hyperloglog& operator=(hyperloglog&&) = default; + ~hyperloglog() = default; void clear_async(cuco::cuda_stream_ref stream) noexcept { diff --git a/include/cuco/detail/hyperloglog/storage.cuh b/include/cuco/detail/hyperloglog/storage.cuh index 195bdbe1c..effdc076a 100644 --- a/include/cuco/detail/hyperloglog/storage.cuh +++ b/include/cuco/detail/hyperloglog/storage.cuh @@ -20,5 +20,6 @@ namespace cuco::detail { template struct alignas(sizeof(int) * 4) hyperloglog_storage - : public cuda::std::array {}; + : public cuda::std::array { +}; } // namespace cuco::detail diff --git a/include/cuco/distinct_count_estimator.cuh b/include/cuco/distinct_count_estimator.cuh index 16c7b46d3..1d5dde49d 100644 --- a/include/cuco/distinct_count_estimator.cuh +++ b/include/cuco/distinct_count_estimator.cuh @@ -51,11 +51,11 @@ class distinct_count_estimator { Allocator const& alloc = {}, cuco::cuda_stream_ref stream = {}); - distinct_count_estimator(distinct_count_estimator const&) = delete; + distinct_count_estimator(distinct_count_estimator const&) = delete; distinct_count_estimator& operator=(distinct_count_estimator const&) = delete; distinct_count_estimator(distinct_count_estimator&&) = default; - distinct_count_estimator& operator=(distinct_count_estimator&&) = default; - ~distinct_count_estimator() = default; + distinct_count_estimator& operator=(distinct_count_estimator&&) = default; + ~distinct_count_estimator() = default; void clear_async(cuco::cuda_stream_ref stream = {}) noexcept; From c59744e40a8b8654d87eb30fcb4ba9f4e99aed01 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Wed, 24 Jan 2024 01:02:30 +0000 Subject: [PATCH 03/78] Resolve merge conflicts --- include/cuco/detail/hyperloglog/kernels.cuh | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/include/cuco/detail/hyperloglog/kernels.cuh b/include/cuco/detail/hyperloglog/kernels.cuh index 70064abcc..e84f49e40 100644 --- a/include/cuco/detail/hyperloglog/kernels.cuh +++ b/include/cuco/detail/hyperloglog/kernels.cuh @@ -23,16 +23,17 @@ #include namespace cuco::hyperloglog_ns::detail { +CUCO_SUPPRESS_KERNEL_WARNINGS template -__global__ void clear(RefType ref) +CUCO_KERNEL void clear(RefType ref) { auto const block = cooperative_groups::this_thread_block(); if (block.group_index().x == 0) { ref.clear(block); } } template -__global__ void add_shmem(InputIt first, cuco::detail::index_type n, RefType ref) +CUCO_KERNEL void add_shmem(InputIt first, cuco::detail::index_type n, RefType ref) { using local_ref_type = typename RefType::with_scope; @@ -56,7 +57,7 @@ __global__ void add_shmem(InputIt first, cuco::detail::index_type n, RefType ref } template -__global__ void merge(OtherRefType other_ref, RefType ref) +CUCO_KERNEL void merge(OtherRefType other_ref, RefType ref) { auto const block = cooperative_groups::this_thread_block(); if (block.group_index().x == 0) { ref.merge(block, other_ref); } @@ -64,7 +65,7 @@ __global__ void merge(OtherRefType other_ref, RefType ref) // TODO this kernel currently isn't being used template -__global__ void estimate(std::size_t* cardinality, RefType ref) +CUCO_KERNEL void estimate(std::size_t* cardinality, RefType ref) { auto const block = cooperative_groups::this_thread_block(); if (block.group_index().x == 0) { From b7533a0eeb309392740890f885ebf3d63e356b8c Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Wed, 24 Jan 2024 22:16:24 +0000 Subject: [PATCH 04/78] Initialize shmem atomics through placement new --- include/cuco/detail/hyperloglog/hyperloglog_ref.cuh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh index ba9333f95..ce66036eb 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh @@ -144,10 +144,9 @@ class hyperloglog_ref { __shared__ cuda::atomic block_zeroes; __shared__ std::size_t estimate; - // TODO is this needed? if (group.thread_rank() == 0) { - block_sum.store(0, cuda::std::memory_order_relaxed); - block_zeroes.store(0, cuda::std::memory_order_relaxed); + new (&block_sum) decltype(block_sum){0}; + new (&block_zeroes) decltype(block_zeroes){0}; } group.sync(); From f4bdac282ef415f672297db6353e510b4cf7d853 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Wed, 24 Jan 2024 22:20:09 +0000 Subject: [PATCH 05/78] Improve naming --- include/cuco/detail/hyperloglog/hyperloglog_ref.cuh | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh index ce66036eb..11fad856b 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh @@ -150,9 +150,6 @@ class hyperloglog_ref { } group.sync(); - // a warp - auto const tile = cooperative_groups::tiled_partition<32>(group); - fp_type thread_sum = 0; int thread_zeroes = 0; for (int i = group.thread_rank(); i < this->storage_.size(); i += group.size()) { @@ -161,11 +158,12 @@ class hyperloglog_ref { thread_zeroes += reg == 0; } - // CG reduce Z and V + // warp reduce Z and V + auto const warp = cooperative_groups::tiled_partition<32>(group); cooperative_groups::reduce_update_async( - tile, block_sum, thread_sum, cooperative_groups::plus()); + warp, block_sum, thread_sum, cooperative_groups::plus()); cooperative_groups::reduce_update_async( - tile, block_zeroes, thread_zeroes, cooperative_groups::plus()); + warp, block_zeroes, thread_zeroes, cooperative_groups::plus()); group.sync(); if (group.thread_rank() == 0) { From cea2afb36f1942f925501041463adda95348ae0f Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Wed, 24 Jan 2024 23:25:31 +0000 Subject: [PATCH 06/78] Move some functionality to storage class --- .../cuco/detail/hyperloglog/hyperloglog.cuh | 14 +-- .../detail/hyperloglog/hyperloglog_ref.cuh | 75 +------------ include/cuco/detail/hyperloglog/storage.cuh | 102 +++++++++++++++++- 3 files changed, 111 insertions(+), 80 deletions(-) diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh index 3d0dd6f29..1b9e0be15 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog.cuh @@ -35,14 +35,14 @@ class hyperloglog { static constexpr auto thread_scope = Scope; ///< CUDA thread scope static constexpr auto precision = Precision; + template + using ref_type = hyperloglog_ref; + using allocator_type = Allocator; ///< Allocator type - using storage_type = detail::hyperloglog_storage; + using storage_type = typename ref_type<>::storage_type; using storage_allocator_type = typename std::allocator_traits::template rebind_alloc; - template - using ref_type = hyperloglog_ref; - constexpr hyperloglog(cuco::cuda_thread_scope, Hash const& hash, Allocator const& alloc, @@ -55,11 +55,11 @@ class hyperloglog { this->clear_async(stream); // TODO async or sync? } - hyperloglog(hyperloglog const&) = delete; + hyperloglog(hyperloglog const&) = delete; hyperloglog& operator=(hyperloglog const&) = delete; hyperloglog(hyperloglog&&) = default; - hyperloglog& operator=(hyperloglog&&) = default; - ~hyperloglog() = default; + hyperloglog& operator=(hyperloglog&&) = default; + ~hyperloglog() = default; void clear_async(cuco::cuda_stream_ref stream) noexcept { diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh index 11fad856b..5994748c5 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh @@ -35,7 +35,7 @@ class hyperloglog_ref { static constexpr auto thread_scope = Scope; ///< CUDA thread scope static constexpr auto precision = Precision; - using storage_type = hyperloglog_storage; + using storage_type = hyperloglog_dense_registers; template using with_scope = hyperloglog_ref; @@ -49,17 +49,7 @@ class hyperloglog_ref { template __device__ void clear(CG const& group) noexcept { - for (int i = group.thread_rank(); i < this->storage_.size(); i += group.size()) { - this->storage_[i] = 0; - } - - // TODO remove test code - // int4 constexpr empty{0, 0, 0, 0}; - // auto vec4 = reinterpret_cast(this->storage_.data()); - // // #pragma unroll 2 - // for (int i = group.thread_rank(); i < (this->storage_.size() / 4); i += group.size()) { - // vec4[i] = empty; - // } + this->storage_.clear(group); } __device__ void add(T const& item) noexcept @@ -70,71 +60,14 @@ class hyperloglog_ref { auto const reg = h & register_mask; auto const zeroes = cuda::std::countl_zero(h | register_mask) + 1; // __clz - if constexpr (Scope == cuda::thread_scope_thread) { - this->storage_[reg] = max(this->storage_[reg], zeroes); - } else if constexpr (Scope == cuda::thread_scope_block) { - atomicMax_block(&(this->storage_[reg]), zeroes); - } else if constexpr (Scope == cuda::thread_scope_device) { - atomicMax(&(this->storage_[reg]), zeroes); - } else if constexpr (Scope == cuda::thread_scope_system) { - atomicMax_system(&(this->storage_[reg]), zeroes); - } else { - static_assert(cuco::dependent_false, "Unsupported thread scope"); - } + this->storage_.update_max(reg, zeroes); } template __device__ void merge(CG const& group, hyperloglog_ref const& other) noexcept { - for (int i = group.thread_rank(); i < this->storage_.size(); i += group.size()) { - if constexpr (Scope == cuda::thread_scope_thread) { - this->storage_[i] = max(this->storage_[i], other.storage_[i]); - } else if constexpr (Scope == cuda::thread_scope_block) { - atomicMax_block(this->storage_.data() + i, other.storage_[i]); - } else if constexpr (Scope == cuda::thread_scope_device) { - atomicMax(this->storage_.data() + i, other.storage_[i]); - } else if constexpr (Scope == cuda::thread_scope_system) { - atomicMax_system(this->storage_.data() + i, other.storage_[i]); - } else { - static_assert(cuco::dependent_false, "Unsupported thread scope"); - } - } - - // TODO remove test code - /* - auto vec4 = reinterpret_cast(other.storage_.data()); - // #pragma unroll 2 - for (int i = group.thread_rank(); i < (this->storage_.size() / 4); i += group.size()) { - auto const items = vec4[i]; - if constexpr (Scope == cuda::thread_scope_thread) { - auto max_vec4 = reinterpret_cast(this->storage_.data()); - auto max_items = max_vec4[i]; - max_items.x = max(max_items.x, items.x); - max_items.y = max(max_items.y, items.y); - max_items.z = max(max_items.z, items.z); - max_items.w = max(max_items.w, items.w); - max_vec4[i] = max_items; - } else if constexpr (Scope == cuda::thread_scope_block) { - atomicMax_block(this->storage_.data() + (i * 4 + 0), items.x); - atomicMax_block(this->storage_.data() + (i * 4 + 1), items.y); - atomicMax_block(this->storage_.data() + (i * 4 + 2), items.z); - atomicMax_block(this->storage_.data() + (i * 4 + 3), items.w); - } else if constexpr (Scope == cuda::thread_scope_device) { - atomicMax(this->storage_.data() + (i * 4 + 0), items.x); - atomicMax(this->storage_.data() + (i * 4 + 1), items.y); - atomicMax(this->storage_.data() + (i * 4 + 2), items.z); - atomicMax(this->storage_.data() + (i * 4 + 3), items.w); - } else if constexpr (Scope == cuda::thread_scope_system) { - atomicMax_system(this->storage_.data() + (i * 4 + 0), items.x); - atomicMax_system(this->storage_.data() + (i * 4 + 1), items.y); - atomicMax_system(this->storage_.data() + (i * 4 + 2), items.z); - atomicMax_system(this->storage_.data() + (i * 4 + 3), items.w); - } else { - static_assert(cuco::dependent_false, "Unsupported thread scope"); - } - } - */ + this->storage_.merge(group, other.storage_); } [[nodiscard]] __device__ std::size_t estimate( diff --git a/include/cuco/detail/hyperloglog/storage.cuh b/include/cuco/detail/hyperloglog/storage.cuh index effdc076a..a1117fdfd 100644 --- a/include/cuco/detail/hyperloglog/storage.cuh +++ b/include/cuco/detail/hyperloglog/storage.cuh @@ -15,11 +15,109 @@ */ #pragma once +#include +#include + +#include #include namespace cuco::detail { + template -struct alignas(sizeof(int) * 4) hyperloglog_storage - : public cuda::std::array { +class hyperloglog_dense_registers { + public: + template + __device__ void constexpr clear(CG const& group) noexcept + { + for (int i = group.thread_rank(); i < this->registers_.size(); i += group.size()) { + this->registers_[i] = 0; + } + + // TODO remove test code + // int4 constexpr empty{0, 0, 0, 0}; + // auto vec4 = reinterpret_cast(this->storage_.data()); + // // #pragma unroll 2 + // for (int i = group.thread_rank(); i < (this->storage_.size() / 4); i += group.size()) { + // vec4[i] = empty; + // } + } + + __host__ __device__ constexpr int& operator[](std::size_t i) noexcept + { + return this->registers_[i]; + } + + __host__ __device__ constexpr int operator[](std::size_t i) const noexcept + { + return this->registers_[i]; + } + + __host__ __device__ constexpr std::size_t size() const noexcept + { + return this->registers_.size(); + } + + template + __device__ constexpr void update_max(std::size_t i, int value) noexcept + { + if constexpr (Scope == cuda::thread_scope_thread) { + this->registers_[i] = max(this->registers_[i], value); + } else if constexpr (Scope == cuda::thread_scope_block) { + atomicMax_block(&(this->registers_[i]), value); + } else if constexpr (Scope == cuda::thread_scope_device) { + atomicMax(&(this->registers_[i]), value); + } else if constexpr (Scope == cuda::thread_scope_system) { + atomicMax_system(&(this->registers_[i]), value); + } else { + static_assert(cuco::dependent_false, "Unsupported thread scope"); + } + } + + template + __device__ void constexpr merge(CG const& group, + hyperloglog_dense_registers const& other) noexcept + { + for (int i = group.thread_rank(); i < this->registers_.size(); i += group.size()) { + this->update_max(i, other.registers_[i]); + } + + // TODO remove test code + /* + auto vec4 = reinterpret_cast(other.storage_.data()); + // #pragma unroll 2 + for (int i = group.thread_rank(); i < (this->storage_.size() / 4); i += group.size()) { + auto const items = vec4[i]; + if constexpr (Scope == cuda::thread_scope_thread) { + auto max_vec4 = reinterpret_cast(this->storage_.data()); + auto max_items = max_vec4[i]; + max_items.x = max(max_items.x, items.x); + max_items.y = max(max_items.y, items.y); + max_items.z = max(max_items.z, items.z); + max_items.w = max(max_items.w, items.w); + max_vec4[i] = max_items; + } else if constexpr (Scope == cuda::thread_scope_block) { + atomicMax_block(this->storage_.data() + (i * 4 + 0), items.x); + atomicMax_block(this->storage_.data() + (i * 4 + 1), items.y); + atomicMax_block(this->storage_.data() + (i * 4 + 2), items.z); + atomicMax_block(this->storage_.data() + (i * 4 + 3), items.w); + } else if constexpr (Scope == cuda::thread_scope_device) { + atomicMax(this->storage_.data() + (i * 4 + 0), items.x); + atomicMax(this->storage_.data() + (i * 4 + 1), items.y); + atomicMax(this->storage_.data() + (i * 4 + 2), items.z); + atomicMax(this->storage_.data() + (i * 4 + 3), items.w); + } else if constexpr (Scope == cuda::thread_scope_system) { + atomicMax_system(this->storage_.data() + (i * 4 + 0), items.x); + atomicMax_system(this->storage_.data() + (i * 4 + 1), items.y); + atomicMax_system(this->storage_.data() + (i * 4 + 2), items.z); + atomicMax_system(this->storage_.data() + (i * 4 + 3), items.w); + } else { + static_assert(cuco::dependent_false, "Unsupported thread scope"); + } + } + */ + } + + private: + alignas(sizeof(int) * 4) cuda::std::array registers_; }; } // namespace cuco::detail From 0f0bd3fb6a8625a275268b2222ed20c65a6b264e Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Thu, 25 Jan 2024 17:01:04 +0000 Subject: [PATCH 07/78] Add inline docs for public APIs --- include/cuco/detail/hyperloglog/finalizer.cuh | 19 +- .../cuco/detail/hyperloglog/hyperloglog.cuh | 179 +++++++++++++++--- .../detail/hyperloglog/hyperloglog_ref.cuh | 73 ++++++- include/cuco/detail/hyperloglog/kernels.cuh | 2 +- include/cuco/detail/hyperloglog/storage.cuh | 56 +++++- include/cuco/distinct_count_estimator.cuh | 143 +++++++++++++- include/cuco/distinct_count_estimator_ref.cuh | 60 +++++- 7 files changed, 484 insertions(+), 48 deletions(-) diff --git a/include/cuco/detail/hyperloglog/finalizer.cuh b/include/cuco/detail/hyperloglog/finalizer.cuh index 9f5c9a20d..3aca44fdf 100644 --- a/include/cuco/detail/hyperloglog/finalizer.cuh +++ b/include/cuco/detail/hyperloglog/finalizer.cuh @@ -20,12 +20,29 @@ #include namespace cuco::hyperloglog_ns::detail { + +/** + * @brief Estimate correction algorithm based on HyperLogLog++. + * + * @note Variable names correspond to the definitions given in the HLL++ paper: + * https://static.googleusercontent.com/media/research.google.com/de//pubs/archive/40671.pdf + * + * @tparam Precision Tuning parameter to trade accuracy for runtime/memory footprint + */ template class finalizer { // this minimum number of registers is required by HLL++ static_assert(Precision >= 4, "Precision must be greater or equal to 4"); public: + /** + * @brief Compute the bias-corrected cardinality estimate. + * + * @param z Geometric mean of registers + * @param v Number of 0 registers + * + * @return Bias-corrected cardinality estimate + */ __host__ __device__ static double constexpr finalize(double z, int v) noexcept { auto e = alpha_mm() / z; @@ -50,7 +67,7 @@ class finalizer { } private: - static auto constexpr m = (1 << Precision); + static auto constexpr m = (1 << Precision); ///< Number of registers __host__ __device__ static double constexpr alpha_mm() noexcept { diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh index 1b9e0be15..3bb032105 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog.cuh @@ -29,20 +29,47 @@ #include namespace cuco::detail { +/** + * @brief A GPU-accelerated utility for approximating the number of distinct items in a multiset. + * + * @note This class implements the HyperLogLog/HyperLogLog++ algorithm: + * https://static.googleusercontent.com/media/research.google.com/de//pubs/archive/40671.pdf. + * @note The `Precision` parameter can be used to trade runtime/memory footprint for better + * accuracy. A higher value corresponds to a more accurate result, however, setting the precision + * too high will result in deminishing results. + * + * @tparam T Type of items to count + * @tparam Precision Tuning parameter to trade runtime/memory footprint for better accuracy + * @tparam Scope The scope in which operations will be performed by individual threads + * @tparam Hash Hash function used to hash items + * @tparam Allocator Type of allocator used for device storage + */ template class hyperloglog { public: - static constexpr auto thread_scope = Scope; ///< CUDA thread scope - static constexpr auto precision = Precision; + static constexpr auto thread_scope = Scope; ///< CUDA thread scope + static constexpr auto precision = Precision; ///< Precision template - using ref_type = hyperloglog_ref; - - using allocator_type = Allocator; ///< Allocator type - using storage_type = typename ref_type<>::storage_type; - using storage_allocator_type = - typename std::allocator_traits::template rebind_alloc; - + using ref_type = hyperloglog_ref; ///< Non-owning reference + ///< type + + using allocator_type = Allocator; ///< Allocator type + using storage_type = typename ref_type<>::storage_type; ///< Storage type + using storage_allocator_type = typename std::allocator_traits::template rebind_alloc< + storage_type>; ///< Storage allocator type + + /** + * @brief Constructs a `hyperloglog` host object. + * + * @note This function synchronizes the given stream. + * + * @param hash The hash function used to hash items + * @param alloc Allocator used for allocating device storage + * @param stream CUDA stream used to initialize the object + */ + // Doxygen cannot document unnamed parameter for scope, see + // https://github.com/doxygen/doxygen/issues/6926 constexpr hyperloglog(cuco::cuda_thread_scope, Hash const& hash, Allocator const& alloc, @@ -55,24 +82,56 @@ class hyperloglog { this->clear_async(stream); // TODO async or sync? } - hyperloglog(hyperloglog const&) = delete; - hyperloglog& operator=(hyperloglog const&) = delete; - hyperloglog(hyperloglog&&) = default; - hyperloglog& operator=(hyperloglog&&) = default; - ~hyperloglog() = default; + ~hyperloglog() = default; + hyperloglog(hyperloglog const&) = delete; + hyperloglog& operator=(hyperloglog const&) = delete; + hyperloglog(hyperloglog&&) = default; ///< Move constructor + + // TODO this is somehow required to pass the Doxygen check. + /** + * @brief Copy-assignment operator. + * + * @return Copy of `*this` + */ + hyperloglog& operator=(hyperloglog&&) = default; + + /** + * @brief Asynchronously resets the estimator, i.e., clears the current count estimate. + * + * @param stream CUDA stream this operation is executed in + */ void clear_async(cuco::cuda_stream_ref stream) noexcept { auto constexpr block_size = 1024; cuco::hyperloglog_ns::detail::clear<<<1, block_size, 0, stream>>>(this->ref()); } + /** + * @brief Resets the estimator, i.e., clears the current count estimate. + * + * @note This function synchronizes the given stream. For asynchronous execution use + * `clear_async`. + * + * @param stream CUDA stream this operation is executed in + */ void clear(cuco::cuda_stream_ref stream) { this->clear_async(stream); stream.synchronize(); } + /** + * @brief Asynchronously adds to be counted items to the estimator. + * + * @tparam InputIt Device accessible random access input iterator where + * std::is_convertible::value_type, + * T> is `true` + * + * @param first Beginning of the sequence of items + * @param last End of the sequence of items + * @param stream CUDA stream this operation is executed in + */ template void add_async(InputIt first, InputIt last, cuco::cuda_stream_ref stream) noexcept { @@ -83,7 +142,11 @@ class hyperloglog { int grid_size = 0; int block_size = 0; - // TODO check cuda error? + + // We make use of the occupancy calculator here to get the minimum number of blocks which still + // saturate the GPU. This reduces the atomic contention on the final register array during the + // merge phase. + // TODO check cuda error or will it sync the stream?? cudaOccupancyMaxPotentialBlockSize( &grid_size, &block_size, &cuco::hyperloglog_ns::detail::add_shmem>); @@ -91,6 +154,20 @@ class hyperloglog { first, num_items, this->ref()); } + /** + * @brief Adds to be counted items to the estimator. + * + * @note This function synchronizes the given stream. For asynchronous execution use + * `add_async`. + * + * @tparam InputIt Device accessible random access input iterator where + * std::is_convertible::value_type, + * T> is `true` + * + * @param first Beginning of the sequence of items + * @param last End of the sequence of items + * @param stream CUDA stream this operation is executed in + */ template void add(InputIt first, InputIt last, cuco::cuda_stream_ref stream) { @@ -98,35 +175,84 @@ class hyperloglog { stream.synchronize(); } + /** + * @brief Asynchronously merges the result of `other` estimator into `*this` estimator. + * + * @tparam OtherScope Thread scope of `other` estimator + * @tparam OtherAllocator Allocator type of `other` estimator + * + * @param other Other estimator to be merged into `*this` + * @param stream CUDA stream this operation is executed in + */ template void merge_async(hyperloglog const& other, - cuco::cuda_stream_ref stream = {}) noexcept + cuco::cuda_stream_ref stream) noexcept { this->merge_async(other.ref(), stream); } + /** + * @brief Merges the result of `other` estimator into `*this` estimator. + * + * @note This function synchronizes the given stream. For asynchronous execution use + * `merge_async`. + * + * @tparam OtherScope Thread scope of `other` estimator + * @tparam OtherAllocator Allocator type of `other` estimator + * + * @param other Other estimator to be merged into `*this` + * @param stream CUDA stream this operation is executed in + */ template void merge(hyperloglog const& other, - cuco::cuda_stream_ref stream = {}) + cuco::cuda_stream_ref stream) { this->merge_async(other, stream); stream.synchronize(); } + /** + * @brief Asynchronously merges the result of `other` estimator reference into `*this` estimator. + * + * @tparam OtherScope Thread scope of `other` estimator + * + * @param other Other estimator reference to be merged into `*this` + * @param stream CUDA stream this operation is executed in + */ template - void merge_async(ref_type const& other, cuco::cuda_stream_ref stream = {}) noexcept + void merge_async(ref_type const& other, cuco::cuda_stream_ref stream) noexcept { auto constexpr block_size = 1024; cuco::hyperloglog_ns::detail::merge<<<1, block_size, 0, stream>>>(other, this->ref()); } + /** + * @brief Merges the result of `other` estimator reference into `*this` estimator. + * + * @note This function synchronizes the given stream. For asynchronous execution use + * `merge_async`. + * + * @tparam OtherScope Thread scope of `other` estimator + * + * @param other Other estimator reference to be merged into `*this` + * @param stream CUDA stream this operation is executed in + */ template - void merge(ref_type const& other, cuco::cuda_stream_ref stream = {}) + void merge(ref_type const& other, cuco::cuda_stream_ref stream) { this->merge_async(other, stream); stream.synchronize(); } + /** + * @brief Compute the estimated distinct items count. + * + * @note This function synchronizes the given stream. + * + * @param stream CUDA stream this operation is executed in + * + * @return Approximate distinct items count + */ [[nodiscard]] std::size_t estimate(cuco::cuda_stream_ref stream) const { // TODO remove test code @@ -167,6 +293,11 @@ class hyperloglog { return cuco::hyperloglog_ns::detail::finalizer::finalize(sum, zeroes); } + /** + * @brief Get device ref. + * + * @return Device ref object of the current `distinct_count_estimator` host object + */ [[nodiscard]] ref_type<> ref() const noexcept { return ref_type<>{*(this->storage_.get()), {}, this->hash_}; @@ -185,11 +316,13 @@ class hyperloglog { storage_allocator_type& allocator; }; - Hash hash_; - storage_allocator_type storage_allocator_; - storage_deleter storage_deleter_; - std::unique_ptr storage_; + Hash hash_; ///< Hash function used to hash items + storage_allocator_type storage_allocator_; ///< Storage allocator + storage_deleter storage_deleter_; ///< Storage deleter + std::unique_ptr storage_; ///< Storage + // Needs to be friends with other instantiations of this class template to have access to their + // storage template friend class hyperloglog; }; diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh index 5994748c5..c6073a265 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh @@ -28,30 +28,66 @@ #include namespace cuco::detail { +/** + * @brief A GPU-accelerated utility for approximating the number of distinct items in a multiset. + * + * @note This class implements the HyperLogLog/HyperLogLog++ algorithm: + * https://static.googleusercontent.com/media/research.google.com/de//pubs/archive/40671.pdf. + * @note The `Precision` parameter can be used to trade runtime/memory footprint for better + * accuracy. A higher value corresponds to a more accurate result, however, setting the precision + * too high will result in deminishing results. + * + * @tparam T Type of items to count + * @tparam Precision Tuning parameter to trade runtime/memory footprint for better accuracy + * @tparam Scope The scope in which operations will be performed by individual threads + * @tparam Hash Hash function used to hash items + */ template class hyperloglog_ref { public: - using fp_type = float; - static constexpr auto thread_scope = Scope; ///< CUDA thread scope - static constexpr auto precision = Precision; + using fp_type = float; ///< Floating point type used for reduction + static constexpr auto thread_scope = Scope; ///< CUDA thread scope + static constexpr auto precision = Precision; ///< Precision - using storage_type = hyperloglog_dense_registers; - template - using with_scope = hyperloglog_ref; + using storage_type = hyperloglog_dense_registers; ///< Storage type + template + using with_scope = hyperloglog_ref; ///< Ref type with different + ///< thread scope + + /** + * @brief Constructs a non-owning `hyperloglog_ref` object. + * + * @param storage Reference to storage object of type `storage_type` + * @param hash The hash function used to hash items + */ + // Doxygen cannot document unnamed parameter for scope, see + // https://github.com/doxygen/doxygen/issues/6926 __host__ __device__ constexpr hyperloglog_ref(storage_type& storage, - cuco::cuda_thread_scope = {}, - Hash const& hash = {}) noexcept + cuco::cuda_thread_scope, + Hash const& hash) noexcept : hash_{hash}, storage_{storage} { } + /** + * @brief Resets the estimator, i.e., clears the current count estimate. + * + * @tparam CG CUDA Cooperative Group type + * + * @param group CUDA Cooperative group this operation is executed in + */ template __device__ void clear(CG const& group) noexcept { this->storage_.clear(group); } + /** + * @brief Adds an item to the estimator. + * + * @param item The item to be counted + */ __device__ void add(T const& item) noexcept { // static_assert NumBuckets is not too big @@ -63,6 +99,15 @@ class hyperloglog_ref { this->storage_.update_max(reg, zeroes); } + /** + * @brief Merges the result of `other` estimator reference into `*this` estimator reference. + * + * @tparam CG CUDA Cooperative Group type + * @tparam OtherScope Thread scope of `other` estimator + * + * @param group CUDA Cooperative group this operation is executed in + * @param other Other estimator reference to be merged into `*this` + */ template __device__ void merge(CG const& group, hyperloglog_ref const& other) noexcept @@ -70,6 +115,13 @@ class hyperloglog_ref { this->storage_.merge(group, other.storage_); } + /** + * @brief Compute the estimated distinct items count. + * + * @param group CUDA thread block group this operation is executed in + * + * @return Approximate distinct items count + */ [[nodiscard]] __device__ std::size_t estimate( cooperative_groups::thread_block const& group) const noexcept { @@ -110,8 +162,9 @@ class hyperloglog_ref { } private: - Hash hash_; - storage_type& storage_; // TODO is a reference the right choice here?? + Hash hash_; ///< Hash function used to hash items + // TODO is a reference the right choice here?? + storage_type& storage_; ///< Reference to storage object template friend class hyperloglog_ref; diff --git a/include/cuco/detail/hyperloglog/kernels.cuh b/include/cuco/detail/hyperloglog/kernels.cuh index e84f49e40..c7b9bc018 100644 --- a/include/cuco/detail/hyperloglog/kernels.cuh +++ b/include/cuco/detail/hyperloglog/kernels.cuh @@ -43,7 +43,7 @@ CUCO_KERNEL void add_shmem(InputIt first, cuco::detail::index_type n, RefType re auto idx = cuco::detail::global_thread_id(); auto const block = cooperative_groups::this_thread_block(); - local_ref_type local_ref(local_storage); + local_ref_type local_ref(local_storage, {}, {}); local_ref.clear(block); block.sync(); diff --git a/include/cuco/detail/hyperloglog/storage.cuh b/include/cuco/detail/hyperloglog/storage.cuh index a1117fdfd..fe0a4ff7a 100644 --- a/include/cuco/detail/hyperloglog/storage.cuh +++ b/include/cuco/detail/hyperloglog/storage.cuh @@ -23,9 +23,25 @@ namespace cuco::detail { +/** + * @brief Storage class for `hyperloglog` and `hyperloglog_ref`. + * + * @note This class implements the dense storage layout from the HyperLogLog++ paper, but uses + * 4bytes per register instead of only 6bits. This is required since we need to update registers + * atomically. + * + * @tparam Precision Tuning parameter to trade runtime/memory footprint for better accuracy + */ template class hyperloglog_dense_registers { public: + /** + * @brief Clears the storage. + * + * @tparam CG CUDA Cooperative Group type + * + * @param group CUDA Cooperative group this operation is executed in + */ template __device__ void constexpr clear(CG const& group) noexcept { @@ -42,21 +58,49 @@ class hyperloglog_dense_registers { // } } + /** + * @brief Returns a reference to the element at specified location `i`. No bounds checking is + * performed. + * + * @param i Position of the element to return + * + * @return Reference to the requested element + */ __host__ __device__ constexpr int& operator[](std::size_t i) noexcept { return this->registers_[i]; } + /** + * @brief Returns the element at specified location `i`. No bounds checking is performed. + * + * @param i Position of the element to return + * + * @return Requested element + */ __host__ __device__ constexpr int operator[](std::size_t i) const noexcept { return this->registers_[i]; } + /** + * @brief Returns the number of elements in the container. + * + * @return The number of elements in the container + */ __host__ __device__ constexpr std::size_t size() const noexcept { return this->registers_.size(); } + /** + * @brief Atomically updates the register at position `i` with `max(reg[i], value)`. + * + * @tparam Scope CUDA thread scope + * + * @param i Register index + * @param value New value + */ template __device__ constexpr void update_max(std::size_t i, int value) noexcept { @@ -73,6 +117,15 @@ class hyperloglog_dense_registers { } } + /** + * @brief Combines the contents of `other` storage into `*this` storage. + * + * @tparam Scope CUDA thread scope + * @tparam CG CUDA Cooperative Group type + * + * @param group CUDA Cooperative group this operation is executed in + * @param other Other storage + */ template __device__ void constexpr merge(CG const& group, hyperloglog_dense_registers const& other) noexcept @@ -118,6 +171,7 @@ class hyperloglog_dense_registers { } private: - alignas(sizeof(int) * 4) cuda::std::array registers_; + alignas(sizeof(int) * + 4) cuda::std::array registers_; ///< Register array storage }; } // namespace cuco::detail diff --git a/include/cuco/distinct_count_estimator.cuh b/include/cuco/distinct_count_estimator.cuh index 1d5dde49d..5b18d7f40 100644 --- a/include/cuco/distinct_count_estimator.cuh +++ b/include/cuco/distinct_count_estimator.cuh @@ -27,6 +27,21 @@ #include namespace cuco { +/** + * @brief A GPU-accelerated utility for approximating the number of distinct items in a multiset. + * + * @note This implementation is based on the HyperLogLog++ algorithm: + * https://static.googleusercontent.com/media/research.google.com/de//pubs/archive/40671.pdf. + * @note The `Precision` parameter can be used to trade runtime/memory footprint for better + * accuracy. A higher value corresponds to a more accurate result, however, setting the precision + * too high will result in deminishing returns. + * + * @tparam T Type of items to count + * @tparam Precision Tuning parameter to trade runtime/memory footprint for better accuracy + * @tparam Scope The scope in which operations will be performed by individual threads + * @tparam Hash Hash function used to hash items + * @tparam Allocator Type of allocator used for device storage + */ template - using ref_type = cuco::distinct_count_estimator_ref; + using ref_type = + cuco::distinct_count_estimator_ref; ///< Non-owning reference + ///< type + + using allocator_type = typename impl_type::allocator_type; ///< Allocator type + using storage_type = typename impl_type::storage_type; ///< Storage type // TODO enable CTAD + /** + * @brief Constructs a `distinct_count_estimator` host object. + * + * @note This function synchronizes the given stream. + * + * @param scope The scope in which operations will be performed + * @param hash The hash function used to hash items + * @param alloc Allocator used for allocating device storage + * @param stream CUDA stream used to initialize the object + */ constexpr distinct_count_estimator(cuco::cuda_thread_scope scope = {}, Hash const& hash = {}, Allocator const& alloc = {}, cuco::cuda_stream_ref stream = {}); + ~distinct_count_estimator() = default; + distinct_count_estimator(distinct_count_estimator const&) = delete; distinct_count_estimator& operator=(distinct_count_estimator const&) = delete; - distinct_count_estimator(distinct_count_estimator&&) = default; + distinct_count_estimator(distinct_count_estimator&&) = default; ///< Move constructor + + // TODO this is somehow required to pass the Doxygen check. + /** + * @brief Copy-assignment operator. + * + * @return Copy of `*this` + */ distinct_count_estimator& operator=(distinct_count_estimator&&) = default; - ~distinct_count_estimator() = default; + /** + * @brief Asynchronously resets the estimator, i.e., clears the current count estimate. + * + * @param stream CUDA stream this operation is executed in + */ void clear_async(cuco::cuda_stream_ref stream = {}) noexcept; + /** + * @brief Resets the estimator, i.e., clears the current count estimate. + * + * @note This function synchronizes the given stream. For asynchronous execution use + * `clear_async`. + * + * @param stream CUDA stream this operation is executed in + */ void clear(cuco::cuda_stream_ref stream = {}); + /** + * @brief Asynchronously adds to be counted items to the estimator. + * + * @tparam InputIt Device accessible random access input iterator where + * std::is_convertible::value_type, + * T> is `true` + * + * @param first Beginning of the sequence of items + * @param last End of the sequence of items + * @param stream CUDA stream this operation is executed in + */ template void add_async(InputIt first, InputIt last, cuco::cuda_stream_ref stream = {}) noexcept; + /** + * @brief Adds to be counted items to the estimator. + * + * @note This function synchronizes the given stream. For asynchronous execution use + * `add_async`. + * + * @tparam InputIt Device accessible random access input iterator where + * std::is_convertible::value_type, + * T> is `true` + * + * @param first Beginning of the sequence of items + * @param last End of the sequence of items + * @param stream CUDA stream this operation is executed in + */ template void add(InputIt first, InputIt last, cuco::cuda_stream_ref stream = {}); + /** + * @brief Asynchronously merges the result of `other` estimator into `*this` estimator. + * + * @tparam OtherScope Thread scope of `other` estimator + * @tparam OtherAllocator Allocator type of `other` estimator + * + * @param other Other estimator to be merged into `*this` + * @param stream CUDA stream this operation is executed in + */ template void merge_async( distinct_count_estimator const& other, cuco::cuda_stream_ref stream = {}) noexcept; + /** + * @brief Merges the result of `other` estimator into `*this` estimator. + * + * @note This function synchronizes the given stream. For asynchronous execution use + * `merge_async`. + * + * @tparam OtherScope Thread scope of `other` estimator + * @tparam OtherAllocator Allocator type of `other` estimator + * + * @param other Other estimator to be merged into `*this` + * @param stream CUDA stream this operation is executed in + */ template void merge(distinct_count_estimator const& other, cuco::cuda_stream_ref stream = {}); + /** + * @brief Asynchronously merges the result of `other` estimator reference into `*this` estimator. + * + * @tparam OtherScope Thread scope of `other` estimator + * + * @param other Other estimator reference to be merged into `*this` + * @param stream CUDA stream this operation is executed in + */ template void merge_async(ref_type const& other, cuco::cuda_stream_ref stream = {}) noexcept; + /** + * @brief Merges the result of `other` estimator reference into `*this` estimator. + * + * @note This function synchronizes the given stream. For asynchronous execution use + * `merge_async`. + * + * @tparam OtherScope Thread scope of `other` estimator + * + * @param other Other estimator reference to be merged into `*this` + * @param stream CUDA stream this operation is executed in + */ template void merge(ref_type const& other, cuco::cuda_stream_ref stream = {}); + /** + * @brief Compute the estimated distinct items count. + * + * @note This function synchronizes the given stream. + * + * @param stream CUDA stream this operation is executed in + * + * @return Approximate distinct items count + */ [[nodiscard]] std::size_t estimate(cuco::cuda_stream_ref stream = {}) const; + /** + * @brief Get device ref. + * + * @return Device ref object of the current `distinct_count_estimator` host object + */ [[nodiscard]] ref_type<> ref() const noexcept; private: - std::unique_ptr impl_; + std::unique_ptr impl_; ///< Implementation object }; } // namespace cuco diff --git a/include/cuco/distinct_count_estimator_ref.cuh b/include/cuco/distinct_count_estimator_ref.cuh index 5787e3f47..a42671812 100644 --- a/include/cuco/distinct_count_estimator_ref.cuh +++ b/include/cuco/distinct_count_estimator_ref.cuh @@ -22,39 +22,91 @@ #include namespace cuco { +/** + * @brief A GPU-accelerated utility for approximating the number of distinct items in a multiset. + * + * @note This implementation is based on the HyperLogLog++ algorithm: + * https://static.googleusercontent.com/media/research.google.com/de//pubs/archive/40671.pdf. + * @note The `Precision` parameter can be used to trade runtime/memory footprint for better + * accuracy. A higher value corresponds to a more accurate result, however, setting the precision + * too high will result in deminishing results. + * + * @tparam T Type of items to count + * @tparam Precision Tuning parameter to trade runtime/memory footprint for better accuracy + * @tparam Scope The scope in which operations will be performed by individual threads + * @tparam Hash Hash function used to hash items + */ template class distinct_count_estimator_ref { using impl_type = detail::hyperloglog_ref; public: static constexpr auto thread_scope = impl_type::thread_scope; ///< CUDA thread scope - static constexpr auto precision = impl_type::precision; + static constexpr auto precision = impl_type::precision; ///< Precision + + using storage_type = typename impl_type::storage_type; ///< Storage type - using storage_type = typename impl_type::storage_type; template - using with_scope = distinct_count_estimator_ref; + using with_scope = + distinct_count_estimator_ref; ///< Ref type with different thread + ///< scope // TODO let storage_type be inferred? + /** + * @brief Constructs a non-owning `distinct_count_estimator_ref` object. + * + * @param storage Reference to storage object of type `storage_type` + * @param scope The scope in which operations will be performed + * @param hash The hash function used to hash items + */ __host__ __device__ constexpr distinct_count_estimator_ref( storage_type& storage, cuco::cuda_thread_scope scope = {}, Hash const& hash = {}) noexcept; + /** + * @brief Resets the estimator, i.e., clears the current count estimate. + * + * @tparam CG CUDA Cooperative Group type + * + * @param group CUDA Cooperative group this operation is executed in + */ template __device__ void clear(CG const& group) noexcept; + /** + * @brief Adds an item to the estimator. + * + * @param item The item to be counted + */ __device__ void add(T const& item) noexcept; + /** + * @brief Merges the result of `other` estimator reference into `*this` estimator reference. + * + * @tparam CG CUDA Cooperative Group type + * @tparam OtherScope Thread scope of `other` estimator + * + * @param group CUDA Cooperative group this operation is executed in + * @param other Other estimator reference to be merged into `*this` + */ template __device__ void merge( CG const& group, distinct_count_estimator_ref const& other) noexcept; + /** + * @brief Compute the estimated distinct items count. + * + * @param group CUDA thread block group this operation is executed in + * + * @return Approximate distinct items count + */ [[nodiscard]] __device__ std::size_t estimate( cooperative_groups::thread_block const& group) const noexcept; private: - impl_type impl_; + impl_type impl_; ///< Implementation object }; } // namespace cuco From 1c780c25b63f1fec2a789982c55dc00f8e45c95c Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Thu, 25 Jan 2024 22:06:38 +0000 Subject: [PATCH 08/78] Add benchmark --- benchmarks/CMakeLists.txt | 5 + benchmarks/distinct_count_estimator_bench.cu | 135 ++++++++++++++++++ benchmarks/utils.hpp | 2 + .../cuco/detail/hyperloglog/hyperloglog.cuh | 1 + .../detail/hyperloglog/hyperloglog_ref.cuh | 1 + include/cuco/distinct_count_estimator.cuh | 1 + include/cuco/distinct_count_estimator_ref.cuh | 1 + 7 files changed, 146 insertions(+) create mode 100644 benchmarks/distinct_count_estimator_bench.cu diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 6b03cb98c..da57a1055 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -84,3 +84,8 @@ ConfigureBench(DYNAMIC_MAP_BENCH # - hash function benchmarks ---------------------------------------------------------------------- ConfigureBench(HASH_BENCH hash_bench.cu) + +################################################################################################### +# - distinct_count_estimator benchmarks ----------------------------------------------------------- +ConfigureBench(DISTINCT_COUNT_ESTIMATOR_BENCH + distinct_count_estimator_bench.cu) diff --git a/benchmarks/distinct_count_estimator_bench.cu b/benchmarks/distinct_count_estimator_bench.cu new file mode 100644 index 000000000..c52025c6c --- /dev/null +++ b/benchmarks/distinct_count_estimator_bench.cu @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include + +#include + +#include + +#include + +using namespace cuco::benchmark; +using namespace cuco::utility; + +template +[[nodiscard]] std::size_t exact_distinct_count(InputIt first, InputIt last) +{ + // TODO don't use detail ns in user land + auto const num_items = cuco::detail::distance(first, last); + if (num_items == 0) { return 0; } + + auto set = cuco::static_set{num_items, cuco::empty_key{-1}}; + set.insert(first, last); + return set.size(); +} + +/** + * @brief A benchmark evaluating `cuco::distinct_count_estimator` end-to-end performance + */ +template +void distinct_count_estimator_e2e(nvbench::state& state, nvbench::type_list) +{ + using T = typename Estimator::value_type; + + auto const num_items = state.get_int64_or_default("NumInputs", 1ull << 30); + + thrust::device_vector items(num_items); + + key_generator gen; + gen.generate(dist_from_state(state), items.begin(), items.end()); + + state.add_element_count(num_items); + state.add_global_memory_reads(num_items, "InputSize"); + + Estimator estimator; + estimator.add(items.begin(), items.end()); + + double estimated_cardinality = estimator.estimate(); + double const true_cardinality = exact_distinct_count(items.begin(), items.end()); + auto const relative_error = abs(true_cardinality - estimated_cardinality) / true_cardinality; + + auto& summ = state.add_summary("RelativeError"); + summ.set_string("hint", "RelErr"); + summ.set_string("short_name", "RelativeError"); + summ.set_string("description", "Relatve approximation error."); + summ.set_float64("value", relative_error); + + estimator.clear(); + state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, + [&](nvbench::launch& launch, auto& timer) { + estimator.clear_async({launch.get_stream()}); + + timer.start(); + estimator.add_async(items.begin(), items.end(), {launch.get_stream()}); + estimated_cardinality = estimator.estimate({launch.get_stream()}); + timer.stop(); + }); +} + +/** + * @brief A benchmark evaluating `cuco::distinct_count_estimator::add` performance + */ +template +void distinct_count_estimator_add(nvbench::state& state, nvbench::type_list) +{ + using T = typename Estimator::value_type; + + auto const num_items = state.get_int64_or_default("NumInputs", 1ull << 30); + + thrust::device_vector items(num_items); + + key_generator gen; + gen.generate(dist_from_state(state), items.begin(), items.end()); + + state.add_element_count(num_items); + state.add_global_memory_reads(num_items, "InputSize"); + + Estimator estimator; + state.exec(nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { + estimator.clear_async({launch.get_stream()}); + + timer.start(); + estimator.add_async(items.begin(), items.end(), {launch.get_stream()}); + timer.stop(); + }); +} + +using ESTIMATOR_RANGE = nvbench::type_list, + cuco::distinct_count_estimator, + cuco::distinct_count_estimator, + cuco::distinct_count_estimator, + cuco::distinct_count_estimator, + cuco::distinct_count_estimator, + cuco::distinct_count_estimator, + cuco::distinct_count_estimator>; + +NVBENCH_BENCH_TYPES(distinct_count_estimator_e2e, + NVBENCH_TYPE_AXES(ESTIMATOR_RANGE, nvbench::type_list)) + .set_name("distinct_count_estimator") + .set_type_axes_names({"Estimator", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE); + +NVBENCH_BENCH_TYPES(distinct_count_estimator_add, + NVBENCH_TYPE_AXES(ESTIMATOR_RANGE, nvbench::type_list)) + .set_name("distinct_count_estimator::add") + .set_type_axes_names({"Estimator", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE); \ No newline at end of file diff --git a/benchmarks/utils.hpp b/benchmarks/utils.hpp index 392cafe06..97ca4988f 100644 --- a/benchmarks/utils.hpp +++ b/benchmarks/utils.hpp @@ -21,6 +21,8 @@ #include +#include // thread_scope + namespace cuco::benchmark { template diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh index 3bb032105..c969be259 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog.cuh @@ -55,6 +55,7 @@ class hyperloglog { ///< type using allocator_type = Allocator; ///< Allocator type + using value_type = typename ref_type<>::value_type; ///< Type of items to count using storage_type = typename ref_type<>::storage_type; ///< Storage type using storage_allocator_type = typename std::allocator_traits::template rebind_alloc< storage_type>; ///< Storage allocator type diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh index c6073a265..2de123946 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh @@ -49,6 +49,7 @@ class hyperloglog_ref { static constexpr auto thread_scope = Scope; ///< CUDA thread scope static constexpr auto precision = Precision; ///< Precision + using value_type = T; ///< Type of items to count using storage_type = hyperloglog_dense_registers; ///< Storage type template diff --git a/include/cuco/distinct_count_estimator.cuh b/include/cuco/distinct_count_estimator.cuh index 5b18d7f40..16b943ac9 100644 --- a/include/cuco/distinct_count_estimator.cuh +++ b/include/cuco/distinct_count_estimator.cuh @@ -59,6 +59,7 @@ class distinct_count_estimator { cuco::distinct_count_estimator_ref; ///< Non-owning reference ///< type + using value_type = typename impl_type::value_type; ///< Type of items to count using allocator_type = typename impl_type::allocator_type; ///< Allocator type using storage_type = typename impl_type::storage_type; ///< Storage type diff --git a/include/cuco/distinct_count_estimator_ref.cuh b/include/cuco/distinct_count_estimator_ref.cuh index a42671812..d32b6c4e0 100644 --- a/include/cuco/distinct_count_estimator_ref.cuh +++ b/include/cuco/distinct_count_estimator_ref.cuh @@ -44,6 +44,7 @@ class distinct_count_estimator_ref { static constexpr auto thread_scope = impl_type::thread_scope; ///< CUDA thread scope static constexpr auto precision = impl_type::precision; ///< Precision + using value_type = typename impl_type::value_type; ///< Type of items to count using storage_type = typename impl_type::storage_type; ///< Storage type template From b478e010ebfc0d3a4875f003324b3a1d74036dc8 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Tue, 30 Jan 2024 12:08:52 +0000 Subject: [PATCH 09/78] Remove scope ctor parameter for now --- .../distinct_count_estimator.inl | 7 ++----- .../distinct_count_estimator_ref.inl | 6 ++---- include/cuco/detail/hyperloglog/hyperloglog.cuh | 11 +++-------- include/cuco/detail/hyperloglog/hyperloglog_ref.cuh | 6 +----- include/cuco/detail/hyperloglog/kernels.cuh | 2 +- include/cuco/distinct_count_estimator.cuh | 10 ++++------ include/cuco/distinct_count_estimator_ref.cuh | 7 ++----- 7 files changed, 15 insertions(+), 34 deletions(-) diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl index 7013bc956..413d7ee7b 100644 --- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl +++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl @@ -18,11 +18,8 @@ namespace cuco { template constexpr distinct_count_estimator::distinct_count_estimator( - cuco::cuda_thread_scope scope, - Hash const& hash, - Allocator const& alloc, - cuco::cuda_stream_ref stream) - : impl_{std::make_unique(scope, hash, alloc, stream)} + Hash const& hash, Allocator const& alloc, cuco::cuda_stream_ref stream) + : impl_{std::make_unique(hash, alloc, stream)} { } diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl index 1359033d0..26fc9bd99 100644 --- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl +++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl @@ -18,10 +18,8 @@ namespace cuco { template __host__ __device__ constexpr distinct_count_estimator_ref:: - distinct_count_estimator_ref(storage_type& storage, - cuco::cuda_thread_scope scope, - Hash const& hash) noexcept - : impl_{storage, scope, hash} + distinct_count_estimator_ref(storage_type& storage, Hash const& hash) noexcept + : impl_{storage, hash} { } diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh index c969be259..af303a921 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog.cuh @@ -69,12 +69,7 @@ class hyperloglog { * @param alloc Allocator used for allocating device storage * @param stream CUDA stream used to initialize the object */ - // Doxygen cannot document unnamed parameter for scope, see - // https://github.com/doxygen/doxygen/issues/6926 - constexpr hyperloglog(cuco::cuda_thread_scope, - Hash const& hash, - Allocator const& alloc, - cuco::cuda_stream_ref stream) + constexpr hyperloglog(Hash const& hash, Allocator const& alloc, cuco::cuda_stream_ref stream) : hash_{hash}, storage_allocator_{alloc}, storage_deleter_{storage_allocator_}, @@ -85,7 +80,7 @@ class hyperloglog { ~hyperloglog() = default; - hyperloglog(hyperloglog const&) = delete; + hyperloglog(hyperloglog const&) = delete; hyperloglog& operator=(hyperloglog const&) = delete; hyperloglog(hyperloglog&&) = default; ///< Move constructor @@ -301,7 +296,7 @@ class hyperloglog { */ [[nodiscard]] ref_type<> ref() const noexcept { - return ref_type<>{*(this->storage_.get()), {}, this->hash_}; + return ref_type<>{*(this->storage_.get()), this->hash_}; } private: diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh index 2de123946..e41f47ef6 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh @@ -62,11 +62,7 @@ class hyperloglog_ref { * @param storage Reference to storage object of type `storage_type` * @param hash The hash function used to hash items */ - // Doxygen cannot document unnamed parameter for scope, see - // https://github.com/doxygen/doxygen/issues/6926 - __host__ __device__ constexpr hyperloglog_ref(storage_type& storage, - cuco::cuda_thread_scope, - Hash const& hash) noexcept + __host__ __device__ constexpr hyperloglog_ref(storage_type& storage, Hash const& hash) noexcept : hash_{hash}, storage_{storage} { } diff --git a/include/cuco/detail/hyperloglog/kernels.cuh b/include/cuco/detail/hyperloglog/kernels.cuh index c7b9bc018..fd3a2a877 100644 --- a/include/cuco/detail/hyperloglog/kernels.cuh +++ b/include/cuco/detail/hyperloglog/kernels.cuh @@ -43,7 +43,7 @@ CUCO_KERNEL void add_shmem(InputIt first, cuco::detail::index_type n, RefType re auto idx = cuco::detail::global_thread_id(); auto const block = cooperative_groups::this_thread_block(); - local_ref_type local_ref(local_storage, {}, {}); + local_ref_type local_ref(local_storage, {}); local_ref.clear(block); block.sync(); diff --git a/include/cuco/distinct_count_estimator.cuh b/include/cuco/distinct_count_estimator.cuh index 16b943ac9..5a9a16c85 100644 --- a/include/cuco/distinct_count_estimator.cuh +++ b/include/cuco/distinct_count_estimator.cuh @@ -69,19 +69,17 @@ class distinct_count_estimator { * * @note This function synchronizes the given stream. * - * @param scope The scope in which operations will be performed * @param hash The hash function used to hash items * @param alloc Allocator used for allocating device storage * @param stream CUDA stream used to initialize the object */ - constexpr distinct_count_estimator(cuco::cuda_thread_scope scope = {}, - Hash const& hash = {}, - Allocator const& alloc = {}, - cuco::cuda_stream_ref stream = {}); + constexpr distinct_count_estimator(Hash const& hash = {}, + Allocator const& alloc = {}, + cuco::cuda_stream_ref stream = {}); ~distinct_count_estimator() = default; - distinct_count_estimator(distinct_count_estimator const&) = delete; + distinct_count_estimator(distinct_count_estimator const&) = delete; distinct_count_estimator& operator=(distinct_count_estimator const&) = delete; distinct_count_estimator(distinct_count_estimator&&) = default; ///< Move constructor diff --git a/include/cuco/distinct_count_estimator_ref.cuh b/include/cuco/distinct_count_estimator_ref.cuh index d32b6c4e0..256183082 100644 --- a/include/cuco/distinct_count_estimator_ref.cuh +++ b/include/cuco/distinct_count_estimator_ref.cuh @@ -57,13 +57,10 @@ class distinct_count_estimator_ref { * @brief Constructs a non-owning `distinct_count_estimator_ref` object. * * @param storage Reference to storage object of type `storage_type` - * @param scope The scope in which operations will be performed * @param hash The hash function used to hash items */ - __host__ __device__ constexpr distinct_count_estimator_ref( - storage_type& storage, - cuco::cuda_thread_scope scope = {}, - Hash const& hash = {}) noexcept; + __host__ __device__ constexpr distinct_count_estimator_ref(storage_type& storage, + Hash const& hash = {}) noexcept; /** * @brief Resets the estimator, i.e., clears the current count estimate. From e3d401a7970dc6851ac45ce9017eca64e3cc586f Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Tue, 30 Jan 2024 19:24:46 +0000 Subject: [PATCH 10/78] Update benchmark --- benchmarks/distinct_count_estimator_bench.cu | 42 ++++++++++++-------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/benchmarks/distinct_count_estimator_bench.cu b/benchmarks/distinct_count_estimator_bench.cu index c52025c6c..7ceb305b4 100644 --- a/benchmarks/distinct_count_estimator_bench.cu +++ b/benchmarks/distinct_count_estimator_bench.cu @@ -24,21 +24,26 @@ #include #include +#include #include +#include using namespace cuco::benchmark; using namespace cuco::utility; -template -[[nodiscard]] std::size_t exact_distinct_count(InputIt first, InputIt last) +template +[[nodiscard]] std::size_t exact_distinct_count(InputIt first, std::size_t n) { - // TODO don't use detail ns in user land - auto const num_items = cuco::detail::distance(first, last); - if (num_items == 0) { return 0; } + // TODO static_set currently only supports types up-to 8-bytes in size. + // Casting is valid since the keys generated are representable in int64_t. + using T = std::int64_t; - auto set = cuco::static_set{num_items, cuco::empty_key{-1}}; - set.insert(first, last); + auto cast_iter = thrust::make_transform_iterator( + first, cuda::proclaim_return_type([] __device__(auto i) { return static_cast(i); })); + + auto set = cuco::static_set{n, 0.8, cuco::empty_key{-1}}; + set.insert(cast_iter, cast_iter + n); return set.size(); } @@ -50,7 +55,7 @@ void distinct_count_estimator_e2e(nvbench::state& state, nvbench::type_list items(num_items); @@ -64,7 +69,7 @@ void distinct_count_estimator_e2e(nvbench::state& state, nvbench::type_list(items.begin(), items.end()); + double const true_cardinality = exact_distinct_count(items.begin(), num_items); auto const relative_error = abs(true_cardinality - estimated_cardinality) / true_cardinality; auto& summ = state.add_summary("RelativeError"); @@ -93,7 +98,7 @@ void distinct_count_estimator_add(nvbench::state& state, nvbench::type_list items(num_items); @@ -113,23 +118,26 @@ void distinct_count_estimator_add(nvbench::state& state, nvbench::type_list, - cuco::distinct_count_estimator, - cuco::distinct_count_estimator, +using ESTIMATOR_RANGE = nvbench::type_list, cuco::distinct_count_estimator, cuco::distinct_count_estimator, - cuco::distinct_count_estimator, + cuco::distinct_count_estimator, cuco::distinct_count_estimator, - cuco::distinct_count_estimator>; + cuco::distinct_count_estimator, + cuco::distinct_count_estimator<__int128_t, 10>, + cuco::distinct_count_estimator<__int128_t, 11>, + cuco::distinct_count_estimator<__int128_t, 12>>; NVBENCH_BENCH_TYPES(distinct_count_estimator_e2e, NVBENCH_TYPE_AXES(ESTIMATOR_RANGE, nvbench::type_list)) - .set_name("distinct_count_estimator") + .set_name("distinct_count_estimator_e2e") .set_type_axes_names({"Estimator", "Distribution"}) + .add_int64_power_of_two_axis("NumInputs", {28, 29, 30}) .set_max_noise(defaults::MAX_NOISE); NVBENCH_BENCH_TYPES(distinct_count_estimator_add, NVBENCH_TYPE_AXES(ESTIMATOR_RANGE, nvbench::type_list)) - .set_name("distinct_count_estimator::add") + .set_name("distinct_count_estimator::add_async") .set_type_axes_names({"Estimator", "Distribution"}) + .add_int64_power_of_two_axis("NumInputs", {28, 29, 30}) .set_max_noise(defaults::MAX_NOISE); \ No newline at end of file From 56520a604eca7dda1e64a4d107a62b0310ffc676 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Wed, 31 Jan 2024 18:08:12 +0000 Subject: [PATCH 11/78] Select cg reduce impl based on nvcc version --- include/cuco/detail/__config | 9 +++++++++ .../detail/hyperloglog/hyperloglog_ref.cuh | 18 ++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/include/cuco/detail/__config b/include/cuco/detail/__config index c083fec86..fd3b6fce4 100644 --- a/include/cuco/detail/__config +++ b/include/cuco/detail/__config @@ -49,4 +49,13 @@ #if defined(__SIZEOF_INT128__) #define CUCO_HAS_INT128 +#endif + +#if (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 8) +#define CUCO_HAS_CG_EXPERIMENTAL_REDUCE_UPDATE_ASYNC +#define _CG_ABI_EXPERIMENTAL +#endif + +#if (__CUDACC_VER_MAJOR__ >= 12) +#define CUCO_HAS_CG_REDUCE_UPDATE_ASYNC #endif \ No newline at end of file diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh index e41f47ef6..d6f362c5f 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh @@ -15,6 +15,7 @@ */ #pragma once +#include #include #include #include @@ -142,10 +143,27 @@ class hyperloglog_ref { // warp reduce Z and V auto const warp = cooperative_groups::tiled_partition<32>(group); +#if defined(CUCO_HAS_CG_REDUCE_UPDATE_ASYNC) cooperative_groups::reduce_update_async( warp, block_sum, thread_sum, cooperative_groups::plus()); cooperative_groups::reduce_update_async( warp, block_zeroes, thread_zeroes, cooperative_groups::plus()); +#elif defined(CUCO_HAS_CG_EXPERIMENTAL_REDUCE_UPDATE_ASYNC) + cooperative_groups::experimental::reduce_update_async( + warp, block_sum, thread_sum, cooperative_groups::plus()); + cooperative_groups::experimental::reduce_update_async( + warp, block_zeroes, thread_zeroes, cooperative_groups::plus()); +#else + auto const warp_sum = + cooperative_groups::reduce(warp, thread_sum, cooperative_groups::plus()); + auto const warp_zeroes = + cooperative_groups::reduce(warp, thread_zeroes, cooperative_groups::plus()); + // TODO warp sync needed? + if (warp.thread_rank() == 0) { + block_sum.fetch_add(warp_sum, cuda::std::memory_order_relaxed); + block_zeroes.fetch_add(warp_zeroes, cuda::std::memory_order_relaxed); + } +#endif group.sync(); if (group.thread_rank() == 0) { From 367377228d5ce682e4c253ecbc6cd3bf44d1c9a4 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Wed, 31 Jan 2024 22:33:20 +0000 Subject: [PATCH 12/78] Re-format tuning header --- include/cuco/detail/hyperloglog/tuning.cuh | 2573 +------------------- 1 file changed, 35 insertions(+), 2538 deletions(-) diff --git a/include/cuco/detail/hyperloglog/tuning.cuh b/include/cuco/detail/hyperloglog/tuning.cuh index f49e43e24..4d4a69067 100644 --- a/include/cuco/detail/hyperloglog/tuning.cuh +++ b/include/cuco/detail/hyperloglog/tuning.cuh @@ -26,2552 +26,49 @@ namespace cuco::hyperloglog_ns::detail { #define CUCO_HLL_TUNING_ARR_DECL __device__ static cuda::std::array constexpr #endif -CUCO_HLL_TUNING_ARR_DECL thresholds{10.0, - 20.0, - 40.0, - 80.0, - 220.0, - 400.0, - 900.0, - 1800.0, - 3100.0, - 6500.0, - 15500.0, - 20000.0, - 50000.0, - 120000.0, - 350000.0}; +// clang-format off +CUCO_HLL_TUNING_ARR_DECL thresholds{10.0, 20.0, 40.0, 80.0, 220.0, 400.0, 900.0, 1800.0, 3100.0, 6500.0, 15500.0, 20000.0, 50000.0, 120000.0, 350000.0}; // HLL++ uses an interpolation method over the raw estimated cardinality to select the optimal bias. // Parameters/interpolation points taken from // https://docs.google.com/document/d/1gyjfMHy43U9OWBXxfaeG-3MjGzejW1dlpyMwEYAAWEI/mobilebasic -CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p4{ - 11.0, 11.717, 12.207, 12.7896, 13.2882, 13.8204, 14.3772, 14.9342, 15.5202, 16.161, - 16.7722, 17.4636, 18.0396, 18.6766, 19.3566, 20.0454, 20.7936, 21.4856, 22.2666, 22.9946, - 23.766, 24.4692, 25.3638, 26.0764, 26.7864, 27.7602, 28.4814, 29.433, 30.2926, 31.0664, - 31.9996, 32.7956, 33.5366, 34.5894, 35.5738, 36.2698, 37.3682, 38.0544, 39.2342, 40.0108, - 40.7966, 41.9298, 42.8704, 43.6358, 44.5194, 45.773, 46.6772, 47.6174, 48.4888, 49.3304, - 50.2506, 51.4996, 52.3824, 53.3078, 54.3984, 55.5838, 56.6618, 57.2174, 58.3514, 59.0802, - 60.1482, 61.0376, 62.3598, 62.8078, 63.9744, 64.914, 65.781, 67.1806, 68.0594, 68.8446, - 69.7928, 70.8248, 71.8324, 72.8598, 73.6246, 74.7014, 75.393, 76.6708, 77.2394}; -CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p5{ - 23.0, 23.1194, 23.8208, 24.2318, 24.77, 25.2436, 25.7774, 26.2848, 26.8224, - 27.3742, 27.9336, 28.503, 29.0494, 29.6292, 30.2124, 30.798, 31.367, 31.9728, - 32.5944, 33.217, 33.8438, 34.3696, 35.0956, 35.7044, 36.324, 37.0668, 37.6698, - 38.3644, 39.049, 39.6918, 40.4146, 41.082, 41.687, 42.5398, 43.2462, 43.857, - 44.6606, 45.4168, 46.1248, 46.9222, 47.6804, 48.447, 49.3454, 49.9594, 50.7636, - 51.5776, 52.331, 53.19, 53.9676, 54.7564, 55.5314, 56.4442, 57.3708, 57.9774, - 58.9624, 59.8796, 60.755, 61.472, 62.2076, 63.1024, 63.8908, 64.7338, 65.7728, - 66.629, 67.413, 68.3266, 69.1524, 70.2642, 71.1806, 72.0566, 72.9192, 73.7598, - 74.3516, 75.5802, 76.4386, 77.4916, 78.1524, 79.1892, 79.8414, 80.8798, 81.8376, - 82.4698, 83.7656, 84.331, 85.5914, 86.6012, 87.7016, 88.5582, 89.3394, 90.3544, - 91.4912, 92.308, 93.3552, 93.9746, 95.2052, 95.727, 97.1322, 98.3944, 98.7588, - 100.242, 101.1914, 102.2538, 102.8776, 103.6292, 105.1932, 105.9152, 107.0868, 107.6728, - 108.7144, 110.3114, 110.8716, 111.245, 112.7908, 113.7064, 114.636, 115.7464, 116.1788, - 117.7464, 118.4896, 119.6166, 120.5082, 121.7798, 122.9028, 123.4426, 124.8854, 125.705, - 126.4652, 128.3464, 128.3462, 130.0398, 131.0342, 131.0042, 132.4766, 133.511, 134.7252, - 135.425, 136.5172, 138.0572, 138.6694, 139.3712, 140.8598, 141.4594, 142.554, 143.4006, - 144.7374, 146.1634, 146.8994, 147.605, 147.9304, 149.1636, 150.2468, 151.5876, 152.2096, - 153.7032, 154.7146, 155.807, 156.9228, 157.0372, 158.5852}; -CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p6{ - 46.0, 46.1902, 47.271, 47.8358, 48.8142, 49.2854, 50.317, 51.354, 51.8924, - 52.9436, 53.4596, 54.5262, 55.6248, 56.1574, 57.2822, 57.837, 58.9636, 60.074, - 60.7042, 61.7976, 62.4772, 63.6564, 64.7942, 65.5004, 66.686, 67.291, 68.5672, - 69.8556, 70.4982, 71.8204, 72.4252, 73.7744, 75.0786, 75.8344, 77.0294, 77.8098, - 79.0794, 80.5732, 81.1878, 82.5648, 83.2902, 84.6784, 85.3352, 86.8946, 88.3712, - 89.0852, 90.499, 91.2686, 92.6844, 94.2234, 94.9732, 96.3356, 97.2286, 98.7262, - 100.3284, 101.1048, 102.5962, 103.3562, 105.1272, 106.4184, 107.4974, 109.0822, 109.856, - 111.48, 113.2834, 114.0208, 115.637, 116.5174, 118.0576, 119.7476, 120.427, 122.1326, - 123.2372, 125.2788, 126.6776, 127.7926, 129.1952, 129.9564, 131.6454, 133.87, 134.5428, - 136.2, 137.0294, 138.6278, 139.6782, 141.792, 143.3516, 144.2832, 146.0394, 147.0748, - 148.4912, 150.849, 151.696, 153.5404, 154.073, 156.3714, 157.7216, 158.7328, 160.4208, - 161.4184, 163.9424, 165.2772, 166.411, 168.1308, 168.769, 170.9258, 172.6828, 173.7502, - 175.706, 176.3886, 179.0186, 180.4518, 181.927, 183.4172, 184.4114, 186.033, 188.5124, - 189.5564, 191.6008, 192.4172, 193.8044, 194.997, 197.4548, 198.8948, 200.2346, 202.3086, - 203.1548, 204.8842, 206.6508, 206.6772, 209.7254, 210.4752, 212.7228, 214.6614, 215.1676, - 217.793, 218.0006, 219.9052, 221.66, 223.5588, 225.1636, 225.6882, 227.7126, 229.4502, - 231.1978, 232.9756, 233.1654, 236.727, 238.1974, 237.7474, 241.1346, 242.3048, 244.1948, - 245.3134, 246.879, 249.1204, 249.853, 252.6792, 253.857, 254.4486, 257.2362, 257.9534, - 260.0286, 260.5632, 262.663, 264.723, 265.7566, 267.2566, 267.1624, 270.62, 272.8216, - 273.2166, 275.2056, 276.2202, 278.3726, 280.3344, 281.9284, 283.9728, 284.1924, 286.4872, - 287.587, 289.807, 291.1206, 292.769, 294.8708, 296.665, 297.1182, 299.4012, 300.6352, - 302.1354, 304.1756, 306.1606, 307.3462, 308.5214, 309.4134, 310.8352, 313.9684, 315.837, - 316.7796, 318.9858}; -CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p7{ - 92.0, 93.4934, 94.9758, 96.4574, 97.9718, 99.4954, 101.5302, 103.0756, 104.6374, - 106.1782, 107.7888, 109.9522, 111.592, 113.2532, 114.9086, 116.5938, 118.9474, 120.6796, - 122.4394, 124.2176, 125.9768, 128.4214, 130.2528, 132.0102, 133.8658, 135.7278, 138.3044, - 140.1316, 142.093, 144.0032, 145.9092, 148.6306, 150.5294, 152.5756, 154.6508, 156.662, - 159.552, 161.3724, 163.617, 165.5754, 167.7872, 169.8444, 172.7988, 174.8606, 177.2118, - 179.3566, 181.4476, 184.5882, 186.6816, 189.0824, 191.0258, 193.6048, 196.4436, 198.7274, - 200.957, 203.147, 205.4364, 208.7592, 211.3386, 213.781, 215.8028, 218.656, 221.6544, - 223.996, 226.4718, 229.1544, 231.6098, 234.5956, 237.0616, 239.5758, 242.4878, 244.5244, - 248.2146, 250.724, 252.8722, 255.5198, 258.0414, 261.941, 264.9048, 266.87, 269.4304, - 272.028, 274.4708, 278.37, 281.0624, 283.4668, 286.5532, 289.4352, 293.2564, 295.2744, - 298.2118, 300.7472, 304.1456, 307.2928, 309.7504, 312.5528, 315.979, 318.2102, 322.1834, - 324.3494, 327.325, 330.6614, 332.903, 337.2544, 339.9042, 343.215, 345.2864, 348.0814, - 352.6764, 355.301, 357.139, 360.658, 363.1732, 366.5902, 369.9538, 373.0828, 375.922, - 378.9902, 382.7328, 386.4538, 388.1136, 391.2234, 394.0878, 396.708, 401.1556, 404.1852, - 406.6372, 409.6822, 412.7796, 416.6078, 418.4916, 422.131, 424.5376, 428.1988, 432.211, - 434.4502, 438.5282, 440.912, 444.0448, 447.7432, 450.8524, 453.7988, 456.7858, 458.8868, - 463.9886, 466.5064, 468.9124, 472.6616, 475.4682, 478.582, 481.304, 485.2738, 488.6894, - 490.329, 496.106, 497.6908, 501.1374, 504.5322, 506.8848, 510.3324, 513.4512, 516.179, - 520.4412, 522.6066, 526.167, 528.7794, 533.379, 536.067, 538.46, 542.9116, 545.692, - 547.9546, 552.493, 555.2722, 557.335, 562.449, 564.2014, 569.0738, 571.0974, 574.8564, - 578.2996, 581.409, 583.9704, 585.8098, 589.6528, 594.5998, 595.958, 600.068, 603.3278, - 608.2016, 609.9632, 612.864, 615.43, 620.7794, 621.272, 625.8644, 629.206, 633.219, - 634.5154, 638.6102}; -CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p8{ - 184.2152, 187.2454, 190.2096, 193.6652, 196.6312, 199.6822, 203.249, 206.3296, 210.0038, - 213.2074, 216.4612, 220.27, 223.5178, 227.4412, 230.8032, 234.1634, 238.1688, 241.6074, - 245.6946, 249.2664, 252.8228, 257.0432, 260.6824, 264.9464, 268.6268, 272.2626, 276.8376, - 280.4034, 284.8956, 288.8522, 292.7638, 297.3552, 301.3556, 305.7526, 309.9292, 313.8954, - 318.8198, 322.7668, 327.298, 331.6688, 335.9466, 340.9746, 345.1672, 349.3474, 354.3028, - 358.8912, 364.114, 368.4646, 372.9744, 378.4092, 382.6022, 387.843, 392.5684, 397.1652, - 402.5426, 407.4152, 412.5388, 417.3592, 422.1366, 427.486, 432.3918, 437.5076, 442.509, - 447.3834, 453.3498, 458.0668, 463.7346, 469.1228, 473.4528, 479.7, 484.644, 491.0518, - 495.5774, 500.9068, 506.432, 512.1666, 517.434, 522.6644, 527.4894, 533.6312, 538.3804, - 544.292, 550.5496, 556.0234, 562.8206, 566.6146, 572.4188, 579.117, 583.6762, 590.6576, - 595.7864, 601.509, 607.5334, 612.9204, 619.772, 624.2924, 630.8654, 636.1836, 642.745, - 649.1316, 655.0386, 660.0136, 666.6342, 671.6196, 678.1866, 684.4282, 689.3324, 695.4794, - 702.5038, 708.129, 713.528, 720.3204, 726.463, 732.7928, 739.123, 744.7418, 751.2192, - 756.5102, 762.6066, 769.0184, 775.2224, 781.4014, 787.7618, 794.1436, 798.6506, 805.6378, - 811.766, 819.7514, 824.5776, 828.7322, 837.8048, 843.6302, 849.9336, 854.4798, 861.3388, - 867.9894, 873.8196, 880.3136, 886.2308, 892.4588, 899.0816, 905.4076, 912.0064, 917.3878, - 923.619, 929.998, 937.3482, 943.9506, 947.991, 955.1144, 962.203, 968.8222, 975.7324, - 981.7826, 988.7666, 994.2648, 1000.3128, 1007.4082, 1013.7536, 1020.3376, 1026.7156, 1031.7478, - 1037.4292, 1045.393, 1051.2278, 1058.3434, 1062.8726, 1071.884, 1076.806, 1082.9176, 1089.1678, - 1095.5032, 1102.525, 1107.2264, 1115.315, 1120.93, 1127.252, 1134.1496, 1139.0408, 1147.5448, - 1153.3296, 1158.1974, 1166.5262, 1174.3328, 1175.657, 1184.4222, 1190.9172, 1197.1292, 1204.4606, - 1210.4578, 1218.8728, 1225.3336, 1226.6592, 1236.5768, 1241.363, 1249.4074, 1254.6566, 1260.8014, - 1266.5454, 1274.5192}; -CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p9{ - 369.0, 374.8294, 381.2452, 387.6698, 394.1464, 400.2024, 406.8782, 413.6598, 420.462, - 427.2826, 433.7102, 440.7416, 447.9366, 455.1046, 462.285, 469.0668, 476.306, 483.8448, - 491.301, 498.9886, 506.2422, 513.8138, 521.7074, 529.7428, 537.8402, 545.1664, 553.3534, - 561.594, 569.6886, 577.7876, 585.65, 594.228, 602.8036, 611.1666, 620.0818, 628.0824, - 637.2574, 646.302, 655.1644, 664.0056, 672.3802, 681.7192, 690.5234, 700.2084, 708.831, - 718.485, 728.1112, 737.4764, 746.76, 756.3368, 766.5538, 775.5058, 785.2646, 795.5902, - 804.3818, 814.8998, 824.9532, 835.2062, 845.2798, 854.4728, 864.9582, 875.3292, 886.171, - 896.781, 906.5716, 916.7048, 927.5322, 937.875, 949.3972, 958.3464, 969.7274, 980.2834, - 992.1444, 1003.4264, 1013.0166, 1024.018, 1035.0438, 1046.34, 1057.6856, 1068.9836, 1079.0312, - 1091.677, 1102.3188, 1113.4846, 1124.4424, 1135.739, 1147.1488, 1158.9202, 1169.406, 1181.5342, - 1193.2834, 1203.8954, 1216.3286, 1226.2146, 1239.6684, 1251.9946, 1262.123, 1275.4338, 1285.7378, - 1296.076, 1308.9692, 1320.4964, 1333.0998, 1343.9864, 1357.7754, 1368.3208, 1380.4838, 1392.7388, - 1406.0758, 1416.9098, 1428.9728, 1440.9228, 1453.9292, 1462.617, 1476.05, 1490.2996, 1500.6128, - 1513.7392, 1524.5174, 1536.6322, 1548.2584, 1562.3766, 1572.423, 1587.1232, 1596.5164, 1610.5938, - 1622.5972, 1633.1222, 1647.7674, 1658.5044, 1671.57, 1683.7044, 1695.4142, 1708.7102, 1720.6094, - 1732.6522, 1747.841, 1756.4072, 1769.9786, 1782.3276, 1797.5216, 1808.3186, 1819.0694, 1834.354, - 1844.575, 1856.2808, 1871.1288, 1880.7852, 1893.9622, 1906.3418, 1920.6548, 1932.9302, 1945.8584, - 1955.473, 1968.8248, 1980.6446, 1995.9598, 2008.349, 2019.8556, 2033.0334, 2044.0206, 2059.3956, - 2069.9174, 2082.6084, 2093.7036, 2106.6108, 2118.9124, 2132.301, 2144.7628, 2159.8422, 2171.0212, - 2183.101, 2193.5112, 2208.052, 2221.3194, 2233.3282, 2247.295, 2257.7222, 2273.342, 2286.5638, - 2299.6786, 2310.8114, 2322.3312, 2335.516, 2349.874, 2363.5968, 2373.865, 2387.1918, 2401.8328, - 2414.8496, 2424.544, 2436.7592, 2447.1682, 2464.1958, 2474.3438, 2489.0006, 2497.4526, 2513.6586, - 2527.19, 2540.7028, 2553.768}; -CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p10{ - 738.1256, 750.4234, 763.1064, 775.4732, 788.4636, 801.0644, 814.488, 827.9654, 841.0832, - 854.7864, 868.1992, 882.2176, 896.5228, 910.1716, 924.7752, 938.899, 953.6126, 968.6492, - 982.9474, 998.5214, 1013.1064, 1028.6364, 1044.2468, 1059.4588, 1075.3832, 1091.0584, 1106.8606, - 1123.3868, 1139.5062, 1156.1862, 1172.463, 1189.339, 1206.1936, 1223.1292, 1240.1854, 1257.2908, - 1275.3324, 1292.8518, 1310.5204, 1328.4854, 1345.9318, 1364.552, 1381.4658, 1400.4256, 1419.849, - 1438.152, 1456.8956, 1474.8792, 1494.118, 1513.62, 1532.5132, 1551.9322, 1570.7726, 1590.6086, - 1610.5332, 1630.5918, 1650.4294, 1669.7662, 1690.4106, 1710.7338, 1730.9012, 1750.4486, 1770.1556, - 1791.6338, 1812.7312, 1833.6264, 1853.9526, 1874.8742, 1896.8326, 1918.1966, 1939.5594, 1961.07, - 1983.037, 2003.1804, 2026.071, 2047.4884, 2070.0848, 2091.2944, 2114.333, 2135.9626, 2158.2902, - 2181.0814, 2202.0334, 2224.4832, 2246.39, 2269.7202, 2292.1714, 2314.2358, 2338.9346, 2360.891, - 2384.0264, 2408.3834, 2430.1544, 2454.8684, 2476.9896, 2501.4368, 2522.8702, 2548.0408, 2570.6738, - 2593.5208, 2617.0158, 2640.2302, 2664.0962, 2687.4986, 2714.2588, 2735.3914, 2759.6244, 2781.8378, - 2808.0072, 2830.6516, 2856.2454, 2877.2136, 2903.4546, 2926.785, 2951.2294, 2976.468, 3000.867, - 3023.6508, 3049.91, 3073.5984, 3098.162, 3121.5564, 3146.2328, 3170.9484, 3195.5902, 3221.3346, - 3242.7032, 3271.6112, 3296.5546, 3317.7376, 3345.072, 3369.9518, 3394.326, 3418.1818, 3444.6926, - 3469.086, 3494.2754, 3517.8698, 3544.248, 3565.3768, 3588.7234, 3616.979, 3643.7504, 3668.6812, - 3695.72, 3719.7392, 3742.6224, 3770.4456, 3795.6602, 3819.9058, 3844.002, 3869.517, 3895.6824, - 3920.8622, 3947.1364, 3973.985, 3995.4772, 4021.62, 4046.628, 4074.65, 4096.2256, 4121.831, - 4146.6406, 4173.276, 4195.0744, 4223.9696, 4251.3708, 4272.9966, 4300.8046, 4326.302, 4353.1248, - 4374.312, 4403.0322, 4426.819, 4450.0598, 4478.5206, 4504.8116, 4528.8928, 4553.9584, 4578.8712, - 4603.8384, 4632.3872, 4655.5128, 4675.821, 4704.6222, 4731.9862, 4755.4174, 4781.2628, 4804.332, - 4832.3048, 4862.8752, 4883.4148, 4906.9544, 4935.3516, 4954.3532, 4984.0248, 5011.217, 5035.3258, - 5057.3672, 5084.1828}; -CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p11{ - 1477.0, 1501.6014, 1526.5802, 1551.7942, 1577.3042, 1603.2062, 1629.8402, 1656.2292, - 1682.9462, 1709.9926, 1737.3026, 1765.4252, 1793.0578, 1821.6092, 1849.626, 1878.5568, - 1908.527, 1937.5154, 1967.1874, 1997.3878, 2027.37, 2058.1972, 2089.5728, 2120.1012, - 2151.9668, 2183.292, 2216.0772, 2247.8578, 2280.6562, 2313.041, 2345.714, 2380.3112, - 2414.1806, 2447.9854, 2481.656, 2516.346, 2551.5154, 2586.8378, 2621.7448, 2656.6722, - 2693.5722, 2729.1462, 2765.4124, 2802.8728, 2838.898, 2876.408, 2913.4926, 2951.4938, - 2989.6776, 3026.282, 3065.7704, 3104.1012, 3143.7388, 3181.6876, 3221.1872, 3261.5048, - 3300.0214, 3339.806, 3381.409, 3421.4144, 3461.4294, 3502.2286, 3544.651, 3586.6156, - 3627.337, 3670.083, 3711.1538, 3753.5094, 3797.01, 3838.6686, 3882.1678, 3922.8116, - 3967.9978, 4009.9204, 4054.3286, 4097.5706, 4140.6014, 4185.544, 4229.5976, 4274.583, - 4316.9438, 4361.672, 4406.2786, 4451.8628, 4496.1834, 4543.505, 4589.1816, 4632.5188, - 4678.2294, 4724.8908, 4769.0194, 4817.052, 4861.4588, 4910.1596, 4956.4344, 5002.5238, - 5048.13, 5093.6374, 5142.8162, 5187.7894, 5237.3984, 5285.6078, 5331.0858, 5379.1036, - 5428.6258, 5474.6018, 5522.7618, 5571.5822, 5618.59, 5667.9992, 5714.88, 5763.454, - 5808.6982, 5860.3644, 5910.2914, 5953.571, 6005.9232, 6055.1914, 6104.5882, 6154.5702, - 6199.7036, 6251.1764, 6298.7596, 6350.0302, 6398.061, 6448.4694, 6495.933, 6548.0474, - 6597.7166, 6646.9416, 6695.9208, 6742.6328, 6793.5276, 6842.1934, 6894.2372, 6945.3864, - 6996.9228, 7044.2372, 7094.1374, 7142.2272, 7192.2942, 7238.8338, 7288.9006, 7344.0908, - 7394.8544, 7443.5176, 7490.4148, 7542.9314, 7595.6738, 7641.9878, 7694.3688, 7743.0448, - 7797.522, 7845.53, 7899.594, 7950.3132, 7996.455, 8050.9442, 8092.9114, 8153.1374, - 8197.4472, 8252.8278, 8301.8728, 8348.6776, 8401.4698, 8453.551, 8504.6598, 8553.8944, - 8604.1276, 8657.6514, 8710.3062, 8758.908, 8807.8706, 8862.1702, 8910.4668, 8960.77, - 9007.2766, 9063.164, 9121.0534, 9164.1354, 9218.1594, 9267.767, 9319.0594, 9372.155, - 9419.7126, 9474.3722, 9520.1338, 9572.368, 9622.7702, 9675.8448, 9726.5396, 9778.7378, - 9827.6554, 9878.1922, 9928.7782, 9978.3984, 10026.578, 10076.5626, 10137.1618, 10177.5244, - 10229.9176}; -CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p12{ - 2954.0, 3003.4782, 3053.3568, 3104.3666, 3155.324, 3206.9598, 3259.648, 3312.539, - 3366.1474, 3420.2576, 3474.8376, 3530.6076, 3586.451, 3643.38, 3700.4104, 3757.5638, - 3815.9676, 3875.193, 3934.838, 3994.8548, 4055.018, 4117.1742, 4178.4482, 4241.1294, - 4304.4776, 4367.4044, 4431.8724, 4496.3732, 4561.4304, 4627.5326, 4693.949, 4761.5532, - 4828.7256, 4897.6182, 4965.5186, 5034.4528, 5104.865, 5174.7164, 5244.6828, 5316.6708, - 5387.8312, 5459.9036, 5532.476, 5604.8652, 5679.6718, 5753.757, 5830.2072, 5905.2828, - 5980.0434, 6056.6264, 6134.3192, 6211.5746, 6290.0816, 6367.1176, 6447.9796, 6526.5576, - 6606.1858, 6686.9144, 6766.1142, 6847.0818, 6927.9664, 7010.9096, 7091.0816, 7175.3962, - 7260.3454, 7344.018, 7426.4214, 7511.3106, 7596.0686, 7679.8094, 7765.818, 7852.4248, - 7936.834, 8022.363, 8109.5066, 8200.4554, 8288.5832, 8373.366, 8463.4808, 8549.7682, - 8642.0522, 8728.3288, 8820.9528, 8907.727, 9001.0794, 9091.2522, 9179.988, 9269.852, - 9362.6394, 9453.642, 9546.9024, 9640.6616, 9732.6622, 9824.3254, 9917.7484, 10007.9392, - 10106.7508, 10196.2152, 10289.8114, 10383.5494, 10482.3064, 10576.8734, 10668.7872, 10764.7156, - 10862.0196, 10952.793, 11049.9748, 11146.0702, 11241.4492, 11339.2772, 11434.2336, 11530.741, - 11627.6136, 11726.311, 11821.5964, 11918.837, 12015.3724, 12113.0162, 12213.0424, 12306.9804, - 12408.4518, 12504.8968, 12604.586, 12700.9332, 12798.705, 12898.5142, 12997.0488, 13094.788, - 13198.475, 13292.7764, 13392.9698, 13486.8574, 13590.1616, 13686.5838, 13783.6264, 13887.2638, - 13992.0978, 14081.0844, 14189.9956, 14280.0912, 14382.4956, 14486.4384, 14588.1082, 14686.2392, - 14782.276, 14888.0284, 14985.1864, 15088.8596, 15187.0998, 15285.027, 15383.6694, 15495.8266, - 15591.3736, 15694.2008, 15790.3246, 15898.4116, 15997.4522, 16095.5014, 16198.8514, 16291.7492, - 16402.6424, 16499.1266, 16606.2436, 16697.7186, 16796.3946, 16902.3376, 17005.7672, 17100.814, - 17206.8282, 17305.8262, 17416.0744, 17508.4092, 17617.0178, 17715.4554, 17816.758, 17920.1748, - 18012.9236, 18119.7984, 18223.2248, 18324.2482, 18426.6276, 18525.0932, 18629.8976, 18733.2588, - 18831.0466, 18940.1366, 19032.2696, 19131.729, 19243.4864, 19349.6932, 19442.866, 19547.9448, - 19653.2798, 19754.4034, 19854.0692, 19965.1224, 20065.1774, 20158.2212, 20253.353, 20366.3264, - 20463.22}; -CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p13{ - 5908.5052, 6007.2672, 6107.347, 6208.5794, 6311.2622, 6414.5514, 6519.3376, 6625.6952, - 6732.5988, 6841.3552, 6950.5972, 7061.3082, 7173.5646, 7287.109, 7401.8216, 7516.4344, - 7633.3802, 7751.2962, 7870.3784, 7990.292, 8110.79, 8233.4574, 8356.6036, 8482.2712, - 8607.7708, 8735.099, 8863.1858, 8993.4746, 9123.8496, 9255.6794, 9388.5448, 9522.7516, - 9657.3106, 9792.6094, 9930.5642, 10068.794, 10206.7256, 10347.81, 10490.3196, 10632.0778, - 10775.9916, 10920.4662, 11066.124, 11213.073, 11358.0362, 11508.1006, 11659.1716, 11808.7514, - 11959.4884, 12112.1314, 12265.037, 12420.3756, 12578.933, 12734.311, 12890.0006, 13047.2144, - 13207.3096, 13368.5144, 13528.024, 13689.847, 13852.7528, 14018.3168, 14180.5372, 14346.9668, - 14513.5074, 14677.867, 14846.2186, 15017.4186, 15184.9716, 15356.339, 15529.2972, 15697.3578, - 15871.8686, 16042.187, 16216.4094, 16389.4188, 16565.9126, 16742.3272, 16919.0042, 17094.7592, - 17273.965, 17451.8342, 17634.4254, 17810.5984, 17988.9242, 18171.051, 18354.7938, 18539.466, - 18721.0408, 18904.9972, 19081.867, 19271.9118, 19451.8694, 19637.9816, 19821.2922, 20013.1292, - 20199.3858, 20387.8726, 20572.9514, 20770.7764, 20955.1714, 21144.751, 21329.9952, 21520.709, - 21712.7016, 21906.3868, 22096.2626, 22286.0524, 22475.051, 22665.5098, 22862.8492, 23055.5294, - 23249.6138, 23437.848, 23636.273, 23826.093, 24020.3296, 24213.3896, 24411.7392, 24602.9614, - 24805.7952, 24998.1552, 25193.9588, 25389.0166, 25585.8392, 25780.6976, 25981.2728, 26175.977, - 26376.5252, 26570.1964, 26773.387, 26962.9812, 27163.0586, 27368.164, 27565.0534, 27758.7428, - 27961.1276, 28163.2324, 28362.3816, 28565.7668, 28758.644, 28956.9768, 29163.4722, 29354.7026, - 29561.1186, 29767.9948, 29959.9986, 30164.0492, 30366.9818, 30562.5338, 30762.9928, 30976.1592, - 31166.274, 31376.722, 31570.3734, 31770.809, 31974.8934, 32179.5286, 32387.5442, 32582.3504, - 32794.076, 32989.9528, 33191.842, 33392.4684, 33595.659, 33801.8672, 34000.3414, 34200.0922, - 34402.6792, 34610.0638, 34804.0084, 35011.13, 35218.669, 35418.6634, 35619.0792, 35830.6534, - 36028.4966, 36229.7902, 36438.6422, 36630.7764, 36833.3102, 37048.6728, 37247.3916, 37453.5904, - 37669.3614, 37854.5526, 38059.305, 38268.0936, 38470.2516, 38674.7064, 38876.167, 39068.3794, - 39281.9144, 39492.8566, 39684.8628, 39898.4108, 40093.1836, 40297.6858, 40489.7086, 40717.2424}; -CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p14{ - 11817.475, 12015.0046, 12215.3792, 12417.7504, 12623.1814, 12830.0086, 13040.0072, 13252.503, - 13466.178, 13683.2738, 13902.0344, 14123.9798, 14347.394, 14573.7784, 14802.6894, 15033.6824, - 15266.9134, 15502.8624, 15741.4944, 15980.7956, 16223.8916, 16468.6316, 16715.733, 16965.5726, - 17217.204, 17470.666, 17727.8516, 17986.7886, 18247.6902, 18510.9632, 18775.304, 19044.7486, - 19314.4408, 19587.202, 19862.2576, 20135.924, 20417.0324, 20697.9788, 20979.6112, 21265.0274, - 21550.723, 21841.6906, 22132.162, 22428.1406, 22722.127, 23020.5606, 23319.7394, 23620.4014, - 23925.2728, 24226.9224, 24535.581, 24845.505, 25155.9618, 25470.3828, 25785.9702, 26103.7764, - 26420.4132, 26742.0186, 27062.8852, 27388.415, 27714.6024, 28042.296, 28365.4494, 28701.1526, - 29031.8008, 29364.2156, 29704.497, 30037.1458, 30380.111, 30723.8168, 31059.5114, 31404.9498, - 31751.6752, 32095.2686, 32444.7792, 32794.767, 33145.204, 33498.4226, 33847.6502, 34209.006, - 34560.849, 34919.4838, 35274.9778, 35635.1322, 35996.3266, 36359.1394, 36722.8266, 37082.8516, - 37447.7354, 37815.9606, 38191.0692, 38559.4106, 38924.8112, 39294.6726, 39663.973, 40042.261, - 40416.2036, 40779.2036, 41161.6436, 41540.9014, 41921.1998, 42294.7698, 42678.5264, 43061.3464, - 43432.375, 43818.432, 44198.6598, 44583.0138, 44970.4794, 45353.924, 45729.858, 46118.2224, - 46511.5724, 46900.7386, 47280.6964, 47668.1472, 48055.6796, 48446.9436, 48838.7146, 49217.7296, - 49613.7796, 50010.7508, 50410.0208, 50793.7886, 51190.2456, 51583.1882, 51971.0796, 52376.5338, - 52763.319, 53165.5534, 53556.5594, 53948.2702, 54346.352, 54748.7914, 55138.577, 55543.4824, - 55941.1748, 56333.7746, 56745.1552, 57142.7944, 57545.2236, 57935.9956, 58348.5268, 58737.5474, - 59158.5962, 59542.6896, 59958.8004, 60349.3788, 60755.0212, 61147.6144, 61548.194, 61946.0696, - 62348.6042, 62763.603, 63162.781, 63560.635, 63974.3482, 64366.4908, 64771.5876, 65176.7346, - 65597.3916, 65995.915, 66394.0384, 66822.9396, 67203.6336, 67612.2032, 68019.0078, 68420.0388, - 68821.22, 69235.8388, 69640.0724, 70055.155, 70466.357, 70863.4266, 71276.2482, 71677.0306, - 72080.2006, 72493.0214, 72893.5952, 73314.5856, 73714.9852, 74125.3022, 74521.2122, 74933.6814, - 75341.5904, 75743.0244, 76166.0278, 76572.1322, 76973.1028, 77381.6284, 77800.6092, 78189.328, - 78607.0962, 79012.2508, 79407.8358, 79825.725, 80238.701, 80646.891, 81035.6436, 81460.0448, - 81876.3884}; -CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p15{ - 23635.0036, 24030.8034, 24431.4744, 24837.1524, 25246.7928, 25661.326, 26081.3532, - 26505.2806, 26933.9892, 27367.7098, 27805.318, 28248.799, 28696.4382, 29148.8244, - 29605.5138, 30066.8668, 30534.2344, 31006.32, 31480.778, 31962.2418, 32447.3324, - 32938.0232, 33432.731, 33930.728, 34433.9896, 34944.1402, 35457.5588, 35974.5958, - 36497.3296, 37021.9096, 37554.326, 38088.0826, 38628.8816, 39171.3192, 39723.2326, - 40274.5554, 40832.3142, 41390.613, 41959.5908, 42532.5466, 43102.0344, 43683.5072, - 44266.694, 44851.2822, 45440.7862, 46038.0586, 46640.3164, 47241.064, 47846.155, - 48454.7396, 49076.9168, 49692.542, 50317.4778, 50939.65, 51572.5596, 52210.2906, - 52843.7396, 53481.3996, 54127.236, 54770.406, 55422.6598, 56078.7958, 56736.7174, - 57397.6784, 58064.5784, 58730.308, 59404.9784, 60077.0864, 60751.9158, 61444.1386, - 62115.817, 62808.7742, 63501.4774, 64187.5454, 64883.6622, 65582.7468, 66274.5318, - 66976.9276, 67688.7764, 68402.138, 69109.6274, 69822.9706, 70543.6108, 71265.5202, - 71983.3848, 72708.4656, 73433.384, 74158.4664, 74896.4868, 75620.9564, 76362.1434, - 77098.3204, 77835.7662, 78582.6114, 79323.9902, 80067.8658, 80814.9246, 81567.0136, - 82310.8536, 83061.9952, 83821.4096, 84580.8608, 85335.547, 86092.5802, 86851.6506, - 87612.311, 88381.2016, 89146.3296, 89907.8974, 90676.846, 91451.4152, 92224.5518, - 92995.8686, 93763.5066, 94551.2796, 95315.1944, 96096.1806, 96881.0918, 97665.679, - 98442.68, 99229.3002, 100011.0994, 100790.6386, 101580.1564, 102377.7484, 103152.1392, - 103944.2712, 104730.216, 105528.6336, 106324.9398, 107117.6706, 107890.3988, 108695.2266, - 109485.238, 110294.7876, 111075.0958, 111878.0496, 112695.2864, 113464.5486, 114270.0474, - 115068.608, 115884.3626, 116673.2588, 117483.3716, 118275.097, 119085.4092, 119879.2808, - 120687.5868, 121499.9944, 122284.916, 123095.9254, 123912.5038, 124709.0454, 125503.7182, - 126323.259, 127138.9412, 127943.8294, 128755.646, 129556.5354, 130375.3298, 131161.4734, - 131971.1962, 132787.5458, 133588.1056, 134431.351, 135220.2906, 136023.398, 136846.6558, - 137667.0004, 138463.663, 139283.7154, 140074.6146, 140901.3072, 141721.8548, 142543.2322, - 143356.1096, 144173.7412, 144973.0948, 145794.3162, 146609.5714, 147420.003, 148237.9784, - 149050.5696, 149854.761, 150663.1966, 151494.0754, 152313.1416, 153112.6902, 153935.7206, - 154746.9262, 155559.547, 156401.9746, 157228.7036, 158008.7254, 158820.75, 159646.9184, - 160470.4458, 161279.5348, 162093.3114, 162918.542, 163729.2842}; -CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p16{ - 47271.0, 48062.3584, 48862.7074, 49673.152, 50492.8416, 51322.9514, 52161.03, - 53009.407, 53867.6348, 54734.206, 55610.5144, 56496.2096, 57390.795, 58297.268, - 59210.6448, 60134.665, 61068.0248, 62010.4472, 62962.5204, 63923.5742, 64895.0194, - 65876.4182, 66862.6136, 67862.6968, 68868.8908, 69882.8544, 70911.271, 71944.0924, - 72990.0326, 74040.692, 75100.6336, 76174.7826, 77252.5998, 78340.2974, 79438.2572, - 80545.4976, 81657.2796, 82784.6336, 83915.515, 85059.7362, 86205.9368, 87364.4424, - 88530.3358, 89707.3744, 90885.9638, 92080.197, 93275.5738, 94479.391, 95695.918, - 96919.2236, 98148.4602, 99382.3474, 100625.6974, 101878.0284, 103141.6278, 104409.4588, - 105686.2882, 106967.5402, 108261.6032, 109548.1578, 110852.0728, 112162.231, 113479.0072, - 114806.2626, 116137.9072, 117469.5048, 118813.5186, 120165.4876, 121516.2556, 122875.766, - 124250.5444, 125621.2222, 127003.2352, 128387.848, 129775.2644, 131181.7776, 132577.3086, - 133979.9458, 135394.1132, 136800.9078, 138233.217, 139668.5308, 141085.212, 142535.2122, - 143969.0684, 145420.2872, 146878.1542, 148332.7572, 149800.3202, 151269.66, 152743.6104, - 154213.0948, 155690.288, 157169.4246, 158672.1756, 160160.059, 161650.6854, 163145.7772, - 164645.6726, 166159.1952, 167682.1578, 169177.3328, 170700.0118, 172228.8964, 173732.6664, - 175265.5556, 176787.799, 178317.111, 179856.6914, 181400.865, 182943.4612, 184486.742, - 186033.4698, 187583.7886, 189148.1868, 190688.4526, 192250.1926, 193810.9042, 195354.2972, - 196938.7682, 198493.5898, 200079.2824, 201618.912, 203205.5492, 204765.5798, 206356.1124, - 207929.3064, 209498.7196, 211086.229, 212675.1324, 214256.7892, 215826.2392, 217412.8474, - 218995.6724, 220618.6038, 222207.1166, 223781.0364, 225387.4332, 227005.7928, 228590.4336, - 230217.8738, 231805.1054, 233408.9, 234995.3432, 236601.4956, 238190.7904, 239817.2548, - 241411.2832, 243002.4066, 244640.1884, 246255.3128, 247849.3508, 249479.9734, 251106.8822, - 252705.027, 254332.9242, 255935.129, 257526.9014, 259154.772, 260777.625, 262390.253, - 264004.4906, 265643.59, 267255.4076, 268873.426, 270470.7252, 272106.4804, 273722.4456, - 275337.794, 276945.7038, 278592.9154, 280204.3726, 281841.1606, 283489.171, 285130.1716, - 286735.3362, 288364.7164, 289961.1814, 291595.5524, 293285.683, 294899.6668, 296499.3434, - 298128.0462, 299761.8946, 301394.2424, 302997.6748, 304615.1478, 306269.7724, 307886.114, - 309543.1028, 311153.2862, 312782.8546, 314421.2008, 316033.2438, 317692.9636, 319305.2648, - 320948.7406, 322566.3364, 324228.4224, 325847.1542}; -CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p17{ - 94542.0, 96125.811, 97728.019, 99348.558, 100987.9705, 102646.7565, 104324.5125, - 106021.7435, 107736.7865, 109469.272, 111223.9465, 112995.219, 114787.432, 116593.152, - 118422.71, 120267.2345, 122134.6765, 124020.937, 125927.2705, 127851.255, 129788.9485, - 131751.016, 133726.8225, 135722.592, 137736.789, 139770.568, 141821.518, 143891.343, - 145982.1415, 148095.387, 150207.526, 152355.649, 154515.6415, 156696.05, 158887.7575, - 161098.159, 163329.852, 165569.053, 167837.4005, 170121.6165, 172420.4595, 174732.6265, - 177062.77, 179412.502, 181774.035, 184151.939, 186551.6895, 188965.691, 191402.8095, - 193857.949, 196305.0775, 198774.6715, 201271.2585, 203764.78, 206299.3695, 208818.1365, - 211373.115, 213946.7465, 216532.076, 219105.541, 221714.5375, 224337.5135, 226977.5125, - 229613.0655, 232270.2685, 234952.2065, 237645.3555, 240331.1925, 243034.517, 245756.0725, - 248517.6865, 251232.737, 254011.3955, 256785.995, 259556.44, 262368.335, 265156.911, - 267965.266, 270785.583, 273616.0495, 276487.4835, 279346.639, 282202.509, 285074.3885, - 287942.2855, 290856.018, 293774.0345, 296678.5145, 299603.6355, 302552.6575, 305492.9785, - 308466.8605, 311392.581, 314347.538, 317319.4295, 320285.9785, 323301.7325, 326298.3235, - 329301.3105, 332301.987, 335309.791, 338370.762, 341382.923, 344431.1265, 347464.1545, - 350507.28, 353619.2345, 356631.2005, 359685.203, 362776.7845, 365886.488, 368958.2255, - 372060.6825, 375165.4335, 378237.935, 381328.311, 384430.5225, 387576.425, 390683.242, - 393839.648, 396977.8425, 400101.9805, 403271.296, 406409.8425, 409529.5485, 412678.7, - 415847.423, 419020.8035, 422157.081, 425337.749, 428479.6165, 431700.902, 434893.1915, - 438049.582, 441210.5415, 444379.2545, 447577.356, 450741.931, 453959.548, 457137.0935, - 460329.846, 463537.4815, 466732.3345, 469960.5615, 473164.681, 476347.6345, 479496.173, - 482813.1645, 486025.6995, 489249.4885, 492460.1945, 495675.8805, 498908.0075, 502131.802, - 505374.3855, 508550.9915, 511806.7305, 515026.776, 518217.0005, 521523.9855, 524705.9855, - 527950.997, 531210.0265, 534472.497, 537750.7315, 540926.922, 544207.094, 547429.4345, - 550666.3745, 553975.3475, 557150.7185, 560399.6165, 563662.697, 566916.7395, 570146.1215, - 573447.425, 576689.6245, 579874.5745, 583202.337, 586503.0255, 589715.635, 592910.161, - 596214.3885, 599488.035, 602740.92, 605983.0685, 609248.67, 612491.3605, 615787.912, - 619107.5245, 622307.9555, 625577.333, 628840.4385, 632085.2155, 635317.6135, 638691.7195, - 641887.467, 645139.9405, 648441.546, 651666.252, 654941.845}; -CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p18{ - 189084.0, 192250.913, 195456.774, 198696.946, 201977.762, 205294.444, 208651.754, - 212042.099, 215472.269, 218941.91, 222443.912, 225996.845, 229568.199, 233193.568, - 236844.457, 240543.233, 244279.475, 248044.27, 251854.588, 255693.2, 259583.619, - 263494.621, 267445.385, 271454.061, 275468.769, 279549.456, 283646.446, 287788.198, - 291966.099, 296181.164, 300431.469, 304718.618, 309024.004, 313393.508, 317760.803, - 322209.731, 326675.061, 331160.627, 335654.47, 340241.442, 344841.833, 349467.132, - 354130.629, 358819.432, 363574.626, 368296.587, 373118.482, 377914.93, 382782.301, - 387680.669, 392601.981, 397544.323, 402529.115, 407546.018, 412593.658, 417638.657, - 422762.865, 427886.169, 433017.167, 438213.273, 443441.254, 448692.421, 453937.533, - 459239.049, 464529.569, 469910.083, 475274.03, 480684.473, 486070.26, 491515.237, - 496995.651, 502476.617, 507973.609, 513497.19, 519083.233, 524726.509, 530305.505, - 535945.728, 541584.404, 547274.055, 552967.236, 558667.862, 564360.216, 570128.148, - 575965.08, 581701.952, 587532.523, 593361.144, 599246.128, 605033.418, 610958.779, - 616837.117, 622772.818, 628672.04, 634675.369, 640574.831, 646585.739, 652574.547, - 658611.217, 664642.684, 670713.914, 676737.681, 682797.313, 688837.897, 694917.874, - 701009.882, 707173.648, 713257.254, 719415.392, 725636.761, 731710.697, 737906.209, - 744103.074, 750313.39, 756504.185, 762712.579, 768876.985, 775167.859, 781359.0, - 787615.959, 793863.597, 800245.477, 806464.582, 812785.294, 819005.925, 825403.057, - 831676.197, 837936.284, 844266.968, 850642.711, 856959.756, 863322.774, 869699.931, - 876102.478, 882355.787, 888694.463, 895159.952, 901536.143, 907872.631, 914293.672, - 920615.14, 927130.974, 933409.404, 939922.178, 946331.47, 952745.93, 959209.264, - 965590.224, 972077.284, 978501.961, 984953.19, 991413.271, 997817.479, 1004222.658, - 1010725.676, 1017177.138, 1023612.529, 1030098.236, 1036493.719, 1043112.207, 1049537.036, - 1056008.096, 1062476.184, 1068942.337, 1075524.95, 1081932.864, 1088426.025, 1094776.005, - 1101327.448, 1107901.673, 1114423.639, 1120884.602, 1127324.923, 1133794.24, 1140328.886, - 1146849.376, 1153346.682, 1159836.502, 1166478.703, 1172953.304, 1179391.502, 1185950.982, - 1192544.052, 1198913.41, 1205430.994, 1212015.525, 1218674.042, 1225121.683, 1231551.101, - 1238126.379, 1244673.795, 1251260.649, 1257697.86, 1264320.983, 1270736.319, 1277274.694, - 1283804.95, 1290211.514, 1296858.568, 1303455.691}; +CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p4{11.0, 11.717, 12.207, 12.7896, 13.2882, 13.8204, 14.3772, 14.9342, 15.5202, 16.161, 16.7722, 17.4636, 18.0396, 18.6766, 19.3566, 20.0454, 20.7936, 21.4856, 22.2666, 22.9946, 23.766, 24.4692, 25.3638, 26.0764, 26.7864, 27.7602, 28.4814, 29.433, 30.2926, 31.0664, 31.9996, 32.7956, 33.5366, 34.5894, 35.5738, 36.2698, 37.3682, 38.0544, 39.2342, 40.0108, 40.7966, 41.9298, 42.8704, 43.6358, 44.5194, 45.773, 46.6772, 47.6174, 48.4888, 49.3304, 50.2506, 51.4996, 52.3824, 53.3078, 54.3984, 55.5838, 56.6618, 57.2174, 58.3514, 59.0802, 60.1482, 61.0376, 62.3598, 62.8078, 63.9744, 64.914, 65.781, 67.1806, 68.0594, 68.8446, 69.7928, 70.8248, 71.8324, 72.8598, 73.6246, 74.7014, 75.393, 76.6708, 77.2394}; +CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p5{23.0, 23.1194, 23.8208, 24.2318, 24.77, 25.2436, 25.7774, 26.2848, 26.8224, 27.3742, 27.9336, 28.503, 29.0494, 29.6292, 30.2124, 30.798, 31.367, 31.9728, 32.5944, 33.217, 33.8438, 34.3696, 35.0956, 35.7044, 36.324, 37.0668, 37.6698, 38.3644, 39.049, 39.6918, 40.4146, 41.082, 41.687, 42.5398, 43.2462, 43.857, 44.6606, 45.4168, 46.1248, 46.9222, 47.6804, 48.447, 49.3454, 49.9594, 50.7636, 51.5776, 52.331, 53.19, 53.9676, 54.7564, 55.5314, 56.4442, 57.3708, 57.9774, 58.9624, 59.8796, 60.755, 61.472, 62.2076, 63.1024, 63.8908, 64.7338, 65.7728, 66.629, 67.413, 68.3266, 69.1524, 70.2642, 71.1806, 72.0566, 72.9192, 73.7598, 74.3516, 75.5802, 76.4386, 77.4916, 78.1524, 79.1892, 79.8414, 80.8798, 81.8376, 82.4698, 83.7656, 84.331, 85.5914, 86.6012, 87.7016, 88.5582, 89.3394, 90.3544, 91.4912, 92.308, 93.3552, 93.9746, 95.2052, 95.727, 97.1322, 98.3944, 98.7588, 100.242, 101.1914, 102.2538, 102.8776, 103.6292, 105.1932, 105.9152, 107.0868, 107.6728, 108.7144, 110.3114, 110.8716, 111.245, 112.7908, 113.7064, 114.636, 115.7464, 116.1788, 117.7464, 118.4896, 119.6166, 120.5082, 121.7798, 122.9028, 123.4426, 124.8854, 125.705, 126.4652, 128.3464, 128.3462, 130.0398, 131.0342, 131.0042, 132.4766, 133.511, 134.7252, 135.425, 136.5172, 138.0572, 138.6694, 139.3712, 140.8598, 141.4594, 142.554, 143.4006, 144.7374, 146.1634, 146.8994, 147.605, 147.9304, 149.1636, 150.2468, 151.5876, 152.2096, 153.7032, 154.7146, 155.807, 156.9228, 157.0372, 158.5852}; +CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p6{46.0, 46.1902, 47.271, 47.8358, 48.8142, 49.2854, 50.317, 51.354, 51.8924, 52.9436, 53.4596, 54.5262, 55.6248, 56.1574, 57.2822, 57.837, 58.9636, 60.074, 60.7042, 61.7976, 62.4772, 63.6564, 64.7942, 65.5004, 66.686, 67.291, 68.5672, 69.8556, 70.4982, 71.8204, 72.4252, 73.7744, 75.0786, 75.8344, 77.0294, 77.8098, 79.0794, 80.5732, 81.1878, 82.5648, 83.2902, 84.6784, 85.3352, 86.8946, 88.3712, 89.0852, 90.499, 91.2686, 92.6844, 94.2234, 94.9732, 96.3356, 97.2286, 98.7262, 100.3284, 101.1048, 102.5962, 103.3562, 105.1272, 106.4184, 107.4974, 109.0822, 109.856, 111.48, 113.2834, 114.0208, 115.637, 116.5174, 118.0576, 119.7476, 120.427, 122.1326, 123.2372, 125.2788, 126.6776, 127.7926, 129.1952, 129.9564, 131.6454, 133.87, 134.5428, 136.2, 137.0294, 138.6278, 139.6782, 141.792, 143.3516, 144.2832, 146.0394, 147.0748, 148.4912, 150.849, 151.696, 153.5404, 154.073, 156.3714, 157.7216, 158.7328, 160.4208, 161.4184, 163.9424, 165.2772, 166.411, 168.1308, 168.769, 170.9258, 172.6828, 173.7502, 175.706, 176.3886, 179.0186, 180.4518, 181.927, 183.4172, 184.4114, 186.033, 188.5124, 189.5564, 191.6008, 192.4172, 193.8044, 194.997, 197.4548, 198.8948, 200.2346, 202.3086, 203.1548, 204.8842, 206.6508, 206.6772, 209.7254, 210.4752, 212.7228, 214.6614, 215.1676, 217.793, 218.0006, 219.9052, 221.66, 223.5588, 225.1636, 225.6882, 227.7126, 229.4502, 231.1978, 232.9756, 233.1654, 236.727, 238.1974, 237.7474, 241.1346, 242.3048, 244.1948, 245.3134, 246.879, 249.1204, 249.853, 252.6792, 253.857, 254.4486, 257.2362, 257.9534, 260.0286, 260.5632, 262.663, 264.723, 265.7566, 267.2566, 267.1624, 270.62, 272.8216, 273.2166, 275.2056, 276.2202, 278.3726, 280.3344, 281.9284, 283.9728, 284.1924, 286.4872, 287.587, 289.807, 291.1206, 292.769, 294.8708, 296.665, 297.1182, 299.4012, 300.6352, 302.1354, 304.1756, 306.1606, 307.3462, 308.5214, 309.4134, 310.8352, 313.9684, 315.837, 316.7796, 318.9858}; +CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p7{92.0, 93.4934, 94.9758, 96.4574, 97.9718, 99.4954, 101.5302, 103.0756, 104.6374, 106.1782, 107.7888, 109.9522, 111.592, 113.2532, 114.9086, 116.5938, 118.9474, 120.6796, 122.4394, 124.2176, 125.9768, 128.4214, 130.2528, 132.0102, 133.8658, 135.7278, 138.3044, 140.1316, 142.093, 144.0032, 145.9092, 148.6306, 150.5294, 152.5756, 154.6508, 156.662, 159.552, 161.3724, 163.617, 165.5754, 167.7872, 169.8444, 172.7988, 174.8606, 177.2118, 179.3566, 181.4476, 184.5882, 186.6816, 189.0824, 191.0258, 193.6048, 196.4436, 198.7274, 200.957, 203.147, 205.4364, 208.7592, 211.3386, 213.781, 215.8028, 218.656, 221.6544, 223.996, 226.4718, 229.1544, 231.6098, 234.5956, 237.0616, 239.5758, 242.4878, 244.5244, 248.2146, 250.724, 252.8722, 255.5198, 258.0414, 261.941, 264.9048, 266.87, 269.4304, 272.028, 274.4708, 278.37, 281.0624, 283.4668, 286.5532, 289.4352, 293.2564, 295.2744, 298.2118, 300.7472, 304.1456, 307.2928, 309.7504, 312.5528, 315.979, 318.2102, 322.1834, 324.3494, 327.325, 330.6614, 332.903, 337.2544, 339.9042, 343.215, 345.2864, 348.0814, 352.6764, 355.301, 357.139, 360.658, 363.1732, 366.5902, 369.9538, 373.0828, 375.922, 378.9902, 382.7328, 386.4538, 388.1136, 391.2234, 394.0878, 396.708, 401.1556, 404.1852, 406.6372, 409.6822, 412.7796, 416.6078, 418.4916, 422.131, 424.5376, 428.1988, 432.211, 434.4502, 438.5282, 440.912, 444.0448, 447.7432, 450.8524, 453.7988, 456.7858, 458.8868, 463.9886, 466.5064, 468.9124, 472.6616, 475.4682, 478.582, 481.304, 485.2738, 488.6894, 490.329, 496.106, 497.6908, 501.1374, 504.5322, 506.8848, 510.3324, 513.4512, 516.179, 520.4412, 522.6066, 526.167, 528.7794, 533.379, 536.067, 538.46, 542.9116, 545.692, 547.9546, 552.493, 555.2722, 557.335, 562.449, 564.2014, 569.0738, 571.0974, 574.8564, 578.2996, 581.409, 583.9704, 585.8098, 589.6528, 594.5998, 595.958, 600.068, 603.3278, 608.2016, 609.9632, 612.864, 615.43, 620.7794, 621.272, 625.8644, 629.206, 633.219, 634.5154, 638.6102}; +CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p8{184.2152, 187.2454, 190.2096, 193.6652, 196.6312, 199.6822, 203.249, 206.3296, 210.0038, 213.2074, 216.4612, 220.27, 223.5178, 227.4412, 230.8032, 234.1634, 238.1688, 241.6074, 245.6946, 249.2664, 252.8228, 257.0432, 260.6824, 264.9464, 268.6268, 272.2626, 276.8376, 280.4034, 284.8956, 288.8522, 292.7638, 297.3552, 301.3556, 305.7526, 309.9292, 313.8954, 318.8198, 322.7668, 327.298, 331.6688, 335.9466, 340.9746, 345.1672, 349.3474, 354.3028, 358.8912, 364.114, 368.4646, 372.9744, 378.4092, 382.6022, 387.843, 392.5684, 397.1652, 402.5426, 407.4152, 412.5388, 417.3592, 422.1366, 427.486, 432.3918, 437.5076, 442.509, 447.3834, 453.3498, 458.0668, 463.7346, 469.1228, 473.4528, 479.7, 484.644, 491.0518, 495.5774, 500.9068, 506.432, 512.1666, 517.434, 522.6644, 527.4894, 533.6312, 538.3804, 544.292, 550.5496, 556.0234, 562.8206, 566.6146, 572.4188, 579.117, 583.6762, 590.6576, 595.7864, 601.509, 607.5334, 612.9204, 619.772, 624.2924, 630.8654, 636.1836, 642.745, 649.1316, 655.0386, 660.0136, 666.6342, 671.6196, 678.1866, 684.4282, 689.3324, 695.4794, 702.5038, 708.129, 713.528, 720.3204, 726.463, 732.7928, 739.123, 744.7418, 751.2192, 756.5102, 762.6066, 769.0184, 775.2224, 781.4014, 787.7618, 794.1436, 798.6506, 805.6378, 811.766, 819.7514, 824.5776, 828.7322, 837.8048, 843.6302, 849.9336, 854.4798, 861.3388, 867.9894, 873.8196, 880.3136, 886.2308, 892.4588, 899.0816, 905.4076, 912.0064, 917.3878, 923.619, 929.998, 937.3482, 943.9506, 947.991, 955.1144, 962.203, 968.8222, 975.7324, 981.7826, 988.7666, 994.2648, 1000.3128, 1007.4082, 1013.7536, 1020.3376, 1026.7156, 1031.7478, 1037.4292, 1045.393, 1051.2278, 1058.3434, 1062.8726, 1071.884, 1076.806, 1082.9176, 1089.1678, 1095.5032, 1102.525, 1107.2264, 1115.315, 1120.93, 1127.252, 1134.1496, 1139.0408, 1147.5448, 1153.3296, 1158.1974, 1166.5262, 1174.3328, 1175.657, 1184.4222, 1190.9172, 1197.1292, 1204.4606, 1210.4578, 1218.8728, 1225.3336, 1226.6592, 1236.5768, 1241.363, 1249.4074, 1254.6566, 1260.8014, 1266.5454, 1274.5192}; +CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p9{369.0, 374.8294, 381.2452, 387.6698, 394.1464, 400.2024, 406.8782, 413.6598, 420.462, 427.2826, 433.7102, 440.7416, 447.9366, 455.1046, 462.285, 469.0668, 476.306, 483.8448, 491.301, 498.9886, 506.2422, 513.8138, 521.7074, 529.7428, 537.8402, 545.1664, 553.3534, 561.594, 569.6886, 577.7876, 585.65, 594.228, 602.8036, 611.1666, 620.0818, 628.0824, 637.2574, 646.302, 655.1644, 664.0056, 672.3802, 681.7192, 690.5234, 700.2084, 708.831, 718.485, 728.1112, 737.4764, 746.76, 756.3368, 766.5538, 775.5058, 785.2646, 795.5902, 804.3818, 814.8998, 824.9532, 835.2062, 845.2798, 854.4728, 864.9582, 875.3292, 886.171, 896.781, 906.5716, 916.7048, 927.5322, 937.875, 949.3972, 958.3464, 969.7274, 980.2834, 992.1444, 1003.4264, 1013.0166, 1024.018, 1035.0438, 1046.34, 1057.6856, 1068.9836, 1079.0312, 1091.677, 1102.3188, 1113.4846, 1124.4424, 1135.739, 1147.1488, 1158.9202, 1169.406, 1181.5342, 1193.2834, 1203.8954, 1216.3286, 1226.2146, 1239.6684, 1251.9946, 1262.123, 1275.4338, 1285.7378, 1296.076, 1308.9692, 1320.4964, 1333.0998, 1343.9864, 1357.7754, 1368.3208, 1380.4838, 1392.7388, 1406.0758, 1416.9098, 1428.9728, 1440.9228, 1453.9292, 1462.617, 1476.05, 1490.2996, 1500.6128, 1513.7392, 1524.5174, 1536.6322, 1548.2584, 1562.3766, 1572.423, 1587.1232, 1596.5164, 1610.5938, 1622.5972, 1633.1222, 1647.7674, 1658.5044, 1671.57, 1683.7044, 1695.4142, 1708.7102, 1720.6094, 1732.6522, 1747.841, 1756.4072, 1769.9786, 1782.3276, 1797.5216, 1808.3186, 1819.0694, 1834.354, 1844.575, 1856.2808, 1871.1288, 1880.7852, 1893.9622, 1906.3418, 1920.6548, 1932.9302, 1945.8584, 1955.473, 1968.8248, 1980.6446, 1995.9598, 2008.349, 2019.8556, 2033.0334, 2044.0206, 2059.3956, 2069.9174, 2082.6084, 2093.7036, 2106.6108, 2118.9124, 2132.301, 2144.7628, 2159.8422, 2171.0212, 2183.101, 2193.5112, 2208.052, 2221.3194, 2233.3282, 2247.295, 2257.7222, 2273.342, 2286.5638, 2299.6786, 2310.8114, 2322.3312, 2335.516, 2349.874, 2363.5968, 2373.865, 2387.1918, 2401.8328, 2414.8496, 2424.544, 2436.7592, 2447.1682, 2464.1958, 2474.3438, 2489.0006, 2497.4526, 2513.6586, 2527.19, 2540.7028, 2553.768}; +CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p10{738.1256, 750.4234, 763.1064, 775.4732, 788.4636, 801.0644, 814.488, 827.9654, 841.0832, 854.7864, 868.1992, 882.2176, 896.5228, 910.1716, 924.7752, 938.899, 953.6126, 968.6492, 982.9474, 998.5214, 1013.1064, 1028.6364, 1044.2468, 1059.4588, 1075.3832, 1091.0584, 1106.8606, 1123.3868, 1139.5062, 1156.1862, 1172.463, 1189.339, 1206.1936, 1223.1292, 1240.1854, 1257.2908, 1275.3324, 1292.8518, 1310.5204, 1328.4854, 1345.9318, 1364.552, 1381.4658, 1400.4256, 1419.849, 1438.152, 1456.8956, 1474.8792, 1494.118, 1513.62, 1532.5132, 1551.9322, 1570.7726, 1590.6086, 1610.5332, 1630.5918, 1650.4294, 1669.7662, 1690.4106, 1710.7338, 1730.9012, 1750.4486, 1770.1556, 1791.6338, 1812.7312, 1833.6264, 1853.9526, 1874.8742, 1896.8326, 1918.1966, 1939.5594, 1961.07, 1983.037, 2003.1804, 2026.071, 2047.4884, 2070.0848, 2091.2944, 2114.333, 2135.9626, 2158.2902, 2181.0814, 2202.0334, 2224.4832, 2246.39, 2269.7202, 2292.1714, 2314.2358, 2338.9346, 2360.891, 2384.0264, 2408.3834, 2430.1544, 2454.8684, 2476.9896, 2501.4368, 2522.8702, 2548.0408, 2570.6738, 2593.5208, 2617.0158, 2640.2302, 2664.0962, 2687.4986, 2714.2588, 2735.3914, 2759.6244, 2781.8378, 2808.0072, 2830.6516, 2856.2454, 2877.2136, 2903.4546, 2926.785, 2951.2294, 2976.468, 3000.867, 3023.6508, 3049.91, 3073.5984, 3098.162, 3121.5564, 3146.2328, 3170.9484, 3195.5902, 3221.3346, 3242.7032, 3271.6112, 3296.5546, 3317.7376, 3345.072, 3369.9518, 3394.326, 3418.1818, 3444.6926, 3469.086, 3494.2754, 3517.8698, 3544.248, 3565.3768, 3588.7234, 3616.979, 3643.7504, 3668.6812, 3695.72, 3719.7392, 3742.6224, 3770.4456, 3795.6602, 3819.9058, 3844.002, 3869.517, 3895.6824, 3920.8622, 3947.1364, 3973.985, 3995.4772, 4021.62, 4046.628, 4074.65, 4096.2256, 4121.831, 4146.6406, 4173.276, 4195.0744, 4223.9696, 4251.3708, 4272.9966, 4300.8046, 4326.302, 4353.1248, 4374.312, 4403.0322, 4426.819, 4450.0598, 4478.5206, 4504.8116, 4528.8928, 4553.9584, 4578.8712, 4603.8384, 4632.3872, 4655.5128, 4675.821, 4704.6222, 4731.9862, 4755.4174, 4781.2628, 4804.332, 4832.3048, 4862.8752, 4883.4148, 4906.9544, 4935.3516, 4954.3532, 4984.0248, 5011.217, 5035.3258, 5057.3672, 5084.1828}; +CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p11{1477.0, 1501.6014, 1526.5802, 1551.7942, 1577.3042, 1603.2062, 1629.8402, 1656.2292, 1682.9462, 1709.9926, 1737.3026, 1765.4252, 1793.0578, 1821.6092, 1849.626, 1878.5568, 1908.527, 1937.5154, 1967.1874, 1997.3878, 2027.37, 2058.1972, 2089.5728, 2120.1012, 2151.9668, 2183.292, 2216.0772, 2247.8578, 2280.6562, 2313.041, 2345.714, 2380.3112, 2414.1806, 2447.9854, 2481.656, 2516.346, 2551.5154, 2586.8378, 2621.7448, 2656.6722, 2693.5722, 2729.1462, 2765.4124, 2802.8728, 2838.898, 2876.408, 2913.4926, 2951.4938, 2989.6776, 3026.282, 3065.7704, 3104.1012, 3143.7388, 3181.6876, 3221.1872, 3261.5048, 3300.0214, 3339.806, 3381.409, 3421.4144, 3461.4294, 3502.2286, 3544.651, 3586.6156, 3627.337, 3670.083, 3711.1538, 3753.5094, 3797.01, 3838.6686, 3882.1678, 3922.8116, 3967.9978, 4009.9204, 4054.3286, 4097.5706, 4140.6014, 4185.544, 4229.5976, 4274.583, 4316.9438, 4361.672, 4406.2786, 4451.8628, 4496.1834, 4543.505, 4589.1816, 4632.5188, 4678.2294, 4724.8908, 4769.0194, 4817.052, 4861.4588, 4910.1596, 4956.4344, 5002.5238, 5048.13, 5093.6374, 5142.8162, 5187.7894, 5237.3984, 5285.6078, 5331.0858, 5379.1036, 5428.6258, 5474.6018, 5522.7618, 5571.5822, 5618.59, 5667.9992, 5714.88, 5763.454, 5808.6982, 5860.3644, 5910.2914, 5953.571, 6005.9232, 6055.1914, 6104.5882, 6154.5702, 6199.7036, 6251.1764, 6298.7596, 6350.0302, 6398.061, 6448.4694, 6495.933, 6548.0474, 6597.7166, 6646.9416, 6695.9208, 6742.6328, 6793.5276, 6842.1934, 6894.2372, 6945.3864, 6996.9228, 7044.2372, 7094.1374, 7142.2272, 7192.2942, 7238.8338, 7288.9006, 7344.0908, 7394.8544, 7443.5176, 7490.4148, 7542.9314, 7595.6738, 7641.9878, 7694.3688, 7743.0448, 7797.522, 7845.53, 7899.594, 7950.3132, 7996.455, 8050.9442, 8092.9114, 8153.1374, 8197.4472, 8252.8278, 8301.8728, 8348.6776, 8401.4698, 8453.551, 8504.6598, 8553.8944, 8604.1276, 8657.6514, 8710.3062, 8758.908, 8807.8706, 8862.1702, 8910.4668, 8960.77, 9007.2766, 9063.164, 9121.0534, 9164.1354, 9218.1594, 9267.767, 9319.0594, 9372.155, 9419.7126, 9474.3722, 9520.1338, 9572.368, 9622.7702, 9675.8448, 9726.5396, 9778.7378, 9827.6554, 9878.1922, 9928.7782, 9978.3984, 10026.578, 10076.5626, 10137.1618, 10177.5244, 10229.9176}; +CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p12{2954.0, 3003.4782, 3053.3568, 3104.3666, 3155.324, 3206.9598, 3259.648, 3312.539, 3366.1474, 3420.2576, 3474.8376, 3530.6076, 3586.451, 3643.38, 3700.4104, 3757.5638, 3815.9676, 3875.193, 3934.838, 3994.8548, 4055.018, 4117.1742, 4178.4482, 4241.1294, 4304.4776, 4367.4044, 4431.8724, 4496.3732, 4561.4304, 4627.5326, 4693.949, 4761.5532, 4828.7256, 4897.6182, 4965.5186, 5034.4528, 5104.865, 5174.7164, 5244.6828, 5316.6708, 5387.8312, 5459.9036, 5532.476, 5604.8652, 5679.6718, 5753.757, 5830.2072, 5905.2828, 5980.0434, 6056.6264, 6134.3192, 6211.5746, 6290.0816, 6367.1176, 6447.9796, 6526.5576, 6606.1858, 6686.9144, 6766.1142, 6847.0818, 6927.9664, 7010.9096, 7091.0816, 7175.3962, 7260.3454, 7344.018, 7426.4214, 7511.3106, 7596.0686, 7679.8094, 7765.818, 7852.4248, 7936.834, 8022.363, 8109.5066, 8200.4554, 8288.5832, 8373.366, 8463.4808, 8549.7682, 8642.0522, 8728.3288, 8820.9528, 8907.727, 9001.0794, 9091.2522, 9179.988, 9269.852, 9362.6394, 9453.642, 9546.9024, 9640.6616, 9732.6622, 9824.3254, 9917.7484, 10007.9392, 10106.7508, 10196.2152, 10289.8114, 10383.5494, 10482.3064, 10576.8734, 10668.7872, 10764.7156, 10862.0196, 10952.793, 11049.9748, 11146.0702, 11241.4492, 11339.2772, 11434.2336, 11530.741, 11627.6136, 11726.311, 11821.5964, 11918.837, 12015.3724, 12113.0162, 12213.0424, 12306.9804, 12408.4518, 12504.8968, 12604.586, 12700.9332, 12798.705, 12898.5142, 12997.0488, 13094.788, 13198.475, 13292.7764, 13392.9698, 13486.8574, 13590.1616, 13686.5838, 13783.6264, 13887.2638, 13992.0978, 14081.0844, 14189.9956, 14280.0912, 14382.4956, 14486.4384, 14588.1082, 14686.2392, 14782.276, 14888.0284, 14985.1864, 15088.8596, 15187.0998, 15285.027, 15383.6694, 15495.8266, 15591.3736, 15694.2008, 15790.3246, 15898.4116, 15997.4522, 16095.5014, 16198.8514, 16291.7492, 16402.6424, 16499.1266, 16606.2436, 16697.7186, 16796.3946, 16902.3376, 17005.7672, 17100.814, 17206.8282, 17305.8262, 17416.0744, 17508.4092, 17617.0178, 17715.4554, 17816.758, 17920.1748, 18012.9236, 18119.7984, 18223.2248, 18324.2482, 18426.6276, 18525.0932, 18629.8976, 18733.2588, 18831.0466, 18940.1366, 19032.2696, 19131.729, 19243.4864, 19349.6932, 19442.866, 19547.9448, 19653.2798, 19754.4034, 19854.0692, 19965.1224, 20065.1774, 20158.2212, 20253.353, 20366.3264, 20463.22}; +CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p13{5908.5052, 6007.2672, 6107.347, 6208.5794, 6311.2622, 6414.5514, 6519.3376, 6625.6952, 6732.5988, 6841.3552, 6950.5972, 7061.3082, 7173.5646, 7287.109, 7401.8216, 7516.4344, 7633.3802, 7751.2962, 7870.3784, 7990.292, 8110.79, 8233.4574, 8356.6036, 8482.2712, 8607.7708, 8735.099, 8863.1858, 8993.4746, 9123.8496, 9255.6794, 9388.5448, 9522.7516, 9657.3106, 9792.6094, 9930.5642, 10068.794, 10206.7256, 10347.81, 10490.3196, 10632.0778, 10775.9916, 10920.4662, 11066.124, 11213.073, 11358.0362, 11508.1006, 11659.1716, 11808.7514, 11959.4884, 12112.1314, 12265.037, 12420.3756, 12578.933, 12734.311, 12890.0006, 13047.2144, 13207.3096, 13368.5144, 13528.024, 13689.847, 13852.7528, 14018.3168, 14180.5372, 14346.9668, 14513.5074, 14677.867, 14846.2186, 15017.4186, 15184.9716, 15356.339, 15529.2972, 15697.3578, 15871.8686, 16042.187, 16216.4094, 16389.4188, 16565.9126, 16742.3272, 16919.0042, 17094.7592, 17273.965, 17451.8342, 17634.4254, 17810.5984, 17988.9242, 18171.051, 18354.7938, 18539.466, 18721.0408, 18904.9972, 19081.867, 19271.9118, 19451.8694, 19637.9816, 19821.2922, 20013.1292, 20199.3858, 20387.8726, 20572.9514, 20770.7764, 20955.1714, 21144.751, 21329.9952, 21520.709, 21712.7016, 21906.3868, 22096.2626, 22286.0524, 22475.051, 22665.5098, 22862.8492, 23055.5294, 23249.6138, 23437.848, 23636.273, 23826.093, 24020.3296, 24213.3896, 24411.7392, 24602.9614, 24805.7952, 24998.1552, 25193.9588, 25389.0166, 25585.8392, 25780.6976, 25981.2728, 26175.977, 26376.5252, 26570.1964, 26773.387, 26962.9812, 27163.0586, 27368.164, 27565.0534, 27758.7428, 27961.1276, 28163.2324, 28362.3816, 28565.7668, 28758.644, 28956.9768, 29163.4722, 29354.7026, 29561.1186, 29767.9948, 29959.9986, 30164.0492, 30366.9818, 30562.5338, 30762.9928, 30976.1592, 31166.274, 31376.722, 31570.3734, 31770.809, 31974.8934, 32179.5286, 32387.5442, 32582.3504, 32794.076, 32989.9528, 33191.842, 33392.4684, 33595.659, 33801.8672, 34000.3414, 34200.0922, 34402.6792, 34610.0638, 34804.0084, 35011.13, 35218.669, 35418.6634, 35619.0792, 35830.6534, 36028.4966, 36229.7902, 36438.6422, 36630.7764, 36833.3102, 37048.6728, 37247.3916, 37453.5904, 37669.3614, 37854.5526, 38059.305, 38268.0936, 38470.2516, 38674.7064, 38876.167, 39068.3794, 39281.9144, 39492.8566, 39684.8628, 39898.4108, 40093.1836, 40297.6858, 40489.7086, 40717.2424}; +CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p14{11817.475, 12015.0046, 12215.3792, 12417.7504, 12623.1814, 12830.0086, 13040.0072, 13252.503, 13466.178, 13683.2738, 13902.0344, 14123.9798, 14347.394, 14573.7784, 14802.6894, 15033.6824, 15266.9134, 15502.8624, 15741.4944, 15980.7956, 16223.8916, 16468.6316, 16715.733, 16965.5726, 17217.204, 17470.666, 17727.8516, 17986.7886, 18247.6902, 18510.9632, 18775.304, 19044.7486, 19314.4408, 19587.202, 19862.2576, 20135.924, 20417.0324, 20697.9788, 20979.6112, 21265.0274, 21550.723, 21841.6906, 22132.162, 22428.1406, 22722.127, 23020.5606, 23319.7394, 23620.4014, 23925.2728, 24226.9224, 24535.581, 24845.505, 25155.9618, 25470.3828, 25785.9702, 26103.7764, 26420.4132, 26742.0186, 27062.8852, 27388.415, 27714.6024, 28042.296, 28365.4494, 28701.1526, 29031.8008, 29364.2156, 29704.497, 30037.1458, 30380.111, 30723.8168, 31059.5114, 31404.9498, 31751.6752, 32095.2686, 32444.7792, 32794.767, 33145.204, 33498.4226, 33847.6502, 34209.006, 34560.849, 34919.4838, 35274.9778, 35635.1322, 35996.3266, 36359.1394, 36722.8266, 37082.8516, 37447.7354, 37815.9606, 38191.0692, 38559.4106, 38924.8112, 39294.6726, 39663.973, 40042.261, 40416.2036, 40779.2036, 41161.6436, 41540.9014, 41921.1998, 42294.7698, 42678.5264, 43061.3464, 43432.375, 43818.432, 44198.6598, 44583.0138, 44970.4794, 45353.924, 45729.858, 46118.2224, 46511.5724, 46900.7386, 47280.6964, 47668.1472, 48055.6796, 48446.9436, 48838.7146, 49217.7296, 49613.7796, 50010.7508, 50410.0208, 50793.7886, 51190.2456, 51583.1882, 51971.0796, 52376.5338, 52763.319, 53165.5534, 53556.5594, 53948.2702, 54346.352, 54748.7914, 55138.577, 55543.4824, 55941.1748, 56333.7746, 56745.1552, 57142.7944, 57545.2236, 57935.9956, 58348.5268, 58737.5474, 59158.5962, 59542.6896, 59958.8004, 60349.3788, 60755.0212, 61147.6144, 61548.194, 61946.0696, 62348.6042, 62763.603, 63162.781, 63560.635, 63974.3482, 64366.4908, 64771.5876, 65176.7346, 65597.3916, 65995.915, 66394.0384, 66822.9396, 67203.6336, 67612.2032, 68019.0078, 68420.0388, 68821.22, 69235.8388, 69640.0724, 70055.155, 70466.357, 70863.4266, 71276.2482, 71677.0306, 72080.2006, 72493.0214, 72893.5952, 73314.5856, 73714.9852, 74125.3022, 74521.2122, 74933.6814, 75341.5904, 75743.0244, 76166.0278, 76572.1322, 76973.1028, 77381.6284, 77800.6092, 78189.328, 78607.0962, 79012.2508, 79407.8358, 79825.725, 80238.701, 80646.891, 81035.6436, 81460.0448, 81876.3884}; +CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p15{23635.0036, 24030.8034, 24431.4744, 24837.1524, 25246.7928, 25661.326, 26081.3532, 26505.2806, 26933.9892, 27367.7098, 27805.318, 28248.799, 28696.4382, 29148.8244, 29605.5138, 30066.8668, 30534.2344, 31006.32, 31480.778, 31962.2418, 32447.3324, 32938.0232, 33432.731, 33930.728, 34433.9896, 34944.1402, 35457.5588, 35974.5958, 36497.3296, 37021.9096, 37554.326, 38088.0826, 38628.8816, 39171.3192, 39723.2326, 40274.5554, 40832.3142, 41390.613, 41959.5908, 42532.5466, 43102.0344, 43683.5072, 44266.694, 44851.2822, 45440.7862, 46038.0586, 46640.3164, 47241.064, 47846.155, 48454.7396, 49076.9168, 49692.542, 50317.4778, 50939.65, 51572.5596, 52210.2906, 52843.7396, 53481.3996, 54127.236, 54770.406, 55422.6598, 56078.7958, 56736.7174, 57397.6784, 58064.5784, 58730.308, 59404.9784, 60077.0864, 60751.9158, 61444.1386, 62115.817, 62808.7742, 63501.4774, 64187.5454, 64883.6622, 65582.7468, 66274.5318, 66976.9276, 67688.7764, 68402.138, 69109.6274, 69822.9706, 70543.6108, 71265.5202, 71983.3848, 72708.4656, 73433.384, 74158.4664, 74896.4868, 75620.9564, 76362.1434, 77098.3204, 77835.7662, 78582.6114, 79323.9902, 80067.8658, 80814.9246, 81567.0136, 82310.8536, 83061.9952, 83821.4096, 84580.8608, 85335.547, 86092.5802, 86851.6506, 87612.311, 88381.2016, 89146.3296, 89907.8974, 90676.846, 91451.4152, 92224.5518, 92995.8686, 93763.5066, 94551.2796, 95315.1944, 96096.1806, 96881.0918, 97665.679, 98442.68, 99229.3002, 100011.0994, 100790.6386, 101580.1564, 102377.7484, 103152.1392, 103944.2712, 104730.216, 105528.6336, 106324.9398, 107117.6706, 107890.3988, 108695.2266, 109485.238, 110294.7876, 111075.0958, 111878.0496, 112695.2864, 113464.5486, 114270.0474, 115068.608, 115884.3626, 116673.2588, 117483.3716, 118275.097, 119085.4092, 119879.2808, 120687.5868, 121499.9944, 122284.916, 123095.9254, 123912.5038, 124709.0454, 125503.7182, 126323.259, 127138.9412, 127943.8294, 128755.646, 129556.5354, 130375.3298, 131161.4734, 131971.1962, 132787.5458, 133588.1056, 134431.351, 135220.2906, 136023.398, 136846.6558, 137667.0004, 138463.663, 139283.7154, 140074.6146, 140901.3072, 141721.8548, 142543.2322, 143356.1096, 144173.7412, 144973.0948, 145794.3162, 146609.5714, 147420.003, 148237.9784, 149050.5696, 149854.761, 150663.1966, 151494.0754, 152313.1416, 153112.6902, 153935.7206, 154746.9262, 155559.547, 156401.9746, 157228.7036, 158008.7254, 158820.75, 159646.9184, 160470.4458, 161279.5348, 162093.3114, 162918.542, 163729.2842}; +CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p16{47271.0, 48062.3584, 48862.7074, 49673.152, 50492.8416, 51322.9514, 52161.03, 53009.407, 53867.6348, 54734.206, 55610.5144, 56496.2096, 57390.795, 58297.268, 59210.6448, 60134.665, 61068.0248, 62010.4472, 62962.5204, 63923.5742, 64895.0194, 65876.4182, 66862.6136, 67862.6968, 68868.8908, 69882.8544, 70911.271, 71944.0924, 72990.0326, 74040.692, 75100.6336, 76174.7826, 77252.5998, 78340.2974, 79438.2572, 80545.4976, 81657.2796, 82784.6336, 83915.515, 85059.7362, 86205.9368, 87364.4424, 88530.3358, 89707.3744, 90885.9638, 92080.197, 93275.5738, 94479.391, 95695.918, 96919.2236, 98148.4602, 99382.3474, 100625.6974, 101878.0284, 103141.6278, 104409.4588, 105686.2882, 106967.5402, 108261.6032, 109548.1578, 110852.0728, 112162.231, 113479.0072, 114806.2626, 116137.9072, 117469.5048, 118813.5186, 120165.4876, 121516.2556, 122875.766, 124250.5444, 125621.2222, 127003.2352, 128387.848, 129775.2644, 131181.7776, 132577.3086, 133979.9458, 135394.1132, 136800.9078, 138233.217, 139668.5308, 141085.212, 142535.2122, 143969.0684, 145420.2872, 146878.1542, 148332.7572, 149800.3202, 151269.66, 152743.6104, 154213.0948, 155690.288, 157169.4246, 158672.1756, 160160.059, 161650.6854, 163145.7772, 164645.6726, 166159.1952, 167682.1578, 169177.3328, 170700.0118, 172228.8964, 173732.6664, 175265.5556, 176787.799, 178317.111, 179856.6914, 181400.865, 182943.4612, 184486.742, 186033.4698, 187583.7886, 189148.1868, 190688.4526, 192250.1926, 193810.9042, 195354.2972, 196938.7682, 198493.5898, 200079.2824, 201618.912, 203205.5492, 204765.5798, 206356.1124, 207929.3064, 209498.7196, 211086.229, 212675.1324, 214256.7892, 215826.2392, 217412.8474, 218995.6724, 220618.6038, 222207.1166, 223781.0364, 225387.4332, 227005.7928, 228590.4336, 230217.8738, 231805.1054, 233408.9, 234995.3432, 236601.4956, 238190.7904, 239817.2548, 241411.2832, 243002.4066, 244640.1884, 246255.3128, 247849.3508, 249479.9734, 251106.8822, 252705.027, 254332.9242, 255935.129, 257526.9014, 259154.772, 260777.625, 262390.253, 264004.4906, 265643.59, 267255.4076, 268873.426, 270470.7252, 272106.4804, 273722.4456, 275337.794, 276945.7038, 278592.9154, 280204.3726, 281841.1606, 283489.171, 285130.1716, 286735.3362, 288364.7164, 289961.1814, 291595.5524, 293285.683, 294899.6668, 296499.3434, 298128.0462, 299761.8946, 301394.2424, 302997.6748, 304615.1478, 306269.7724, 307886.114, 309543.1028, 311153.2862, 312782.8546, 314421.2008, 316033.2438, 317692.9636, 319305.2648, 320948.7406, 322566.3364, 324228.4224, 325847.1542}; +CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p17{94542.0, 96125.811, 97728.019, 99348.558, 100987.9705, 102646.7565, 104324.5125, 106021.7435, 107736.7865, 109469.272, 111223.9465, 112995.219, 114787.432, 116593.152, 118422.71, 120267.2345, 122134.6765, 124020.937, 125927.2705, 127851.255, 129788.9485, 131751.016, 133726.8225, 135722.592, 137736.789, 139770.568, 141821.518, 143891.343, 145982.1415, 148095.387, 150207.526, 152355.649, 154515.6415, 156696.05, 158887.7575, 161098.159, 163329.852, 165569.053, 167837.4005, 170121.6165, 172420.4595, 174732.6265, 177062.77, 179412.502, 181774.035, 184151.939, 186551.6895, 188965.691, 191402.8095, 193857.949, 196305.0775, 198774.6715, 201271.2585, 203764.78, 206299.3695, 208818.1365, 211373.115, 213946.7465, 216532.076, 219105.541, 221714.5375, 224337.5135, 226977.5125, 229613.0655, 232270.2685, 234952.2065, 237645.3555, 240331.1925, 243034.517, 245756.0725, 248517.6865, 251232.737, 254011.3955, 256785.995, 259556.44, 262368.335, 265156.911, 267965.266, 270785.583, 273616.0495, 276487.4835, 279346.639, 282202.509, 285074.3885, 287942.2855, 290856.018, 293774.0345, 296678.5145, 299603.6355, 302552.6575, 305492.9785, 308466.8605, 311392.581, 314347.538, 317319.4295, 320285.9785, 323301.7325, 326298.3235, 329301.3105, 332301.987, 335309.791, 338370.762, 341382.923, 344431.1265, 347464.1545, 350507.28, 353619.2345, 356631.2005, 359685.203, 362776.7845, 365886.488, 368958.2255, 372060.6825, 375165.4335, 378237.935, 381328.311, 384430.5225, 387576.425, 390683.242, 393839.648, 396977.8425, 400101.9805, 403271.296, 406409.8425, 409529.5485, 412678.7, 415847.423, 419020.8035, 422157.081, 425337.749, 428479.6165, 431700.902, 434893.1915, 438049.582, 441210.5415, 444379.2545, 447577.356, 450741.931, 453959.548, 457137.0935, 460329.846, 463537.4815, 466732.3345, 469960.5615, 473164.681, 476347.6345, 479496.173, 482813.1645, 486025.6995, 489249.4885, 492460.1945, 495675.8805, 498908.0075, 502131.802, 505374.3855, 508550.9915, 511806.7305, 515026.776, 518217.0005, 521523.9855, 524705.9855, 527950.997, 531210.0265, 534472.497, 537750.7315, 540926.922, 544207.094, 547429.4345, 550666.3745, 553975.3475, 557150.7185, 560399.6165, 563662.697, 566916.7395, 570146.1215, 573447.425, 576689.6245, 579874.5745, 583202.337, 586503.0255, 589715.635, 592910.161, 596214.3885, 599488.035, 602740.92, 605983.0685, 609248.67, 612491.3605, 615787.912, 619107.5245, 622307.9555, 625577.333, 628840.4385, 632085.2155, 635317.6135, 638691.7195, 641887.467, 645139.9405, 648441.546, 651666.252, 654941.845}; +CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p18{189084.0, 192250.913, 195456.774, 198696.946, 201977.762, 205294.444, 208651.754, 212042.099, 215472.269, 218941.91, 222443.912, 225996.845, 229568.199, 233193.568, 236844.457, 240543.233, 244279.475, 248044.27, 251854.588, 255693.2, 259583.619, 263494.621, 267445.385, 271454.061, 275468.769, 279549.456, 283646.446, 287788.198, 291966.099, 296181.164, 300431.469, 304718.618, 309024.004, 313393.508, 317760.803, 322209.731, 326675.061, 331160.627, 335654.47, 340241.442, 344841.833, 349467.132, 354130.629, 358819.432, 363574.626, 368296.587, 373118.482, 377914.93, 382782.301, 387680.669, 392601.981, 397544.323, 402529.115, 407546.018, 412593.658, 417638.657, 422762.865, 427886.169, 433017.167, 438213.273, 443441.254, 448692.421, 453937.533, 459239.049, 464529.569, 469910.083, 475274.03, 480684.473, 486070.26, 491515.237, 496995.651, 502476.617, 507973.609, 513497.19, 519083.233, 524726.509, 530305.505, 535945.728, 541584.404, 547274.055, 552967.236, 558667.862, 564360.216, 570128.148, 575965.08, 581701.952, 587532.523, 593361.144, 599246.128, 605033.418, 610958.779, 616837.117, 622772.818, 628672.04, 634675.369, 640574.831, 646585.739, 652574.547, 658611.217, 664642.684, 670713.914, 676737.681, 682797.313, 688837.897, 694917.874, 701009.882, 707173.648, 713257.254, 719415.392, 725636.761, 731710.697, 737906.209, 744103.074, 750313.39, 756504.185, 762712.579, 768876.985, 775167.859, 781359.0, 787615.959, 793863.597, 800245.477, 806464.582, 812785.294, 819005.925, 825403.057, 831676.197, 837936.284, 844266.968, 850642.711, 856959.756, 863322.774, 869699.931, 876102.478, 882355.787, 888694.463, 895159.952, 901536.143, 907872.631, 914293.672, 920615.14, 927130.974, 933409.404, 939922.178, 946331.47, 952745.93, 959209.264, 965590.224, 972077.284, 978501.961, 984953.19, 991413.271, 997817.479, 1004222.658, 1010725.676, 1017177.138, 1023612.529, 1030098.236, 1036493.719, 1043112.207, 1049537.036, 1056008.096, 1062476.184, 1068942.337, 1075524.95, 1081932.864, 1088426.025, 1094776.005, 1101327.448, 1107901.673, 1114423.639, 1120884.602, 1127324.923, 1133794.24, 1140328.886, 1146849.376, 1153346.682, 1159836.502, 1166478.703, 1172953.304, 1179391.502, 1185950.982, 1192544.052, 1198913.41, 1205430.994, 1212015.525, 1218674.042, 1225121.683, 1231551.101, 1238126.379, 1244673.795, 1251260.649, 1257697.86, 1264320.983, 1270736.319, 1277274.694, 1283804.95, 1290211.514, 1296858.568, 1303455.691}; // Meta array storing interpolation points for estimates for Precision=4..18 -__device__ static cuda::std::array constexpr raw_estimate_data{raw_estimate_data_p4.data(), - raw_estimate_data_p5.data(), - raw_estimate_data_p6.data(), - raw_estimate_data_p7.data(), - raw_estimate_data_p8.data(), - raw_estimate_data_p9.data(), - raw_estimate_data_p10.data(), - raw_estimate_data_p11.data(), - raw_estimate_data_p12.data(), - raw_estimate_data_p13.data(), - raw_estimate_data_p14.data(), - raw_estimate_data_p15.data(), - raw_estimate_data_p16.data(), - raw_estimate_data_p17.data(), - raw_estimate_data_p18.data()}; +CUCO_HLL_TUNING_ARR_DECL raw_estimate_data{raw_estimate_data_p4.data(), raw_estimate_data_p5.data(), raw_estimate_data_p6.data(), raw_estimate_data_p7.data(), raw_estimate_data_p8.data(), raw_estimate_data_p9.data(), raw_estimate_data_p10.data(), raw_estimate_data_p11.data(), raw_estimate_data_p12.data(), raw_estimate_data_p13.data(), raw_estimate_data_p14.data(), raw_estimate_data_p15.data(), raw_estimate_data_p16.data(), raw_estimate_data_p17.data(), raw_estimate_data_p18.data()}; -CUCO_HLL_TUNING_ARR_DECL bias_data_p4{10.0, - 9.717, - 9.207, - 8.7896, - 8.2882, - 7.8204, - 7.3772, - 6.9342, - 6.5202, - 6.161, - 5.7722, - 5.4636, - 5.0396, - 4.6766, - 4.3566, - 4.0454, - 3.7936, - 3.4856, - 3.2666, - 2.9946, - 2.766, - 2.4692, - 2.3638, - 2.0764, - 1.7864, - 1.7602, - 1.4814, - 1.433, - 1.2926, - 1.0664, - 0.999600000000001, - 0.7956, - 0.5366, - 0.589399999999998, - 0.573799999999999, - 0.269799999999996, - 0.368200000000002, - 0.0544000000000011, - 0.234200000000001, - 0.0108000000000033, - -0.203400000000002, - -0.0701999999999998, - -0.129600000000003, - -0.364199999999997, - -0.480600000000003, - -0.226999999999997, - -0.322800000000001, - -0.382599999999996, - -0.511200000000002, - -0.669600000000003, - -0.749400000000001, - -0.500399999999999, - -0.617600000000003, - -0.6922, - -0.601599999999998, - -0.416200000000003, - -0.338200000000001, - -0.782600000000002, - -0.648600000000002, - -0.919800000000002, - -0.851799999999997, - -0.962400000000002, - -0.6402, - -1.1922, - -1.0256, - -1.086, - -1.21899999999999, - -0.819400000000002, - -0.940600000000003, - -1.1554, - -1.2072, - -1.1752, - -1.16759999999999, - -1.14019999999999, - -1.3754, - -1.29859999999999, - -1.607, - -1.3292, - -1.7606}; -CUCO_HLL_TUNING_ARR_DECL bias_data_p5{22.0, - 21.1194, - 20.8208, - 20.2318, - 19.77, - 19.2436, - 18.7774, - 18.2848, - 17.8224, - 17.3742, - 16.9336, - 16.503, - 16.0494, - 15.6292, - 15.2124, - 14.798, - 14.367, - 13.9728, - 13.5944, - 13.217, - 12.8438, - 12.3696, - 12.0956, - 11.7044, - 11.324, - 11.0668, - 10.6698, - 10.3644, - 10.049, - 9.6918, - 9.4146, - 9.082, - 8.687, - 8.5398, - 8.2462, - 7.857, - 7.6606, - 7.4168, - 7.1248, - 6.9222, - 6.6804, - 6.447, - 6.3454, - 5.9594, - 5.7636, - 5.5776, - 5.331, - 5.19, - 4.9676, - 4.7564, - 4.5314, - 4.4442, - 4.3708, - 3.9774, - 3.9624, - 3.8796, - 3.755, - 3.472, - 3.2076, - 3.1024, - 2.8908, - 2.7338, - 2.7728, - 2.629, - 2.413, - 2.3266, - 2.1524, - 2.2642, - 2.1806, - 2.0566, - 1.9192, - 1.7598, - 1.3516, - 1.5802, - 1.43859999999999, - 1.49160000000001, - 1.1524, - 1.1892, - 0.841399999999993, - 0.879800000000003, - 0.837599999999995, - 0.469800000000006, - 0.765600000000006, - 0.331000000000003, - 0.591399999999993, - 0.601200000000006, - 0.701599999999999, - 0.558199999999999, - 0.339399999999998, - 0.354399999999998, - 0.491200000000006, - 0.308000000000007, - 0.355199999999996, - -0.0254000000000048, - 0.205200000000005, - -0.272999999999996, - 0.132199999999997, - 0.394400000000005, - -0.241200000000006, - 0.242000000000004, - 0.191400000000002, - 0.253799999999998, - -0.122399999999999, - -0.370800000000003, - 0.193200000000004, - -0.0848000000000013, - 0.0867999999999967, - -0.327200000000005, - -0.285600000000002, - 0.311400000000006, - -0.128399999999999, - -0.754999999999995, - -0.209199999999996, - -0.293599999999998, - -0.364000000000004, - -0.253600000000006, - -0.821200000000005, - -0.253600000000006, - -0.510400000000004, - -0.383399999999995, - -0.491799999999998, - -0.220200000000006, - -0.0972000000000008, - -0.557400000000001, - -0.114599999999996, - -0.295000000000002, - -0.534800000000004, - 0.346399999999988, - -0.65379999999999, - 0.0398000000000138, - 0.0341999999999985, - -0.995800000000003, - -0.523400000000009, - -0.489000000000004, - -0.274799999999999, - -0.574999999999989, - -0.482799999999997, - 0.0571999999999946, - -0.330600000000004, - -0.628800000000012, - -0.140199999999993, - -0.540600000000012, - -0.445999999999998, - -0.599400000000003, - -0.262599999999992, - 0.163399999999996, - -0.100599999999986, - -0.39500000000001, - -1.06960000000001, - -0.836399999999998, - -0.753199999999993, - -0.412399999999991, - -0.790400000000005, - -0.29679999999999, - -0.28540000000001, - -0.193000000000012, - -0.0772000000000048, - -0.962799999999987, - -0.414800000000014}; -CUCO_HLL_TUNING_ARR_DECL bias_data_p6{45.0, - 44.1902, - 43.271, - 42.8358, - 41.8142, - 41.2854, - 40.317, - 39.354, - 38.8924, - 37.9436, - 37.4596, - 36.5262, - 35.6248, - 35.1574, - 34.2822, - 33.837, - 32.9636, - 32.074, - 31.7042, - 30.7976, - 30.4772, - 29.6564, - 28.7942, - 28.5004, - 27.686, - 27.291, - 26.5672, - 25.8556, - 25.4982, - 24.8204, - 24.4252, - 23.7744, - 23.0786, - 22.8344, - 22.0294, - 21.8098, - 21.0794, - 20.5732, - 20.1878, - 19.5648, - 19.2902, - 18.6784, - 18.3352, - 17.8946, - 17.3712, - 17.0852, - 16.499, - 16.2686, - 15.6844, - 15.2234, - 14.9732, - 14.3356, - 14.2286, - 13.7262, - 13.3284, - 13.1048, - 12.5962, - 12.3562, - 12.1272, - 11.4184, - 11.4974, - 11.0822, - 10.856, - 10.48, - 10.2834, - 10.0208, - 9.637, - 9.51739999999999, - 9.05759999999999, - 8.74760000000001, - 8.42700000000001, - 8.1326, - 8.2372, - 8.2788, - 7.6776, - 7.79259999999999, - 7.1952, - 6.9564, - 6.6454, - 6.87, - 6.5428, - 6.19999999999999, - 6.02940000000001, - 5.62780000000001, - 5.6782, - 5.792, - 5.35159999999999, - 5.28319999999999, - 5.0394, - 5.07480000000001, - 4.49119999999999, - 4.84899999999999, - 4.696, - 4.54040000000001, - 4.07300000000001, - 4.37139999999999, - 3.7216, - 3.7328, - 3.42080000000001, - 3.41839999999999, - 3.94239999999999, - 3.27719999999999, - 3.411, - 3.13079999999999, - 2.76900000000001, - 2.92580000000001, - 2.68279999999999, - 2.75020000000001, - 2.70599999999999, - 2.3886, - 3.01859999999999, - 2.45179999999999, - 2.92699999999999, - 2.41720000000001, - 2.41139999999999, - 2.03299999999999, - 2.51240000000001, - 2.5564, - 2.60079999999999, - 2.41720000000001, - 1.80439999999999, - 1.99700000000001, - 2.45480000000001, - 1.8948, - 2.2346, - 2.30860000000001, - 2.15479999999999, - 1.88419999999999, - 1.6508, - 0.677199999999999, - 1.72540000000001, - 1.4752, - 1.72280000000001, - 1.66139999999999, - 1.16759999999999, - 1.79300000000001, - 1.00059999999999, - 0.905200000000008, - 0.659999999999997, - 1.55879999999999, - 1.1636, - 0.688199999999995, - 0.712600000000009, - 0.450199999999995, - 1.1978, - 0.975599999999986, - 0.165400000000005, - 1.727, - 1.19739999999999, - -0.252600000000001, - 1.13460000000001, - 1.3048, - 1.19479999999999, - 0.313400000000001, - 0.878999999999991, - 1.12039999999999, - 0.853000000000009, - 1.67920000000001, - 0.856999999999999, - 0.448599999999999, - 1.2362, - 0.953399999999988, - 1.02859999999998, - 0.563199999999995, - 0.663000000000011, - 0.723000000000013, - 0.756599999999992, - 0.256599999999992, - -0.837600000000009, - 0.620000000000005, - 0.821599999999989, - 0.216600000000028, - 0.205600000000004, - 0.220199999999977, - 0.372599999999977, - 0.334400000000016, - 0.928400000000011, - 0.972800000000007, - 0.192400000000021, - 0.487199999999973, - -0.413000000000011, - 0.807000000000016, - 0.120600000000024, - 0.769000000000005, - 0.870799999999974, - 0.66500000000002, - 0.118200000000002, - 0.401200000000017, - 0.635199999999998, - 0.135400000000004, - 0.175599999999974, - 1.16059999999999, - 0.34620000000001, - 0.521400000000028, - -0.586599999999976, - -1.16480000000001, - 0.968399999999974, - 0.836999999999989, - 0.779600000000016, - 0.985799999999983}; -CUCO_HLL_TUNING_ARR_DECL bias_data_p7{91.0, - 89.4934, - 87.9758, - 86.4574, - 84.9718, - 83.4954, - 81.5302, - 80.0756, - 78.6374, - 77.1782, - 75.7888, - 73.9522, - 72.592, - 71.2532, - 69.9086, - 68.5938, - 66.9474, - 65.6796, - 64.4394, - 63.2176, - 61.9768, - 60.4214, - 59.2528, - 58.0102, - 56.8658, - 55.7278, - 54.3044, - 53.1316, - 52.093, - 51.0032, - 49.9092, - 48.6306, - 47.5294, - 46.5756, - 45.6508, - 44.662, - 43.552, - 42.3724, - 41.617, - 40.5754, - 39.7872, - 38.8444, - 37.7988, - 36.8606, - 36.2118, - 35.3566, - 34.4476, - 33.5882, - 32.6816, - 32.0824, - 31.0258, - 30.6048, - 29.4436, - 28.7274, - 27.957, - 27.147, - 26.4364, - 25.7592, - 25.3386, - 24.781, - 23.8028, - 23.656, - 22.6544, - 21.996, - 21.4718, - 21.1544, - 20.6098, - 19.5956, - 19.0616, - 18.5758, - 18.4878, - 17.5244, - 17.2146, - 16.724, - 15.8722, - 15.5198, - 15.0414, - 14.941, - 14.9048, - 13.87, - 13.4304, - 13.028, - 12.4708, - 12.37, - 12.0624, - 11.4668, - 11.5532, - 11.4352, - 11.2564, - 10.2744, - 10.2118, - 9.74720000000002, - 10.1456, - 9.2928, - 8.75040000000001, - 8.55279999999999, - 8.97899999999998, - 8.21019999999999, - 8.18340000000001, - 7.3494, - 7.32499999999999, - 7.66140000000001, - 6.90300000000002, - 7.25439999999998, - 6.9042, - 7.21499999999997, - 6.28640000000001, - 6.08139999999997, - 6.6764, - 6.30099999999999, - 5.13900000000001, - 5.65800000000002, - 5.17320000000001, - 4.59019999999998, - 4.9538, - 5.08280000000002, - 4.92200000000003, - 4.99020000000002, - 4.7328, - 5.4538, - 4.11360000000002, - 4.22340000000003, - 4.08780000000002, - 3.70800000000003, - 4.15559999999999, - 4.18520000000001, - 3.63720000000001, - 3.68220000000002, - 3.77960000000002, - 3.6078, - 2.49160000000001, - 3.13099999999997, - 2.5376, - 3.19880000000001, - 3.21100000000001, - 2.4502, - 3.52820000000003, - 2.91199999999998, - 3.04480000000001, - 2.7432, - 2.85239999999999, - 2.79880000000003, - 2.78579999999999, - 1.88679999999999, - 2.98860000000002, - 2.50639999999999, - 1.91239999999999, - 2.66160000000002, - 2.46820000000002, - 1.58199999999999, - 1.30399999999997, - 2.27379999999999, - 2.68939999999998, - 1.32900000000001, - 3.10599999999999, - 1.69080000000002, - 2.13740000000001, - 2.53219999999999, - 1.88479999999998, - 1.33240000000001, - 1.45119999999997, - 1.17899999999997, - 2.44119999999998, - 1.60659999999996, - 2.16700000000003, - 0.77940000000001, - 2.37900000000002, - 2.06700000000001, - 1.46000000000004, - 2.91160000000002, - 1.69200000000001, - 0.954600000000028, - 2.49300000000005, - 2.2722, - 1.33500000000004, - 2.44899999999996, - 1.20140000000004, - 3.07380000000001, - 2.09739999999999, - 2.85640000000001, - 2.29960000000005, - 2.40899999999999, - 1.97040000000004, - 0.809799999999996, - 1.65279999999996, - 2.59979999999996, - 0.95799999999997, - 2.06799999999998, - 2.32780000000002, - 4.20159999999998, - 1.96320000000003, - 1.86400000000003, - 1.42999999999995, - 3.77940000000001, - 1.27200000000005, - 1.86440000000005, - 2.20600000000002, - 3.21900000000005, - 1.5154, - 2.61019999999996}; -CUCO_HLL_TUNING_ARR_DECL bias_data_p8{183.2152, - 180.2454, - 177.2096, - 173.6652, - 170.6312, - 167.6822, - 164.249, - 161.3296, - 158.0038, - 155.2074, - 152.4612, - 149.27, - 146.5178, - 143.4412, - 140.8032, - 138.1634, - 135.1688, - 132.6074, - 129.6946, - 127.2664, - 124.8228, - 122.0432, - 119.6824, - 116.9464, - 114.6268, - 112.2626, - 109.8376, - 107.4034, - 104.8956, - 102.8522, - 100.7638, - 98.3552, - 96.3556, - 93.7526, - 91.9292, - 89.8954, - 87.8198, - 85.7668, - 83.298, - 81.6688, - 79.9466, - 77.9746, - 76.1672, - 74.3474, - 72.3028, - 70.8912, - 69.114, - 67.4646, - 65.9744, - 64.4092, - 62.6022, - 60.843, - 59.5684, - 58.1652, - 56.5426, - 55.4152, - 53.5388, - 52.3592, - 51.1366, - 49.486, - 48.3918, - 46.5076, - 45.509, - 44.3834, - 43.3498, - 42.0668, - 40.7346, - 40.1228, - 38.4528, - 37.7, - 36.644, - 36.0518, - 34.5774, - 33.9068, - 32.432, - 32.1666, - 30.434, - 29.6644, - 28.4894, - 27.6312, - 26.3804, - 26.292, - 25.5496000000001, - 25.0234, - 24.8206, - 22.6146, - 22.4188, - 22.117, - 20.6762, - 20.6576, - 19.7864, - 19.509, - 18.5334, - 17.9204, - 17.772, - 16.2924, - 16.8654, - 15.1836, - 15.745, - 15.1316, - 15.0386, - 14.0136, - 13.6342, - 12.6196, - 12.1866, - 12.4281999999999, - 11.3324, - 10.4794000000001, - 11.5038, - 10.129, - 9.52800000000002, - 10.3203999999999, - 9.46299999999997, - 9.79280000000006, - 9.12300000000005, - 8.74180000000001, - 9.2192, - 7.51020000000005, - 7.60659999999996, - 7.01840000000004, - 7.22239999999999, - 7.40139999999997, - 6.76179999999999, - 7.14359999999999, - 5.65060000000005, - 5.63779999999997, - 5.76599999999996, - 6.75139999999999, - 5.57759999999996, - 3.73220000000003, - 5.8048, - 5.63019999999995, - 4.93359999999996, - 3.47979999999995, - 4.33879999999999, - 3.98940000000005, - 3.81960000000004, - 3.31359999999995, - 3.23080000000004, - 3.4588, - 3.08159999999998, - 3.4076, - 3.00639999999999, - 2.38779999999997, - 2.61900000000003, - 1.99800000000005, - 3.34820000000002, - 2.95060000000001, - 0.990999999999985, - 2.11440000000005, - 2.20299999999997, - 2.82219999999995, - 2.73239999999998, - 2.7826, - 3.76660000000004, - 2.26480000000004, - 2.31280000000004, - 2.40819999999997, - 2.75360000000001, - 3.33759999999995, - 2.71559999999999, - 1.7478000000001, - 1.42920000000004, - 2.39300000000003, - 2.22779999999989, - 2.34339999999997, - 0.87259999999992, - 3.88400000000001, - 1.80600000000004, - 1.91759999999999, - 1.16779999999994, - 1.50320000000011, - 2.52500000000009, - 0.226400000000012, - 2.31500000000005, - 0.930000000000064, - 1.25199999999995, - 2.14959999999996, - 0.0407999999999902, - 2.5447999999999, - 1.32960000000003, - 0.197400000000016, - 2.52620000000002, - 3.33279999999991, - -1.34300000000007, - 0.422199999999975, - 0.917200000000093, - 1.12920000000008, - 1.46060000000011, - 1.45779999999991, - 2.8728000000001, - 3.33359999999993, - -1.34079999999994, - 1.57680000000005, - 0.363000000000056, - 1.40740000000005, - 0.656600000000026, - 0.801400000000058, - -0.454600000000028, - 1.51919999999996}; -CUCO_HLL_TUNING_ARR_DECL bias_data_p9{368.0, - 361.8294, - 355.2452, - 348.6698, - 342.1464, - 336.2024, - 329.8782, - 323.6598, - 317.462, - 311.2826, - 305.7102, - 299.7416, - 293.9366, - 288.1046, - 282.285, - 277.0668, - 271.306, - 265.8448, - 260.301, - 254.9886, - 250.2422, - 244.8138, - 239.7074, - 234.7428, - 229.8402, - 225.1664, - 220.3534, - 215.594, - 210.6886, - 205.7876, - 201.65, - 197.228, - 192.8036, - 188.1666, - 184.0818, - 180.0824, - 176.2574, - 172.302, - 168.1644, - 164.0056, - 160.3802, - 156.7192, - 152.5234, - 149.2084, - 145.831, - 142.485, - 139.1112, - 135.4764, - 131.76, - 129.3368, - 126.5538, - 122.5058, - 119.2646, - 116.5902, - 113.3818, - 110.8998, - 107.9532, - 105.2062, - 102.2798, - 99.4728, - 96.9582, - 94.3292, - 92.171, - 89.7809999999999, - 87.5716, - 84.7048, - 82.5322, - 79.875, - 78.3972, - 75.3464, - 73.7274, - 71.2834, - 70.1444, - 68.4263999999999, - 66.0166, - 64.018, - 62.0437999999999, - 60.3399999999999, - 58.6856, - 57.9836, - 55.0311999999999, - 54.6769999999999, - 52.3188, - 51.4846, - 49.4423999999999, - 47.739, - 46.1487999999999, - 44.9202, - 43.4059999999999, - 42.5342000000001, - 41.2834, - 38.8954000000001, - 38.3286000000001, - 36.2146, - 36.6684, - 35.9946, - 33.123, - 33.4338, - 31.7378000000001, - 29.076, - 28.9692, - 27.4964, - 27.0998, - 25.9864, - 26.7754, - 24.3208, - 23.4838, - 22.7388000000001, - 24.0758000000001, - 21.9097999999999, - 20.9728, - 19.9228000000001, - 19.9292, - 16.617, - 17.05, - 18.2996000000001, - 15.6128000000001, - 15.7392, - 14.5174, - 13.6322, - 12.2583999999999, - 13.3766000000001, - 11.423, - 13.1232, - 9.51639999999998, - 10.5938000000001, - 9.59719999999993, - 8.12220000000002, - 9.76739999999995, - 7.50440000000003, - 7.56999999999994, - 6.70440000000008, - 6.41419999999994, - 6.71019999999999, - 5.60940000000005, - 4.65219999999999, - 6.84099999999989, - 3.4072000000001, - 3.97859999999991, - 3.32760000000007, - 5.52160000000003, - 3.31860000000006, - 2.06940000000009, - 4.35400000000004, - 1.57500000000005, - 0.280799999999999, - 2.12879999999996, - -0.214799999999968, - -0.0378000000000611, - -0.658200000000079, - 0.654800000000023, - -0.0697999999999865, - 0.858400000000074, - -2.52700000000004, - -2.1751999999999, - -3.35539999999992, - -1.04019999999991, - -0.651000000000067, - -2.14439999999991, - -1.96659999999997, - -3.97939999999994, - -0.604400000000169, - -3.08260000000018, - -3.39159999999993, - -5.29640000000018, - -5.38920000000007, - -5.08759999999984, - -4.69900000000007, - -5.23720000000003, - -3.15779999999995, - -4.97879999999986, - -4.89899999999989, - -7.48880000000008, - -5.94799999999987, - -5.68060000000014, - -6.67180000000008, - -4.70499999999993, - -7.27779999999984, - -4.6579999999999, - -4.4362000000001, - -4.32139999999981, - -5.18859999999995, - -6.66879999999992, - -6.48399999999992, - -5.1260000000002, - -4.4032000000002, - -6.13500000000022, - -5.80819999999994, - -4.16719999999987, - -4.15039999999999, - -7.45600000000013, - -7.24080000000004, - -9.83179999999993, - -5.80420000000004, - -8.6561999999999, - -6.99940000000015, - -10.5473999999999, - -7.34139999999979, - -6.80999999999995, - -6.29719999999998, - -6.23199999999997}; -CUCO_HLL_TUNING_ARR_DECL bias_data_p10{737.1256, - 724.4234, - 711.1064, - 698.4732, - 685.4636, - 673.0644, - 660.488, - 647.9654, - 636.0832, - 623.7864, - 612.1992, - 600.2176, - 588.5228, - 577.1716, - 565.7752, - 554.899, - 543.6126, - 532.6492, - 521.9474, - 511.5214, - 501.1064, - 490.6364, - 480.2468, - 470.4588, - 460.3832, - 451.0584, - 440.8606, - 431.3868, - 422.5062, - 413.1862, - 404.463, - 395.339, - 386.1936, - 378.1292, - 369.1854, - 361.2908, - 353.3324, - 344.8518, - 337.5204, - 329.4854, - 321.9318, - 314.552, - 306.4658, - 299.4256, - 292.849, - 286.152, - 278.8956, - 271.8792, - 265.118, - 258.62, - 252.5132, - 245.9322, - 239.7726, - 233.6086, - 227.5332, - 222.5918, - 216.4294, - 210.7662, - 205.4106, - 199.7338, - 194.9012, - 188.4486, - 183.1556, - 178.6338, - 173.7312, - 169.6264, - 163.9526, - 159.8742, - 155.8326, - 151.1966, - 147.5594, - 143.07, - 140.037, - 134.1804, - 131.071, - 127.4884, - 124.0848, - 120.2944, - 117.333, - 112.9626, - 110.2902, - 107.0814, - 103.0334, - 99.4832000000001, - 96.3899999999999, - 93.7202000000002, - 90.1714000000002, - 87.2357999999999, - 85.9346, - 82.8910000000001, - 80.0264000000002, - 78.3834000000002, - 75.1543999999999, - 73.8683999999998, - 70.9895999999999, - 69.4367999999999, - 64.8701999999998, - 65.0408000000002, - 61.6738, - 59.5207999999998, - 57.0158000000001, - 54.2302, - 53.0962, - 50.4985999999999, - 52.2588000000001, - 47.3914, - 45.6244000000002, - 42.8377999999998, - 43.0072, - 40.6516000000001, - 40.2453999999998, - 35.2136, - 36.4546, - 33.7849999999999, - 33.2294000000002, - 32.4679999999998, - 30.8670000000002, - 28.6507999999999, - 28.9099999999999, - 27.5983999999999, - 26.1619999999998, - 24.5563999999999, - 23.2328000000002, - 21.9484000000002, - 21.5902000000001, - 21.3346000000001, - 17.7031999999999, - 20.6111999999998, - 19.5545999999999, - 15.7375999999999, - 17.0720000000001, - 16.9517999999998, - 15.326, - 13.1817999999998, - 14.6925999999999, - 13.0859999999998, - 13.2754, - 10.8697999999999, - 11.248, - 7.3768, - 4.72339999999986, - 7.97899999999981, - 8.7503999999999, - 7.68119999999999, - 9.7199999999998, - 7.73919999999998, - 5.6224000000002, - 7.44560000000001, - 6.6601999999998, - 5.9058, - 4.00199999999995, - 4.51699999999983, - 4.68240000000014, - 3.86220000000003, - 5.13639999999987, - 5.98500000000013, - 2.47719999999981, - 2.61999999999989, - 1.62800000000016, - 4.65000000000009, - 0.225599999999758, - 0.831000000000131, - -0.359400000000278, - 1.27599999999984, - -2.92559999999958, - -0.0303999999996449, - 2.37079999999969, - -2.0033999999996, - 0.804600000000391, - 0.30199999999968, - 1.1247999999996, - -2.6880000000001, - 0.0321999999996478, - -1.18099999999959, - -3.9402, - -1.47940000000017, - -0.188400000000001, - -2.10720000000038, - -2.04159999999956, - -3.12880000000041, - -4.16160000000036, - -0.612799999999879, - -3.48719999999958, - -8.17900000000009, - -5.37780000000021, - -4.01379999999972, - -5.58259999999973, - -5.73719999999958, - -7.66799999999967, - -5.69520000000011, - -1.1247999999996, - -5.58520000000044, - -8.04560000000038, - -4.64840000000004, - -11.6468000000004, - -7.97519999999986, - -5.78300000000036, - -7.67420000000038, - -10.6328000000003, - -9.81720000000041}; -CUCO_HLL_TUNING_ARR_DECL bias_data_p11{1476.0, - 1449.6014, - 1423.5802, - 1397.7942, - 1372.3042, - 1347.2062, - 1321.8402, - 1297.2292, - 1272.9462, - 1248.9926, - 1225.3026, - 1201.4252, - 1178.0578, - 1155.6092, - 1132.626, - 1110.5568, - 1088.527, - 1066.5154, - 1045.1874, - 1024.3878, - 1003.37, - 982.1972, - 962.5728, - 942.1012, - 922.9668, - 903.292, - 884.0772, - 864.8578, - 846.6562, - 828.041, - 809.714, - 792.3112, - 775.1806, - 757.9854, - 740.656, - 724.346, - 707.5154, - 691.8378, - 675.7448, - 659.6722, - 645.5722, - 630.1462, - 614.4124, - 600.8728, - 585.898, - 572.408, - 558.4926, - 544.4938, - 531.6776, - 517.282, - 505.7704, - 493.1012, - 480.7388, - 467.6876, - 456.1872, - 445.5048, - 433.0214, - 420.806, - 411.409, - 400.4144, - 389.4294, - 379.2286, - 369.651, - 360.6156, - 350.337, - 342.083, - 332.1538, - 322.5094, - 315.01, - 305.6686, - 298.1678, - 287.8116, - 280.9978, - 271.9204, - 265.3286, - 257.5706, - 249.6014, - 242.544, - 235.5976, - 229.583, - 220.9438, - 214.672, - 208.2786, - 201.8628, - 195.1834, - 191.505, - 186.1816, - 178.5188, - 172.2294, - 167.8908, - 161.0194, - 158.052, - 151.4588, - 148.1596, - 143.4344, - 138.5238, - 133.13, - 127.6374, - 124.8162, - 118.7894, - 117.3984, - 114.6078, - 109.0858, - 105.1036, - 103.6258, - 98.6018000000004, - 95.7618000000002, - 93.5821999999998, - 88.5900000000001, - 86.9992000000002, - 82.8800000000001, - 80.4539999999997, - 74.6981999999998, - 74.3644000000004, - 73.2914000000001, - 65.5709999999999, - 66.9232000000002, - 65.1913999999997, - 62.5882000000001, - 61.5702000000001, - 55.7035999999998, - 56.1764000000003, - 52.7596000000003, - 53.0302000000001, - 49.0609999999997, - 48.4694, - 44.933, - 46.0474000000004, - 44.7165999999997, - 41.9416000000001, - 39.9207999999999, - 35.6328000000003, - 35.5276000000003, - 33.1934000000001, - 33.2371999999996, - 33.3864000000003, - 33.9228000000003, - 30.2371999999996, - 29.1373999999996, - 25.2272000000003, - 24.2942000000003, - 19.8338000000003, - 18.9005999999999, - 23.0907999999999, - 21.8544000000002, - 19.5176000000001, - 15.4147999999996, - 16.9314000000004, - 18.6737999999996, - 12.9877999999999, - 14.3688000000002, - 12.0447999999997, - 15.5219999999999, - 12.5299999999997, - 14.5940000000001, - 14.3131999999996, - 9.45499999999993, - 12.9441999999999, - 3.91139999999996, - 13.1373999999996, - 5.44720000000052, - 9.82779999999912, - 7.87279999999919, - 3.67760000000089, - 5.46980000000076, - 5.55099999999948, - 5.65979999999945, - 3.89439999999922, - 3.1275999999998, - 5.65140000000065, - 6.3062000000009, - 3.90799999999945, - 1.87060000000019, - 5.17020000000048, - 2.46680000000015, - 0.770000000000437, - -3.72340000000077, - 1.16400000000067, - 8.05340000000069, - 0.135399999999208, - 2.15940000000046, - 0.766999999999825, - 1.0594000000001, - 3.15500000000065, - -0.287399999999252, - 2.37219999999979, - -2.86620000000039, - -1.63199999999961, - -2.22979999999916, - -0.15519999999924, - -1.46039999999994, - -0.262199999999211, - -2.34460000000036, - -2.8078000000005, - -3.22179999999935, - -5.60159999999996, - -8.42200000000048, - -9.43740000000071, - 0.161799999999857, - -10.4755999999998, - -10.0823999999993}; -CUCO_HLL_TUNING_ARR_DECL bias_data_p12{2953.0, - 2900.4782, - 2848.3568, - 2796.3666, - 2745.324, - 2694.9598, - 2644.648, - 2595.539, - 2546.1474, - 2498.2576, - 2450.8376, - 2403.6076, - 2357.451, - 2311.38, - 2266.4104, - 2221.5638, - 2176.9676, - 2134.193, - 2090.838, - 2048.8548, - 2007.018, - 1966.1742, - 1925.4482, - 1885.1294, - 1846.4776, - 1807.4044, - 1768.8724, - 1731.3732, - 1693.4304, - 1657.5326, - 1621.949, - 1586.5532, - 1551.7256, - 1517.6182, - 1483.5186, - 1450.4528, - 1417.865, - 1385.7164, - 1352.6828, - 1322.6708, - 1291.8312, - 1260.9036, - 1231.476, - 1201.8652, - 1173.6718, - 1145.757, - 1119.2072, - 1092.2828, - 1065.0434, - 1038.6264, - 1014.3192, - 988.5746, - 965.0816, - 940.1176, - 917.9796, - 894.5576, - 871.1858, - 849.9144, - 827.1142, - 805.0818, - 783.9664, - 763.9096, - 742.0816, - 724.3962, - 706.3454, - 688.018, - 667.4214, - 650.3106, - 633.0686, - 613.8094, - 597.818, - 581.4248, - 563.834, - 547.363, - 531.5066, - 520.455400000001, - 505.583199999999, - 488.366, - 476.480799999999, - 459.7682, - 450.0522, - 434.328799999999, - 423.952799999999, - 408.727000000001, - 399.079400000001, - 387.252200000001, - 373.987999999999, - 360.852000000001, - 351.6394, - 339.642, - 330.902400000001, - 322.661599999999, - 311.662200000001, - 301.3254, - 291.7484, - 279.939200000001, - 276.7508, - 263.215200000001, - 254.811400000001, - 245.5494, - 242.306399999999, - 234.8734, - 223.787200000001, - 217.7156, - 212.0196, - 200.793, - 195.9748, - 189.0702, - 182.449199999999, - 177.2772, - 170.2336, - 164.741, - 158.613600000001, - 155.311, - 147.5964, - 142.837, - 137.3724, - 132.0162, - 130.0424, - 121.9804, - 120.451800000001, - 114.8968, - 111.585999999999, - 105.933199999999, - 101.705, - 98.5141999999996, - 95.0488000000005, - 89.7880000000005, - 91.4750000000004, - 83.7764000000006, - 80.9698000000008, - 72.8574000000008, - 73.1615999999995, - 67.5838000000003, - 62.6263999999992, - 63.2638000000006, - 66.0977999999996, - 52.0843999999997, - 58.9956000000002, - 47.0912000000008, - 46.4956000000002, - 48.4383999999991, - 47.1082000000006, - 43.2392, - 37.2759999999998, - 40.0283999999992, - 35.1864000000005, - 35.8595999999998, - 32.0998, - 28.027, - 23.6694000000007, - 33.8266000000003, - 26.3736000000008, - 27.2008000000005, - 21.3245999999999, - 26.4115999999995, - 23.4521999999997, - 19.5013999999992, - 19.8513999999996, - 10.7492000000002, - 18.6424000000006, - 13.1265999999996, - 18.2436000000016, - 6.71860000000015, - 3.39459999999963, - 6.33759999999893, - 7.76719999999841, - 0.813999999998487, - 3.82819999999992, - 0.826199999999517, - 8.07440000000133, - -1.59080000000176, - 5.01780000000144, - 0.455399999998917, - -0.24199999999837, - 0.174800000000687, - -9.07640000000174, - -4.20160000000033, - -3.77520000000004, - -4.75179999999818, - -5.3724000000002, - -8.90680000000066, - -6.10239999999976, - -5.74120000000039, - -9.95339999999851, - -3.86339999999836, - -13.7304000000004, - -16.2710000000006, - -7.51359999999841, - -3.30679999999847, - -13.1339999999982, - -10.0551999999989, - -6.72019999999975, - -8.59660000000076, - -10.9307999999983, - -1.8775999999998, - -4.82259999999951, - -13.7788, - -21.6470000000008, - -10.6735999999983, - -15.7799999999988}; -CUCO_HLL_TUNING_ARR_DECL bias_data_p13{5907.5052, 5802.2672, - 5697.347, 5593.5794, - 5491.2622, 5390.5514, - 5290.3376, 5191.6952, - 5093.5988, 4997.3552, - 4902.5972, 4808.3082, - 4715.5646, 4624.109, - 4533.8216, 4444.4344, - 4356.3802, 4269.2962, - 4183.3784, 4098.292, - 4014.79, 3932.4574, - 3850.6036, 3771.2712, - 3691.7708, 3615.099, - 3538.1858, 3463.4746, - 3388.8496, 3315.6794, - 3244.5448, 3173.7516, - 3103.3106, 3033.6094, - 2966.5642, 2900.794, - 2833.7256, 2769.81, - 2707.3196, 2644.0778, - 2583.9916, 2523.4662, - 2464.124, 2406.073, - 2347.0362, 2292.1006, - 2238.1716, 2182.7514, - 2128.4884, 2077.1314, - 2025.037, 1975.3756, - 1928.933, 1879.311, - 1831.0006, 1783.2144, - 1738.3096, 1694.5144, - 1649.024, 1606.847, - 1564.7528, 1525.3168, - 1482.5372, 1443.9668, - 1406.5074, 1365.867, - 1329.2186, 1295.4186, - 1257.9716, 1225.339, - 1193.2972, 1156.3578, - 1125.8686, 1091.187, - 1061.4094, 1029.4188, - 1000.9126, 972.3272, - 944.004199999999, 915.7592, - 889.965, 862.834200000001, - 840.4254, 812.598399999999, - 785.924200000001, 763.050999999999, - 741.793799999999, 721.466, - 699.040799999999, 677.997200000002, - 649.866999999998, 634.911800000002, - 609.8694, 591.981599999999, - 570.2922, 557.129199999999, - 538.3858, 521.872599999999, - 502.951400000002, 495.776399999999, - 475.171399999999, 459.751, - 439.995200000001, 426.708999999999, - 413.7016, 402.3868, - 387.262599999998, 372.0524, - 357.050999999999, 342.5098, - 334.849200000001, 322.529399999999, - 311.613799999999, 295.848000000002, - 289.273000000001, 274.093000000001, - 263.329600000001, 251.389599999999, - 245.7392, 231.9614, - 229.7952, 217.155200000001, - 208.9588, 199.016599999999, - 190.839199999999, 180.6976, - 176.272799999999, 166.976999999999, - 162.5252, 151.196400000001, - 149.386999999999, 133.981199999998, - 130.0586, 130.164000000001, - 122.053400000001, 110.7428, - 108.1276, 106.232400000001, - 100.381600000001, 98.7668000000012, - 86.6440000000002, 79.9768000000004, - 82.4722000000002, 68.7026000000005, - 70.1186000000016, 71.9948000000004, - 58.998599999999, 59.0492000000013, - 56.9818000000014, 47.5338000000011, - 42.9928, 51.1591999999982, - 37.2740000000013, 42.7220000000016, - 31.3734000000004, 26.8090000000011, - 25.8934000000008, 26.5286000000015, - 29.5442000000003, 19.3503999999994, - 26.0760000000009, 17.9527999999991, - 14.8419999999969, 10.4683999999979, - 8.65899999999965, 9.86720000000059, - 4.34139999999752, -0.907800000000861, - -3.32080000000133, -0.936199999996461, - -11.9916000000012, -8.87000000000262, - -6.33099999999831, -11.3366000000024, - -15.9207999999999, -9.34659999999712, - -15.5034000000014, -19.2097999999969, - -15.357799999998, -28.2235999999975, - -30.6898000000001, -19.3271999999997, - -25.6083999999973, -24.409599999999, - -13.6385999999984, -33.4473999999973, - -32.6949999999997, -28.9063999999998, - -31.7483999999968, -32.2935999999972, - -35.8329999999987, -47.620600000002, - -39.0855999999985, -33.1434000000008, - -46.1371999999974, -37.5892000000022, - -46.8164000000033, -47.3142000000007, - -60.2914000000019, -37.7575999999972}; -CUCO_HLL_TUNING_ARR_DECL bias_data_p14{ - 11816.475, 11605.0046, 11395.3792, 11188.7504, 10984.1814, - 10782.0086, 10582.0072, 10384.503, 10189.178, 9996.2738, - 9806.0344, 9617.9798, 9431.394, 9248.7784, 9067.6894, - 8889.6824, 8712.9134, 8538.8624, 8368.4944, 8197.7956, - 8031.8916, 7866.6316, 7703.733, 7544.5726, 7386.204, - 7230.666, 7077.8516, 6926.7886, 6778.6902, 6631.9632, - 6487.304, 6346.7486, 6206.4408, 6070.202, 5935.2576, - 5799.924, 5671.0324, 5541.9788, 5414.6112, 5290.0274, - 5166.723, 5047.6906, 4929.162, 4815.1406, 4699.127, - 4588.5606, 4477.7394, 4369.4014, 4264.2728, 4155.9224, - 4055.581, 3955.505, 3856.9618, 3761.3828, 3666.9702, - 3575.7764, 3482.4132, 3395.0186, 3305.8852, 3221.415, - 3138.6024, 3056.296, 2970.4494, 2896.1526, 2816.8008, - 2740.2156, 2670.497, 2594.1458, 2527.111, 2460.8168, - 2387.5114, 2322.9498, 2260.6752, 2194.2686, 2133.7792, - 2074.767, 2015.204, 1959.4226, 1898.6502, 1850.006, - 1792.849, 1741.4838, 1687.9778, 1638.1322, 1589.3266, - 1543.1394, 1496.8266, 1447.8516, 1402.7354, 1361.9606, - 1327.0692, 1285.4106, 1241.8112, 1201.6726, 1161.973, - 1130.261, 1094.2036, 1048.2036, 1020.6436, 990.901400000002, - 961.199800000002, 924.769800000002, 899.526400000002, 872.346400000002, 834.375, - 810.432000000001, 780.659800000001, 756.013800000001, 733.479399999997, 707.923999999999, - 673.858, 652.222399999999, 636.572399999997, 615.738599999997, 586.696400000001, - 564.147199999999, 541.679600000003, 523.943599999999, 505.714599999999, 475.729599999999, - 461.779600000002, 449.750800000002, 439.020799999998, 412.7886, 400.245600000002, - 383.188199999997, 362.079599999997, 357.533799999997, 334.319000000003, 327.553399999997, - 308.559399999998, 291.270199999999, 279.351999999999, 271.791400000002, 252.576999999997, - 247.482400000001, 236.174800000001, 218.774599999997, 220.155200000001, 208.794399999999, - 201.223599999998, 182.995600000002, 185.5268, 164.547400000003, 176.5962, - 150.689599999998, 157.8004, 138.378799999999, 134.021200000003, 117.614399999999, - 108.194000000003, 97.0696000000025, 89.6042000000016, 95.6030000000028, 84.7810000000027, - 72.635000000002, 77.3482000000004, 59.4907999999996, 55.5875999999989, 50.7346000000034, - 61.3916000000027, 50.9149999999936, 39.0384000000049, 58.9395999999979, 29.633600000001, - 28.2032000000036, 26.0078000000067, 17.0387999999948, 9.22000000000116, 13.8387999999977, - 8.07240000000456, 14.1549999999988, 15.3570000000036, 3.42660000000615, 6.24820000000182, - -2.96940000000177, -8.79940000000352, -5.97860000000219, -14.4048000000039, -3.4143999999942, - -13.0148000000045, -11.6977999999945, -25.7878000000055, -22.3185999999987, -24.409599999999, - -31.9756000000052, -18.9722000000038, -22.8678000000073, -30.8972000000067, -32.3715999999986, - -22.3907999999938, -43.6720000000059, -35.9038, -39.7492000000057, -54.1641999999993, - -45.2749999999942, -42.2989999999991, -44.1089999999967, -64.3564000000042, -49.9551999999967, - -42.6116000000038}; -CUCO_HLL_TUNING_ARR_DECL bias_data_p15{ - 23634.0036, 23210.8034, 22792.4744, 22379.1524, - 21969.7928, 21565.326, 21165.3532, 20770.2806, - 20379.9892, 19994.7098, 19613.318, 19236.799, - 18865.4382, 18498.8244, 18136.5138, 17778.8668, - 17426.2344, 17079.32, 16734.778, 16397.2418, - 16063.3324, 15734.0232, 15409.731, 15088.728, - 14772.9896, 14464.1402, 14157.5588, 13855.5958, - 13559.3296, 13264.9096, 12978.326, 12692.0826, - 12413.8816, 12137.3192, 11870.2326, 11602.5554, - 11340.3142, 11079.613, 10829.5908, 10583.5466, - 10334.0344, 10095.5072, 9859.694, 9625.2822, - 9395.7862, 9174.0586, 8957.3164, 8738.064, - 8524.155, 8313.7396, 8116.9168, 7913.542, - 7718.4778, 7521.65, 7335.5596, 7154.2906, - 6968.7396, 6786.3996, 6613.236, 6437.406, - 6270.6598, 6107.7958, 5945.7174, 5787.6784, - 5635.5784, 5482.308, 5337.9784, 5190.0864, - 5045.9158, 4919.1386, 4771.817, 4645.7742, - 4518.4774, 4385.5454, 4262.6622, 4142.74679999999, - 4015.5318, 3897.9276, 3790.7764, 3685.13800000001, - 3573.6274, 3467.9706, 3368.61079999999, 3271.5202, - 3170.3848, 3076.4656, 2982.38400000001, 2888.4664, - 2806.4868, 2711.9564, 2634.1434, 2551.3204, - 2469.7662, 2396.61139999999, 2318.9902, 2243.8658, - 2171.9246, 2105.01360000001, 2028.8536, 1960.9952, - 1901.4096, 1841.86079999999, 1777.54700000001, 1714.5802, - 1654.65059999999, 1596.311, 1546.2016, 1492.3296, - 1433.8974, 1383.84600000001, 1339.4152, 1293.5518, - 1245.8686, 1193.50659999999, 1162.27959999999, 1107.19439999999, - 1069.18060000001, 1035.09179999999, 999.679000000004, 957.679999999993, - 925.300199999998, 888.099400000006, 848.638600000006, 818.156400000007, - 796.748399999997, 752.139200000005, 725.271200000003, 692.216, - 671.633600000001, 647.939799999993, 621.670599999998, 575.398799999995, - 561.226599999995, 532.237999999998, 521.787599999996, 483.095799999996, - 467.049599999998, 465.286399999997, 415.548599999995, 401.047399999996, - 380.607999999993, 377.362599999993, 347.258799999996, 338.371599999999, - 310.096999999994, 301.409199999995, 276.280799999993, 265.586800000005, - 258.994399999996, 223.915999999997, 215.925399999993, 213.503800000006, - 191.045400000003, 166.718200000003, 166.259000000005, 162.941200000001, - 148.829400000002, 141.645999999993, 123.535399999993, 122.329800000007, - 89.473399999988, 80.1962000000058, 77.5457999999926, 59.1056000000099, - 83.3509999999951, 52.2906000000075, 36.3979999999865, 40.6558000000077, - 42.0003999999899, 19.6630000000005, 19.7153999999864, -8.38539999999921, - -0.692799999989802, 0.854800000000978, 3.23219999999856, -3.89040000000386, - -5.25880000001052, -24.9052000000083, -22.6837999999989, -26.4286000000138, - -34.997000000003, -37.0216000000073, -43.430400000012, -58.2390000000014, - -68.8034000000043, -56.9245999999985, -57.8583999999973, -77.3097999999882, - -73.2793999999994, -81.0738000000129, -87.4530000000086, -65.0254000000132, - -57.296399999992, -96.2746000000043, -103.25, -96.081600000005, - -91.5542000000132, -102.465200000006, -107.688599999994, -101.458000000013, - -109.715800000005}; -CUCO_HLL_TUNING_ARR_DECL bias_data_p16{ - 47270.0, 46423.3584, 45585.7074, 44757.152, 43938.8416, - 43130.9514, 42330.03, 41540.407, 40759.6348, 39988.206, - 39226.5144, 38473.2096, 37729.795, 36997.268, 36272.6448, - 35558.665, 34853.0248, 34157.4472, 33470.5204, 32793.5742, - 32127.0194, 31469.4182, 30817.6136, 30178.6968, 29546.8908, - 28922.8544, 28312.271, 27707.0924, 27114.0326, 26526.692, - 25948.6336, 25383.7826, 24823.5998, 24272.2974, 23732.2572, - 23201.4976, 22674.2796, 22163.6336, 21656.515, 21161.7362, - 20669.9368, 20189.4424, 19717.3358, 19256.3744, 18795.9638, - 18352.197, 17908.5738, 17474.391, 17052.918, 16637.2236, - 16228.4602, 15823.3474, 15428.6974, 15043.0284, 14667.6278, - 14297.4588, 13935.2882, 13578.5402, 13234.6032, 12882.1578, - 12548.0728, 12219.231, 11898.0072, 11587.2626, 11279.9072, - 10973.5048, 10678.5186, 10392.4876, 10105.2556, 9825.766, - 9562.5444, 9294.2222, 9038.2352, 8784.848, 8533.2644, - 8301.7776, 8058.30859999999, 7822.94579999999, 7599.11319999999, 7366.90779999999, - 7161.217, 6957.53080000001, 6736.212, 6548.21220000001, 6343.06839999999, - 6156.28719999999, 5975.15419999999, 5791.75719999999, 5621.32019999999, 5451.66, - 5287.61040000001, 5118.09479999999, 4957.288, 4798.4246, 4662.17559999999, - 4512.05900000001, 4364.68539999999, 4220.77720000001, 4082.67259999999, 3957.19519999999, - 3842.15779999999, 3699.3328, 3583.01180000001, 3473.8964, 3338.66639999999, - 3233.55559999999, 3117.799, 3008.111, 2909.69140000001, 2814.86499999999, - 2719.46119999999, 2624.742, 2532.46979999999, 2444.7886, 2370.1868, - 2272.45259999999, 2196.19260000001, 2117.90419999999, 2023.2972, 1969.76819999999, - 1885.58979999999, 1833.2824, 1733.91200000001, 1682.54920000001, 1604.57980000001, - 1556.11240000001, 1491.3064, 1421.71960000001, 1371.22899999999, 1322.1324, - 1264.7892, 1196.23920000001, 1143.8474, 1088.67240000001, 1073.60380000001, - 1023.11660000001, 959.036400000012, 927.433199999999, 906.792799999996, 853.433599999989, - 841.873800000001, 791.1054, 756.899999999994, 704.343200000003, 672.495599999995, - 622.790399999998, 611.254799999995, 567.283200000005, 519.406599999988, 519.188400000014, - 495.312800000014, 451.350799999986, 443.973399999988, 431.882199999993, 392.027000000002, - 380.924200000009, 345.128999999986, 298.901400000002, 287.771999999997, 272.625, - 247.253000000026, 222.490600000019, 223.590000000026, 196.407599999977, 176.425999999978, - 134.725199999986, 132.4804, 110.445599999977, 86.7939999999944, 56.7038000000175, - 64.915399999998, 38.3726000000024, 37.1606000000029, 46.170999999973, 49.1716000000015, - 15.3362000000197, 6.71639999997569, -34.8185999999987, -39.4476000000141, 12.6830000000191, - -12.3331999999937, -50.6565999999875, -59.9538000000175, -65.1054000000004, -70.7576000000117, - -106.325200000021, -126.852200000023, -110.227599999984, -132.885999999999, -113.897200000007, - -142.713800000027, -151.145399999979, -150.799200000009, -177.756200000003, -156.036399999983, - -182.735199999996, -177.259399999981, -198.663600000029, -174.577600000019, -193.84580000001}; -CUCO_HLL_TUNING_ARR_DECL bias_data_p17{ - 94541.0, 92848.811, 91174.019, 89517.558, 87879.9705, - 86262.7565, 84663.5125, 83083.7435, 81521.7865, 79977.272, - 78455.9465, 76950.219, 75465.432, 73994.152, 72546.71, - 71115.2345, 69705.6765, 68314.937, 66944.2705, 65591.255, - 64252.9485, 62938.016, 61636.8225, 60355.592, 59092.789, - 57850.568, 56624.518, 55417.343, 54231.1415, 53067.387, - 51903.526, 50774.649, 49657.6415, 48561.05, 47475.7575, - 46410.159, 45364.852, 44327.053, 43318.4005, 42325.6165, - 41348.4595, 40383.6265, 39436.77, 38509.502, 37594.035, - 36695.939, 35818.6895, 34955.691, 34115.8095, 33293.949, - 32465.0775, 31657.6715, 30877.2585, 30093.78, 29351.3695, - 28594.1365, 27872.115, 27168.7465, 26477.076, 25774.541, - 25106.5375, 24452.5135, 23815.5125, 23174.0655, 22555.2685, - 21960.2065, 21376.3555, 20785.1925, 20211.517, 19657.0725, - 19141.6865, 18579.737, 18081.3955, 17578.995, 17073.44, - 16608.335, 16119.911, 15651.266, 15194.583, 14749.0495, - 14343.4835, 13925.639, 13504.509, 13099.3885, 12691.2855, - 12328.018, 11969.0345, 11596.5145, 11245.6355, 10917.6575, - 10580.9785, 10277.8605, 9926.58100000001, 9605.538, 9300.42950000003, - 8989.97850000003, 8728.73249999998, 8448.3235, 8175.31050000002, 7898.98700000002, - 7629.79100000003, 7413.76199999999, 7149.92300000001, 6921.12650000001, 6677.1545, - 6443.28000000003, 6278.23450000002, 6014.20049999998, 5791.20299999998, 5605.78450000001, - 5438.48800000001, 5234.2255, 5059.6825, 4887.43349999998, 4682.935, - 4496.31099999999, 4322.52250000002, 4191.42499999999, 4021.24200000003, 3900.64799999999, - 3762.84250000003, 3609.98050000001, 3502.29599999997, 3363.84250000003, 3206.54849999998, - 3079.70000000001, 2971.42300000001, 2867.80349999998, 2727.08100000001, 2630.74900000001, - 2496.6165, 2440.902, 2356.19150000002, 2235.58199999999, 2120.54149999999, - 2012.25449999998, 1933.35600000003, 1820.93099999998, 1761.54800000001, 1663.09350000002, - 1578.84600000002, 1509.48149999999, 1427.3345, 1379.56150000001, 1306.68099999998, - 1212.63449999999, 1084.17300000001, 1124.16450000001, 1060.69949999999, 1007.48849999998, - 941.194499999983, 879.880500000028, 836.007500000007, 782.802000000025, 748.385499999975, - 647.991500000004, 626.730500000005, 570.776000000013, 484.000500000024, 513.98550000001, - 418.985499999952, 386.996999999974, 370.026500000036, 355.496999999974, 356.731499999994, - 255.92200000002, 259.094000000041, 205.434499999974, 165.374500000034, 197.347500000033, - 95.718499999959, 67.6165000000037, 54.6970000000438, 31.7395000000251, -15.8784999999916, - 8.42500000004657, -26.3754999999655, -118.425500000012, -66.6629999999423, -42.9745000000112, - -107.364999999991, -189.839000000036, -162.611499999999, -164.964999999967, -189.079999999958, - -223.931499999948, -235.329999999958, -269.639500000048, -249.087999999989, -206.475499999942, - -283.04449999996, -290.667000000016, -304.561499999953, -336.784499999951, -380.386500000022, - -283.280499999993, -364.533000000054, -389.059499999974, -364.454000000027, -415.748000000021, - -417.155000000028}; -CUCO_HLL_TUNING_ARR_DECL bias_data_p18{189083.0, - 185696.913, - 182348.774, - 179035.946, - 175762.762, - 172526.444, - 169329.754, - 166166.099, - 163043.269, - 159958.91, - 156907.912, - 153906.845, - 150924.199, - 147996.568, - 145093.457, - 142239.233, - 139421.475, - 136632.27, - 133889.588, - 131174.2, - 128511.619, - 125868.621, - 123265.385, - 120721.061, - 118181.769, - 115709.456, - 113252.446, - 110840.198, - 108465.099, - 106126.164, - 103823.469, - 101556.618, - 99308.004, - 97124.508, - 94937.803, - 92833.731, - 90745.061, - 88677.627, - 86617.47, - 84650.442, - 82697.833, - 80769.132, - 78879.629, - 77014.432, - 75215.626, - 73384.587, - 71652.482, - 69895.93, - 68209.301, - 66553.669, - 64921.981, - 63310.323, - 61742.115, - 60205.018, - 58698.658, - 57190.657, - 55760.865, - 54331.169, - 52908.167, - 51550.273, - 50225.254, - 48922.421, - 47614.533, - 46362.049, - 45098.569, - 43926.083, - 42736.03, - 41593.473, - 40425.26, - 39316.237, - 38243.651, - 37170.617, - 36114.609, - 35084.19, - 34117.233, - 33206.509, - 32231.505, - 31318.728, - 30403.404, - 29540.0550000001, - 28679.236, - 27825.862, - 26965.216, - 26179.148, - 25462.08, - 24645.952, - 23922.523, - 23198.144, - 22529.128, - 21762.4179999999, - 21134.779, - 20459.117, - 19840.818, - 19187.04, - 18636.3689999999, - 17982.831, - 17439.7389999999, - 16874.547, - 16358.2169999999, - 15835.684, - 15352.914, - 14823.681, - 14329.313, - 13816.897, - 13342.874, - 12880.882, - 12491.648, - 12021.254, - 11625.392, - 11293.7610000001, - 10813.697, - 10456.209, - 10099.074, - 9755.39000000001, - 9393.18500000006, - 9047.57900000003, - 8657.98499999999, - 8395.85900000005, - 8033.0, - 7736.95900000003, - 7430.59699999995, - 7258.47699999996, - 6924.58200000005, - 6691.29399999999, - 6357.92500000005, - 6202.05700000003, - 5921.19700000004, - 5628.28399999999, - 5404.96799999999, - 5226.71100000001, - 4990.75600000005, - 4799.77399999998, - 4622.93099999998, - 4472.478, - 4171.78700000001, - 3957.46299999999, - 3868.95200000005, - 3691.14300000004, - 3474.63100000005, - 3341.67200000002, - 3109.14000000001, - 3071.97400000005, - 2796.40399999998, - 2756.17799999996, - 2611.46999999997, - 2471.93000000005, - 2382.26399999997, - 2209.22400000005, - 2142.28399999999, - 2013.96100000001, - 1911.18999999994, - 1818.27099999995, - 1668.47900000005, - 1519.65800000005, - 1469.67599999998, - 1367.13800000004, - 1248.52899999998, - 1181.23600000003, - 1022.71900000004, - 1088.20700000005, - 959.03600000008, - 876.095999999903, - 791.183999999892, - 703.337000000058, - 731.949999999953, - 586.86400000006, - 526.024999999907, - 323.004999999888, - 320.448000000091, - 340.672999999952, - 309.638999999966, - 216.601999999955, - 102.922999999952, - 19.2399999999907, - -0.114000000059605, - -32.6240000000689, - -89.3179999999702, - -153.497999999905, - -64.2970000000205, - -143.695999999996, - -259.497999999905, - -253.017999999924, - -213.948000000091, - -397.590000000084, - -434.006000000052, - -403.475000000093, - -297.958000000101, - -404.317000000039, - -528.898999999976, - -506.621000000043, - -513.205000000075, - -479.351000000024, - -596.139999999898, - -527.016999999993, - -664.681000000099, - -680.306000000099, - -704.050000000047, - -850.486000000034, - -757.43200000003, - -713.308999999892}; +CUCO_HLL_TUNING_ARR_DECL bias_data_p4{10.0, 9.717, 9.207, 8.7896, 8.2882, 7.8204, 7.3772, 6.9342, 6.5202, 6.161, 5.7722, 5.4636, 5.0396, 4.6766, 4.3566, 4.0454, 3.7936, 3.4856, 3.2666, 2.9946, 2.766, 2.4692, 2.3638, 2.0764, 1.7864, 1.7602, 1.4814, 1.433, 1.2926, 1.0664, 0.999600000000001, 0.7956, 0.5366, 0.589399999999998, 0.573799999999999, 0.269799999999996, 0.368200000000002, 0.0544000000000011, 0.234200000000001, 0.0108000000000033, -0.203400000000002, -0.0701999999999998, -0.129600000000003, -0.364199999999997, -0.480600000000003, -0.226999999999997, -0.322800000000001, -0.382599999999996, -0.511200000000002, -0.669600000000003, -0.749400000000001, -0.500399999999999, -0.617600000000003, -0.6922, -0.601599999999998, -0.416200000000003, -0.338200000000001, -0.782600000000002, -0.648600000000002, -0.919800000000002, -0.851799999999997, -0.962400000000002, -0.6402, -1.1922, -1.0256, -1.086, -1.21899999999999, -0.819400000000002, -0.940600000000003, -1.1554, -1.2072, -1.1752, -1.16759999999999, -1.14019999999999, -1.3754, -1.29859999999999, -1.607, -1.3292, -1.7606}; +CUCO_HLL_TUNING_ARR_DECL bias_data_p5{22.0, 21.1194, 20.8208, 20.2318, 19.77, 19.2436, 18.7774, 18.2848, 17.8224, 17.3742, 16.9336, 16.503, 16.0494, 15.6292, 15.2124, 14.798, 14.367, 13.9728, 13.5944, 13.217, 12.8438, 12.3696, 12.0956, 11.7044, 11.324, 11.0668, 10.6698, 10.3644, 10.049, 9.6918, 9.4146, 9.082, 8.687, 8.5398, 8.2462, 7.857, 7.6606, 7.4168, 7.1248, 6.9222, 6.6804, 6.447, 6.3454, 5.9594, 5.7636, 5.5776, 5.331, 5.19, 4.9676, 4.7564, 4.5314, 4.4442, 4.3708, 3.9774, 3.9624, 3.8796, 3.755, 3.472, 3.2076, 3.1024, 2.8908, 2.7338, 2.7728, 2.629, 2.413, 2.3266, 2.1524, 2.2642, 2.1806, 2.0566, 1.9192, 1.7598, 1.3516, 1.5802, 1.43859999999999, 1.49160000000001, 1.1524, 1.1892, 0.841399999999993, 0.879800000000003, 0.837599999999995, 0.469800000000006, 0.765600000000006, 0.331000000000003, 0.591399999999993, 0.601200000000006, 0.701599999999999, 0.558199999999999, 0.339399999999998, 0.354399999999998, 0.491200000000006, 0.308000000000007, 0.355199999999996, -0.0254000000000048, 0.205200000000005, -0.272999999999996, 0.132199999999997, 0.394400000000005, -0.241200000000006, 0.242000000000004, 0.191400000000002, 0.253799999999998, -0.122399999999999, -0.370800000000003, 0.193200000000004, -0.0848000000000013, 0.0867999999999967, -0.327200000000005, -0.285600000000002, 0.311400000000006, -0.128399999999999, -0.754999999999995, -0.209199999999996, -0.293599999999998, -0.364000000000004, -0.253600000000006, -0.821200000000005, -0.253600000000006, -0.510400000000004, -0.383399999999995, -0.491799999999998, -0.220200000000006, -0.0972000000000008, -0.557400000000001, -0.114599999999996, -0.295000000000002, -0.534800000000004, 0.346399999999988, -0.65379999999999, 0.0398000000000138, 0.0341999999999985, -0.995800000000003, -0.523400000000009, -0.489000000000004, -0.274799999999999, -0.574999999999989, -0.482799999999997, 0.0571999999999946, -0.330600000000004, -0.628800000000012, -0.140199999999993, -0.540600000000012, -0.445999999999998, -0.599400000000003, -0.262599999999992, 0.163399999999996, -0.100599999999986, -0.39500000000001, -1.06960000000001, -0.836399999999998, -0.753199999999993, -0.412399999999991, -0.790400000000005, -0.29679999999999, -0.28540000000001, -0.193000000000012, -0.0772000000000048, -0.962799999999987, -0.414800000000014}; +CUCO_HLL_TUNING_ARR_DECL bias_data_p6{45.0, 44.1902, 43.271, 42.8358, 41.8142, 41.2854, 40.317, 39.354, 38.8924, 37.9436, 37.4596, 36.5262, 35.6248, 35.1574, 34.2822, 33.837, 32.9636, 32.074, 31.7042, 30.7976, 30.4772, 29.6564, 28.7942, 28.5004, 27.686, 27.291, 26.5672, 25.8556, 25.4982, 24.8204, 24.4252, 23.7744, 23.0786, 22.8344, 22.0294, 21.8098, 21.0794, 20.5732, 20.1878, 19.5648, 19.2902, 18.6784, 18.3352, 17.8946, 17.3712, 17.0852, 16.499, 16.2686, 15.6844, 15.2234, 14.9732, 14.3356, 14.2286, 13.7262, 13.3284, 13.1048, 12.5962, 12.3562, 12.1272, 11.4184, 11.4974, 11.0822, 10.856, 10.48, 10.2834, 10.0208, 9.637, 9.51739999999999, 9.05759999999999, 8.74760000000001, 8.42700000000001, 8.1326, 8.2372, 8.2788, 7.6776, 7.79259999999999, 7.1952, 6.9564, 6.6454, 6.87, 6.5428, 6.19999999999999, 6.02940000000001, 5.62780000000001, 5.6782, 5.792, 5.35159999999999, 5.28319999999999, 5.0394, 5.07480000000001, 4.49119999999999, 4.84899999999999, 4.696, 4.54040000000001, 4.07300000000001, 4.37139999999999, 3.7216, 3.7328, 3.42080000000001, 3.41839999999999, 3.94239999999999, 3.27719999999999, 3.411, 3.13079999999999, 2.76900000000001, 2.92580000000001, 2.68279999999999, 2.75020000000001, 2.70599999999999, 2.3886, 3.01859999999999, 2.45179999999999, 2.92699999999999, 2.41720000000001, 2.41139999999999, 2.03299999999999, 2.51240000000001, 2.5564, 2.60079999999999, 2.41720000000001, 1.80439999999999, 1.99700000000001, 2.45480000000001, 1.8948, 2.2346, 2.30860000000001, 2.15479999999999, 1.88419999999999, 1.6508, 0.677199999999999, 1.72540000000001, 1.4752, 1.72280000000001, 1.66139999999999, 1.16759999999999, 1.79300000000001, 1.00059999999999, 0.905200000000008, 0.659999999999997, 1.55879999999999, 1.1636, 0.688199999999995, 0.712600000000009, 0.450199999999995, 1.1978, 0.975599999999986, 0.165400000000005, 1.727, 1.19739999999999, -0.252600000000001, 1.13460000000001, 1.3048, 1.19479999999999, 0.313400000000001, 0.878999999999991, 1.12039999999999, 0.853000000000009, 1.67920000000001, 0.856999999999999, 0.448599999999999, 1.2362, 0.953399999999988, 1.02859999999998, 0.563199999999995, 0.663000000000011, 0.723000000000013, 0.756599999999992, 0.256599999999992, -0.837600000000009, 0.620000000000005, 0.821599999999989, 0.216600000000028, 0.205600000000004, 0.220199999999977, 0.372599999999977, 0.334400000000016, 0.928400000000011, 0.972800000000007, 0.192400000000021, 0.487199999999973, -0.413000000000011, 0.807000000000016, 0.120600000000024, 0.769000000000005, 0.870799999999974, 0.66500000000002, 0.118200000000002, 0.401200000000017, 0.635199999999998, 0.135400000000004, 0.175599999999974, 1.16059999999999, 0.34620000000001, 0.521400000000028, -0.586599999999976, -1.16480000000001, 0.968399999999974, 0.836999999999989, 0.779600000000016, 0.985799999999983}; +CUCO_HLL_TUNING_ARR_DECL bias_data_p7{91.0, 89.4934, 87.9758, 86.4574, 84.9718, 83.4954, 81.5302, 80.0756, 78.6374, 77.1782, 75.7888, 73.9522, 72.592, 71.2532, 69.9086, 68.5938, 66.9474, 65.6796, 64.4394, 63.2176, 61.9768, 60.4214, 59.2528, 58.0102, 56.8658, 55.7278, 54.3044, 53.1316, 52.093, 51.0032, 49.9092, 48.6306, 47.5294, 46.5756, 45.6508, 44.662, 43.552, 42.3724, 41.617, 40.5754, 39.7872, 38.8444, 37.7988, 36.8606, 36.2118, 35.3566, 34.4476, 33.5882, 32.6816, 32.0824, 31.0258, 30.6048, 29.4436, 28.7274, 27.957, 27.147, 26.4364, 25.7592, 25.3386, 24.781, 23.8028, 23.656, 22.6544, 21.996, 21.4718, 21.1544, 20.6098, 19.5956, 19.0616, 18.5758, 18.4878, 17.5244, 17.2146, 16.724, 15.8722, 15.5198, 15.0414, 14.941, 14.9048, 13.87, 13.4304, 13.028, 12.4708, 12.37, 12.0624, 11.4668, 11.5532, 11.4352, 11.2564, 10.2744, 10.2118, 9.74720000000002, 10.1456, 9.2928, 8.75040000000001, 8.55279999999999, 8.97899999999998, 8.21019999999999, 8.18340000000001, 7.3494, 7.32499999999999, 7.66140000000001, 6.90300000000002, 7.25439999999998, 6.9042, 7.21499999999997, 6.28640000000001, 6.08139999999997, 6.6764, 6.30099999999999, 5.13900000000001, 5.65800000000002, 5.17320000000001, 4.59019999999998, 4.9538, 5.08280000000002, 4.92200000000003, 4.99020000000002, 4.7328, 5.4538, 4.11360000000002, 4.22340000000003, 4.08780000000002, 3.70800000000003, 4.15559999999999, 4.18520000000001, 3.63720000000001, 3.68220000000002, 3.77960000000002, 3.6078, 2.49160000000001, 3.13099999999997, 2.5376, 3.19880000000001, 3.21100000000001, 2.4502, 3.52820000000003, 2.91199999999998, 3.04480000000001, 2.7432, 2.85239999999999, 2.79880000000003, 2.78579999999999, 1.88679999999999, 2.98860000000002, 2.50639999999999, 1.91239999999999, 2.66160000000002, 2.46820000000002, 1.58199999999999, 1.30399999999997, 2.27379999999999, 2.68939999999998, 1.32900000000001, 3.10599999999999, 1.69080000000002, 2.13740000000001, 2.53219999999999, 1.88479999999998, 1.33240000000001, 1.45119999999997, 1.17899999999997, 2.44119999999998, 1.60659999999996, 2.16700000000003, 0.77940000000001, 2.37900000000002, 2.06700000000001, 1.46000000000004, 2.91160000000002, 1.69200000000001, 0.954600000000028, 2.49300000000005, 2.2722, 1.33500000000004, 2.44899999999996, 1.20140000000004, 3.07380000000001, 2.09739999999999, 2.85640000000001, 2.29960000000005, 2.40899999999999, 1.97040000000004, 0.809799999999996, 1.65279999999996, 2.59979999999996, 0.95799999999997, 2.06799999999998, 2.32780000000002, 4.20159999999998, 1.96320000000003, 1.86400000000003, 1.42999999999995, 3.77940000000001, 1.27200000000005, 1.86440000000005, 2.20600000000002, 3.21900000000005, 1.5154, 2.61019999999996}; +CUCO_HLL_TUNING_ARR_DECL bias_data_p8{183.2152, 180.2454, 177.2096, 173.6652, 170.6312, 167.6822, 164.249, 161.3296, 158.0038, 155.2074, 152.4612, 149.27, 146.5178, 143.4412, 140.8032, 138.1634, 135.1688, 132.6074, 129.6946, 127.2664, 124.8228, 122.0432, 119.6824, 116.9464, 114.6268, 112.2626, 109.8376, 107.4034, 104.8956, 102.8522, 100.7638, 98.3552, 96.3556, 93.7526, 91.9292, 89.8954, 87.8198, 85.7668, 83.298, 81.6688, 79.9466, 77.9746, 76.1672, 74.3474, 72.3028, 70.8912, 69.114, 67.4646, 65.9744, 64.4092, 62.6022, 60.843, 59.5684, 58.1652, 56.5426, 55.4152, 53.5388, 52.3592, 51.1366, 49.486, 48.3918, 46.5076, 45.509, 44.3834, 43.3498, 42.0668, 40.7346, 40.1228, 38.4528, 37.7, 36.644, 36.0518, 34.5774, 33.9068, 32.432, 32.1666, 30.434, 29.6644, 28.4894, 27.6312, 26.3804, 26.292, 25.5496000000001, 25.0234, 24.8206, 22.6146, 22.4188, 22.117, 20.6762, 20.6576, 19.7864, 19.509, 18.5334, 17.9204, 17.772, 16.2924, 16.8654, 15.1836, 15.745, 15.1316, 15.0386, 14.0136, 13.6342, 12.6196, 12.1866, 12.4281999999999, 11.3324, 10.4794000000001, 11.5038, 10.129, 9.52800000000002, 10.3203999999999, 9.46299999999997, 9.79280000000006, 9.12300000000005, 8.74180000000001, 9.2192, 7.51020000000005, 7.60659999999996, 7.01840000000004, 7.22239999999999, 7.40139999999997, 6.76179999999999, 7.14359999999999, 5.65060000000005, 5.63779999999997, 5.76599999999996, 6.75139999999999, 5.57759999999996, 3.73220000000003, 5.8048, 5.63019999999995, 4.93359999999996, 3.47979999999995, 4.33879999999999, 3.98940000000005, 3.81960000000004, 3.31359999999995, 3.23080000000004, 3.4588, 3.08159999999998, 3.4076, 3.00639999999999, 2.38779999999997, 2.61900000000003, 1.99800000000005, 3.34820000000002, 2.95060000000001, 0.990999999999985, 2.11440000000005, 2.20299999999997, 2.82219999999995, 2.73239999999998, 2.7826, 3.76660000000004, 2.26480000000004, 2.31280000000004, 2.40819999999997, 2.75360000000001, 3.33759999999995, 2.71559999999999, 1.7478000000001, 1.42920000000004, 2.39300000000003, 2.22779999999989, 2.34339999999997, 0.87259999999992, 3.88400000000001, 1.80600000000004, 1.91759999999999, 1.16779999999994, 1.50320000000011, 2.52500000000009, 0.226400000000012, 2.31500000000005, 0.930000000000064, 1.25199999999995, 2.14959999999996, 0.0407999999999902, 2.5447999999999, 1.32960000000003, 0.197400000000016, 2.52620000000002, 3.33279999999991, -1.34300000000007, 0.422199999999975, 0.917200000000093, 1.12920000000008, 1.46060000000011, 1.45779999999991, 2.8728000000001, 3.33359999999993, -1.34079999999994, 1.57680000000005, 0.363000000000056, 1.40740000000005, 0.656600000000026, 0.801400000000058, -0.454600000000028, 1.51919999999996}; +CUCO_HLL_TUNING_ARR_DECL bias_data_p9{368.0, 361.8294, 355.2452, 348.6698, 342.1464, 336.2024, 329.8782, 323.6598, 317.462, 311.2826, 305.7102, 299.7416, 293.9366, 288.1046, 282.285, 277.0668, 271.306, 265.8448, 260.301, 254.9886, 250.2422, 244.8138, 239.7074, 234.7428, 229.8402, 225.1664, 220.3534, 215.594, 210.6886, 205.7876, 201.65, 197.228, 192.8036, 188.1666, 184.0818, 180.0824, 176.2574, 172.302, 168.1644, 164.0056, 160.3802, 156.7192, 152.5234, 149.2084, 145.831, 142.485, 139.1112, 135.4764, 131.76, 129.3368, 126.5538, 122.5058, 119.2646, 116.5902, 113.3818, 110.8998, 107.9532, 105.2062, 102.2798, 99.4728, 96.9582, 94.3292, 92.171, 89.7809999999999, 87.5716, 84.7048, 82.5322, 79.875, 78.3972, 75.3464, 73.7274, 71.2834, 70.1444, 68.4263999999999, 66.0166, 64.018, 62.0437999999999, 60.3399999999999, 58.6856, 57.9836, 55.0311999999999, 54.6769999999999, 52.3188, 51.4846, 49.4423999999999, 47.739, 46.1487999999999, 44.9202, 43.4059999999999, 42.5342000000001, 41.2834, 38.8954000000001, 38.3286000000001, 36.2146, 36.6684, 35.9946, 33.123, 33.4338, 31.7378000000001, 29.076, 28.9692, 27.4964, 27.0998, 25.9864, 26.7754, 24.3208, 23.4838, 22.7388000000001, 24.0758000000001, 21.9097999999999, 20.9728, 19.9228000000001, 19.9292, 16.617, 17.05, 18.2996000000001, 15.6128000000001, 15.7392, 14.5174, 13.6322, 12.2583999999999, 13.3766000000001, 11.423, 13.1232, 9.51639999999998, 10.5938000000001, 9.59719999999993, 8.12220000000002, 9.76739999999995, 7.50440000000003, 7.56999999999994, 6.70440000000008, 6.41419999999994, 6.71019999999999, 5.60940000000005, 4.65219999999999, 6.84099999999989, 3.4072000000001, 3.97859999999991, 3.32760000000007, 5.52160000000003, 3.31860000000006, 2.06940000000009, 4.35400000000004, 1.57500000000005, 0.280799999999999, 2.12879999999996, -0.214799999999968, -0.0378000000000611, -0.658200000000079, 0.654800000000023, -0.0697999999999865, 0.858400000000074, -2.52700000000004, -2.1751999999999, -3.35539999999992, -1.04019999999991, -0.651000000000067, -2.14439999999991, -1.96659999999997, -3.97939999999994, -0.604400000000169, -3.08260000000018, -3.39159999999993, -5.29640000000018, -5.38920000000007, -5.08759999999984, -4.69900000000007, -5.23720000000003, -3.15779999999995, -4.97879999999986, -4.89899999999989, -7.48880000000008, -5.94799999999987, -5.68060000000014, -6.67180000000008, -4.70499999999993, -7.27779999999984, -4.6579999999999, -4.4362000000001, -4.32139999999981, -5.18859999999995, -6.66879999999992, -6.48399999999992, -5.1260000000002, -4.4032000000002, -6.13500000000022, -5.80819999999994, -4.16719999999987, -4.15039999999999, -7.45600000000013, -7.24080000000004, -9.83179999999993, -5.80420000000004, -8.6561999999999, -6.99940000000015, -10.5473999999999, -7.34139999999979, -6.80999999999995, -6.29719999999998, -6.23199999999997}; +CUCO_HLL_TUNING_ARR_DECL bias_data_p10{737.1256, 724.4234, 711.1064, 698.4732, 685.4636, 673.0644, 660.488, 647.9654, 636.0832, 623.7864, 612.1992, 600.2176, 588.5228, 577.1716, 565.7752, 554.899, 543.6126, 532.6492, 521.9474, 511.5214, 501.1064, 490.6364, 480.2468, 470.4588, 460.3832, 451.0584, 440.8606, 431.3868, 422.5062, 413.1862, 404.463, 395.339, 386.1936, 378.1292, 369.1854, 361.2908, 353.3324, 344.8518, 337.5204, 329.4854, 321.9318, 314.552, 306.4658, 299.4256, 292.849, 286.152, 278.8956, 271.8792, 265.118, 258.62, 252.5132, 245.9322, 239.7726, 233.6086, 227.5332, 222.5918, 216.4294, 210.7662, 205.4106, 199.7338, 194.9012, 188.4486, 183.1556, 178.6338, 173.7312, 169.6264, 163.9526, 159.8742, 155.8326, 151.1966, 147.5594, 143.07, 140.037, 134.1804, 131.071, 127.4884, 124.0848, 120.2944, 117.333, 112.9626, 110.2902, 107.0814, 103.0334, 99.4832000000001, 96.3899999999999, 93.7202000000002, 90.1714000000002, 87.2357999999999, 85.9346, 82.8910000000001, 80.0264000000002, 78.3834000000002, 75.1543999999999, 73.8683999999998, 70.9895999999999, 69.4367999999999, 64.8701999999998, 65.0408000000002, 61.6738, 59.5207999999998, 57.0158000000001, 54.2302, 53.0962, 50.4985999999999, 52.2588000000001, 47.3914, 45.6244000000002, 42.8377999999998, 43.0072, 40.6516000000001, 40.2453999999998, 35.2136, 36.4546, 33.7849999999999, 33.2294000000002, 32.4679999999998, 30.8670000000002, 28.6507999999999, 28.9099999999999, 27.5983999999999, 26.1619999999998, 24.5563999999999, 23.2328000000002, 21.9484000000002, 21.5902000000001, 21.3346000000001, 17.7031999999999, 20.6111999999998, 19.5545999999999, 15.7375999999999, 17.0720000000001, 16.9517999999998, 15.326, 13.1817999999998, 14.6925999999999, 13.0859999999998, 13.2754, 10.8697999999999, 11.248, 7.3768, 4.72339999999986, 7.97899999999981, 8.7503999999999, 7.68119999999999, 9.7199999999998, 7.73919999999998, 5.6224000000002, 7.44560000000001, 6.6601999999998, 5.9058, 4.00199999999995, 4.51699999999983, 4.68240000000014, 3.86220000000003, 5.13639999999987, 5.98500000000013, 2.47719999999981, 2.61999999999989, 1.62800000000016, 4.65000000000009, 0.225599999999758, 0.831000000000131, -0.359400000000278, 1.27599999999984, -2.92559999999958, -0.0303999999996449, 2.37079999999969, -2.0033999999996, 0.804600000000391, 0.30199999999968, 1.1247999999996, -2.6880000000001, 0.0321999999996478, -1.18099999999959, -3.9402, -1.47940000000017, -0.188400000000001, -2.10720000000038, -2.04159999999956, -3.12880000000041, -4.16160000000036, -0.612799999999879, -3.48719999999958, -8.17900000000009, -5.37780000000021, -4.01379999999972, -5.58259999999973, -5.73719999999958, -7.66799999999967, -5.69520000000011, -1.1247999999996, -5.58520000000044, -8.04560000000038, -4.64840000000004, -11.6468000000004, -7.97519999999986, -5.78300000000036, -7.67420000000038, -10.6328000000003, -9.81720000000041}; +CUCO_HLL_TUNING_ARR_DECL bias_data_p11{1476.0, 1449.6014, 1423.5802, 1397.7942, 1372.3042, 1347.2062, 1321.8402, 1297.2292, 1272.9462, 1248.9926, 1225.3026, 1201.4252, 1178.0578, 1155.6092, 1132.626, 1110.5568, 1088.527, 1066.5154, 1045.1874, 1024.3878, 1003.37, 982.1972, 962.5728, 942.1012, 922.9668, 903.292, 884.0772, 864.8578, 846.6562, 828.041, 809.714, 792.3112, 775.1806, 757.9854, 740.656, 724.346, 707.5154, 691.8378, 675.7448, 659.6722, 645.5722, 630.1462, 614.4124, 600.8728, 585.898, 572.408, 558.4926, 544.4938, 531.6776, 517.282, 505.7704, 493.1012, 480.7388, 467.6876, 456.1872, 445.5048, 433.0214, 420.806, 411.409, 400.4144, 389.4294, 379.2286, 369.651, 360.6156, 350.337, 342.083, 332.1538, 322.5094, 315.01, 305.6686, 298.1678, 287.8116, 280.9978, 271.9204, 265.3286, 257.5706, 249.6014, 242.544, 235.5976, 229.583, 220.9438, 214.672, 208.2786, 201.8628, 195.1834, 191.505, 186.1816, 178.5188, 172.2294, 167.8908, 161.0194, 158.052, 151.4588, 148.1596, 143.4344, 138.5238, 133.13, 127.6374, 124.8162, 118.7894, 117.3984, 114.6078, 109.0858, 105.1036, 103.6258, 98.6018000000004, 95.7618000000002, 93.5821999999998, 88.5900000000001, 86.9992000000002, 82.8800000000001, 80.4539999999997, 74.6981999999998, 74.3644000000004, 73.2914000000001, 65.5709999999999, 66.9232000000002, 65.1913999999997, 62.5882000000001, 61.5702000000001, 55.7035999999998, 56.1764000000003, 52.7596000000003, 53.0302000000001, 49.0609999999997, 48.4694, 44.933, 46.0474000000004, 44.7165999999997, 41.9416000000001, 39.9207999999999, 35.6328000000003, 35.5276000000003, 33.1934000000001, 33.2371999999996, 33.3864000000003, 33.9228000000003, 30.2371999999996, 29.1373999999996, 25.2272000000003, 24.2942000000003, 19.8338000000003, 18.9005999999999, 23.0907999999999, 21.8544000000002, 19.5176000000001, 15.4147999999996, 16.9314000000004, 18.6737999999996, 12.9877999999999, 14.3688000000002, 12.0447999999997, 15.5219999999999, 12.5299999999997, 14.5940000000001, 14.3131999999996, 9.45499999999993, 12.9441999999999, 3.91139999999996, 13.1373999999996, 5.44720000000052, 9.82779999999912, 7.87279999999919, 3.67760000000089, 5.46980000000076, 5.55099999999948, 5.65979999999945, 3.89439999999922, 3.1275999999998, 5.65140000000065, 6.3062000000009, 3.90799999999945, 1.87060000000019, 5.17020000000048, 2.46680000000015, 0.770000000000437, -3.72340000000077, 1.16400000000067, 8.05340000000069, 0.135399999999208, 2.15940000000046, 0.766999999999825, 1.0594000000001, 3.15500000000065, -0.287399999999252, 2.37219999999979, -2.86620000000039, -1.63199999999961, -2.22979999999916, -0.15519999999924, -1.46039999999994, -0.262199999999211, -2.34460000000036, -2.8078000000005, -3.22179999999935, -5.60159999999996, -8.42200000000048, -9.43740000000071, 0.161799999999857, -10.4755999999998, -10.0823999999993}; +CUCO_HLL_TUNING_ARR_DECL bias_data_p12{2953.0, 2900.4782, 2848.3568, 2796.3666, 2745.324, 2694.9598, 2644.648, 2595.539, 2546.1474, 2498.2576, 2450.8376, 2403.6076, 2357.451, 2311.38, 2266.4104, 2221.5638, 2176.9676, 2134.193, 2090.838, 2048.8548, 2007.018, 1966.1742, 1925.4482, 1885.1294, 1846.4776, 1807.4044, 1768.8724, 1731.3732, 1693.4304, 1657.5326, 1621.949, 1586.5532, 1551.7256, 1517.6182, 1483.5186, 1450.4528, 1417.865, 1385.7164, 1352.6828, 1322.6708, 1291.8312, 1260.9036, 1231.476, 1201.8652, 1173.6718, 1145.757, 1119.2072, 1092.2828, 1065.0434, 1038.6264, 1014.3192, 988.5746, 965.0816, 940.1176, 917.9796, 894.5576, 871.1858, 849.9144, 827.1142, 805.0818, 783.9664, 763.9096, 742.0816, 724.3962, 706.3454, 688.018, 667.4214, 650.3106, 633.0686, 613.8094, 597.818, 581.4248, 563.834, 547.363, 531.5066, 520.455400000001, 505.583199999999, 488.366, 476.480799999999, 459.7682, 450.0522, 434.328799999999, 423.952799999999, 408.727000000001, 399.079400000001, 387.252200000001, 373.987999999999, 360.852000000001, 351.6394, 339.642, 330.902400000001, 322.661599999999, 311.662200000001, 301.3254, 291.7484, 279.939200000001, 276.7508, 263.215200000001, 254.811400000001, 245.5494, 242.306399999999, 234.8734, 223.787200000001, 217.7156, 212.0196, 200.793, 195.9748, 189.0702, 182.449199999999, 177.2772, 170.2336, 164.741, 158.613600000001, 155.311, 147.5964, 142.837, 137.3724, 132.0162, 130.0424, 121.9804, 120.451800000001, 114.8968, 111.585999999999, 105.933199999999, 101.705, 98.5141999999996, 95.0488000000005, 89.7880000000005, 91.4750000000004, 83.7764000000006, 80.9698000000008, 72.8574000000008, 73.1615999999995, 67.5838000000003, 62.6263999999992, 63.2638000000006, 66.0977999999996, 52.0843999999997, 58.9956000000002, 47.0912000000008, 46.4956000000002, 48.4383999999991, 47.1082000000006, 43.2392, 37.2759999999998, 40.0283999999992, 35.1864000000005, 35.8595999999998, 32.0998, 28.027, 23.6694000000007, 33.8266000000003, 26.3736000000008, 27.2008000000005, 21.3245999999999, 26.4115999999995, 23.4521999999997, 19.5013999999992, 19.8513999999996, 10.7492000000002, 18.6424000000006, 13.1265999999996, 18.2436000000016, 6.71860000000015, 3.39459999999963, 6.33759999999893, 7.76719999999841, 0.813999999998487, 3.82819999999992, 0.826199999999517, 8.07440000000133, -1.59080000000176, 5.01780000000144, 0.455399999998917, -0.24199999999837, 0.174800000000687, -9.07640000000174, -4.20160000000033, -3.77520000000004, -4.75179999999818, -5.3724000000002, -8.90680000000066, -6.10239999999976, -5.74120000000039, -9.95339999999851, -3.86339999999836, -13.7304000000004, -16.2710000000006, -7.51359999999841, -3.30679999999847, -13.1339999999982, -10.0551999999989, -6.72019999999975, -8.59660000000076, -10.9307999999983, -1.8775999999998, -4.82259999999951, -13.7788, -21.6470000000008, -10.6735999999983, -15.7799999999988}; +CUCO_HLL_TUNING_ARR_DECL bias_data_p13{5907.5052, 5802.2672, 5697.347, 5593.5794, 5491.2622, 5390.5514, 5290.3376, 5191.6952, 5093.5988, 4997.3552, 4902.5972, 4808.3082, 4715.5646, 4624.109, 4533.8216, 4444.4344, 4356.3802, 4269.2962, 4183.3784, 4098.292, 4014.79, 3932.4574, 3850.6036, 3771.2712, 3691.7708, 3615.099, 3538.1858, 3463.4746, 3388.8496, 3315.6794, 3244.5448, 3173.7516, 3103.3106, 3033.6094, 2966.5642, 2900.794, 2833.7256, 2769.81, 2707.3196, 2644.0778, 2583.9916, 2523.4662, 2464.124, 2406.073, 2347.0362, 2292.1006, 2238.1716, 2182.7514, 2128.4884, 2077.1314, 2025.037, 1975.3756, 1928.933, 1879.311, 1831.0006, 1783.2144, 1738.3096, 1694.5144, 1649.024, 1606.847, 1564.7528, 1525.3168, 1482.5372, 1443.9668, 1406.5074, 1365.867, 1329.2186, 1295.4186, 1257.9716, 1225.339, 1193.2972, 1156.3578, 1125.8686, 1091.187, 1061.4094, 1029.4188, 1000.9126, 972.3272, 944.004199999999, 915.7592, 889.965, 862.834200000001, 840.4254, 812.598399999999, 785.924200000001, 763.050999999999, 741.793799999999, 721.466, 699.040799999999, 677.997200000002, 649.866999999998, 634.911800000002, 609.8694, 591.981599999999, 570.2922, 557.129199999999, 538.3858, 521.872599999999, 502.951400000002, 495.776399999999, 475.171399999999, 459.751, 439.995200000001, 426.708999999999, 413.7016, 402.3868, 387.262599999998, 372.0524, 357.050999999999, 342.5098, 334.849200000001, 322.529399999999, 311.613799999999, 295.848000000002, 289.273000000001, 274.093000000001, 263.329600000001, 251.389599999999, 245.7392, 231.9614, 229.7952, 217.155200000001, 208.9588, 199.016599999999, 190.839199999999, 180.6976, 176.272799999999, 166.976999999999, 162.5252, 151.196400000001, 149.386999999999, 133.981199999998, 130.0586, 130.164000000001, 122.053400000001, 110.7428, 108.1276, 106.232400000001, 100.381600000001, 98.7668000000012, 86.6440000000002, 79.9768000000004, 82.4722000000002, 68.7026000000005, 70.1186000000016, 71.9948000000004, 58.998599999999, 59.0492000000013, 56.9818000000014, 47.5338000000011, 42.9928, 51.1591999999982, 37.2740000000013, 42.7220000000016, 31.3734000000004, 26.8090000000011, 25.8934000000008, 26.5286000000015, 29.5442000000003, 19.3503999999994, 26.0760000000009, 17.9527999999991, 14.8419999999969, 10.4683999999979, 8.65899999999965, 9.86720000000059, 4.34139999999752, -0.907800000000861, -3.32080000000133, -0.936199999996461, -11.9916000000012, -8.87000000000262, -6.33099999999831, -11.3366000000024, -15.9207999999999, -9.34659999999712, -15.5034000000014, -19.2097999999969, -15.357799999998, -28.2235999999975, -30.6898000000001, -19.3271999999997, -25.6083999999973, -24.409599999999, -13.6385999999984, -33.4473999999973, -32.6949999999997, -28.9063999999998, -31.7483999999968, -32.2935999999972, -35.8329999999987, -47.620600000002, -39.0855999999985, -33.1434000000008, -46.1371999999974, -37.5892000000022, -46.8164000000033, -47.3142000000007, -60.2914000000019, -37.7575999999972}; +CUCO_HLL_TUNING_ARR_DECL bias_data_p14{11816.475, 11605.0046, 11395.3792, 11188.7504, 10984.1814, 10782.0086, 10582.0072, 10384.503, 10189.178, 9996.2738, 9806.0344, 9617.9798, 9431.394, 9248.7784, 9067.6894, 8889.6824, 8712.9134, 8538.8624, 8368.4944, 8197.7956, 8031.8916, 7866.6316, 7703.733, 7544.5726, 7386.204, 7230.666, 7077.8516, 6926.7886, 6778.6902, 6631.9632, 6487.304, 6346.7486, 6206.4408, 6070.202, 5935.2576, 5799.924, 5671.0324, 5541.9788, 5414.6112, 5290.0274, 5166.723, 5047.6906, 4929.162, 4815.1406, 4699.127, 4588.5606, 4477.7394, 4369.4014, 4264.2728, 4155.9224, 4055.581, 3955.505, 3856.9618, 3761.3828, 3666.9702, 3575.7764, 3482.4132, 3395.0186, 3305.8852, 3221.415, 3138.6024, 3056.296, 2970.4494, 2896.1526, 2816.8008, 2740.2156, 2670.497, 2594.1458, 2527.111, 2460.8168, 2387.5114, 2322.9498, 2260.6752, 2194.2686, 2133.7792, 2074.767, 2015.204, 1959.4226, 1898.6502, 1850.006, 1792.849, 1741.4838, 1687.9778, 1638.1322, 1589.3266, 1543.1394, 1496.8266, 1447.8516, 1402.7354, 1361.9606, 1327.0692, 1285.4106, 1241.8112, 1201.6726, 1161.973, 1130.261, 1094.2036, 1048.2036, 1020.6436, 990.901400000002, 961.199800000002, 924.769800000002, 899.526400000002, 872.346400000002, 834.375, 810.432000000001, 780.659800000001, 756.013800000001, 733.479399999997, 707.923999999999, 673.858, 652.222399999999, 636.572399999997, 615.738599999997, 586.696400000001, 564.147199999999, 541.679600000003, 523.943599999999, 505.714599999999, 475.729599999999, 461.779600000002, 449.750800000002, 439.020799999998, 412.7886, 400.245600000002, 383.188199999997, 362.079599999997, 357.533799999997, 334.319000000003, 327.553399999997, 308.559399999998, 291.270199999999, 279.351999999999, 271.791400000002, 252.576999999997, 247.482400000001, 236.174800000001, 218.774599999997, 220.155200000001, 208.794399999999, 201.223599999998, 182.995600000002, 185.5268, 164.547400000003, 176.5962, 150.689599999998, 157.8004, 138.378799999999, 134.021200000003, 117.614399999999, 108.194000000003, 97.0696000000025, 89.6042000000016, 95.6030000000028, 84.7810000000027, 72.635000000002, 77.3482000000004, 59.4907999999996, 55.5875999999989, 50.7346000000034, 61.3916000000027, 50.9149999999936, 39.0384000000049, 58.9395999999979, 29.633600000001, 28.2032000000036, 26.0078000000067, 17.0387999999948, 9.22000000000116, 13.8387999999977, 8.07240000000456, 14.1549999999988, 15.3570000000036, 3.42660000000615, 6.24820000000182, -2.96940000000177, -8.79940000000352, -5.97860000000219, -14.4048000000039, -3.4143999999942, -13.0148000000045, -11.6977999999945, -25.7878000000055, -22.3185999999987, -24.409599999999, -31.9756000000052, -18.9722000000038, -22.8678000000073, -30.8972000000067, -32.3715999999986, -22.3907999999938, -43.6720000000059, -35.9038, -39.7492000000057, -54.1641999999993, -45.2749999999942, -42.2989999999991, -44.1089999999967, -64.3564000000042, -49.9551999999967, -42.6116000000038}; +CUCO_HLL_TUNING_ARR_DECL bias_data_p15{23634.0036, 23210.8034, 22792.4744, 22379.1524, 21969.7928, 21565.326, 21165.3532, 20770.2806, 20379.9892, 19994.7098, 19613.318, 19236.799, 18865.4382, 18498.8244, 18136.5138, 17778.8668, 17426.2344, 17079.32, 16734.778, 16397.2418, 16063.3324, 15734.0232, 15409.731, 15088.728, 14772.9896, 14464.1402, 14157.5588, 13855.5958, 13559.3296, 13264.9096, 12978.326, 12692.0826, 12413.8816, 12137.3192, 11870.2326, 11602.5554, 11340.3142, 11079.613, 10829.5908, 10583.5466, 10334.0344, 10095.5072, 9859.694, 9625.2822, 9395.7862, 9174.0586, 8957.3164, 8738.064, 8524.155, 8313.7396, 8116.9168, 7913.542, 7718.4778, 7521.65, 7335.5596, 7154.2906, 6968.7396, 6786.3996, 6613.236, 6437.406, 6270.6598, 6107.7958, 5945.7174, 5787.6784, 5635.5784, 5482.308, 5337.9784, 5190.0864, 5045.9158, 4919.1386, 4771.817, 4645.7742, 4518.4774, 4385.5454, 4262.6622, 4142.74679999999, 4015.5318, 3897.9276, 3790.7764, 3685.13800000001, 3573.6274, 3467.9706, 3368.61079999999, 3271.5202, 3170.3848, 3076.4656, 2982.38400000001, 2888.4664, 2806.4868, 2711.9564, 2634.1434, 2551.3204, 2469.7662, 2396.61139999999, 2318.9902, 2243.8658, 2171.9246, 2105.01360000001, 2028.8536, 1960.9952, 1901.4096, 1841.86079999999, 1777.54700000001, 1714.5802, 1654.65059999999, 1596.311, 1546.2016, 1492.3296, 1433.8974, 1383.84600000001, 1339.4152, 1293.5518, 1245.8686, 1193.50659999999, 1162.27959999999, 1107.19439999999, 1069.18060000001, 1035.09179999999, 999.679000000004, 957.679999999993, 925.300199999998, 888.099400000006, 848.638600000006, 818.156400000007, 796.748399999997, 752.139200000005, 725.271200000003, 692.216, 671.633600000001, 647.939799999993, 621.670599999998, 575.398799999995, 561.226599999995, 532.237999999998, 521.787599999996, 483.095799999996, 467.049599999998, 465.286399999997, 415.548599999995, 401.047399999996, 380.607999999993, 377.362599999993, 347.258799999996, 338.371599999999, 310.096999999994, 301.409199999995, 276.280799999993, 265.586800000005, 258.994399999996, 223.915999999997, 215.925399999993, 213.503800000006, 191.045400000003, 166.718200000003, 166.259000000005, 162.941200000001, 148.829400000002, 141.645999999993, 123.535399999993, 122.329800000007, 89.473399999988, 80.1962000000058, 77.5457999999926, 59.1056000000099, 83.3509999999951, 52.2906000000075, 36.3979999999865, 40.6558000000077, 42.0003999999899, 19.6630000000005, 19.7153999999864, -8.38539999999921, -0.692799999989802, 0.854800000000978, 3.23219999999856, -3.89040000000386, -5.25880000001052, -24.9052000000083, -22.6837999999989, -26.4286000000138, -34.997000000003, -37.0216000000073, -43.430400000012, -58.2390000000014, -68.8034000000043, -56.9245999999985, -57.8583999999973, -77.3097999999882, -73.2793999999994, -81.0738000000129, -87.4530000000086, -65.0254000000132, -57.296399999992, -96.2746000000043, -103.25, -96.081600000005, -91.5542000000132, -102.465200000006, -107.688599999994, -101.458000000013, -109.715800000005}; +CUCO_HLL_TUNING_ARR_DECL bias_data_p16{47270.0, 46423.3584, 45585.7074, 44757.152, 43938.8416, 43130.9514, 42330.03, 41540.407, 40759.6348, 39988.206, 39226.5144, 38473.2096, 37729.795, 36997.268, 36272.6448, 35558.665, 34853.0248, 34157.4472, 33470.5204, 32793.5742, 32127.0194, 31469.4182, 30817.6136, 30178.6968, 29546.8908, 28922.8544, 28312.271, 27707.0924, 27114.0326, 26526.692, 25948.6336, 25383.7826, 24823.5998, 24272.2974, 23732.2572, 23201.4976, 22674.2796, 22163.6336, 21656.515, 21161.7362, 20669.9368, 20189.4424, 19717.3358, 19256.3744, 18795.9638, 18352.197, 17908.5738, 17474.391, 17052.918, 16637.2236, 16228.4602, 15823.3474, 15428.6974, 15043.0284, 14667.6278, 14297.4588, 13935.2882, 13578.5402, 13234.6032, 12882.1578, 12548.0728, 12219.231, 11898.0072, 11587.2626, 11279.9072, 10973.5048, 10678.5186, 10392.4876, 10105.2556, 9825.766, 9562.5444, 9294.2222, 9038.2352, 8784.848, 8533.2644, 8301.7776, 8058.30859999999, 7822.94579999999, 7599.11319999999, 7366.90779999999, 7161.217, 6957.53080000001, 6736.212, 6548.21220000001, 6343.06839999999, 6156.28719999999, 5975.15419999999, 5791.75719999999, 5621.32019999999, 5451.66, 5287.61040000001, 5118.09479999999, 4957.288, 4798.4246, 4662.17559999999, 4512.05900000001, 4364.68539999999, 4220.77720000001, 4082.67259999999, 3957.19519999999, 3842.15779999999, 3699.3328, 3583.01180000001, 3473.8964, 3338.66639999999, 3233.55559999999, 3117.799, 3008.111, 2909.69140000001, 2814.86499999999, 2719.46119999999, 2624.742, 2532.46979999999, 2444.7886, 2370.1868, 2272.45259999999, 2196.19260000001, 2117.90419999999, 2023.2972, 1969.76819999999, 1885.58979999999, 1833.2824, 1733.91200000001, 1682.54920000001, 1604.57980000001, 1556.11240000001, 1491.3064, 1421.71960000001, 1371.22899999999, 1322.1324, 1264.7892, 1196.23920000001, 1143.8474, 1088.67240000001, 1073.60380000001, 1023.11660000001, 959.036400000012, 927.433199999999, 906.792799999996, 853.433599999989, 841.873800000001, 791.1054, 756.899999999994, 704.343200000003, 672.495599999995, 622.790399999998, 611.254799999995, 567.283200000005, 519.406599999988, 519.188400000014, 495.312800000014, 451.350799999986, 443.973399999988, 431.882199999993, 392.027000000002, 380.924200000009, 345.128999999986, 298.901400000002, 287.771999999997, 272.625, 247.253000000026, 222.490600000019, 223.590000000026, 196.407599999977, 176.425999999978, 134.725199999986, 132.4804, 110.445599999977, 86.7939999999944, 56.7038000000175, 64.915399999998, 38.3726000000024, 37.1606000000029, 46.170999999973, 49.1716000000015, 15.3362000000197, 6.71639999997569, -34.8185999999987, -39.4476000000141, 12.6830000000191, -12.3331999999937, -50.6565999999875, -59.9538000000175, -65.1054000000004, -70.7576000000117, -106.325200000021, -126.852200000023, -110.227599999984, -132.885999999999, -113.897200000007, -142.713800000027, -151.145399999979, -150.799200000009, -177.756200000003, -156.036399999983, -182.735199999996, -177.259399999981, -198.663600000029, -174.577600000019, -193.84580000001}; +CUCO_HLL_TUNING_ARR_DECL bias_data_p17{94541.0, 92848.811, 91174.019, 89517.558, 87879.9705, 86262.7565, 84663.5125, 83083.7435, 81521.7865, 79977.272, 78455.9465, 76950.219, 75465.432, 73994.152, 72546.71, 71115.2345, 69705.6765, 68314.937, 66944.2705, 65591.255, 64252.9485, 62938.016, 61636.8225, 60355.592, 59092.789, 57850.568, 56624.518, 55417.343, 54231.1415, 53067.387, 51903.526, 50774.649, 49657.6415, 48561.05, 47475.7575, 46410.159, 45364.852, 44327.053, 43318.4005, 42325.6165, 41348.4595, 40383.6265, 39436.77, 38509.502, 37594.035, 36695.939, 35818.6895, 34955.691, 34115.8095, 33293.949, 32465.0775, 31657.6715, 30877.2585, 30093.78, 29351.3695, 28594.1365, 27872.115, 27168.7465, 26477.076, 25774.541, 25106.5375, 24452.5135, 23815.5125, 23174.0655, 22555.2685, 21960.2065, 21376.3555, 20785.1925, 20211.517, 19657.0725, 19141.6865, 18579.737, 18081.3955, 17578.995, 17073.44, 16608.335, 16119.911, 15651.266, 15194.583, 14749.0495, 14343.4835, 13925.639, 13504.509, 13099.3885, 12691.2855, 12328.018, 11969.0345, 11596.5145, 11245.6355, 10917.6575, 10580.9785, 10277.8605, 9926.58100000001, 9605.538, 9300.42950000003, 8989.97850000003, 8728.73249999998, 8448.3235, 8175.31050000002, 7898.98700000002, 7629.79100000003, 7413.76199999999, 7149.92300000001, 6921.12650000001, 6677.1545, 6443.28000000003, 6278.23450000002, 6014.20049999998, 5791.20299999998, 5605.78450000001, 5438.48800000001, 5234.2255, 5059.6825, 4887.43349999998, 4682.935, 4496.31099999999, 4322.52250000002, 4191.42499999999, 4021.24200000003, 3900.64799999999, 3762.84250000003, 3609.98050000001, 3502.29599999997, 3363.84250000003, 3206.54849999998, 3079.70000000001, 2971.42300000001, 2867.80349999998, 2727.08100000001, 2630.74900000001, 2496.6165, 2440.902, 2356.19150000002, 2235.58199999999, 2120.54149999999, 2012.25449999998, 1933.35600000003, 1820.93099999998, 1761.54800000001, 1663.09350000002, 1578.84600000002, 1509.48149999999, 1427.3345, 1379.56150000001, 1306.68099999998, 1212.63449999999, 1084.17300000001, 1124.16450000001, 1060.69949999999, 1007.48849999998, 941.194499999983, 879.880500000028, 836.007500000007, 782.802000000025, 748.385499999975, 647.991500000004, 626.730500000005, 570.776000000013, 484.000500000024, 513.98550000001, 418.985499999952, 386.996999999974, 370.026500000036, 355.496999999974, 356.731499999994, 255.92200000002, 259.094000000041, 205.434499999974, 165.374500000034, 197.347500000033, 95.718499999959, 67.6165000000037, 54.6970000000438, 31.7395000000251, -15.8784999999916, 8.42500000004657, -26.3754999999655, -118.425500000012, -66.6629999999423, -42.9745000000112, -107.364999999991, -189.839000000036, -162.611499999999, -164.964999999967, -189.079999999958, -223.931499999948, -235.329999999958, -269.639500000048, -249.087999999989, -206.475499999942, -283.04449999996, -290.667000000016, -304.561499999953, -336.784499999951, -380.386500000022, -283.280499999993, -364.533000000054, -389.059499999974, -364.454000000027, -415.748000000021, -417.155000000028}; +CUCO_HLL_TUNING_ARR_DECL bias_data_p18{189083.0, 185696.913, 182348.774, 179035.946, 175762.762, 172526.444, 169329.754, 166166.099, 163043.269, 159958.91, 156907.912, 153906.845, 150924.199, 147996.568, 145093.457, 142239.233, 139421.475, 136632.27, 133889.588, 131174.2, 128511.619, 125868.621, 123265.385, 120721.061, 118181.769, 115709.456, 113252.446, 110840.198, 108465.099, 106126.164, 103823.469, 101556.618, 99308.004, 97124.508, 94937.803, 92833.731, 90745.061, 88677.627, 86617.47, 84650.442, 82697.833, 80769.132, 78879.629, 77014.432, 75215.626, 73384.587, 71652.482, 69895.93, 68209.301, 66553.669, 64921.981, 63310.323, 61742.115, 60205.018, 58698.658, 57190.657, 55760.865, 54331.169, 52908.167, 51550.273, 50225.254, 48922.421, 47614.533, 46362.049, 45098.569, 43926.083, 42736.03, 41593.473, 40425.26, 39316.237, 38243.651, 37170.617, 36114.609, 35084.19, 34117.233, 33206.509, 32231.505, 31318.728, 30403.404, 29540.0550000001, 28679.236, 27825.862, 26965.216, 26179.148, 25462.08, 24645.952, 23922.523, 23198.144, 22529.128, 21762.4179999999, 21134.779, 20459.117, 19840.818, 19187.04, 18636.3689999999, 17982.831, 17439.7389999999, 16874.547, 16358.2169999999, 15835.684, 15352.914, 14823.681, 14329.313, 13816.897, 13342.874, 12880.882, 12491.648, 12021.254, 11625.392, 11293.7610000001, 10813.697, 10456.209, 10099.074, 9755.39000000001, 9393.18500000006, 9047.57900000003, 8657.98499999999, 8395.85900000005, 8033.0, 7736.95900000003, 7430.59699999995, 7258.47699999996, 6924.58200000005, 6691.29399999999, 6357.92500000005, 6202.05700000003, 5921.19700000004, 5628.28399999999, 5404.96799999999, 5226.71100000001, 4990.75600000005, 4799.77399999998, 4622.93099999998, 4472.478, 4171.78700000001, 3957.46299999999, 3868.95200000005, 3691.14300000004, 3474.63100000005, 3341.67200000002, 3109.14000000001, 3071.97400000005, 2796.40399999998, 2756.17799999996, 2611.46999999997, 2471.93000000005, 2382.26399999997, 2209.22400000005, 2142.28399999999, 2013.96100000001, 1911.18999999994, 1818.27099999995, 1668.47900000005, 1519.65800000005, 1469.67599999998, 1367.13800000004, 1248.52899999998, 1181.23600000003, 1022.71900000004, 1088.20700000005, 959.03600000008, 876.095999999903, 791.183999999892, 703.337000000058, 731.949999999953, 586.86400000006, 526.024999999907, 323.004999999888, 320.448000000091, 340.672999999952, 309.638999999966, 216.601999999955, 102.922999999952, 19.2399999999907, -0.114000000059605, -32.6240000000689, -89.3179999999702, -153.497999999905, -64.2970000000205, -143.695999999996, -259.497999999905, -253.017999999924, -213.948000000091, -397.590000000084, -434.006000000052, -403.475000000093, -297.958000000101, -404.317000000039, -528.898999999976, -506.621000000043, -513.205000000075, -479.351000000024, -596.139999999898, -527.016999999993, -664.681000000099, -680.306000000099, -704.050000000047, -850.486000000034, -757.43200000003, -713.308999999892}; // Meta array storing interpolation points for biases for Precision=4..18 -__device__ static cuda::std::array constexpr bias_data{bias_data_p4.data(), - bias_data_p5.data(), - bias_data_p6.data(), - bias_data_p7.data(), - bias_data_p8.data(), - bias_data_p9.data(), - bias_data_p10.data(), - bias_data_p11.data(), - bias_data_p12.data(), - bias_data_p13.data(), - bias_data_p14.data(), - bias_data_p15.data(), - bias_data_p16.data(), - bias_data_p17.data(), - bias_data_p18.data()}; +CUCO_HLL_TUNING_ARR_DECL bias_data{bias_data_p4.data(), bias_data_p5.data(), bias_data_p6.data(), bias_data_p7.data(), bias_data_p8.data(), bias_data_p9.data(), bias_data_p10.data(), bias_data_p11.data(), bias_data_p12.data(), bias_data_p13.data(), bias_data_p14.data(), bias_data_p15.data(), bias_data_p16.data(), bias_data_p17.data(), bias_data_p18.data()}; +// clang-format on } // namespace cuco::hyperloglog_ns::detail \ No newline at end of file From 799284e6940024295a4d04a37dae0889919d9afe Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Thu, 1 Feb 2024 00:49:41 +0000 Subject: [PATCH 13/78] Implement HLL++ bias correction step --- include/cuco/detail/hyperloglog/finalizer.cuh | 77 ++++++++++++- include/cuco/detail/hyperloglog/tuning.cuh | 102 ++++++++++++++++-- 2 files changed, 168 insertions(+), 11 deletions(-) diff --git a/include/cuco/detail/hyperloglog/finalizer.cuh b/include/cuco/detail/hyperloglog/finalizer.cuh index 3aca44fdf..cd69ffa0a 100644 --- a/include/cuco/detail/hyperloglog/finalizer.cuh +++ b/include/cuco/detail/hyperloglog/finalizer.cuh @@ -17,7 +17,9 @@ #include +#include #include +#include namespace cuco::hyperloglog_ns::detail { @@ -31,6 +33,9 @@ namespace cuco::hyperloglog_ns::detail { */ template class finalizer { + // Note: Most of the types in this implementation are explicit instead of relying on `auto` to + // avoid confusion with the reference implementation. + // this minimum number of registers is required by HLL++ static_assert(Precision >= 4, "Precision must be greater or equal to 4"); @@ -43,11 +48,9 @@ class finalizer { * * @return Bias-corrected cardinality estimate */ - __host__ __device__ static double constexpr finalize(double z, int v) noexcept + __host__ __device__ static std::size_t constexpr finalize(double z, int v) noexcept { auto e = alpha_mm() / z; - // TODO remove test code - // printf("raw e: %lf\n", e); if (v > 0) { // Use linear counting for small cardinality estimates. @@ -68,6 +71,7 @@ class finalizer { private: static auto constexpr m = (1 << Precision); ///< Number of registers + static auto constexpr k = 6; ///< Number of interpolation points to consider __host__ __device__ static double constexpr alpha_mm() noexcept { @@ -90,7 +94,70 @@ class finalizer { return e; } - // TODO implement HLL++ bias correction - __host__ __device__ static double constexpr bias(double e) noexcept { return e * 0; } + __host__ __device__ static double constexpr bias(double e) noexcept + { + auto const anchor_index = interpolation_anchor_index(e); + int const n = raw_estimate_data().size(); + + auto low = cuda::std::max(anchor_index - k + 1, 0); + auto high = cuda::std::min(low + k, n); + // Keep moving bounds as long as the (exclusive) high bound is closer to the estimate than + // the lower (inclusive) bound. + while (high < n and distance(e, high) < distance(e, low)) { + low += 1; + high += 1; + } + + auto const& biases = bias_data(); + double bias_sum = 0.0; + for (int i = low; i < high; ++i) { + bias_sum += biases[i]; + } + + return bias_sum / (high - low); + } + + __host__ __device__ static double distance(double e, int i) noexcept + { + auto const diff = e - raw_estimate_data()[i]; + return diff * diff; + } + + __host__ __device__ static int interpolation_anchor_index(double e) noexcept + { + auto const& estimates = raw_estimate_data(); + int left = 0; + int right = static_cast(estimates.size()) - 1; + int mid; + int candidate_index = 0; // Index of the closest element found + + while (left <= right) { + mid = left + (right - left) / 2; + + if (estimates[mid] < e) { + left = mid + 1; + } else if (estimates[mid] > e) { + right = mid - 1; + } else { + // Exact match found, no need to look further + return mid; + } + } + + // At this point, 'left' is the insertion point. We need to compare the elements at 'left' and + // 'left - 1' to find the closest one, taking care of boundary conditions. + + // Distance from 'e' to the element at 'left', if within bounds + double const dist_lhs = left < static_cast(estimates.size()) + ? cuda::std::abs(estimates[left] - e) + : cuda::std::numeric_limits::max(); + // Distance from 'e' to the element at 'left - 1', if within bounds + double const dist_rhs = left - 1 >= 0 ? cuda::std::abs(estimates[left - 1] - e) + : cuda::std::numeric_limits::max(); + + candidate_index = (dist_lhs < dist_rhs) ? left : left - 1; + + return candidate_index; + } }; } // namespace cuco::hyperloglog_ns::detail \ No newline at end of file diff --git a/include/cuco/detail/hyperloglog/tuning.cuh b/include/cuco/detail/hyperloglog/tuning.cuh index 4d4a69067..c10ef6950 100644 --- a/include/cuco/detail/hyperloglog/tuning.cuh +++ b/include/cuco/detail/hyperloglog/tuning.cuh @@ -20,10 +20,9 @@ namespace cuco::hyperloglog_ns::detail { // TODO this will spawn one copy of each array in every TU :( -// TODO use float instead of double? // TODO use __constant__? #ifndef CUCO_HLL_TUNING_ARR_DECL -#define CUCO_HLL_TUNING_ARR_DECL __device__ static cuda::std::array constexpr +#define CUCO_HLL_TUNING_ARR_DECL __device__ static constexpr cuda::std::array #endif // clang-format off @@ -48,8 +47,54 @@ CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p16{47271.0, 48062.3584, 48862.7074, CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p17{94542.0, 96125.811, 97728.019, 99348.558, 100987.9705, 102646.7565, 104324.5125, 106021.7435, 107736.7865, 109469.272, 111223.9465, 112995.219, 114787.432, 116593.152, 118422.71, 120267.2345, 122134.6765, 124020.937, 125927.2705, 127851.255, 129788.9485, 131751.016, 133726.8225, 135722.592, 137736.789, 139770.568, 141821.518, 143891.343, 145982.1415, 148095.387, 150207.526, 152355.649, 154515.6415, 156696.05, 158887.7575, 161098.159, 163329.852, 165569.053, 167837.4005, 170121.6165, 172420.4595, 174732.6265, 177062.77, 179412.502, 181774.035, 184151.939, 186551.6895, 188965.691, 191402.8095, 193857.949, 196305.0775, 198774.6715, 201271.2585, 203764.78, 206299.3695, 208818.1365, 211373.115, 213946.7465, 216532.076, 219105.541, 221714.5375, 224337.5135, 226977.5125, 229613.0655, 232270.2685, 234952.2065, 237645.3555, 240331.1925, 243034.517, 245756.0725, 248517.6865, 251232.737, 254011.3955, 256785.995, 259556.44, 262368.335, 265156.911, 267965.266, 270785.583, 273616.0495, 276487.4835, 279346.639, 282202.509, 285074.3885, 287942.2855, 290856.018, 293774.0345, 296678.5145, 299603.6355, 302552.6575, 305492.9785, 308466.8605, 311392.581, 314347.538, 317319.4295, 320285.9785, 323301.7325, 326298.3235, 329301.3105, 332301.987, 335309.791, 338370.762, 341382.923, 344431.1265, 347464.1545, 350507.28, 353619.2345, 356631.2005, 359685.203, 362776.7845, 365886.488, 368958.2255, 372060.6825, 375165.4335, 378237.935, 381328.311, 384430.5225, 387576.425, 390683.242, 393839.648, 396977.8425, 400101.9805, 403271.296, 406409.8425, 409529.5485, 412678.7, 415847.423, 419020.8035, 422157.081, 425337.749, 428479.6165, 431700.902, 434893.1915, 438049.582, 441210.5415, 444379.2545, 447577.356, 450741.931, 453959.548, 457137.0935, 460329.846, 463537.4815, 466732.3345, 469960.5615, 473164.681, 476347.6345, 479496.173, 482813.1645, 486025.6995, 489249.4885, 492460.1945, 495675.8805, 498908.0075, 502131.802, 505374.3855, 508550.9915, 511806.7305, 515026.776, 518217.0005, 521523.9855, 524705.9855, 527950.997, 531210.0265, 534472.497, 537750.7315, 540926.922, 544207.094, 547429.4345, 550666.3745, 553975.3475, 557150.7185, 560399.6165, 563662.697, 566916.7395, 570146.1215, 573447.425, 576689.6245, 579874.5745, 583202.337, 586503.0255, 589715.635, 592910.161, 596214.3885, 599488.035, 602740.92, 605983.0685, 609248.67, 612491.3605, 615787.912, 619107.5245, 622307.9555, 625577.333, 628840.4385, 632085.2155, 635317.6135, 638691.7195, 641887.467, 645139.9405, 648441.546, 651666.252, 654941.845}; CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p18{189084.0, 192250.913, 195456.774, 198696.946, 201977.762, 205294.444, 208651.754, 212042.099, 215472.269, 218941.91, 222443.912, 225996.845, 229568.199, 233193.568, 236844.457, 240543.233, 244279.475, 248044.27, 251854.588, 255693.2, 259583.619, 263494.621, 267445.385, 271454.061, 275468.769, 279549.456, 283646.446, 287788.198, 291966.099, 296181.164, 300431.469, 304718.618, 309024.004, 313393.508, 317760.803, 322209.731, 326675.061, 331160.627, 335654.47, 340241.442, 344841.833, 349467.132, 354130.629, 358819.432, 363574.626, 368296.587, 373118.482, 377914.93, 382782.301, 387680.669, 392601.981, 397544.323, 402529.115, 407546.018, 412593.658, 417638.657, 422762.865, 427886.169, 433017.167, 438213.273, 443441.254, 448692.421, 453937.533, 459239.049, 464529.569, 469910.083, 475274.03, 480684.473, 486070.26, 491515.237, 496995.651, 502476.617, 507973.609, 513497.19, 519083.233, 524726.509, 530305.505, 535945.728, 541584.404, 547274.055, 552967.236, 558667.862, 564360.216, 570128.148, 575965.08, 581701.952, 587532.523, 593361.144, 599246.128, 605033.418, 610958.779, 616837.117, 622772.818, 628672.04, 634675.369, 640574.831, 646585.739, 652574.547, 658611.217, 664642.684, 670713.914, 676737.681, 682797.313, 688837.897, 694917.874, 701009.882, 707173.648, 713257.254, 719415.392, 725636.761, 731710.697, 737906.209, 744103.074, 750313.39, 756504.185, 762712.579, 768876.985, 775167.859, 781359.0, 787615.959, 793863.597, 800245.477, 806464.582, 812785.294, 819005.925, 825403.057, 831676.197, 837936.284, 844266.968, 850642.711, 856959.756, 863322.774, 869699.931, 876102.478, 882355.787, 888694.463, 895159.952, 901536.143, 907872.631, 914293.672, 920615.14, 927130.974, 933409.404, 939922.178, 946331.47, 952745.93, 959209.264, 965590.224, 972077.284, 978501.961, 984953.19, 991413.271, 997817.479, 1004222.658, 1010725.676, 1017177.138, 1023612.529, 1030098.236, 1036493.719, 1043112.207, 1049537.036, 1056008.096, 1062476.184, 1068942.337, 1075524.95, 1081932.864, 1088426.025, 1094776.005, 1101327.448, 1107901.673, 1114423.639, 1120884.602, 1127324.923, 1133794.24, 1140328.886, 1146849.376, 1153346.682, 1159836.502, 1166478.703, 1172953.304, 1179391.502, 1185950.982, 1192544.052, 1198913.41, 1205430.994, 1212015.525, 1218674.042, 1225121.683, 1231551.101, 1238126.379, 1244673.795, 1251260.649, 1257697.86, 1264320.983, 1270736.319, 1277274.694, 1283804.95, 1290211.514, 1296858.568, 1303455.691}; -// Meta array storing interpolation points for estimates for Precision=4..18 -CUCO_HLL_TUNING_ARR_DECL raw_estimate_data{raw_estimate_data_p4.data(), raw_estimate_data_p5.data(), raw_estimate_data_p6.data(), raw_estimate_data_p7.data(), raw_estimate_data_p8.data(), raw_estimate_data_p9.data(), raw_estimate_data_p10.data(), raw_estimate_data_p11.data(), raw_estimate_data_p12.data(), raw_estimate_data_p13.data(), raw_estimate_data_p14.data(), raw_estimate_data_p15.data(), raw_estimate_data_p16.data(), raw_estimate_data_p17.data(), raw_estimate_data_p18.data()}; +// helpers for selecting the corresponding arrays for a given precision +template +__host__ __device__ auto const& raw_estimate_data() noexcept; + +template <> +__host__ __device__ auto const& raw_estimate_data<4>() noexcept { return raw_estimate_data_p4; }; + +template <> +__host__ __device__ auto const& raw_estimate_data<5>() noexcept { return raw_estimate_data_p5; }; + +template <> +__host__ __device__ auto const& raw_estimate_data<6>() noexcept { return raw_estimate_data_p6; }; + +template <> +__host__ __device__ auto const& raw_estimate_data<7>() noexcept { return raw_estimate_data_p7; }; + +template <> +__host__ __device__ auto const& raw_estimate_data<8>() noexcept { return raw_estimate_data_p8; }; + +template <> +__host__ __device__ auto const& raw_estimate_data<9>() noexcept { return raw_estimate_data_p9; }; + +template <> +__host__ __device__ auto const& raw_estimate_data<10>() noexcept { return raw_estimate_data_p10; }; + +template <> +__host__ __device__ auto const& raw_estimate_data<11>() noexcept { return raw_estimate_data_p11; }; + +template <> +__host__ __device__ auto const& raw_estimate_data<12>() noexcept { return raw_estimate_data_p12; }; + +template <> +__host__ __device__ auto const& raw_estimate_data<13>() noexcept { return raw_estimate_data_p13; }; + +template <> +__host__ __device__ auto const& raw_estimate_data<14>() noexcept { return raw_estimate_data_p14; }; + +template <> +__host__ __device__ auto const& raw_estimate_data<15>() noexcept { return raw_estimate_data_p15; }; + +template <> +__host__ __device__ auto const& raw_estimate_data<16>() noexcept { return raw_estimate_data_p16; }; + +template <> +__host__ __device__ auto const& raw_estimate_data<17>() noexcept { return raw_estimate_data_p17; }; + +template <> +__host__ __device__ auto const& raw_estimate_data<18>() noexcept { return raw_estimate_data_p18; }; CUCO_HLL_TUNING_ARR_DECL bias_data_p4{10.0, 9.717, 9.207, 8.7896, 8.2882, 7.8204, 7.3772, 6.9342, 6.5202, 6.161, 5.7722, 5.4636, 5.0396, 4.6766, 4.3566, 4.0454, 3.7936, 3.4856, 3.2666, 2.9946, 2.766, 2.4692, 2.3638, 2.0764, 1.7864, 1.7602, 1.4814, 1.433, 1.2926, 1.0664, 0.999600000000001, 0.7956, 0.5366, 0.589399999999998, 0.573799999999999, 0.269799999999996, 0.368200000000002, 0.0544000000000011, 0.234200000000001, 0.0108000000000033, -0.203400000000002, -0.0701999999999998, -0.129600000000003, -0.364199999999997, -0.480600000000003, -0.226999999999997, -0.322800000000001, -0.382599999999996, -0.511200000000002, -0.669600000000003, -0.749400000000001, -0.500399999999999, -0.617600000000003, -0.6922, -0.601599999999998, -0.416200000000003, -0.338200000000001, -0.782600000000002, -0.648600000000002, -0.919800000000002, -0.851799999999997, -0.962400000000002, -0.6402, -1.1922, -1.0256, -1.086, -1.21899999999999, -0.819400000000002, -0.940600000000003, -1.1554, -1.2072, -1.1752, -1.16759999999999, -1.14019999999999, -1.3754, -1.29859999999999, -1.607, -1.3292, -1.7606}; CUCO_HLL_TUNING_ARR_DECL bias_data_p5{22.0, 21.1194, 20.8208, 20.2318, 19.77, 19.2436, 18.7774, 18.2848, 17.8224, 17.3742, 16.9336, 16.503, 16.0494, 15.6292, 15.2124, 14.798, 14.367, 13.9728, 13.5944, 13.217, 12.8438, 12.3696, 12.0956, 11.7044, 11.324, 11.0668, 10.6698, 10.3644, 10.049, 9.6918, 9.4146, 9.082, 8.687, 8.5398, 8.2462, 7.857, 7.6606, 7.4168, 7.1248, 6.9222, 6.6804, 6.447, 6.3454, 5.9594, 5.7636, 5.5776, 5.331, 5.19, 4.9676, 4.7564, 4.5314, 4.4442, 4.3708, 3.9774, 3.9624, 3.8796, 3.755, 3.472, 3.2076, 3.1024, 2.8908, 2.7338, 2.7728, 2.629, 2.413, 2.3266, 2.1524, 2.2642, 2.1806, 2.0566, 1.9192, 1.7598, 1.3516, 1.5802, 1.43859999999999, 1.49160000000001, 1.1524, 1.1892, 0.841399999999993, 0.879800000000003, 0.837599999999995, 0.469800000000006, 0.765600000000006, 0.331000000000003, 0.591399999999993, 0.601200000000006, 0.701599999999999, 0.558199999999999, 0.339399999999998, 0.354399999999998, 0.491200000000006, 0.308000000000007, 0.355199999999996, -0.0254000000000048, 0.205200000000005, -0.272999999999996, 0.132199999999997, 0.394400000000005, -0.241200000000006, 0.242000000000004, 0.191400000000002, 0.253799999999998, -0.122399999999999, -0.370800000000003, 0.193200000000004, -0.0848000000000013, 0.0867999999999967, -0.327200000000005, -0.285600000000002, 0.311400000000006, -0.128399999999999, -0.754999999999995, -0.209199999999996, -0.293599999999998, -0.364000000000004, -0.253600000000006, -0.821200000000005, -0.253600000000006, -0.510400000000004, -0.383399999999995, -0.491799999999998, -0.220200000000006, -0.0972000000000008, -0.557400000000001, -0.114599999999996, -0.295000000000002, -0.534800000000004, 0.346399999999988, -0.65379999999999, 0.0398000000000138, 0.0341999999999985, -0.995800000000003, -0.523400000000009, -0.489000000000004, -0.274799999999999, -0.574999999999989, -0.482799999999997, 0.0571999999999946, -0.330600000000004, -0.628800000000012, -0.140199999999993, -0.540600000000012, -0.445999999999998, -0.599400000000003, -0.262599999999992, 0.163399999999996, -0.100599999999986, -0.39500000000001, -1.06960000000001, -0.836399999999998, -0.753199999999993, -0.412399999999991, -0.790400000000005, -0.29679999999999, -0.28540000000001, -0.193000000000012, -0.0772000000000048, -0.962799999999987, -0.414800000000014}; @@ -67,8 +112,53 @@ CUCO_HLL_TUNING_ARR_DECL bias_data_p16{47270.0, 46423.3584, 45585.7074, 44757.15 CUCO_HLL_TUNING_ARR_DECL bias_data_p17{94541.0, 92848.811, 91174.019, 89517.558, 87879.9705, 86262.7565, 84663.5125, 83083.7435, 81521.7865, 79977.272, 78455.9465, 76950.219, 75465.432, 73994.152, 72546.71, 71115.2345, 69705.6765, 68314.937, 66944.2705, 65591.255, 64252.9485, 62938.016, 61636.8225, 60355.592, 59092.789, 57850.568, 56624.518, 55417.343, 54231.1415, 53067.387, 51903.526, 50774.649, 49657.6415, 48561.05, 47475.7575, 46410.159, 45364.852, 44327.053, 43318.4005, 42325.6165, 41348.4595, 40383.6265, 39436.77, 38509.502, 37594.035, 36695.939, 35818.6895, 34955.691, 34115.8095, 33293.949, 32465.0775, 31657.6715, 30877.2585, 30093.78, 29351.3695, 28594.1365, 27872.115, 27168.7465, 26477.076, 25774.541, 25106.5375, 24452.5135, 23815.5125, 23174.0655, 22555.2685, 21960.2065, 21376.3555, 20785.1925, 20211.517, 19657.0725, 19141.6865, 18579.737, 18081.3955, 17578.995, 17073.44, 16608.335, 16119.911, 15651.266, 15194.583, 14749.0495, 14343.4835, 13925.639, 13504.509, 13099.3885, 12691.2855, 12328.018, 11969.0345, 11596.5145, 11245.6355, 10917.6575, 10580.9785, 10277.8605, 9926.58100000001, 9605.538, 9300.42950000003, 8989.97850000003, 8728.73249999998, 8448.3235, 8175.31050000002, 7898.98700000002, 7629.79100000003, 7413.76199999999, 7149.92300000001, 6921.12650000001, 6677.1545, 6443.28000000003, 6278.23450000002, 6014.20049999998, 5791.20299999998, 5605.78450000001, 5438.48800000001, 5234.2255, 5059.6825, 4887.43349999998, 4682.935, 4496.31099999999, 4322.52250000002, 4191.42499999999, 4021.24200000003, 3900.64799999999, 3762.84250000003, 3609.98050000001, 3502.29599999997, 3363.84250000003, 3206.54849999998, 3079.70000000001, 2971.42300000001, 2867.80349999998, 2727.08100000001, 2630.74900000001, 2496.6165, 2440.902, 2356.19150000002, 2235.58199999999, 2120.54149999999, 2012.25449999998, 1933.35600000003, 1820.93099999998, 1761.54800000001, 1663.09350000002, 1578.84600000002, 1509.48149999999, 1427.3345, 1379.56150000001, 1306.68099999998, 1212.63449999999, 1084.17300000001, 1124.16450000001, 1060.69949999999, 1007.48849999998, 941.194499999983, 879.880500000028, 836.007500000007, 782.802000000025, 748.385499999975, 647.991500000004, 626.730500000005, 570.776000000013, 484.000500000024, 513.98550000001, 418.985499999952, 386.996999999974, 370.026500000036, 355.496999999974, 356.731499999994, 255.92200000002, 259.094000000041, 205.434499999974, 165.374500000034, 197.347500000033, 95.718499999959, 67.6165000000037, 54.6970000000438, 31.7395000000251, -15.8784999999916, 8.42500000004657, -26.3754999999655, -118.425500000012, -66.6629999999423, -42.9745000000112, -107.364999999991, -189.839000000036, -162.611499999999, -164.964999999967, -189.079999999958, -223.931499999948, -235.329999999958, -269.639500000048, -249.087999999989, -206.475499999942, -283.04449999996, -290.667000000016, -304.561499999953, -336.784499999951, -380.386500000022, -283.280499999993, -364.533000000054, -389.059499999974, -364.454000000027, -415.748000000021, -417.155000000028}; CUCO_HLL_TUNING_ARR_DECL bias_data_p18{189083.0, 185696.913, 182348.774, 179035.946, 175762.762, 172526.444, 169329.754, 166166.099, 163043.269, 159958.91, 156907.912, 153906.845, 150924.199, 147996.568, 145093.457, 142239.233, 139421.475, 136632.27, 133889.588, 131174.2, 128511.619, 125868.621, 123265.385, 120721.061, 118181.769, 115709.456, 113252.446, 110840.198, 108465.099, 106126.164, 103823.469, 101556.618, 99308.004, 97124.508, 94937.803, 92833.731, 90745.061, 88677.627, 86617.47, 84650.442, 82697.833, 80769.132, 78879.629, 77014.432, 75215.626, 73384.587, 71652.482, 69895.93, 68209.301, 66553.669, 64921.981, 63310.323, 61742.115, 60205.018, 58698.658, 57190.657, 55760.865, 54331.169, 52908.167, 51550.273, 50225.254, 48922.421, 47614.533, 46362.049, 45098.569, 43926.083, 42736.03, 41593.473, 40425.26, 39316.237, 38243.651, 37170.617, 36114.609, 35084.19, 34117.233, 33206.509, 32231.505, 31318.728, 30403.404, 29540.0550000001, 28679.236, 27825.862, 26965.216, 26179.148, 25462.08, 24645.952, 23922.523, 23198.144, 22529.128, 21762.4179999999, 21134.779, 20459.117, 19840.818, 19187.04, 18636.3689999999, 17982.831, 17439.7389999999, 16874.547, 16358.2169999999, 15835.684, 15352.914, 14823.681, 14329.313, 13816.897, 13342.874, 12880.882, 12491.648, 12021.254, 11625.392, 11293.7610000001, 10813.697, 10456.209, 10099.074, 9755.39000000001, 9393.18500000006, 9047.57900000003, 8657.98499999999, 8395.85900000005, 8033.0, 7736.95900000003, 7430.59699999995, 7258.47699999996, 6924.58200000005, 6691.29399999999, 6357.92500000005, 6202.05700000003, 5921.19700000004, 5628.28399999999, 5404.96799999999, 5226.71100000001, 4990.75600000005, 4799.77399999998, 4622.93099999998, 4472.478, 4171.78700000001, 3957.46299999999, 3868.95200000005, 3691.14300000004, 3474.63100000005, 3341.67200000002, 3109.14000000001, 3071.97400000005, 2796.40399999998, 2756.17799999996, 2611.46999999997, 2471.93000000005, 2382.26399999997, 2209.22400000005, 2142.28399999999, 2013.96100000001, 1911.18999999994, 1818.27099999995, 1668.47900000005, 1519.65800000005, 1469.67599999998, 1367.13800000004, 1248.52899999998, 1181.23600000003, 1022.71900000004, 1088.20700000005, 959.03600000008, 876.095999999903, 791.183999999892, 703.337000000058, 731.949999999953, 586.86400000006, 526.024999999907, 323.004999999888, 320.448000000091, 340.672999999952, 309.638999999966, 216.601999999955, 102.922999999952, 19.2399999999907, -0.114000000059605, -32.6240000000689, -89.3179999999702, -153.497999999905, -64.2970000000205, -143.695999999996, -259.497999999905, -253.017999999924, -213.948000000091, -397.590000000084, -434.006000000052, -403.475000000093, -297.958000000101, -404.317000000039, -528.898999999976, -506.621000000043, -513.205000000075, -479.351000000024, -596.139999999898, -527.016999999993, -664.681000000099, -680.306000000099, -704.050000000047, -850.486000000034, -757.43200000003, -713.308999999892}; -// Meta array storing interpolation points for biases for Precision=4..18 -CUCO_HLL_TUNING_ARR_DECL bias_data{bias_data_p4.data(), bias_data_p5.data(), bias_data_p6.data(), bias_data_p7.data(), bias_data_p8.data(), bias_data_p9.data(), bias_data_p10.data(), bias_data_p11.data(), bias_data_p12.data(), bias_data_p13.data(), bias_data_p14.data(), bias_data_p15.data(), bias_data_p16.data(), bias_data_p17.data(), bias_data_p18.data()}; +template +__host__ __device__ auto const& bias_data() noexcept; + +template <> +__host__ __device__ auto const& bias_data<4>() noexcept { return bias_data_p4; }; + +template <> +__host__ __device__ auto const& bias_data<5>() noexcept { return bias_data_p5; }; + +template <> +__host__ __device__ auto const& bias_data<6>() noexcept { return bias_data_p6; }; + +template <> +__host__ __device__ auto const& bias_data<7>() noexcept { return bias_data_p7; }; + +template <> +__host__ __device__ auto const& bias_data<8>() noexcept { return bias_data_p8; }; + +template <> +__host__ __device__ auto const& bias_data<9>() noexcept { return bias_data_p9; }; + +template <> +__host__ __device__ auto const& bias_data<10>() noexcept { return bias_data_p10; }; + +template <> +__host__ __device__ auto const& bias_data<11>() noexcept { return bias_data_p11; }; + +template <> +__host__ __device__ auto const& bias_data<12>() noexcept { return bias_data_p12; }; + +template <> +__host__ __device__ auto const& bias_data<13>() noexcept { return bias_data_p13; }; + +template <> +__host__ __device__ auto const& bias_data<14>() noexcept { return bias_data_p14; }; + +template <> +__host__ __device__ auto const& bias_data<15>() noexcept { return bias_data_p15; }; + +template <> +__host__ __device__ auto const& bias_data<16>() noexcept { return bias_data_p16; }; + +template <> +__host__ __device__ auto const& bias_data<17>() noexcept { return bias_data_p17; }; + +template <> +__host__ __device__ auto const& bias_data<18>() noexcept { return bias_data_p18; }; // clang-format on } // namespace cuco::hyperloglog_ns::detail \ No newline at end of file From abbeffa0076cfa48d77b61b625fe58ee19cabcd5 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Thu, 1 Feb 2024 02:00:30 +0000 Subject: [PATCH 14/78] Extend examples and fix some bugs along the way --- examples/CMakeLists.txt | 1 + .../device_ref_example.cu | 116 ++++++++++++++++++ .../host_bulk_example.cu | 45 ++++--- .../distinct_count_estimator.inl | 2 +- .../distinct_count_estimator_ref.inl | 4 +- .../cuco/detail/hyperloglog/hyperloglog.cuh | 14 +++ include/cuco/distinct_count_estimator_ref.cuh | 3 + 7 files changed, 158 insertions(+), 27 deletions(-) create mode 100644 examples/distinct_count_estimator/device_ref_example.cu diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index f6e753cf2..9ee062690 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -42,3 +42,4 @@ ConfigureExample(STATIC_MAP_CUSTOM_TYPE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/sta ConfigureExample(STATIC_MAP_COUNT_BY_KEY_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/count_by_key_example.cu") ConfigureExample(STATIC_MULTIMAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_multimap/host_bulk_example.cu") ConfigureExample(DISTINCT_COUNT_ESTIMATOR_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/distinct_count_estimator/host_bulk_example.cu") +ConfigureExample(DISTINCT_COUNT_ESTIMATOR_DEVICE_REF_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/distinct_count_estimator/device_ref_example.cu") diff --git a/examples/distinct_count_estimator/device_ref_example.cu b/examples/distinct_count_estimator/device_ref_example.cu new file mode 100644 index 000000000..82e34b5c9 --- /dev/null +++ b/examples/distinct_count_estimator/device_ref_example.cu @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include + +#include +#include + +#include +#include + +/** + * @file device_reference_example.cu + * @brief Demonstrates usage of `cuco::distinct_count_estimator` device-side APIs. + * + * This example demonstrates how the non-owning reference type `cuco::distinct_count_estimator_ref` + * can be used to implement a custom kernel that fuses the cardinality estimation step with any + * other workload that traverses the input data. + */ + +template +__global__ void piggyback_kernel(RefType ref, InputIt first, std::size_t n) +{ + // Transform the reference type (with device scope) to a reference type with block scope + using local_ref_type = typename RefType::with_scope; + + // Shared memory storage for the block-local estimator + __shared__ typename local_ref_type::storage_type local_storage; + + auto const loop_stride = gridDim.x * blockDim.x; + auto idx = blockDim.x * blockIdx.x + threadIdx.x; + auto const block = cooperative_groups::this_thread_block(); + + // Create the local estimator with the shared memory storage + local_ref_type local_ref(local_storage, {}); + + // Initialize the local estimator + local_ref.clear(block); + block.sync(); + + while (idx < n) { + auto const& item = *(first + idx); + + // Add each item to the local estimator + local_ref.add(item); + + /* + Here we can add some custom workload that takes the input `item`. + + The idea is that cardinality estimation can be fused/piggy-backed with any other workload that + traverses the data. Since `local_ref.add` can run close to the SOL of the DRAM bandwidth, we get + the estimate "for free" while performing other computations over the data. + */ + + idx += loop_stride; + } + block.sync(); + + // We can also compute the local estimate on the device + auto const local_estimate = local_ref.estimate(block); + if (block.thread_rank() == 0) { + // The local estimate should approximately be `num_items`/`gridDim.x` + printf("Estimate for block %d = %llu\n", blockIdx.x, local_estimate); + } + + // In the end, we merge the shared memory estimator into the global estimator which gives us the + // final result + ref.merge(block, local_ref); +} + +int main(void) +{ + using T = int; + constexpr std::size_t num_items = 1ull << 28; // 1GB + + thrust::device_vector items(num_items); + + // Generate `num_items` distinct items + thrust::sequence(items.begin(), items.end(), 0); + + // Initialize the estimator + cuco::distinct_count_estimator estimator; + + // Add all items to the estimator + estimator.add(items.begin(), items.end()); + + // Calculate the cardinality estimate from the bulk operation + std::size_t const estimated_cardinality_bulk = estimator.estimate(); + + // Clear the estimator so it can be reused + estimator.clear(); + + // Call the custom kernel and pass a non-owning reference to the estimator to the GPU + piggyback_kernel<<<10, 512>>>(estimator.ref(), items.begin(), num_items); + + // Calculate the cardinality estimate from the custom kernel + std::size_t const estimated_cardinality_custom = estimator.estimate(); + + if (estimated_cardinality_bulk == estimated_cardinality_custom) { + std::cout << "Success! Cardinality estimates are identical" << std::endl; + } + + return 0; +} \ No newline at end of file diff --git a/examples/distinct_count_estimator/host_bulk_example.cu b/examples/distinct_count_estimator/host_bulk_example.cu index 18085e72f..9e60ae47b 100644 --- a/examples/distinct_count_estimator/host_bulk_example.cu +++ b/examples/distinct_count_estimator/host_bulk_example.cu @@ -21,41 +21,38 @@ #include #include -int main() +/** + * @file host_bulk_example.cu + * @brief Demonstrates usage of `cuco::distinct_count_estimator` "bulk" host APIs. + */ + +int main(void) { using T = int; - std::size_t constexpr num_items = 1ull << 30; // 4GB + constexpr std::size_t num_items = 1ull << 28; // 1GB thrust::device_vector items(num_items); - // create a vector of distinct items - thrust::sequence(items.begin(), items.end(), 0); - cudaEvent_t start, stop; - cudaEventCreate(&start); - cudaEventCreate(&stop); + // Generate `num_items` distinct items + thrust::sequence(items.begin(), items.end(), 0); + // Initialize the estimator cuco::distinct_count_estimator estimator; - cudaEventRecord(start); - // add all items to the estimator + + // Add all items to the estimator estimator.add(items.begin(), items.end()); - // after the estimator has seen all items, we can calculate the cardinality - std::size_t const estimated_cardinality = estimator.estimate(); - cudaEventRecord(stop); - cudaEventSynchronize(stop); - float milliseconds = 0; - cudaEventElapsedTime(&milliseconds, start, stop); - float input_size_gb = num_items * sizeof(T) / 1073741824.0f; - float throughput = input_size_gb / (milliseconds / 1000.0f); + // Adding the same items again will not affect the result + estimator.add(items.begin(), items.begin() + num_items / 2); + + // Calculate the cardinality estimate + std::size_t const estimated_cardinality = estimator.estimate(); - std::cout << "True cardinality:\t" << num_items << "\nEstimated cardinality:\t" - << estimated_cardinality << "\nRelative error:\t" + std::cout << "True cardinality: " << num_items + << "\nEstimated cardinality: " << estimated_cardinality << "\nRelative error: " << abs(static_cast(num_items) - static_cast(estimated_cardinality)) / num_items - << "\nData size:\t" << input_size_gb << "GB" - << "\nElapsed time:\t" << milliseconds << "ms" - << "\nMemory throughput\t" << throughput << "GB/s" << std::endl; + << std::endl; - cudaEventDestroy(start); - cudaEventDestroy(stop); + return 0; } \ No newline at end of file diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl index 413d7ee7b..79488e0e1 100644 --- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl +++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl @@ -98,6 +98,6 @@ template ::ref_type<> distinct_count_estimator::ref() const noexcept { - return this->impl_->ref(); + return ref_type<>{this->impl_->storage_ref(), this->impl_->hash()}; } } // namespace cuco \ No newline at end of file diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl index 26fc9bd99..3b940edfd 100644 --- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl +++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl @@ -43,13 +43,13 @@ __device__ void distinct_count_estimator_ref::merge( CG const& group, distinct_count_estimator_ref const& other) noexcept { - this->impl_.merge(group, other); + this->impl_.merge(group, other.impl_); } template __device__ std::size_t distinct_count_estimator_ref::estimate( cooperative_groups::thread_block const& group) const noexcept { - this->impl_.estimate(group); + return this->impl_.estimate(group); } } // namespace cuco \ No newline at end of file diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh index af303a921..a08cfa942 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog.cuh @@ -299,6 +299,20 @@ class hyperloglog { return ref_type<>{*(this->storage_.get()), this->hash_}; } + /** + * @brief Get storage ref. + * + * @return Reference to storage + */ + [[nodiscard]] storage_type& storage_ref() const noexcept { return *(this->storage_.get()); } + + /** + * @brief Get hash function. + * + * @return The hash function + */ + [[nodiscard]] auto hash() const noexcept { return this->hash_; } + private: struct storage_deleter { using pointer = typename storage_allocator_type::value_type*; diff --git a/include/cuco/distinct_count_estimator_ref.cuh b/include/cuco/distinct_count_estimator_ref.cuh index 256183082..d656d6e17 100644 --- a/include/cuco/distinct_count_estimator_ref.cuh +++ b/include/cuco/distinct_count_estimator_ref.cuh @@ -105,6 +105,9 @@ class distinct_count_estimator_ref { private: impl_type impl_; ///< Implementation object + + template + friend class distinct_count_estimator_ref; }; } // namespace cuco From 86d461801a5aad230e6143b2cca9cdbc89bf0bb8 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Thu, 1 Feb 2024 17:01:02 +0000 Subject: [PATCH 15/78] Refactor thresholds --- include/cuco/detail/hyperloglog/finalizer.cuh | 2 +- include/cuco/detail/hyperloglog/tuning.cuh | 52 +++++++++++++++++-- 2 files changed, 50 insertions(+), 4 deletions(-) diff --git a/include/cuco/detail/hyperloglog/finalizer.cuh b/include/cuco/detail/hyperloglog/finalizer.cuh index cd69ffa0a..18af4ca1b 100644 --- a/include/cuco/detail/hyperloglog/finalizer.cuh +++ b/include/cuco/detail/hyperloglog/finalizer.cuh @@ -57,7 +57,7 @@ class finalizer { double const h = m * log(static_cast(m) / v); // HLL++ is defined only when p < 19, otherwise we need to fallback to HLL. // The threshold `2.5 * m` is from the original HLL algorithm. - if ((Precision < 19 and h <= thresholds[Precision - 4]) or e <= 2.5 * m) { + if ((Precision < 19 and h <= threshold()) or e <= 2.5 * m) { e = h; } else { e = bias_corrected_estimate(e); diff --git a/include/cuco/detail/hyperloglog/tuning.cuh b/include/cuco/detail/hyperloglog/tuning.cuh index c10ef6950..05cacb067 100644 --- a/include/cuco/detail/hyperloglog/tuning.cuh +++ b/include/cuco/detail/hyperloglog/tuning.cuh @@ -26,7 +26,53 @@ namespace cuco::hyperloglog_ns::detail { #endif // clang-format off -CUCO_HLL_TUNING_ARR_DECL thresholds{10.0, 20.0, 40.0, 80.0, 220.0, 400.0, 900.0, 1800.0, 3100.0, 6500.0, 15500.0, 20000.0, 50000.0, 120000.0, 350000.0}; +template +__host__ __device__ constexpr auto threshold() noexcept; + +template <> +__host__ __device__ constexpr auto threshold<4>() noexcept { return 10.0; }; + +template <> +__host__ __device__ constexpr auto threshold<5>() noexcept { return 20.0; }; + +template <> +__host__ __device__ constexpr auto threshold<6>() noexcept { return 40.0; }; + +template <> +__host__ __device__ constexpr auto threshold<7>() noexcept { return 80.0; }; + +template <> +__host__ __device__ constexpr auto threshold<8>() noexcept { return 220.0; }; + +template <> +__host__ __device__ constexpr auto threshold<9>() noexcept { return 400.0; }; + +template <> +__host__ __device__ constexpr auto threshold<10>() noexcept { return 900.0; }; + +template <> +__host__ __device__ constexpr auto threshold<11>() noexcept { return 1800.0; }; + +template <> +__host__ __device__ constexpr auto threshold<12>() noexcept { return 3100.0; }; + +template <> +__host__ __device__ constexpr auto threshold<13>() noexcept { return 6500.0; }; + +template <> +__host__ __device__ constexpr auto threshold<14>() noexcept { return 15500.0; }; + +template <> +__host__ __device__ constexpr auto threshold<15>() noexcept { return 20000.0; }; + +template <> +__host__ __device__ constexpr auto threshold<16>() noexcept { return 50000.0; }; + +template <> +__host__ __device__ constexpr auto threshold<17>() noexcept { return 120000.0; }; + +template <> +__host__ __device__ constexpr auto threshold<18>() noexcept { return 350000.0; }; // HLL++ uses an interpolation method over the raw estimated cardinality to select the optimal bias. // Parameters/interpolation points taken from @@ -48,7 +94,7 @@ CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p17{94542.0, 96125.811, 97728.019, 99 CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p18{189084.0, 192250.913, 195456.774, 198696.946, 201977.762, 205294.444, 208651.754, 212042.099, 215472.269, 218941.91, 222443.912, 225996.845, 229568.199, 233193.568, 236844.457, 240543.233, 244279.475, 248044.27, 251854.588, 255693.2, 259583.619, 263494.621, 267445.385, 271454.061, 275468.769, 279549.456, 283646.446, 287788.198, 291966.099, 296181.164, 300431.469, 304718.618, 309024.004, 313393.508, 317760.803, 322209.731, 326675.061, 331160.627, 335654.47, 340241.442, 344841.833, 349467.132, 354130.629, 358819.432, 363574.626, 368296.587, 373118.482, 377914.93, 382782.301, 387680.669, 392601.981, 397544.323, 402529.115, 407546.018, 412593.658, 417638.657, 422762.865, 427886.169, 433017.167, 438213.273, 443441.254, 448692.421, 453937.533, 459239.049, 464529.569, 469910.083, 475274.03, 480684.473, 486070.26, 491515.237, 496995.651, 502476.617, 507973.609, 513497.19, 519083.233, 524726.509, 530305.505, 535945.728, 541584.404, 547274.055, 552967.236, 558667.862, 564360.216, 570128.148, 575965.08, 581701.952, 587532.523, 593361.144, 599246.128, 605033.418, 610958.779, 616837.117, 622772.818, 628672.04, 634675.369, 640574.831, 646585.739, 652574.547, 658611.217, 664642.684, 670713.914, 676737.681, 682797.313, 688837.897, 694917.874, 701009.882, 707173.648, 713257.254, 719415.392, 725636.761, 731710.697, 737906.209, 744103.074, 750313.39, 756504.185, 762712.579, 768876.985, 775167.859, 781359.0, 787615.959, 793863.597, 800245.477, 806464.582, 812785.294, 819005.925, 825403.057, 831676.197, 837936.284, 844266.968, 850642.711, 856959.756, 863322.774, 869699.931, 876102.478, 882355.787, 888694.463, 895159.952, 901536.143, 907872.631, 914293.672, 920615.14, 927130.974, 933409.404, 939922.178, 946331.47, 952745.93, 959209.264, 965590.224, 972077.284, 978501.961, 984953.19, 991413.271, 997817.479, 1004222.658, 1010725.676, 1017177.138, 1023612.529, 1030098.236, 1036493.719, 1043112.207, 1049537.036, 1056008.096, 1062476.184, 1068942.337, 1075524.95, 1081932.864, 1088426.025, 1094776.005, 1101327.448, 1107901.673, 1114423.639, 1120884.602, 1127324.923, 1133794.24, 1140328.886, 1146849.376, 1153346.682, 1159836.502, 1166478.703, 1172953.304, 1179391.502, 1185950.982, 1192544.052, 1198913.41, 1205430.994, 1212015.525, 1218674.042, 1225121.683, 1231551.101, 1238126.379, 1244673.795, 1251260.649, 1257697.86, 1264320.983, 1270736.319, 1277274.694, 1283804.95, 1290211.514, 1296858.568, 1303455.691}; // helpers for selecting the corresponding arrays for a given precision -template +template __host__ __device__ auto const& raw_estimate_data() noexcept; template <> @@ -112,7 +158,7 @@ CUCO_HLL_TUNING_ARR_DECL bias_data_p16{47270.0, 46423.3584, 45585.7074, 44757.15 CUCO_HLL_TUNING_ARR_DECL bias_data_p17{94541.0, 92848.811, 91174.019, 89517.558, 87879.9705, 86262.7565, 84663.5125, 83083.7435, 81521.7865, 79977.272, 78455.9465, 76950.219, 75465.432, 73994.152, 72546.71, 71115.2345, 69705.6765, 68314.937, 66944.2705, 65591.255, 64252.9485, 62938.016, 61636.8225, 60355.592, 59092.789, 57850.568, 56624.518, 55417.343, 54231.1415, 53067.387, 51903.526, 50774.649, 49657.6415, 48561.05, 47475.7575, 46410.159, 45364.852, 44327.053, 43318.4005, 42325.6165, 41348.4595, 40383.6265, 39436.77, 38509.502, 37594.035, 36695.939, 35818.6895, 34955.691, 34115.8095, 33293.949, 32465.0775, 31657.6715, 30877.2585, 30093.78, 29351.3695, 28594.1365, 27872.115, 27168.7465, 26477.076, 25774.541, 25106.5375, 24452.5135, 23815.5125, 23174.0655, 22555.2685, 21960.2065, 21376.3555, 20785.1925, 20211.517, 19657.0725, 19141.6865, 18579.737, 18081.3955, 17578.995, 17073.44, 16608.335, 16119.911, 15651.266, 15194.583, 14749.0495, 14343.4835, 13925.639, 13504.509, 13099.3885, 12691.2855, 12328.018, 11969.0345, 11596.5145, 11245.6355, 10917.6575, 10580.9785, 10277.8605, 9926.58100000001, 9605.538, 9300.42950000003, 8989.97850000003, 8728.73249999998, 8448.3235, 8175.31050000002, 7898.98700000002, 7629.79100000003, 7413.76199999999, 7149.92300000001, 6921.12650000001, 6677.1545, 6443.28000000003, 6278.23450000002, 6014.20049999998, 5791.20299999998, 5605.78450000001, 5438.48800000001, 5234.2255, 5059.6825, 4887.43349999998, 4682.935, 4496.31099999999, 4322.52250000002, 4191.42499999999, 4021.24200000003, 3900.64799999999, 3762.84250000003, 3609.98050000001, 3502.29599999997, 3363.84250000003, 3206.54849999998, 3079.70000000001, 2971.42300000001, 2867.80349999998, 2727.08100000001, 2630.74900000001, 2496.6165, 2440.902, 2356.19150000002, 2235.58199999999, 2120.54149999999, 2012.25449999998, 1933.35600000003, 1820.93099999998, 1761.54800000001, 1663.09350000002, 1578.84600000002, 1509.48149999999, 1427.3345, 1379.56150000001, 1306.68099999998, 1212.63449999999, 1084.17300000001, 1124.16450000001, 1060.69949999999, 1007.48849999998, 941.194499999983, 879.880500000028, 836.007500000007, 782.802000000025, 748.385499999975, 647.991500000004, 626.730500000005, 570.776000000013, 484.000500000024, 513.98550000001, 418.985499999952, 386.996999999974, 370.026500000036, 355.496999999974, 356.731499999994, 255.92200000002, 259.094000000041, 205.434499999974, 165.374500000034, 197.347500000033, 95.718499999959, 67.6165000000037, 54.6970000000438, 31.7395000000251, -15.8784999999916, 8.42500000004657, -26.3754999999655, -118.425500000012, -66.6629999999423, -42.9745000000112, -107.364999999991, -189.839000000036, -162.611499999999, -164.964999999967, -189.079999999958, -223.931499999948, -235.329999999958, -269.639500000048, -249.087999999989, -206.475499999942, -283.04449999996, -290.667000000016, -304.561499999953, -336.784499999951, -380.386500000022, -283.280499999993, -364.533000000054, -389.059499999974, -364.454000000027, -415.748000000021, -417.155000000028}; CUCO_HLL_TUNING_ARR_DECL bias_data_p18{189083.0, 185696.913, 182348.774, 179035.946, 175762.762, 172526.444, 169329.754, 166166.099, 163043.269, 159958.91, 156907.912, 153906.845, 150924.199, 147996.568, 145093.457, 142239.233, 139421.475, 136632.27, 133889.588, 131174.2, 128511.619, 125868.621, 123265.385, 120721.061, 118181.769, 115709.456, 113252.446, 110840.198, 108465.099, 106126.164, 103823.469, 101556.618, 99308.004, 97124.508, 94937.803, 92833.731, 90745.061, 88677.627, 86617.47, 84650.442, 82697.833, 80769.132, 78879.629, 77014.432, 75215.626, 73384.587, 71652.482, 69895.93, 68209.301, 66553.669, 64921.981, 63310.323, 61742.115, 60205.018, 58698.658, 57190.657, 55760.865, 54331.169, 52908.167, 51550.273, 50225.254, 48922.421, 47614.533, 46362.049, 45098.569, 43926.083, 42736.03, 41593.473, 40425.26, 39316.237, 38243.651, 37170.617, 36114.609, 35084.19, 34117.233, 33206.509, 32231.505, 31318.728, 30403.404, 29540.0550000001, 28679.236, 27825.862, 26965.216, 26179.148, 25462.08, 24645.952, 23922.523, 23198.144, 22529.128, 21762.4179999999, 21134.779, 20459.117, 19840.818, 19187.04, 18636.3689999999, 17982.831, 17439.7389999999, 16874.547, 16358.2169999999, 15835.684, 15352.914, 14823.681, 14329.313, 13816.897, 13342.874, 12880.882, 12491.648, 12021.254, 11625.392, 11293.7610000001, 10813.697, 10456.209, 10099.074, 9755.39000000001, 9393.18500000006, 9047.57900000003, 8657.98499999999, 8395.85900000005, 8033.0, 7736.95900000003, 7430.59699999995, 7258.47699999996, 6924.58200000005, 6691.29399999999, 6357.92500000005, 6202.05700000003, 5921.19700000004, 5628.28399999999, 5404.96799999999, 5226.71100000001, 4990.75600000005, 4799.77399999998, 4622.93099999998, 4472.478, 4171.78700000001, 3957.46299999999, 3868.95200000005, 3691.14300000004, 3474.63100000005, 3341.67200000002, 3109.14000000001, 3071.97400000005, 2796.40399999998, 2756.17799999996, 2611.46999999997, 2471.93000000005, 2382.26399999997, 2209.22400000005, 2142.28399999999, 2013.96100000001, 1911.18999999994, 1818.27099999995, 1668.47900000005, 1519.65800000005, 1469.67599999998, 1367.13800000004, 1248.52899999998, 1181.23600000003, 1022.71900000004, 1088.20700000005, 959.03600000008, 876.095999999903, 791.183999999892, 703.337000000058, 731.949999999953, 586.86400000006, 526.024999999907, 323.004999999888, 320.448000000091, 340.672999999952, 309.638999999966, 216.601999999955, 102.922999999952, 19.2399999999907, -0.114000000059605, -32.6240000000689, -89.3179999999702, -153.497999999905, -64.2970000000205, -143.695999999996, -259.497999999905, -253.017999999924, -213.948000000091, -397.590000000084, -434.006000000052, -403.475000000093, -297.958000000101, -404.317000000039, -528.898999999976, -506.621000000043, -513.205000000075, -479.351000000024, -596.139999999898, -527.016999999993, -664.681000000099, -680.306000000099, -704.050000000047, -850.486000000034, -757.43200000003, -713.308999999892}; -template +template __host__ __device__ auto const& bias_data() noexcept; template <> From d6a9a4e9efdffff72cc9b516a2457bfe7e8d229f Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Thu, 1 Feb 2024 17:08:24 +0000 Subject: [PATCH 16/78] Initialize shmem storage using placement new --- examples/distinct_count_estimator/device_ref_example.cu | 4 ++++ include/cuco/detail/hyperloglog/kernels.cuh | 3 +++ 2 files changed, 7 insertions(+) diff --git a/examples/distinct_count_estimator/device_ref_example.cu b/examples/distinct_count_estimator/device_ref_example.cu index 82e34b5c9..8634e6b12 100644 --- a/examples/distinct_count_estimator/device_ref_example.cu +++ b/examples/distinct_count_estimator/device_ref_example.cu @@ -43,6 +43,10 @@ __global__ void piggyback_kernel(RefType ref, InputIt first, std::size_t n) auto idx = blockDim.x * blockIdx.x + threadIdx.x; auto const block = cooperative_groups::this_thread_block(); + // Initialize the local storage object + if (block.thread_rank() == 0) { new (&local_storage) typename local_ref_type::storage_type{}; } + block.sync(); + // Create the local estimator with the shared memory storage local_ref_type local_ref(local_storage, {}); diff --git a/include/cuco/detail/hyperloglog/kernels.cuh b/include/cuco/detail/hyperloglog/kernels.cuh index fd3a2a877..07f16b097 100644 --- a/include/cuco/detail/hyperloglog/kernels.cuh +++ b/include/cuco/detail/hyperloglog/kernels.cuh @@ -43,6 +43,9 @@ CUCO_KERNEL void add_shmem(InputIt first, cuco::detail::index_type n, RefType re auto idx = cuco::detail::global_thread_id(); auto const block = cooperative_groups::this_thread_block(); + if (block.thread_rank() == 0) { new (&local_storage) typename local_ref_type::storage_type{}; } + block.sync(); + local_ref_type local_ref(local_storage, {}); local_ref.clear(block); block.sync(); From 891d6068b5deb342e726c0e5ab6483ae80be2c8a Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Thu, 1 Feb 2024 17:52:52 +0000 Subject: [PATCH 17/78] Add unit test --- tests/CMakeLists.txt | 5 + .../unique_sequence_test.cu | 105 ++++++++++++++++++ 2 files changed, 110 insertions(+) create mode 100644 tests/distinct_count_estimator/unique_sequence_test.cu diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index e09efddb3..531556247 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -109,3 +109,8 @@ ConfigureTest(DYNAMIC_BITSET_TEST dynamic_bitset/rank_test.cu dynamic_bitset/select_test.cu dynamic_bitset/size_test.cu) + +################################################################################################### +# - distinct_count_estimator ---------------------------------------------------------------------- +ConfigureTest(DISTINCT_COUNT_ESTIMATOR_TEST + distinct_count_estimator/unique_sequence_test.cu) diff --git a/tests/distinct_count_estimator/unique_sequence_test.cu b/tests/distinct_count_estimator/unique_sequence_test.cu new file mode 100644 index 000000000..a4d07ba4d --- /dev/null +++ b/tests/distinct_count_estimator/unique_sequence_test.cu @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include + +#include +#include + +#include +#include + +#include +#include +#include + +TEMPLATE_TEST_CASE_SIG("distinct_count_estimator: unique sequence", + "", + ((typename T, int32_t Precision, typename Hash), T, Precision, Hash), + (int32_t, 9, cuco::xxhash_32), + (int32_t, 10, cuco::xxhash_32), + (int32_t, 11, cuco::xxhash_32), + (int32_t, 12, cuco::xxhash_32), + (int32_t, 13, cuco::xxhash_32), + (int32_t, 9, cuco::xxhash_64), + (int32_t, 10, cuco::xxhash_64), + (int32_t, 11, cuco::xxhash_64), + (int32_t, 12, cuco::xxhash_64), + (int32_t, 13, cuco::xxhash_64), + (int64_t, 9, cuco::xxhash_32), + (int64_t, 10, cuco::xxhash_32), + (int64_t, 11, cuco::xxhash_32), + (int64_t, 12, cuco::xxhash_32), + (int64_t, 13, cuco::xxhash_32), + (int64_t, 9, cuco::xxhash_64), + (int64_t, 10, cuco::xxhash_64), + (int64_t, 11, cuco::xxhash_64), + (int64_t, 12, cuco::xxhash_64), + (int64_t, 13, cuco::xxhash_64), + (__int128_t, 9, cuco::xxhash_32<__int128_t>), + (__int128_t, 10, cuco::xxhash_32<__int128_t>), + (__int128_t, 11, cuco::xxhash_32<__int128_t>), + (__int128_t, 12, cuco::xxhash_32<__int128_t>), + (__int128_t, 13, cuco::xxhash_32<__int128_t>), + (__int128_t, 9, cuco::xxhash_64<__int128_t>), + (__int128_t, 10, cuco::xxhash_64<__int128_t>), + (__int128_t, 11, cuco::xxhash_64<__int128_t>), + (__int128_t, 12, cuco::xxhash_64<__int128_t>), + (__int128_t, 13, cuco::xxhash_64<__int128_t>)) +{ + // This factor determines the error threshold for passing the test + // TODO might be too high + double constexpr tolerance_factor = 3.0; + // RSD for a given precision is given by the following formula + double const relative_standard_deviation = + 1.04 / std::sqrt(static_cast(1ull << Precision)); + + auto num_items_pow2 = GENERATE(25, 26, 28); + INFO("num_items=2^" << num_items_pow2); + auto num_items = 1ull << num_items_pow2; + + thrust::device_vector items(num_items); + + // Generate `num_items` distinct items + thrust::sequence(items.begin(), items.end(), 0); + + // Initialize the estimator + cuco::distinct_count_estimator estimator; + + REQUIRE(estimator.estimate() == 0); + + // Add all items to the estimator + estimator.add(items.begin(), items.end()); + + auto const estimate = estimator.estimate(); + + // Adding the same items again should not affect the result + estimator.add(items.begin(), items.begin() + num_items / 2); + REQUIRE(estimator.estimate() == estimate); + + // Clearing the estimator shoult reset the estimate + estimator.clear(); + REQUIRE(estimator.estimate() == 0); + + double const relative_error = + std::abs(static_cast(num_items) - static_cast(estimate)) / num_items; + + // Check if the error is acceptable + REQUIRE(relative_error < tolerance_factor * relative_standard_deviation); +} From 35441950b86b49e572c8dd1fe3932cc9f7c043d2 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Thu, 1 Feb 2024 23:17:35 +0000 Subject: [PATCH 18/78] Remove experimental cg async reduce since it is buggy --- include/cuco/detail/__config | 5 ----- include/cuco/detail/hyperloglog/hyperloglog_ref.cuh | 5 ----- 2 files changed, 10 deletions(-) diff --git a/include/cuco/detail/__config b/include/cuco/detail/__config index fd3b6fce4..812a4e631 100644 --- a/include/cuco/detail/__config +++ b/include/cuco/detail/__config @@ -51,11 +51,6 @@ #define CUCO_HAS_INT128 #endif -#if (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 8) -#define CUCO_HAS_CG_EXPERIMENTAL_REDUCE_UPDATE_ASYNC -#define _CG_ABI_EXPERIMENTAL -#endif - #if (__CUDACC_VER_MAJOR__ >= 12) #define CUCO_HAS_CG_REDUCE_UPDATE_ASYNC #endif \ No newline at end of file diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh index d6f362c5f..c25d68c8e 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh @@ -148,11 +148,6 @@ class hyperloglog_ref { warp, block_sum, thread_sum, cooperative_groups::plus()); cooperative_groups::reduce_update_async( warp, block_zeroes, thread_zeroes, cooperative_groups::plus()); -#elif defined(CUCO_HAS_CG_EXPERIMENTAL_REDUCE_UPDATE_ASYNC) - cooperative_groups::experimental::reduce_update_async( - warp, block_sum, thread_sum, cooperative_groups::plus()); - cooperative_groups::experimental::reduce_update_async( - warp, block_zeroes, thread_zeroes, cooperative_groups::plus()); #else auto const warp_sum = cooperative_groups::reduce(warp, thread_sum, cooperative_groups::plus()); From 3506ecbf2673d3d012e8c0872efd8110faa37486 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Fri, 2 Feb 2024 00:15:40 +0000 Subject: [PATCH 19/78] Fix bit-shifting bug that lead to high error rates --- include/cuco/detail/hyperloglog/hyperloglog.cuh | 3 ++- include/cuco/detail/hyperloglog/hyperloglog_ref.cuh | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh index a08cfa942..a9288bb80 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog.cuh @@ -278,10 +278,11 @@ class hyperloglog { using fp_type = typename ref_type<>::fp_type; fp_type sum = 0; int zeroes = 0; + // geometric mean computation + count registers with 0s for (std::size_t i = 0; i < registers.size(); ++i) { auto const reg = registers[i]; - sum += fp_type{1} / static_cast(1 << reg); + sum += fp_type{1} / static_cast(1ull << reg); zeroes += reg == 0; } diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh index c25d68c8e..5fe8d5c3e 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh @@ -89,7 +89,7 @@ class hyperloglog_ref { __device__ void add(T const& item) noexcept { // static_assert NumBuckets is not too big - auto constexpr register_mask = (1 << Precision) - 1; + auto constexpr register_mask = (1ull << Precision) - 1; auto const h = this->hash_(item); auto const reg = h & register_mask; auto const zeroes = cuda::std::countl_zero(h | register_mask) + 1; // __clz From 919d0abe8f5b48408c19d34a4d442899fd2aa9c0 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Fri, 2 Feb 2024 00:26:14 +0000 Subject: [PATCH 20/78] Storage cleanups --- .../cuco/detail/hyperloglog/hyperloglog.cuh | 2 +- include/cuco/detail/hyperloglog/storage.cuh | 65 ++----------------- 2 files changed, 8 insertions(+), 59 deletions(-) diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh index a9288bb80..f95f7859f 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog.cuh @@ -280,7 +280,7 @@ class hyperloglog { int zeroes = 0; // geometric mean computation + count registers with 0s - for (std::size_t i = 0; i < registers.size(); ++i) { + for (int i = 0; i < registers.size(); ++i) { auto const reg = registers[i]; sum += fp_type{1} / static_cast(1ull << reg); zeroes += reg == 0; diff --git a/include/cuco/detail/hyperloglog/storage.cuh b/include/cuco/detail/hyperloglog/storage.cuh index fe0a4ff7a..5ca525c69 100644 --- a/include/cuco/detail/hyperloglog/storage.cuh +++ b/include/cuco/detail/hyperloglog/storage.cuh @@ -35,6 +35,8 @@ namespace cuco::detail { template class hyperloglog_dense_registers { public: + // We use `int` here since this is the smallest type that supports native `atomicMax` on GPUs + using register_type = int; ///< Register array storage /** * @brief Clears the storage. * @@ -48,14 +50,6 @@ class hyperloglog_dense_registers { for (int i = group.thread_rank(); i < this->registers_.size(); i += group.size()) { this->registers_[i] = 0; } - - // TODO remove test code - // int4 constexpr empty{0, 0, 0, 0}; - // auto vec4 = reinterpret_cast(this->storage_.data()); - // // #pragma unroll 2 - // for (int i = group.thread_rank(); i < (this->storage_.size() / 4); i += group.size()) { - // vec4[i] = empty; - // } } /** @@ -66,10 +60,7 @@ class hyperloglog_dense_registers { * * @return Reference to the requested element */ - __host__ __device__ constexpr int& operator[](std::size_t i) noexcept - { - return this->registers_[i]; - } + __host__ __device__ constexpr int& operator[](int i) noexcept { return this->registers_[i]; } /** * @brief Returns the element at specified location `i`. No bounds checking is performed. @@ -78,20 +69,14 @@ class hyperloglog_dense_registers { * * @return Requested element */ - __host__ __device__ constexpr int operator[](std::size_t i) const noexcept - { - return this->registers_[i]; - } + __host__ __device__ constexpr int operator[](int i) const noexcept { return this->registers_[i]; } /** * @brief Returns the number of elements in the container. * * @return The number of elements in the container */ - __host__ __device__ constexpr std::size_t size() const noexcept - { - return this->registers_.size(); - } + __host__ __device__ constexpr int size() const noexcept { return this->registers_.size(); } /** * @brief Atomically updates the register at position `i` with `max(reg[i], value)`. @@ -102,7 +87,7 @@ class hyperloglog_dense_registers { * @param value New value */ template - __device__ constexpr void update_max(std::size_t i, int value) noexcept + __device__ constexpr void update_max(int i, register_type value) noexcept { if constexpr (Scope == cuda::thread_scope_thread) { this->registers_[i] = max(this->registers_[i], value); @@ -133,45 +118,9 @@ class hyperloglog_dense_registers { for (int i = group.thread_rank(); i < this->registers_.size(); i += group.size()) { this->update_max(i, other.registers_[i]); } - - // TODO remove test code - /* - auto vec4 = reinterpret_cast(other.storage_.data()); - // #pragma unroll 2 - for (int i = group.thread_rank(); i < (this->storage_.size() / 4); i += group.size()) { - auto const items = vec4[i]; - if constexpr (Scope == cuda::thread_scope_thread) { - auto max_vec4 = reinterpret_cast(this->storage_.data()); - auto max_items = max_vec4[i]; - max_items.x = max(max_items.x, items.x); - max_items.y = max(max_items.y, items.y); - max_items.z = max(max_items.z, items.z); - max_items.w = max(max_items.w, items.w); - max_vec4[i] = max_items; - } else if constexpr (Scope == cuda::thread_scope_block) { - atomicMax_block(this->storage_.data() + (i * 4 + 0), items.x); - atomicMax_block(this->storage_.data() + (i * 4 + 1), items.y); - atomicMax_block(this->storage_.data() + (i * 4 + 2), items.z); - atomicMax_block(this->storage_.data() + (i * 4 + 3), items.w); - } else if constexpr (Scope == cuda::thread_scope_device) { - atomicMax(this->storage_.data() + (i * 4 + 0), items.x); - atomicMax(this->storage_.data() + (i * 4 + 1), items.y); - atomicMax(this->storage_.data() + (i * 4 + 2), items.z); - atomicMax(this->storage_.data() + (i * 4 + 3), items.w); - } else if constexpr (Scope == cuda::thread_scope_system) { - atomicMax_system(this->storage_.data() + (i * 4 + 0), items.x); - atomicMax_system(this->storage_.data() + (i * 4 + 1), items.y); - atomicMax_system(this->storage_.data() + (i * 4 + 2), items.z); - atomicMax_system(this->storage_.data() + (i * 4 + 3), items.w); - } else { - static_assert(cuco::dependent_false, "Unsupported thread scope"); - } - } - */ } private: - alignas(sizeof(int) * - 4) cuda::std::array registers_; ///< Register array storage + cuda::std::array registers_; ///< Register array storage }; } // namespace cuco::detail From 52f6e09e10d4e92c9fd028ab5c741da37ad30d6b Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Fri, 2 Feb 2024 00:44:46 +0000 Subject: [PATCH 21/78] Update readme --- README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.md b/README.md index 30937c0b6..069a70897 100644 --- a/README.md +++ b/README.md @@ -232,4 +232,12 @@ We plan to add many GPU-accelerated, concurrent data structures to `cuCollection #### Examples: - [Host-bulk APIs (TODO)]() +### `distinct_count_estimator` + +`cuco::distinct_count_estimator` implements the famous [HyperLogLog++ algorithm](https://static.googleusercontent.com/media/research.google.com/de//pubs/archive/40671.pdf) for approximating the count of distinct items in a multiset/stream. + +#### Examples: +- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/distinct_count_estimator/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/z/EG7cMssxo)) +- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/distinct_count_estimator/device_ref_example.cu) (see [live example in godbolt](https://godbolt.org/z/va8eE9dqb)) + From 0a0119d9cc6e42ee0885b8683285ad3d4621c11a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20J=C3=BCnger?= <2955913+sleeepyjack@users.noreply.github.com> Date: Fri, 2 Feb 2024 01:45:52 +0100 Subject: [PATCH 22/78] Fix typo Co-authored-by: Yunsong Wang --- tests/distinct_count_estimator/unique_sequence_test.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/distinct_count_estimator/unique_sequence_test.cu b/tests/distinct_count_estimator/unique_sequence_test.cu index a4d07ba4d..23c86321d 100644 --- a/tests/distinct_count_estimator/unique_sequence_test.cu +++ b/tests/distinct_count_estimator/unique_sequence_test.cu @@ -93,7 +93,7 @@ TEMPLATE_TEST_CASE_SIG("distinct_count_estimator: unique sequence", estimator.add(items.begin(), items.begin() + num_items / 2); REQUIRE(estimator.estimate() == estimate); - // Clearing the estimator shoult reset the estimate + // Clearing the estimator should reset the estimate estimator.clear(); REQUIRE(estimator.estimate() == 0); From ab50bed27ac1331caccc913bba897ca7294b768f Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Fri, 2 Feb 2024 00:48:02 +0000 Subject: [PATCH 23/78] Fix typo --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 069a70897..5283fcf3e 100644 --- a/README.md +++ b/README.md @@ -238,6 +238,6 @@ We plan to add many GPU-accelerated, concurrent data structures to `cuCollection #### Examples: - [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/distinct_count_estimator/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/z/EG7cMssxo)) -- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/distinct_count_estimator/device_ref_example.cu) (see [live example in godbolt](https://godbolt.org/z/va8eE9dqb)) +- [Device-ref APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/distinct_count_estimator/device_ref_example.cu) (see [live example in godbolt](https://godbolt.org/z/va8eE9dqb)) From 68d2df07658c41a872e8a5ad928966c46c73532d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20J=C3=BCnger?= <2955913+sleeepyjack@users.noreply.github.com> Date: Sat, 3 Feb 2024 02:24:58 +0100 Subject: [PATCH 24/78] Apply suggestions from code review Co-authored-by: Yunsong Wang --- README.md | 2 +- benchmarks/distinct_count_estimator_bench.cu | 3 ++- examples/distinct_count_estimator/device_ref_example.cu | 2 +- include/cuco/detail/hyperloglog/finalizer.cuh | 3 ++- include/cuco/detail/hyperloglog/hyperloglog.cuh | 3 +-- include/cuco/detail/hyperloglog/hyperloglog_ref.cuh | 3 ++- 6 files changed, 9 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 5283fcf3e..48b598da9 100644 --- a/README.md +++ b/README.md @@ -234,7 +234,7 @@ We plan to add many GPU-accelerated, concurrent data structures to `cuCollection ### `distinct_count_estimator` -`cuco::distinct_count_estimator` implements the famous [HyperLogLog++ algorithm](https://static.googleusercontent.com/media/research.google.com/de//pubs/archive/40671.pdf) for approximating the count of distinct items in a multiset/stream. +`cuco::distinct_count_estimator` implements the well-established [HyperLogLog++ algorithm](https://static.googleusercontent.com/media/research.google.com/de//pubs/archive/40671.pdf) for approximating the count of distinct items in a multiset/stream. #### Examples: - [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/distinct_count_estimator/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/z/EG7cMssxo)) diff --git a/benchmarks/distinct_count_estimator_bench.cu b/benchmarks/distinct_count_estimator_bench.cu index 7ceb305b4..12504f120 100644 --- a/benchmarks/distinct_count_estimator_bench.cu +++ b/benchmarks/distinct_count_estimator_bench.cu @@ -26,9 +26,10 @@ #include #include -#include #include +#include + using namespace cuco::benchmark; using namespace cuco::utility; diff --git a/examples/distinct_count_estimator/device_ref_example.cu b/examples/distinct_count_estimator/device_ref_example.cu index 8634e6b12..2701e34b7 100644 --- a/examples/distinct_count_estimator/device_ref_example.cu +++ b/examples/distinct_count_estimator/device_ref_example.cu @@ -22,7 +22,7 @@ #include /** - * @file device_reference_example.cu + * @file device_ref_example.cu * @brief Demonstrates usage of `cuco::distinct_count_estimator` device-side APIs. * * This example demonstrates how the non-owning reference type `cuco::distinct_count_estimator_ref` diff --git a/include/cuco/detail/hyperloglog/finalizer.cuh b/include/cuco/detail/hyperloglog/finalizer.cuh index 18af4ca1b..2129783fd 100644 --- a/include/cuco/detail/hyperloglog/finalizer.cuh +++ b/include/cuco/detail/hyperloglog/finalizer.cuh @@ -17,10 +17,11 @@ #include -#include #include #include +#include + namespace cuco::hyperloglog_ns::detail { /** diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh index f95f7859f..6d4f9ef58 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog.cuh @@ -75,7 +75,7 @@ class hyperloglog { storage_deleter_{storage_allocator_}, storage_{storage_allocator_.allocate(1ull), storage_deleter_} { - this->clear_async(stream); // TODO async or sync? + this->clear_async(stream); } ~hyperloglog() = default; @@ -84,7 +84,6 @@ class hyperloglog { hyperloglog& operator=(hyperloglog const&) = delete; hyperloglog(hyperloglog&&) = default; ///< Move constructor - // TODO this is somehow required to pass the Doxygen check. /** * @brief Copy-assignment operator. * diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh index 5fe8d5c3e..a045ca20c 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh @@ -22,12 +22,13 @@ #include #include -#include #include #include #include +#include + namespace cuco::detail { /** * @brief A GPU-accelerated utility for approximating the number of distinct items in a multiset. From 93f68a2515214d9b59e58cb773fa3ae3af2c8bf7 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Tue, 6 Feb 2024 15:26:14 +0000 Subject: [PATCH 25/78] Use CUDART_VERSION instead of (__CUDACC_VER_MAJOR__ --- include/cuco/detail/__config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/cuco/detail/__config b/include/cuco/detail/__config index 812a4e631..ba300d4dc 100644 --- a/include/cuco/detail/__config +++ b/include/cuco/detail/__config @@ -51,6 +51,6 @@ #define CUCO_HAS_INT128 #endif -#if (__CUDACC_VER_MAJOR__ >= 12) +#if defined(CUDART_VERSION) && (CUDART_VERSION >= 12000) #define CUCO_HAS_CG_REDUCE_UPDATE_ASYNC #endif \ No newline at end of file From 03a85728c49d0df7f76366fa6a556264cd6f16b3 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Tue, 6 Feb 2024 16:38:44 +0000 Subject: [PATCH 26/78] Apply suggestions from code review --- .../cuco/detail/hyperloglog/hyperloglog.cuh | 49 ++++++++----------- .../detail/hyperloglog/hyperloglog_ref.cuh | 4 +- 2 files changed, 22 insertions(+), 31 deletions(-) diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh index 6d4f9ef58..94850c0c7 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog.cuh @@ -16,10 +16,13 @@ #pragma once #include +#include #include #include #include #include +#include +#include #include #include #include @@ -54,11 +57,12 @@ class hyperloglog { using ref_type = hyperloglog_ref; ///< Non-owning reference ///< type - using allocator_type = Allocator; ///< Allocator type - using value_type = typename ref_type<>::value_type; ///< Type of items to count - using storage_type = typename ref_type<>::storage_type; ///< Storage type - using storage_allocator_type = typename std::allocator_traits::template rebind_alloc< - storage_type>; ///< Storage allocator type + using value_type = typename ref_type<>::value_type; ///< Type of items to count + using storage_type = typename ref_type<>::storage_type; ///< Storage type + using hash_type = typename ref_type<>::hash_type; ///< Hash function type + using allocator_type = + typename std::allocator_traits::template rebind_alloc; ///< Allocator + ///< type /** * @brief Constructs a `hyperloglog` host object. @@ -71,9 +75,9 @@ class hyperloglog { */ constexpr hyperloglog(Hash const& hash, Allocator const& alloc, cuco::cuda_stream_ref stream) : hash_{hash}, - storage_allocator_{alloc}, - storage_deleter_{storage_allocator_}, - storage_{storage_allocator_.allocate(1ull), storage_deleter_} + allocator_{alloc}, + deleter_{1ull, allocator_}, + storage_{allocator_.allocate(1ull), deleter_} { this->clear_async(stream); } @@ -128,9 +132,9 @@ class hyperloglog { * @param stream CUDA stream this operation is executed in */ template - void add_async(InputIt first, InputIt last, cuco::cuda_stream_ref stream) noexcept + void add_async(InputIt first, InputIt last, cuco::cuda_stream_ref stream) { - auto const num_items = cuco::detail::distance(first, last); // TODO include + auto const num_items = cuco::detail::distance(first, last); if (num_items == 0) { return; } // TODO fallback to local memory registers in case they don't fit in shmem @@ -141,9 +145,8 @@ class hyperloglog { // We make use of the occupancy calculator here to get the minimum number of blocks which still // saturate the GPU. This reduces the atomic contention on the final register array during the // merge phase. - // TODO check cuda error or will it sync the stream?? - cudaOccupancyMaxPotentialBlockSize( - &grid_size, &block_size, &cuco::hyperloglog_ns::detail::add_shmem>); + CUCO_CUDA_TRY(cudaOccupancyMaxPotentialBlockSize( + &grid_size, &block_size, &cuco::hyperloglog_ns::detail::add_shmem>)); cuco::hyperloglog_ns::detail::add_shmem<<>>( first, num_items, this->ref()); @@ -314,22 +317,10 @@ class hyperloglog { [[nodiscard]] auto hash() const noexcept { return this->hash_; } private: - struct storage_deleter { - using pointer = typename storage_allocator_type::value_type*; - - storage_deleter(storage_allocator_type& a) : allocator{a} {} - - storage_deleter(storage_deleter const&) = default; - - void operator()(pointer ptr) { allocator.deallocate(ptr, 1); } - - storage_allocator_type& allocator; - }; - - Hash hash_; ///< Hash function used to hash items - storage_allocator_type storage_allocator_; ///< Storage allocator - storage_deleter storage_deleter_; ///< Storage deleter - std::unique_ptr storage_; ///< Storage + hash_type hash_; ///< Hash function used to hash items + allocator_type allocator_; ///< Storage allocator + custom_deleter deleter_; ///< Storage deleter + std::unique_ptr> storage_; ///< Storage // Needs to be friends with other instantiations of this class template to have access to their // storage diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh index a045ca20c..c9ace51c3 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh @@ -53,6 +53,7 @@ class hyperloglog_ref { using value_type = T; ///< Type of items to count using storage_type = hyperloglog_dense_registers; ///< Storage type + using hash_type = Hash; ///< Hash function type template using with_scope = hyperloglog_ref; ///< Ref type with different @@ -173,8 +174,7 @@ class hyperloglog_ref { } private: - Hash hash_; ///< Hash function used to hash items - // TODO is a reference the right choice here?? + hash_type hash_; ///< Hash function used to hash items storage_type& storage_; ///< Reference to storage object template From 33f7bafbdfc130b8d459b85325f818ab417b64e8 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Fri, 16 Feb 2024 01:02:54 +0000 Subject: [PATCH 27/78] Enable Precision>18; fix some bugs, extend tests. --- include/cuco/detail/hyperloglog/finalizer.cuh | 17 ++++----- .../cuco/detail/hyperloglog/hyperloglog.cuh | 28 ++++++++++----- .../detail/hyperloglog/hyperloglog_ref.cuh | 11 +++--- include/cuco/detail/hyperloglog/kernels.cuh | 22 +++++++++--- .../unique_sequence_test.cu | 36 +++++++------------ 5 files changed, 63 insertions(+), 51 deletions(-) diff --git a/include/cuco/detail/hyperloglog/finalizer.cuh b/include/cuco/detail/hyperloglog/finalizer.cuh index 2129783fd..8b221f6ba 100644 --- a/include/cuco/detail/hyperloglog/finalizer.cuh +++ b/include/cuco/detail/hyperloglog/finalizer.cuh @@ -56,15 +56,15 @@ class finalizer { if (v > 0) { // Use linear counting for small cardinality estimates. double const h = m * log(static_cast(m) / v); - // HLL++ is defined only when p < 19, otherwise we need to fallback to HLL. // The threshold `2.5 * m` is from the original HLL algorithm. - if ((Precision < 19 and h <= threshold()) or e <= 2.5 * m) { - e = h; - } else { - e = bias_corrected_estimate(e); + if (e <= 2.5 * m) { return cuda::std::round(h); } + + if constexpr (Precision < 19) { + e = (h <= threshold()) ? h : bias_corrected_estimate(e); } } else { - e = bias_corrected_estimate(e); + // HLL++ is defined only when p < 19, otherwise we need to fallback to HLL. + if constexpr (Precision < 19) { e = bias_corrected_estimate(e); } } return cuda::std::round(e); @@ -89,10 +89,7 @@ class finalizer { __host__ __device__ static double constexpr bias_corrected_estimate(double e) noexcept { - if constexpr (Precision < 19) { - if (e < 5.0 * m) { return e - bias(e); } - } - return e; + return (e < 5.0 * m) ? e - bias(e) : e; } __host__ __device__ static double constexpr bias(double e) noexcept diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh index 94850c0c7..986166836 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog.cuh @@ -137,19 +137,31 @@ class hyperloglog { auto const num_items = cuco::detail::distance(first, last); if (num_items == 0) { return; } - // TODO fallback to local memory registers in case they don't fit in shmem - - int grid_size = 0; - int block_size = 0; + int grid_size = 0; + int block_size = 0; + int const shmem_bytes = sizeof(storage_type); // We make use of the occupancy calculator here to get the minimum number of blocks which still // saturate the GPU. This reduces the atomic contention on the final register array during the // merge phase. CUCO_CUDA_TRY(cudaOccupancyMaxPotentialBlockSize( - &grid_size, &block_size, &cuco::hyperloglog_ns::detail::add_shmem>)); - - cuco::hyperloglog_ns::detail::add_shmem<<>>( - first, num_items, this->ref()); + &grid_size, + &block_size, + &cuco::hyperloglog_ns::detail::add_shmem>, + shmem_bytes)); + + if (grid_size != 0) { // use shmem codepath + cuco::hyperloglog_ns::detail::add_shmem<<>>( + first, num_items, this->ref()); + } else { // use gmem codepath since there is not enough shmem available + block_size = 0; + CUCO_CUDA_TRY(cudaOccupancyMaxPotentialBlockSize( + &grid_size, &block_size, &cuco::hyperloglog_ns::detail::add_gmem>)); + CUCO_EXPECTS(grid_size != 0, "Invalid kernel launch configuration"); + + cuco::hyperloglog_ns::detail::add_gmem<<>>( + first, num_items, this->ref()); + } } /** diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh index c9ace51c3..e2d57c65d 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh @@ -23,6 +23,7 @@ #include #include +#include #include #include @@ -90,11 +91,11 @@ class hyperloglog_ref { */ __device__ void add(T const& item) noexcept { - // static_assert NumBuckets is not too big - auto constexpr register_mask = (1ull << Precision) - 1; - auto const h = this->hash_(item); - auto const reg = h & register_mask; - auto const zeroes = cuda::std::countl_zero(h | register_mask) + 1; // __clz + using hash_value_type = decltype(cuda::std::declval()(cuda::std::declval())); + hash_value_type constexpr register_mask = (1ull << Precision) - 1; + auto const h = this->hash_(item); + auto const reg = h & register_mask; + auto const zeroes = cuda::std::countl_zero(h | register_mask) + 1; // __clz this->storage_.update_max(reg, zeroes); } diff --git a/include/cuco/detail/hyperloglog/kernels.cuh b/include/cuco/detail/hyperloglog/kernels.cuh index 07f16b097..653caac95 100644 --- a/include/cuco/detail/hyperloglog/kernels.cuh +++ b/include/cuco/detail/hyperloglog/kernels.cuh @@ -35,18 +35,20 @@ CUCO_KERNEL void clear(RefType ref) template CUCO_KERNEL void add_shmem(InputIt first, cuco::detail::index_type n, RefType ref) { - using local_ref_type = typename RefType::with_scope; + using local_ref_type = typename RefType::with_scope; + using local_storage_type = typename local_ref_type::storage_type; - __shared__ typename local_ref_type::storage_type local_storage; + alignas(16) extern __shared__ char shmem[]; + local_storage_type* local_storage = reinterpret_cast(shmem); auto const loop_stride = cuco::detail::grid_stride(); auto idx = cuco::detail::global_thread_id(); auto const block = cooperative_groups::this_thread_block(); - if (block.thread_rank() == 0) { new (&local_storage) typename local_ref_type::storage_type{}; } + if (block.thread_rank() == 0) { new (local_storage) local_storage_type{}; } block.sync(); - local_ref_type local_ref(local_storage, {}); + local_ref_type local_ref(*local_storage, {}); local_ref.clear(block); block.sync(); @@ -59,6 +61,18 @@ CUCO_KERNEL void add_shmem(InputIt first, cuco::detail::index_type n, RefType re ref.merge(block, local_ref); } +template +CUCO_KERNEL void add_gmem(InputIt first, cuco::detail::index_type n, RefType ref) +{ + auto const loop_stride = cuco::detail::grid_stride(); + auto idx = cuco::detail::global_thread_id(); + + while (idx < n) { + ref.add(*(first + idx)); + idx += loop_stride; + } +} + template CUCO_KERNEL void merge(OtherRefType other_ref, RefType ref) { diff --git a/tests/distinct_count_estimator/unique_sequence_test.cu b/tests/distinct_count_estimator/unique_sequence_test.cu index 23c86321d..9ebbc6291 100644 --- a/tests/distinct_count_estimator/unique_sequence_test.cu +++ b/tests/distinct_count_estimator/unique_sequence_test.cu @@ -32,40 +32,28 @@ TEMPLATE_TEST_CASE_SIG("distinct_count_estimator: unique sequence", "", ((typename T, int32_t Precision, typename Hash), T, Precision, Hash), - (int32_t, 9, cuco::xxhash_32), - (int32_t, 10, cuco::xxhash_32), - (int32_t, 11, cuco::xxhash_32), - (int32_t, 12, cuco::xxhash_32), - (int32_t, 13, cuco::xxhash_32), (int32_t, 9, cuco::xxhash_64), - (int32_t, 10, cuco::xxhash_64), (int32_t, 11, cuco::xxhash_64), - (int32_t, 12, cuco::xxhash_64), (int32_t, 13, cuco::xxhash_64), - (int64_t, 9, cuco::xxhash_32), - (int64_t, 10, cuco::xxhash_32), - (int64_t, 11, cuco::xxhash_32), - (int64_t, 12, cuco::xxhash_32), - (int64_t, 13, cuco::xxhash_32), + (int32_t, 16, cuco::xxhash_64), + (int32_t, 18, cuco::xxhash_64), + (int32_t, 20, cuco::xxhash_64), (int64_t, 9, cuco::xxhash_64), - (int64_t, 10, cuco::xxhash_64), (int64_t, 11, cuco::xxhash_64), - (int64_t, 12, cuco::xxhash_64), (int64_t, 13, cuco::xxhash_64), - (__int128_t, 9, cuco::xxhash_32<__int128_t>), - (__int128_t, 10, cuco::xxhash_32<__int128_t>), - (__int128_t, 11, cuco::xxhash_32<__int128_t>), - (__int128_t, 12, cuco::xxhash_32<__int128_t>), - (__int128_t, 13, cuco::xxhash_32<__int128_t>), + (int64_t, 16, cuco::xxhash_64), + (int64_t, 18, cuco::xxhash_64), + (int64_t, 20, cuco::xxhash_64), (__int128_t, 9, cuco::xxhash_64<__int128_t>), - (__int128_t, 10, cuco::xxhash_64<__int128_t>), (__int128_t, 11, cuco::xxhash_64<__int128_t>), - (__int128_t, 12, cuco::xxhash_64<__int128_t>), - (__int128_t, 13, cuco::xxhash_64<__int128_t>)) + (__int128_t, 13, cuco::xxhash_64<__int128_t>), + (__int128_t, 16, cuco::xxhash_64<__int128_t>), + (__int128_t, 18, cuco::xxhash_64<__int128_t>), + (__int128_t, 20, cuco::xxhash_64<__int128_t>)) { // This factor determines the error threshold for passing the test // TODO might be too high - double constexpr tolerance_factor = 3.0; + double constexpr tolerance_factor = 2.5; // RSD for a given precision is given by the following formula double const relative_standard_deviation = 1.04 / std::sqrt(static_cast(1ull << Precision)); @@ -80,7 +68,7 @@ TEMPLATE_TEST_CASE_SIG("distinct_count_estimator: unique sequence", thrust::sequence(items.begin(), items.end(), 0); // Initialize the estimator - cuco::distinct_count_estimator estimator; + cuco::distinct_count_estimator estimator; REQUIRE(estimator.estimate() == 0); From b1253bfa7b0860b85dddaac0e1d82d6b63696b94 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Wed, 13 Mar 2024 12:46:13 +0000 Subject: [PATCH 28/78] Remove storage class and move host implementations to ref class --- .../device_ref_example.cu | 9 +- .../distinct_count_estimator.inl | 29 +- .../distinct_count_estimator_ref.inl | 89 +++++- .../cuco/detail/hyperloglog/hyperloglog.cuh | 159 +++------- .../detail/hyperloglog/hyperloglog_ref.cuh | 276 +++++++++++++++++- include/cuco/detail/hyperloglog/kernels.cuh | 14 +- include/cuco/distinct_count_estimator.cuh | 29 +- include/cuco/distinct_count_estimator_ref.cuh | 126 +++++++- 8 files changed, 583 insertions(+), 148 deletions(-) diff --git a/examples/distinct_count_estimator/device_ref_example.cu b/examples/distinct_count_estimator/device_ref_example.cu index 2701e34b7..845634388 100644 --- a/examples/distinct_count_estimator/device_ref_example.cu +++ b/examples/distinct_count_estimator/device_ref_example.cu @@ -37,18 +37,15 @@ __global__ void piggyback_kernel(RefType ref, InputIt first, std::size_t n) using local_ref_type = typename RefType::with_scope; // Shared memory storage for the block-local estimator - __shared__ typename local_ref_type::storage_type local_storage; + alignas(local_ref_type::sketch_alignment()) + __shared__ std::byte local_sketch[local_ref_type::sketch_bytes()]; auto const loop_stride = gridDim.x * blockDim.x; auto idx = blockDim.x * blockIdx.x + threadIdx.x; auto const block = cooperative_groups::this_thread_block(); - // Initialize the local storage object - if (block.thread_rank() == 0) { new (&local_storage) typename local_ref_type::storage_type{}; } - block.sync(); - // Create the local estimator with the shared memory storage - local_ref_type local_ref(local_storage, {}); + local_ref_type local_ref(cuda::std::span{local_sketch, local_ref_type::sketch_bytes()}, {}); // Initialize the local estimator local_ref.clear(block); diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl index 79488e0e1..df68a0593 100644 --- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl +++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl @@ -98,6 +98,33 @@ template ::ref_type<> distinct_count_estimator::ref() const noexcept { - return ref_type<>{this->impl_->storage_ref(), this->impl_->hash()}; + return {this->sketch(), this->hash()}; } + +template +auto distinct_count_estimator::hash() const noexcept +{ + return this->impl_->hash(); +} + +template +auto distinct_count_estimator::sketch() const noexcept +{ + return this->impl_->sketch(); +} + +template +constexpr size_t +distinct_count_estimator::sketch_bytes() noexcept +{ + return impl_type::sketch_bytes(); +} + +template +constexpr size_t +distinct_count_estimator::sketch_alignment() noexcept +{ + return impl_type::sketch(); +} + } // namespace cuco \ No newline at end of file diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl index 3b940edfd..50bea1675 100644 --- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl +++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl @@ -17,9 +17,10 @@ namespace cuco { template +template __host__ __device__ constexpr distinct_count_estimator_ref:: - distinct_count_estimator_ref(storage_type& storage, Hash const& hash) noexcept - : impl_{storage, hash} + distinct_count_estimator_ref(cuda::std::span sketch_span, Hash const& hash) noexcept + : impl_{sketch_span, hash} { } @@ -31,12 +32,42 @@ __device__ void distinct_count_estimator_ref::clear( this->impl_.clear(group); } +template +__host__ void distinct_count_estimator_ref::clear_async( + cuco::cuda_stream_ref stream) noexcept +{ + this->impl_.clear_async(stream); +} + +template +__host__ void distinct_count_estimator_ref::clear( + cuco::cuda_stream_ref stream) +{ + this->impl_.clear(stream); +} + template __device__ void distinct_count_estimator_ref::add(T const& item) noexcept { this->impl_.add(item); } +template +template +__host__ void distinct_count_estimator_ref::add_async( + InputIt first, InputIt last, cuco::cuda_stream_ref stream) +{ + this->impl_.add_async(first, last, stream); +} + +template +template +__host__ void distinct_count_estimator_ref::add( + InputIt first, InputIt last, cuco::cuda_stream_ref stream) +{ + this->impl_.add(first, last, stream); +} + template template __device__ void distinct_count_estimator_ref::merge( @@ -46,10 +77,64 @@ __device__ void distinct_count_estimator_ref::merge( this->impl_.merge(group, other.impl_); } +template +template +__host__ void distinct_count_estimator_ref::merge_async( + distinct_count_estimator_ref const& other, + cuco::cuda_stream_ref stream) noexcept +{ + this->impl_.merge_async(other, stream); +} + +template +template +__host__ void distinct_count_estimator_ref::merge( + distinct_count_estimator_ref const& other, + cuco::cuda_stream_ref stream) +{ + this->impl_.merge(other, stream); +} + template __device__ std::size_t distinct_count_estimator_ref::estimate( cooperative_groups::thread_block const& group) const noexcept { return this->impl_.estimate(group); } + +template +__host__ std::size_t distinct_count_estimator_ref::estimate( + cuco::cuda_stream_ref stream) const +{ + return this->impl_.estimate(stream); +} + +template +__host__ __device__ auto distinct_count_estimator_ref::hash() + const noexcept +{ + return this->impl_.hash(); +} + +template +__host__ __device__ auto distinct_count_estimator_ref::sketch() + const noexcept +{ + return this->impl_.sketch(); +} + +template +__host__ __device__ constexpr std::size_t +distinct_count_estimator_ref::sketch_bytes() noexcept +{ + return impl_type::sketch_bytes(); +} + +template +__host__ __device__ constexpr std::size_t +distinct_count_estimator_ref::sketch_alignment() noexcept +{ + return impl_type::sketch_alignment(); +} + } // namespace cuco \ No newline at end of file diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh index 986166836..56e13da66 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog.cuh @@ -17,14 +17,9 @@ #include #include -#include #include -#include -#include #include -#include #include -#include #include #include @@ -57,12 +52,11 @@ class hyperloglog { using ref_type = hyperloglog_ref; ///< Non-owning reference ///< type - using value_type = typename ref_type<>::value_type; ///< Type of items to count - using storage_type = typename ref_type<>::storage_type; ///< Storage type - using hash_type = typename ref_type<>::hash_type; ///< Hash function type + using value_type = typename ref_type<>::value_type; ///< Type of items to count + using hash_type = typename ref_type<>::hash_type; ///< Hash function type using allocator_type = - typename std::allocator_traits::template rebind_alloc; ///< Allocator - ///< type + typename std::allocator_traits::template rebind_alloc; ///< Allocator + ///< type /** * @brief Constructs a `hyperloglog` host object. @@ -74,12 +68,12 @@ class hyperloglog { * @param stream CUDA stream used to initialize the object */ constexpr hyperloglog(Hash const& hash, Allocator const& alloc, cuco::cuda_stream_ref stream) - : hash_{hash}, - allocator_{alloc}, - deleter_{1ull, allocator_}, - storage_{allocator_.allocate(1ull), deleter_} + : allocator_{alloc}, + deleter_{this->sketch_bytes(), this->allocator_}, + sketch_{this->allocator_.allocate(this->sketch_bytes()), this->deleter_}, + ref_{cuda::std::span{this->sketch_.get(), this->sketch_bytes()}, hash} { - this->clear_async(stream); + this->ref_.clear_async(stream); } ~hyperloglog() = default; @@ -100,11 +94,7 @@ class hyperloglog { * * @param stream CUDA stream this operation is executed in */ - void clear_async(cuco::cuda_stream_ref stream) noexcept - { - auto constexpr block_size = 1024; - cuco::hyperloglog_ns::detail::clear<<<1, block_size, 0, stream>>>(this->ref()); - } + void clear_async(cuco::cuda_stream_ref stream) noexcept { this->ref_.clear_async(stream); } /** * @brief Resets the estimator, i.e., clears the current count estimate. @@ -114,11 +104,7 @@ class hyperloglog { * * @param stream CUDA stream this operation is executed in */ - void clear(cuco::cuda_stream_ref stream) - { - this->clear_async(stream); - stream.synchronize(); - } + void clear(cuco::cuda_stream_ref stream) { this->ref_.clear(stream); } /** * @brief Asynchronously adds to be counted items to the estimator. @@ -134,34 +120,7 @@ class hyperloglog { template void add_async(InputIt first, InputIt last, cuco::cuda_stream_ref stream) { - auto const num_items = cuco::detail::distance(first, last); - if (num_items == 0) { return; } - - int grid_size = 0; - int block_size = 0; - int const shmem_bytes = sizeof(storage_type); - - // We make use of the occupancy calculator here to get the minimum number of blocks which still - // saturate the GPU. This reduces the atomic contention on the final register array during the - // merge phase. - CUCO_CUDA_TRY(cudaOccupancyMaxPotentialBlockSize( - &grid_size, - &block_size, - &cuco::hyperloglog_ns::detail::add_shmem>, - shmem_bytes)); - - if (grid_size != 0) { // use shmem codepath - cuco::hyperloglog_ns::detail::add_shmem<<>>( - first, num_items, this->ref()); - } else { // use gmem codepath since there is not enough shmem available - block_size = 0; - CUCO_CUDA_TRY(cudaOccupancyMaxPotentialBlockSize( - &grid_size, &block_size, &cuco::hyperloglog_ns::detail::add_gmem>)); - CUCO_EXPECTS(grid_size != 0, "Invalid kernel launch configuration"); - - cuco::hyperloglog_ns::detail::add_gmem<<>>( - first, num_items, this->ref()); - } + this->ref_.add_async(first, last, stream); } /** @@ -181,8 +140,7 @@ class hyperloglog { template void add(InputIt first, InputIt last, cuco::cuda_stream_ref stream) { - this->add_async(first, last, stream); - stream.synchronize(); + this->ref_.add(first, last, stream); } /** @@ -198,7 +156,7 @@ class hyperloglog { void merge_async(hyperloglog const& other, cuco::cuda_stream_ref stream) noexcept { - this->merge_async(other.ref(), stream); + this->ref_.merge_async(other.ref(), stream); } /** @@ -217,8 +175,7 @@ class hyperloglog { void merge(hyperloglog const& other, cuco::cuda_stream_ref stream) { - this->merge_async(other, stream); - stream.synchronize(); + this->ref_.merge(other.ref(), stream); } /** @@ -232,8 +189,7 @@ class hyperloglog { template void merge_async(ref_type const& other, cuco::cuda_stream_ref stream) noexcept { - auto constexpr block_size = 1024; - cuco::hyperloglog_ns::detail::merge<<<1, block_size, 0, stream>>>(other, this->ref()); + this->ref_.merge_async(other, stream); } /** @@ -250,8 +206,7 @@ class hyperloglog { template void merge(ref_type const& other, cuco::cuda_stream_ref stream) { - this->merge_async(other, stream); - stream.synchronize(); + this->ref_.merge(other, stream); } /** @@ -265,43 +220,7 @@ class hyperloglog { */ [[nodiscard]] std::size_t estimate(cuco::cuda_stream_ref stream) const { - // TODO remove test code - // std::size_t* result; - // cudaMallocHost(&result, sizeof(std::size_t)); - - // int grid_size = 0; - // int block_size = 0; - // // TODO check cuda error? - // cudaOccupancyMaxPotentialBlockSize( - // &grid_size, &block_size, &cuco::hyperloglog_ns::detail::estimate>); - - // cuco::hyperloglog_ns::detail::estimate<<>>( - // result, this->ref()); - // stream.synchronize(); - - // return *result; - - // TODO this function currently copies the registers to the host and then finalizes the result; - // move computation to device? Edit: host computation is faster -.- - storage_type registers; - // TODO check if storage is host accessible - CUCO_CUDA_TRY(cudaMemcpyAsync( - ®isters, this->storage_.get(), sizeof(storage_type), cudaMemcpyDeviceToHost, stream)); - stream.synchronize(); - - using fp_type = typename ref_type<>::fp_type; - fp_type sum = 0; - int zeroes = 0; - - // geometric mean computation + count registers with 0s - for (int i = 0; i < registers.size(); ++i) { - auto const reg = registers[i]; - sum += fp_type{1} / static_cast(1ull << reg); - zeroes += reg == 0; - } - - // pass intermediate result to finalizer for bias correction, etc. - return cuco::hyperloglog_ns::detail::finalizer::finalize(sum, zeroes); + return this->ref_.estimate(stream); } /** @@ -309,30 +228,48 @@ class hyperloglog { * * @return Device ref object of the current `distinct_count_estimator` host object */ - [[nodiscard]] ref_type<> ref() const noexcept - { - return ref_type<>{*(this->storage_.get()), this->hash_}; - } + [[nodiscard]] ref_type<> ref() const noexcept { return this->ref_; } /** - * @brief Get storage ref. + * @brief Get hash function. * - * @return Reference to storage + * @return The hash function */ - [[nodiscard]] storage_type& storage_ref() const noexcept { return *(this->storage_.get()); } + [[nodiscard]] auto hash() const noexcept { return this->ref_.hash(); } /** - * @brief Get hash function. + * @brief Gets the span of the sketch. * - * @return The hash function + * @return The cuda::std::span of the sketch + */ + [[nodiscard]] auto sketch() const noexcept { return this->ref_.sketch(); } + + /** + * @brief Gets the number of bytes required for the sketch storage. + * + * @return The number of bytes required for the sketch */ - [[nodiscard]] auto hash() const noexcept { return this->hash_; } + [[nodiscard]] constexpr std::size_t sketch_bytes() const noexcept + { + return ref_type<>::sketch_bytes(); + } + + /** + * @brief Gets the alignment required for the sketch storage. + * + * @return The required alignment + */ + [[nodiscard]] static constexpr std::size_t sketch_alignment() noexcept + { + return ref_type<>::sketch_alignment(); + } private: - hash_type hash_; ///< Hash function used to hash items allocator_type allocator_; ///< Storage allocator custom_deleter deleter_; ///< Storage deleter - std::unique_ptr> storage_; ///< Storage + std::unique_ptr> + sketch_; ///< Sketch storage + ref_type<> ref_; //< Ref type // Needs to be friends with other instantiations of this class template to have access to their // storage diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh index e2d57c65d..46e61966a 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh @@ -15,14 +15,20 @@ */ #pragma once +#include #include +#include #include -#include +#include +#include #include #include #include +#include + #include +#include #include #include @@ -31,6 +37,7 @@ #include namespace cuco::detail { + /** * @brief A GPU-accelerated utility for approximating the number of distinct items in a multiset. * @@ -47,14 +54,15 @@ namespace cuco::detail { */ template class hyperloglog_ref { + using register_type = int; ///< Register array storage + // We use `int` here since this is the smallest type that supports native `atomicMax` on GPUs + using fp_type = float; ///< Floating point type used for reduction public: - using fp_type = float; ///< Floating point type used for reduction static constexpr auto thread_scope = Scope; ///< CUDA thread scope static constexpr auto precision = Precision; ///< Precision - using value_type = T; ///< Type of items to count - using storage_type = hyperloglog_dense_registers; ///< Storage type - using hash_type = Hash; ///< Hash function type + using value_type = T; ///< Type of items to count + using hash_type = Hash; ///< Hash function type template using with_scope = hyperloglog_ref; ///< Ref type with different @@ -63,12 +71,17 @@ class hyperloglog_ref { /** * @brief Constructs a non-owning `hyperloglog_ref` object. * - * @param storage Reference to storage object of type `storage_type` + * @param sketch_span Reference to sketch storage * @param hash The hash function used to hash items */ - __host__ __device__ constexpr hyperloglog_ref(storage_type& storage, Hash const& hash) noexcept - : hash_{hash}, storage_{storage} + template + __host__ __device__ constexpr hyperloglog_ref(cuda::std::span sketch_span, + Hash const& hash) noexcept + : hash_{hash}, + sketch_{reinterpret_cast(sketch_span.data()), + this->sketch_bytes() / sizeof(register_type)} { + // TODO check size and alignment } /** @@ -81,7 +94,34 @@ class hyperloglog_ref { template __device__ void clear(CG const& group) noexcept { - this->storage_.clear(group); + for (int i = group.thread_rank(); i < this->sketch_.size(); i += group.size()) { + this->sketch_[i] = 0; + } + } + + /** + * @brief Resets the estimator, i.e., clears the current count estimate. + * + * @note This function synchronizes the given stream. For asynchronous execution use + * `clear_async`. + * + * @param stream CUDA stream this operation is executed in + */ + __host__ void clear(cuco::cuda_stream_ref stream) + { + this->clear_async(stream); + stream.synchronize(); + } + + /** + * @brief Asynchronously resets the estimator, i.e., clears the current count estimate. + * + * @param stream CUDA stream this operation is executed in + */ + __host__ void clear_async(cuco::cuda_stream_ref stream) noexcept + { + auto constexpr block_size = 1024; + cuco::hyperloglog_ns::detail::clear<<<1, block_size, 0, stream>>>(*this); } /** @@ -97,7 +137,83 @@ class hyperloglog_ref { auto const reg = h & register_mask; auto const zeroes = cuda::std::countl_zero(h | register_mask) + 1; // __clz - this->storage_.update_max(reg, zeroes); + this->update_max(reg, zeroes); + } + + /** + * @brief Asynchronously adds to be counted items to the estimator. + * + * @tparam InputIt Device accessible random access input iterator where + * std::is_convertible::value_type, + * T> is `true` + * + * @param first Beginning of the sequence of items + * @param last End of the sequence of items + * @param stream CUDA stream this operation is executed in + */ + template + __host__ void add_async(InputIt first, InputIt last, cuco::cuda_stream_ref stream) + { + auto const num_items = cuco::detail::distance(first, last); + if (num_items == 0) { return; } + + int grid_size = 0; + int block_size = 0; + int const shmem_bytes = sketch_bytes(); + + // TODO specialize for is_continuous_iterator -> use memcpy_async + + // try expanding shmem partition beyond 48KB if necessary + bool const fits_shmem = + cudaSuccess == + cudaFuncSetAttribute(cuco::hyperloglog_ns::detail::add_shmem, + cudaFuncAttributeMaxDynamicSharedMemorySize, + shmem_bytes); + + // We make use of the occupancy calculator to get the minimum number of blocks which still + // saturates the GPU. This reduces the shmem initialization overhead and atomic contention on + // the final register array during the merge phase. + if (fits_shmem) { // use shmem codepath + CUCO_CUDA_TRY(cudaOccupancyMaxPotentialBlockSize( + &grid_size, + &block_size, + &cuco::hyperloglog_ns::detail::add_shmem, + shmem_bytes)); + + cuco::hyperloglog_ns::detail::add_shmem<<>>( + first, num_items, *this); + } else { // use gmem codepath since there is not enough shmem available + block_size = 0; + CUCO_CUDA_TRY(cudaOccupancyMaxPotentialBlockSize( + &grid_size, + &block_size, + &cuco::hyperloglog_ns::detail::add_gmem)); + CUCO_EXPECTS(grid_size != 0, "Invalid kernel launch configuration"); + + cuco::hyperloglog_ns::detail::add_gmem<<>>( + first, num_items, *this); + } + } + + /** + * @brief Adds to be counted items to the estimator. + * + * @note This function synchronizes the given stream. For asynchronous execution use + * `add_async`. + * + * @tparam InputIt Device accessible random access input iterator where + * std::is_convertible::value_type, + * T> is `true` + * + * @param first Beginning of the sequence of items + * @param last End of the sequence of items + * @param stream CUDA stream this operation is executed in + */ + template + __host__ void add(InputIt first, InputIt last, cuco::cuda_stream_ref stream) + { + this->add_async(first, last, stream); + stream.synchronize(); } /** @@ -113,7 +229,44 @@ class hyperloglog_ref { __device__ void merge(CG const& group, hyperloglog_ref const& other) noexcept { - this->storage_.merge(group, other.storage_); + for (int i = group.thread_rank(); i < this->sketch_.size(); i += group.size()) { + this->update_max(i, other.sketch_[i]); + } + } + + /** + * @brief Asynchronously merges the result of `other` estimator reference into `*this` estimator. + * + * @tparam OtherScope Thread scope of `other` estimator + * + * @param other Other estimator reference to be merged into `*this` + * @param stream CUDA stream this operation is executed in + */ + template + __host__ void merge_async(hyperloglog_ref const& other, + cuco::cuda_stream_ref stream) noexcept + { + auto constexpr block_size = 1024; + cuco::hyperloglog_ns::detail::merge<<<1, block_size, 0, stream>>>(other, *this); + } + + /** + * @brief Merges the result of `other` estimator reference into `*this` estimator. + * + * @note This function synchronizes the given stream. For asynchronous execution use + * `merge_async`. + * + * @tparam OtherScope Thread scope of `other` estimator + * + * @param other Other estimator reference to be merged into `*this` + * @param stream CUDA stream this operation is executed in + */ + template + __host__ void merge(hyperloglog_ref const& other, + cuco::cuda_stream_ref stream) + { + this->merge_async(other, stream); + stream.synchronize(); } /** @@ -138,8 +291,8 @@ class hyperloglog_ref { fp_type thread_sum = 0; int thread_zeroes = 0; - for (int i = group.thread_rank(); i < this->storage_.size(); i += group.size()) { - auto const reg = this->storage_[i]; + for (int i = group.thread_rank(); i < this->sketch_.size(); i += group.size()) { + auto const reg = this->sketch_[i]; thread_sum += fp_type{1} / static_cast(1 << reg); thread_zeroes += reg == 0; } @@ -174,9 +327,102 @@ class hyperloglog_ref { return estimate; } + /** + * @brief Compute the estimated distinct items count. + * + * @note This function synchronizes the given stream. + * + * @param stream CUDA stream this operation is executed in + * + * @return Approximate distinct items count + */ + [[nodiscard]] __host__ std::size_t estimate(cuco::cuda_stream_ref stream) const + { + auto const num_regs = 1ull << Precision; + thrust::host_vector host_sketch(num_regs); + + // TODO check if storage is host accessible + CUCO_CUDA_TRY(cudaMemcpyAsync(host_sketch.data(), + this->sketch_.data(), + sizeof(register_type) * num_regs, + cudaMemcpyDeviceToHost, + stream)); + stream.synchronize(); + + fp_type sum = 0; + int zeroes = 0; + + // geometric mean computation + count registers with 0s + for (auto const reg : host_sketch) { + sum += fp_type{1} / static_cast(1ull << reg); + zeroes += reg == 0; + } + + // pass intermediate result to finalizer for bias correction, etc. + return cuco::hyperloglog_ns::detail::finalizer::finalize(sum, zeroes); + } + + /** + * @brief Gets the hash function. + * + * @return The hash function + */ + [[nodiscard]] __host__ __device__ auto hash() const noexcept { return this->hash_; } + + /** + * @brief Gets the span of the sketch. + * + * @return The cuda::std::span of the sketch + */ + [[nodiscard]] __host__ __device__ auto sketch() const noexcept { return this->sketch_; } + + /** + * @brief Gets the number of bytes required for the sketch storage. + * + * @return The number of bytes required for the sketch + */ + [[nodiscard]] __host__ __device__ static constexpr std::size_t sketch_bytes() noexcept + { + return (1ull << Precision) * sizeof(register_type); + } + + /** + * @brief Gets the alignment required for the sketch storage. + * + * @return The required alignment + */ + [[nodiscard]] __host__ __device__ static constexpr std::size_t sketch_alignment() noexcept + { + return alignof(register_type); + } + private: - hash_type hash_; ///< Hash function used to hash items - storage_type& storage_; ///< Reference to storage object + /** + * @brief Atomically updates the register at position `i` with `max(reg[i], value)`. + * + * @tparam Scope CUDA thread scope + * + * @param i Register index + * @param value New value + */ + __device__ constexpr void update_max(int i, register_type value) noexcept + { + if constexpr (Scope == cuda::thread_scope_thread) { + this->sketch_[i] = max(this->sketch_[i], value); + } else if constexpr (Scope == cuda::thread_scope_block) { + atomicMax_block(&(this->sketch_[i]), value); + } else if constexpr (Scope == cuda::thread_scope_device) { + atomicMax(&(this->sketch_[i]), value); + } else if constexpr (Scope == cuda::thread_scope_system) { + atomicMax_system(&(this->sketch_[i]), value); + } else { + static_assert(cuco::dependent_false, "Unsupported thread scope"); + } + } + + hash_type hash_; ///< Hash function used to hash items + cuda::std::span + sketch_; ///< HLL sketch storage template friend class hyperloglog_ref; diff --git a/include/cuco/detail/hyperloglog/kernels.cuh b/include/cuco/detail/hyperloglog/kernels.cuh index 653caac95..8b2ab73e9 100644 --- a/include/cuco/detail/hyperloglog/kernels.cuh +++ b/include/cuco/detail/hyperloglog/kernels.cuh @@ -18,6 +18,8 @@ #include #include +#include + #include #include @@ -35,20 +37,16 @@ CUCO_KERNEL void clear(RefType ref) template CUCO_KERNEL void add_shmem(InputIt first, cuco::detail::index_type n, RefType ref) { - using local_ref_type = typename RefType::with_scope; - using local_storage_type = typename local_ref_type::storage_type; + using local_ref_type = typename RefType::with_scope; - alignas(16) extern __shared__ char shmem[]; - local_storage_type* local_storage = reinterpret_cast(shmem); + // TODO assert alignment + extern __shared__ std::byte local_sketch[]; auto const loop_stride = cuco::detail::grid_stride(); auto idx = cuco::detail::global_thread_id(); auto const block = cooperative_groups::this_thread_block(); - if (block.thread_rank() == 0) { new (local_storage) local_storage_type{}; } - block.sync(); - - local_ref_type local_ref(*local_storage, {}); + local_ref_type local_ref(cuda::std::span{local_sketch, ref.sketch_bytes()}, {}); local_ref.clear(block); block.sync(); diff --git a/include/cuco/distinct_count_estimator.cuh b/include/cuco/distinct_count_estimator.cuh index 5a9a16c85..38f9cbd16 100644 --- a/include/cuco/distinct_count_estimator.cuh +++ b/include/cuco/distinct_count_estimator.cuh @@ -61,7 +61,6 @@ class distinct_count_estimator { using value_type = typename impl_type::value_type; ///< Type of items to count using allocator_type = typename impl_type::allocator_type; ///< Allocator type - using storage_type = typename impl_type::storage_type; ///< Storage type // TODO enable CTAD /** @@ -212,6 +211,34 @@ class distinct_count_estimator { */ [[nodiscard]] ref_type<> ref() const noexcept; + /** + * @brief Get hash function. + * + * @return The hash function + */ + [[nodiscard]] auto hash() const noexcept; + + /** + * @brief Gets the span of the sketch. + * + * @return The cuda::std::span of the sketch + */ + [[nodiscard]] auto sketch() const noexcept; + + /** + * @brief Gets the number of bytes required for the sketch storage. + * + * @return The number of bytes required for the sketch + */ + [[nodiscard]] static constexpr std::size_t sketch_bytes() noexcept; + + /** + * @brief Gets the alignment required for the sketch storage. + * + * @return The required alignment + */ + [[nodiscard]] static constexpr std::size_t sketch_alignment() noexcept; + private: std::unique_ptr impl_; ///< Implementation object }; diff --git a/include/cuco/distinct_count_estimator_ref.cuh b/include/cuco/distinct_count_estimator_ref.cuh index d656d6e17..905a6d379 100644 --- a/include/cuco/distinct_count_estimator_ref.cuh +++ b/include/cuco/distinct_count_estimator_ref.cuh @@ -15,12 +15,15 @@ */ #pragma once +#include #include #include #include #include +#include + namespace cuco { /** * @brief A GPU-accelerated utility for approximating the number of distinct items in a multiset. @@ -44,8 +47,7 @@ class distinct_count_estimator_ref { static constexpr auto thread_scope = impl_type::thread_scope; ///< CUDA thread scope static constexpr auto precision = impl_type::precision; ///< Precision - using value_type = typename impl_type::value_type; ///< Type of items to count - using storage_type = typename impl_type::storage_type; ///< Storage type + using value_type = typename impl_type::value_type; ///< Type of items to count template using with_scope = @@ -56,10 +58,11 @@ class distinct_count_estimator_ref { /** * @brief Constructs a non-owning `distinct_count_estimator_ref` object. * - * @param storage Reference to storage object of type `storage_type` + * @param sketch_span Reference to sketch storage * @param hash The hash function used to hash items */ - __host__ __device__ constexpr distinct_count_estimator_ref(storage_type& storage, + template + __host__ __device__ constexpr distinct_count_estimator_ref(cuda::std::span sketch_span, Hash const& hash = {}) noexcept; /** @@ -72,6 +75,23 @@ class distinct_count_estimator_ref { template __device__ void clear(CG const& group) noexcept; + /** + * @brief Asynchronously resets the estimator, i.e., clears the current count estimate. + * + * @param stream CUDA stream this operation is executed in + */ + __host__ void clear_async(cuco::cuda_stream_ref stream = {}) noexcept; + + /** + * @brief Resets the estimator, i.e., clears the current count estimate. + * + * @note This function synchronizes the given stream. For asynchronous execution use + * `clear_async`. + * + * @param stream CUDA stream this operation is executed in + */ + __host__ void clear(cuco::cuda_stream_ref stream = {}); + /** * @brief Adds an item to the estimator. * @@ -79,6 +99,37 @@ class distinct_count_estimator_ref { */ __device__ void add(T const& item) noexcept; + /** + * @brief Asynchronously adds to be counted items to the estimator. + * + * @tparam InputIt Device accessible random access input iterator where + * std::is_convertible::value_type, + * T> is `true` + * + * @param first Beginning of the sequence of items + * @param last End of the sequence of items + * @param stream CUDA stream this operation is executed in + */ + template + __host__ void add_async(InputIt first, InputIt last, cuco::cuda_stream_ref stream = {}); + + /** + * @brief Adds to be counted items to the estimator. + * + * @note This function synchronizes the given stream. For asynchronous execution use + * `add_async`. + * + * @tparam InputIt Device accessible random access input iterator where + * std::is_convertible::value_type, + * T> is `true` + * + * @param first Beginning of the sequence of items + * @param last End of the sequence of items + * @param stream CUDA stream this operation is executed in + */ + template + __host__ void add(InputIt first, InputIt last, cuco::cuda_stream_ref stream = {}); + /** * @brief Merges the result of `other` estimator reference into `*this` estimator reference. * @@ -93,6 +144,34 @@ class distinct_count_estimator_ref { CG const& group, distinct_count_estimator_ref const& other) noexcept; + /** + * @brief Asynchronously merges the result of `other` estimator reference into `*this` estimator. + * + * @tparam OtherScope Thread scope of `other` estimator + * + * @param other Other estimator reference to be merged into `*this` + * @param stream CUDA stream this operation is executed in + */ + template + __host__ void merge_async( + distinct_count_estimator_ref const& other, + cuco::cuda_stream_ref stream = {}) noexcept; + + /** + * @brief Merges the result of `other` estimator reference into `*this` estimator. + * + * @note This function synchronizes the given stream. For asynchronous execution use + * `merge_async`. + * + * @tparam OtherScope Thread scope of `other` estimator + * + * @param other Other estimator reference to be merged into `*this` + * @param stream CUDA stream this operation is executed in + */ + template + __host__ void merge(distinct_count_estimator_ref const& other, + cuco::cuda_stream_ref stream = {}); + /** * @brief Compute the estimated distinct items count. * @@ -103,6 +182,45 @@ class distinct_count_estimator_ref { [[nodiscard]] __device__ std::size_t estimate( cooperative_groups::thread_block const& group) const noexcept; + /** + * @brief Compute the estimated distinct items count. + * + * @note This function synchronizes the given stream. + * + * @param stream CUDA stream this operation is executed in + * + * @return Approximate distinct items count + */ + [[nodiscard]] __host__ std::size_t estimate(cuco::cuda_stream_ref stream = {}) const; + + /** + * @brief Gets the hash function. + * + * @return The hash function + */ + [[nodiscard]] __host__ __device__ auto hash() const noexcept; + + /** + * @brief Gets the span of the sketch. + * + * @return The cuda::std::span of the sketch + */ + [[nodiscard]] __host__ __device__ auto sketch() const noexcept; + + /** + * @brief Gets the number of bytes required for the sketch storage. + * + * @return The number of bytes required for the sketch + */ + [[nodiscard]] __host__ __device__ static constexpr std::size_t sketch_bytes() noexcept; + + /** + * @brief Gets the alignment required for the sketch storage. + * + * @return The required alignment + */ + [[nodiscard]] __host__ __device__ static constexpr std::size_t sketch_alignment() noexcept; + private: impl_type impl_; ///< Implementation object From 22c083d3d851694dc70035b8cc27e4ce7dd31c8f Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Thu, 14 Mar 2024 22:12:37 +0000 Subject: [PATCH 29/78] Remove storage class --- include/cuco/detail/hyperloglog/storage.cuh | 126 -------------------- 1 file changed, 126 deletions(-) delete mode 100644 include/cuco/detail/hyperloglog/storage.cuh diff --git a/include/cuco/detail/hyperloglog/storage.cuh b/include/cuco/detail/hyperloglog/storage.cuh deleted file mode 100644 index 5ca525c69..000000000 --- a/include/cuco/detail/hyperloglog/storage.cuh +++ /dev/null @@ -1,126 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include - -#include -#include - -namespace cuco::detail { - -/** - * @brief Storage class for `hyperloglog` and `hyperloglog_ref`. - * - * @note This class implements the dense storage layout from the HyperLogLog++ paper, but uses - * 4bytes per register instead of only 6bits. This is required since we need to update registers - * atomically. - * - * @tparam Precision Tuning parameter to trade runtime/memory footprint for better accuracy - */ -template -class hyperloglog_dense_registers { - public: - // We use `int` here since this is the smallest type that supports native `atomicMax` on GPUs - using register_type = int; ///< Register array storage - /** - * @brief Clears the storage. - * - * @tparam CG CUDA Cooperative Group type - * - * @param group CUDA Cooperative group this operation is executed in - */ - template - __device__ void constexpr clear(CG const& group) noexcept - { - for (int i = group.thread_rank(); i < this->registers_.size(); i += group.size()) { - this->registers_[i] = 0; - } - } - - /** - * @brief Returns a reference to the element at specified location `i`. No bounds checking is - * performed. - * - * @param i Position of the element to return - * - * @return Reference to the requested element - */ - __host__ __device__ constexpr int& operator[](int i) noexcept { return this->registers_[i]; } - - /** - * @brief Returns the element at specified location `i`. No bounds checking is performed. - * - * @param i Position of the element to return - * - * @return Requested element - */ - __host__ __device__ constexpr int operator[](int i) const noexcept { return this->registers_[i]; } - - /** - * @brief Returns the number of elements in the container. - * - * @return The number of elements in the container - */ - __host__ __device__ constexpr int size() const noexcept { return this->registers_.size(); } - - /** - * @brief Atomically updates the register at position `i` with `max(reg[i], value)`. - * - * @tparam Scope CUDA thread scope - * - * @param i Register index - * @param value New value - */ - template - __device__ constexpr void update_max(int i, register_type value) noexcept - { - if constexpr (Scope == cuda::thread_scope_thread) { - this->registers_[i] = max(this->registers_[i], value); - } else if constexpr (Scope == cuda::thread_scope_block) { - atomicMax_block(&(this->registers_[i]), value); - } else if constexpr (Scope == cuda::thread_scope_device) { - atomicMax(&(this->registers_[i]), value); - } else if constexpr (Scope == cuda::thread_scope_system) { - atomicMax_system(&(this->registers_[i]), value); - } else { - static_assert(cuco::dependent_false, "Unsupported thread scope"); - } - } - - /** - * @brief Combines the contents of `other` storage into `*this` storage. - * - * @tparam Scope CUDA thread scope - * @tparam CG CUDA Cooperative Group type - * - * @param group CUDA Cooperative group this operation is executed in - * @param other Other storage - */ - template - __device__ void constexpr merge(CG const& group, - hyperloglog_dense_registers const& other) noexcept - { - for (int i = group.thread_rank(); i < this->registers_.size(); i += group.size()) { - this->update_max(i, other.registers_[i]); - } - } - - private: - cuda::std::array registers_; ///< Register array storage -}; -} // namespace cuco::detail From 56cdc6bce550b7a8d14acd017f300689b2e4c96d Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Fri, 15 Mar 2024 00:09:38 +0000 Subject: [PATCH 30/78] Add vectorized add kernel --- .../detail/hyperloglog/hyperloglog_ref.cuh | 110 +++++++++++++----- include/cuco/detail/hyperloglog/kernels.cuh | 37 ++++++ 2 files changed, 115 insertions(+), 32 deletions(-) diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh index 46e61966a..d658e748e 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh @@ -26,6 +26,7 @@ #include #include +#include #include #include @@ -160,38 +161,63 @@ class hyperloglog_ref { int grid_size = 0; int block_size = 0; int const shmem_bytes = sketch_bytes(); + void const* kernel = nullptr; + + // In case the input iterator represents a contiguous memory segment we can employ efficient + // vectorized loads + if constexpr (thrust::is_contiguous_iterator_v) { + auto const ptr = thrust::raw_pointer_cast(&first[0]); + auto const alignment = + 1 << cuda::std::countr_zero(reinterpret_cast(ptr) | 16); + auto const vector_size = alignment / sizeof(value_type); + + switch (vector_size) { + case 2: + kernel = reinterpret_cast( + cuco::hyperloglog_ns::detail::add_shmem_vectorized<2, hyperloglog_ref>); + break; + case 4: + kernel = reinterpret_cast( + cuco::hyperloglog_ns::detail::add_shmem_vectorized<4, hyperloglog_ref>); + break; + case 8: + kernel = reinterpret_cast( + cuco::hyperloglog_ns::detail::add_shmem_vectorized<8, hyperloglog_ref>); + break; + }; + } - // TODO specialize for is_continuous_iterator -> use memcpy_async - - // try expanding shmem partition beyond 48KB if necessary - bool const fits_shmem = - cudaSuccess == - cudaFuncSetAttribute(cuco::hyperloglog_ns::detail::add_shmem, - cudaFuncAttributeMaxDynamicSharedMemorySize, - shmem_bytes); - - // We make use of the occupancy calculator to get the minimum number of blocks which still - // saturates the GPU. This reduces the shmem initialization overhead and atomic contention on - // the final register array during the merge phase. - if (fits_shmem) { // use shmem codepath - CUCO_CUDA_TRY(cudaOccupancyMaxPotentialBlockSize( - &grid_size, - &block_size, - &cuco::hyperloglog_ns::detail::add_shmem, - shmem_bytes)); - - cuco::hyperloglog_ns::detail::add_shmem<<>>( - first, num_items, *this); - } else { // use gmem codepath since there is not enough shmem available - block_size = 0; - CUCO_CUDA_TRY(cudaOccupancyMaxPotentialBlockSize( - &grid_size, - &block_size, - &cuco::hyperloglog_ns::detail::add_gmem)); - CUCO_EXPECTS(grid_size != 0, "Invalid kernel launch configuration"); - - cuco::hyperloglog_ns::detail::add_gmem<<>>( - first, num_items, *this); + if (kernel != nullptr and this->try_reserve_shmem(kernel, shmem_bytes)) { + // We make use of the occupancy calculator to get the minimum number of blocks which still + // saturates the GPU. This reduces the shmem initialization overhead and atomic contention on + // the final register array during the merge phase. + CUCO_CUDA_TRY( + cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, kernel, shmem_bytes)); + + auto const ptr = thrust::raw_pointer_cast(&first[0]); + void* kernel_args[] = { + (void*)(&ptr), // TODO can't use reinterpret_cast since it can't cast away const + (void*)(&num_items), + reinterpret_cast(this)}; + CUCO_CUDA_TRY( + cudaLaunchKernel(kernel, grid_size, block_size, kernel_args, shmem_bytes, stream)); + } else { + kernel = (void const*)cuco::hyperloglog_ns::detail::add_shmem; + void* kernel_args[] = {(void*)(&first), (void*)(&num_items), reinterpret_cast(this)}; + if (this->try_reserve_shmem(kernel, shmem_bytes)) { + CUCO_CUDA_TRY( + cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, kernel, shmem_bytes)); + + CUCO_CUDA_TRY( + cudaLaunchKernel(kernel, grid_size, block_size, kernel_args, shmem_bytes, stream)); + } else { + // Computes sketch directly in global memory. (Fallback path in case there is not enough + // shared memory avalable) + kernel = (void const*)cuco::hyperloglog_ns::detail::add_gmem; + + CUCO_CUDA_TRY( + cudaLaunchKernel(kernel, grid_size, block_size, kernel_args, shmem_bytes, stream)); + } } } @@ -235,7 +261,8 @@ class hyperloglog_ref { } /** - * @brief Asynchronously merges the result of `other` estimator reference into `*this` estimator. + * @brief Asynchronously merges the result of `other` estimator reference into `*this` + * estimator. * * @tparam OtherScope Thread scope of `other` estimator * @@ -420,6 +447,25 @@ class hyperloglog_ref { } } + /** + * @brief Try expanding the shmem partition for a given kernel beyond 48KB is necessary. + * + * @tparam Kernel Type of kernel function + * + * @param kernel The kernel function + * @param shmem_bytes Number of requested dynamic shared memory bytes + * + * @returns True iff kernel configuration is succesful + */ + template + [[nodiscard]] __host__ constexpr bool try_reserve_shmem(Kernel kernel, + int shmem_bytes) const noexcept + { + return cudaSuccess == cudaFuncSetAttribute(reinterpret_cast(kernel), + cudaFuncAttributeMaxDynamicSharedMemorySize, + shmem_bytes); + } + hash_type hash_; ///< Hash function used to hash items cuda::std::span sketch_; ///< HLL sketch storage diff --git a/include/cuco/detail/hyperloglog/kernels.cuh b/include/cuco/detail/hyperloglog/kernels.cuh index 8b2ab73e9..4da78d020 100644 --- a/include/cuco/detail/hyperloglog/kernels.cuh +++ b/include/cuco/detail/hyperloglog/kernels.cuh @@ -18,6 +18,7 @@ #include #include +#include #include #include @@ -34,6 +35,42 @@ CUCO_KERNEL void clear(RefType ref) if (block.group_index().x == 0) { ref.clear(block); } } +template +CUCO_KERNEL void add_shmem_vectorized(typename RefType::value_type const* first, + cuco::detail::index_type n, + RefType ref) +{ + using value_type = typename RefType::value_type; + using vector_type = cuda::std::array; + using local_ref_type = typename RefType::with_scope; + + // TODO assert alignment + extern __shared__ std::byte local_sketch[]; + + auto const loop_stride = cuco::detail::grid_stride(); + auto idx = cuco::detail::global_thread_id(); + auto const block = cooperative_groups::this_thread_block(); + + local_ref_type local_ref(cuda::std::span{local_sketch, ref.sketch_bytes()}, {}); + local_ref.clear(block); + block.sync(); + + vector_type vec; + while (idx < n / VectorSize) { + vec = *reinterpret_cast( + __builtin_assume_aligned(first + idx * VectorSize, sizeof(vector_type))); + for (auto const& i : vec) { + local_ref.add(i); + } + idx += loop_stride; + } + auto const remainder = n % VectorSize; + if (idx >= n / VectorSize and idx < n / VectorSize + remainder) { local_ref.add(*(first + idx)); } + block.sync(); + + ref.merge(block, local_ref); +} + template CUCO_KERNEL void add_shmem(InputIt first, cuco::detail::index_type n, RefType ref) { From 9b4b61294a3ecc904854f739bb8b05c07724e719 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Fri, 15 Mar 2024 12:55:15 +0000 Subject: [PATCH 31/78] Add missing kernel config --- include/cuco/detail/hyperloglog/hyperloglog_ref.cuh | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh index d658e748e..35ffce286 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh @@ -202,7 +202,8 @@ class hyperloglog_ref { CUCO_CUDA_TRY( cudaLaunchKernel(kernel, grid_size, block_size, kernel_args, shmem_bytes, stream)); } else { - kernel = (void const*)cuco::hyperloglog_ns::detail::add_shmem; + kernel = reinterpret_cast( + cuco::hyperloglog_ns::detail::add_shmem); void* kernel_args[] = {(void*)(&first), (void*)(&num_items), reinterpret_cast(this)}; if (this->try_reserve_shmem(kernel, shmem_bytes)) { CUCO_CUDA_TRY( @@ -213,10 +214,12 @@ class hyperloglog_ref { } else { // Computes sketch directly in global memory. (Fallback path in case there is not enough // shared memory avalable) - kernel = (void const*)cuco::hyperloglog_ns::detail::add_gmem; + kernel = reinterpret_cast( + cuco::hyperloglog_ns::detail::add_gmem); - CUCO_CUDA_TRY( - cudaLaunchKernel(kernel, grid_size, block_size, kernel_args, shmem_bytes, stream)); + CUCO_CUDA_TRY(cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, kernel, 0)); + + CUCO_CUDA_TRY(cudaLaunchKernel(kernel, grid_size, block_size, kernel_args, 0, stream)); } } } @@ -448,7 +451,7 @@ class hyperloglog_ref { } /** - * @brief Try expanding the shmem partition for a given kernel beyond 48KB is necessary. + * @brief Try expanding the shmem partition for a given kernel beyond 48KB if necessary. * * @tparam Kernel Type of kernel function * From 30bd79ddbd919855261fddb1c2b9765c0e585156 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Fri, 15 Mar 2024 13:35:30 +0000 Subject: [PATCH 32/78] Make tuning arrs accessible in non-constexpr context --- include/cuco/detail/hyperloglog/finalizer.cuh | 22 +- include/cuco/detail/hyperloglog/tuning.cuh | 226 +++++++----------- 2 files changed, 97 insertions(+), 151 deletions(-) diff --git a/include/cuco/detail/hyperloglog/finalizer.cuh b/include/cuco/detail/hyperloglog/finalizer.cuh index 8b221f6ba..60a7ffcab 100644 --- a/include/cuco/detail/hyperloglog/finalizer.cuh +++ b/include/cuco/detail/hyperloglog/finalizer.cuh @@ -60,7 +60,7 @@ class finalizer { if (e <= 2.5 * m) { return cuda::std::round(h); } if constexpr (Precision < 19) { - e = (h <= threshold()) ? h : bias_corrected_estimate(e); + e = (h <= threshold(Precision)) ? h : bias_corrected_estimate(e); } } else { // HLL++ is defined only when p < 19, otherwise we need to fallback to HLL. @@ -95,7 +95,7 @@ class finalizer { __host__ __device__ static double constexpr bias(double e) noexcept { auto const anchor_index = interpolation_anchor_index(e); - int const n = raw_estimate_data().size(); + int const n = raw_estimate_data_size(Precision); auto low = cuda::std::max(anchor_index - k + 1, 0); auto high = cuda::std::min(low + k, n); @@ -106,8 +106,8 @@ class finalizer { high += 1; } - auto const& biases = bias_data(); - double bias_sum = 0.0; + auto biases = bias_data(Precision); + double bias_sum = 0.0; for (int i = low; i < high; ++i) { bias_sum += biases[i]; } @@ -117,15 +117,16 @@ class finalizer { __host__ __device__ static double distance(double e, int i) noexcept { - auto const diff = e - raw_estimate_data()[i]; + auto const diff = e - raw_estimate_data(Precision)[i]; return diff * diff; } __host__ __device__ static int interpolation_anchor_index(double e) noexcept { - auto const& estimates = raw_estimate_data(); - int left = 0; - int right = static_cast(estimates.size()) - 1; + auto estimates = raw_estimate_data(Precision); + int const n = raw_estimate_data_size(Precision); + int left = 0; + int right = static_cast(n) - 1; int mid; int candidate_index = 0; // Index of the closest element found @@ -146,9 +147,8 @@ class finalizer { // 'left - 1' to find the closest one, taking care of boundary conditions. // Distance from 'e' to the element at 'left', if within bounds - double const dist_lhs = left < static_cast(estimates.size()) - ? cuda::std::abs(estimates[left] - e) - : cuda::std::numeric_limits::max(); + double const dist_lhs = left < static_cast(n) ? cuda::std::abs(estimates[left] - e) + : cuda::std::numeric_limits::max(); // Distance from 'e' to the element at 'left - 1', if within bounds double const dist_rhs = left - 1 >= 0 ? cuda::std::abs(estimates[left - 1] - e) : cuda::std::numeric_limits::max(); diff --git a/include/cuco/detail/hyperloglog/tuning.cuh b/include/cuco/detail/hyperloglog/tuning.cuh index 05cacb067..a816ffa89 100644 --- a/include/cuco/detail/hyperloglog/tuning.cuh +++ b/include/cuco/detail/hyperloglog/tuning.cuh @@ -26,53 +26,11 @@ namespace cuco::hyperloglog_ns::detail { #endif // clang-format off -template -__host__ __device__ constexpr auto threshold() noexcept; +CUCO_HLL_TUNING_ARR_DECL threshold_data{10.0, 20.0, 40.0, 80.0, 220.0, 400.0, 900.0, 1800.0, 3100.0, 6500.0, 15500.0, 20000.0, 50000.0, 120000.0, 350000.0}; -template <> -__host__ __device__ constexpr auto threshold<4>() noexcept { return 10.0; }; - -template <> -__host__ __device__ constexpr auto threshold<5>() noexcept { return 20.0; }; - -template <> -__host__ __device__ constexpr auto threshold<6>() noexcept { return 40.0; }; - -template <> -__host__ __device__ constexpr auto threshold<7>() noexcept { return 80.0; }; - -template <> -__host__ __device__ constexpr auto threshold<8>() noexcept { return 220.0; }; - -template <> -__host__ __device__ constexpr auto threshold<9>() noexcept { return 400.0; }; - -template <> -__host__ __device__ constexpr auto threshold<10>() noexcept { return 900.0; }; - -template <> -__host__ __device__ constexpr auto threshold<11>() noexcept { return 1800.0; }; - -template <> -__host__ __device__ constexpr auto threshold<12>() noexcept { return 3100.0; }; - -template <> -__host__ __device__ constexpr auto threshold<13>() noexcept { return 6500.0; }; - -template <> -__host__ __device__ constexpr auto threshold<14>() noexcept { return 15500.0; }; - -template <> -__host__ __device__ constexpr auto threshold<15>() noexcept { return 20000.0; }; - -template <> -__host__ __device__ constexpr auto threshold<16>() noexcept { return 50000.0; }; - -template <> -__host__ __device__ constexpr auto threshold<17>() noexcept { return 120000.0; }; - -template <> -__host__ __device__ constexpr auto threshold<18>() noexcept { return 350000.0; }; +__host__ __device__ constexpr auto threshold(int32_t precision) noexcept { + return threshold_data[precision - 4]; +} // HLL++ uses an interpolation method over the raw estimated cardinality to select the optimal bias. // Parameters/interpolation points taken from @@ -94,53 +52,47 @@ CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p17{94542.0, 96125.811, 97728.019, 99 CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p18{189084.0, 192250.913, 195456.774, 198696.946, 201977.762, 205294.444, 208651.754, 212042.099, 215472.269, 218941.91, 222443.912, 225996.845, 229568.199, 233193.568, 236844.457, 240543.233, 244279.475, 248044.27, 251854.588, 255693.2, 259583.619, 263494.621, 267445.385, 271454.061, 275468.769, 279549.456, 283646.446, 287788.198, 291966.099, 296181.164, 300431.469, 304718.618, 309024.004, 313393.508, 317760.803, 322209.731, 326675.061, 331160.627, 335654.47, 340241.442, 344841.833, 349467.132, 354130.629, 358819.432, 363574.626, 368296.587, 373118.482, 377914.93, 382782.301, 387680.669, 392601.981, 397544.323, 402529.115, 407546.018, 412593.658, 417638.657, 422762.865, 427886.169, 433017.167, 438213.273, 443441.254, 448692.421, 453937.533, 459239.049, 464529.569, 469910.083, 475274.03, 480684.473, 486070.26, 491515.237, 496995.651, 502476.617, 507973.609, 513497.19, 519083.233, 524726.509, 530305.505, 535945.728, 541584.404, 547274.055, 552967.236, 558667.862, 564360.216, 570128.148, 575965.08, 581701.952, 587532.523, 593361.144, 599246.128, 605033.418, 610958.779, 616837.117, 622772.818, 628672.04, 634675.369, 640574.831, 646585.739, 652574.547, 658611.217, 664642.684, 670713.914, 676737.681, 682797.313, 688837.897, 694917.874, 701009.882, 707173.648, 713257.254, 719415.392, 725636.761, 731710.697, 737906.209, 744103.074, 750313.39, 756504.185, 762712.579, 768876.985, 775167.859, 781359.0, 787615.959, 793863.597, 800245.477, 806464.582, 812785.294, 819005.925, 825403.057, 831676.197, 837936.284, 844266.968, 850642.711, 856959.756, 863322.774, 869699.931, 876102.478, 882355.787, 888694.463, 895159.952, 901536.143, 907872.631, 914293.672, 920615.14, 927130.974, 933409.404, 939922.178, 946331.47, 952745.93, 959209.264, 965590.224, 972077.284, 978501.961, 984953.19, 991413.271, 997817.479, 1004222.658, 1010725.676, 1017177.138, 1023612.529, 1030098.236, 1036493.719, 1043112.207, 1049537.036, 1056008.096, 1062476.184, 1068942.337, 1075524.95, 1081932.864, 1088426.025, 1094776.005, 1101327.448, 1107901.673, 1114423.639, 1120884.602, 1127324.923, 1133794.24, 1140328.886, 1146849.376, 1153346.682, 1159836.502, 1166478.703, 1172953.304, 1179391.502, 1185950.982, 1192544.052, 1198913.41, 1205430.994, 1212015.525, 1218674.042, 1225121.683, 1231551.101, 1238126.379, 1244673.795, 1251260.649, 1257697.86, 1264320.983, 1270736.319, 1277274.694, 1283804.95, 1290211.514, 1296858.568, 1303455.691}; // helpers for selecting the corresponding arrays for a given precision -template -__host__ __device__ auto const& raw_estimate_data() noexcept; - -template <> -__host__ __device__ auto const& raw_estimate_data<4>() noexcept { return raw_estimate_data_p4; }; - -template <> -__host__ __device__ auto const& raw_estimate_data<5>() noexcept { return raw_estimate_data_p5; }; - -template <> -__host__ __device__ auto const& raw_estimate_data<6>() noexcept { return raw_estimate_data_p6; }; - -template <> -__host__ __device__ auto const& raw_estimate_data<7>() noexcept { return raw_estimate_data_p7; }; - -template <> -__host__ __device__ auto const& raw_estimate_data<8>() noexcept { return raw_estimate_data_p8; }; - -template <> -__host__ __device__ auto const& raw_estimate_data<9>() noexcept { return raw_estimate_data_p9; }; - -template <> -__host__ __device__ auto const& raw_estimate_data<10>() noexcept { return raw_estimate_data_p10; }; - -template <> -__host__ __device__ auto const& raw_estimate_data<11>() noexcept { return raw_estimate_data_p11; }; - -template <> -__host__ __device__ auto const& raw_estimate_data<12>() noexcept { return raw_estimate_data_p12; }; - -template <> -__host__ __device__ auto const& raw_estimate_data<13>() noexcept { return raw_estimate_data_p13; }; - -template <> -__host__ __device__ auto const& raw_estimate_data<14>() noexcept { return raw_estimate_data_p14; }; - -template <> -__host__ __device__ auto const& raw_estimate_data<15>() noexcept { return raw_estimate_data_p15; }; - -template <> -__host__ __device__ auto const& raw_estimate_data<16>() noexcept { return raw_estimate_data_p16; }; - -template <> -__host__ __device__ auto const& raw_estimate_data<17>() noexcept { return raw_estimate_data_p17; }; - -template <> -__host__ __device__ auto const& raw_estimate_data<18>() noexcept { return raw_estimate_data_p18; }; +__host__ __device__ constexpr double const* raw_estimate_data(int32_t precision) noexcept { + switch (precision) { + case 4: return raw_estimate_data_p4.data(); + case 5: return raw_estimate_data_p5.data(); + case 6: return raw_estimate_data_p6.data(); + case 7: return raw_estimate_data_p7.data(); + case 8: return raw_estimate_data_p8.data(); + case 9: return raw_estimate_data_p9.data(); + case 10: return raw_estimate_data_p10.data(); + case 11: return raw_estimate_data_p11.data(); + case 12: return raw_estimate_data_p12.data(); + case 13: return raw_estimate_data_p13.data(); + case 14: return raw_estimate_data_p14.data(); + case 15: return raw_estimate_data_p15.data(); + case 16: return raw_estimate_data_p16.data(); + case 17: return raw_estimate_data_p17.data(); + case 18: return raw_estimate_data_p18.data(); + default: return nullptr; + } +} + +__host__ __device__ constexpr size_t raw_estimate_data_size(int32_t precision) noexcept { + switch (precision) { + case 4: return raw_estimate_data_p4.size(); + case 5: return raw_estimate_data_p5.size(); + case 6: return raw_estimate_data_p6.size(); + case 7: return raw_estimate_data_p7.size(); + case 8: return raw_estimate_data_p8.size(); + case 9: return raw_estimate_data_p9.size(); + case 10: return raw_estimate_data_p10.size(); + case 11: return raw_estimate_data_p11.size(); + case 12: return raw_estimate_data_p12.size(); + case 13: return raw_estimate_data_p13.size(); + case 14: return raw_estimate_data_p14.size(); + case 15: return raw_estimate_data_p15.size(); + case 16: return raw_estimate_data_p16.size(); + case 17: return raw_estimate_data_p17.size(); + case 18: return raw_estimate_data_p18.size(); + default: return 0; + } +} CUCO_HLL_TUNING_ARR_DECL bias_data_p4{10.0, 9.717, 9.207, 8.7896, 8.2882, 7.8204, 7.3772, 6.9342, 6.5202, 6.161, 5.7722, 5.4636, 5.0396, 4.6766, 4.3566, 4.0454, 3.7936, 3.4856, 3.2666, 2.9946, 2.766, 2.4692, 2.3638, 2.0764, 1.7864, 1.7602, 1.4814, 1.433, 1.2926, 1.0664, 0.999600000000001, 0.7956, 0.5366, 0.589399999999998, 0.573799999999999, 0.269799999999996, 0.368200000000002, 0.0544000000000011, 0.234200000000001, 0.0108000000000033, -0.203400000000002, -0.0701999999999998, -0.129600000000003, -0.364199999999997, -0.480600000000003, -0.226999999999997, -0.322800000000001, -0.382599999999996, -0.511200000000002, -0.669600000000003, -0.749400000000001, -0.500399999999999, -0.617600000000003, -0.6922, -0.601599999999998, -0.416200000000003, -0.338200000000001, -0.782600000000002, -0.648600000000002, -0.919800000000002, -0.851799999999997, -0.962400000000002, -0.6402, -1.1922, -1.0256, -1.086, -1.21899999999999, -0.819400000000002, -0.940600000000003, -1.1554, -1.2072, -1.1752, -1.16759999999999, -1.14019999999999, -1.3754, -1.29859999999999, -1.607, -1.3292, -1.7606}; CUCO_HLL_TUNING_ARR_DECL bias_data_p5{22.0, 21.1194, 20.8208, 20.2318, 19.77, 19.2436, 18.7774, 18.2848, 17.8224, 17.3742, 16.9336, 16.503, 16.0494, 15.6292, 15.2124, 14.798, 14.367, 13.9728, 13.5944, 13.217, 12.8438, 12.3696, 12.0956, 11.7044, 11.324, 11.0668, 10.6698, 10.3644, 10.049, 9.6918, 9.4146, 9.082, 8.687, 8.5398, 8.2462, 7.857, 7.6606, 7.4168, 7.1248, 6.9222, 6.6804, 6.447, 6.3454, 5.9594, 5.7636, 5.5776, 5.331, 5.19, 4.9676, 4.7564, 4.5314, 4.4442, 4.3708, 3.9774, 3.9624, 3.8796, 3.755, 3.472, 3.2076, 3.1024, 2.8908, 2.7338, 2.7728, 2.629, 2.413, 2.3266, 2.1524, 2.2642, 2.1806, 2.0566, 1.9192, 1.7598, 1.3516, 1.5802, 1.43859999999999, 1.49160000000001, 1.1524, 1.1892, 0.841399999999993, 0.879800000000003, 0.837599999999995, 0.469800000000006, 0.765600000000006, 0.331000000000003, 0.591399999999993, 0.601200000000006, 0.701599999999999, 0.558199999999999, 0.339399999999998, 0.354399999999998, 0.491200000000006, 0.308000000000007, 0.355199999999996, -0.0254000000000048, 0.205200000000005, -0.272999999999996, 0.132199999999997, 0.394400000000005, -0.241200000000006, 0.242000000000004, 0.191400000000002, 0.253799999999998, -0.122399999999999, -0.370800000000003, 0.193200000000004, -0.0848000000000013, 0.0867999999999967, -0.327200000000005, -0.285600000000002, 0.311400000000006, -0.128399999999999, -0.754999999999995, -0.209199999999996, -0.293599999999998, -0.364000000000004, -0.253600000000006, -0.821200000000005, -0.253600000000006, -0.510400000000004, -0.383399999999995, -0.491799999999998, -0.220200000000006, -0.0972000000000008, -0.557400000000001, -0.114599999999996, -0.295000000000002, -0.534800000000004, 0.346399999999988, -0.65379999999999, 0.0398000000000138, 0.0341999999999985, -0.995800000000003, -0.523400000000009, -0.489000000000004, -0.274799999999999, -0.574999999999989, -0.482799999999997, 0.0571999999999946, -0.330600000000004, -0.628800000000012, -0.140199999999993, -0.540600000000012, -0.445999999999998, -0.599400000000003, -0.262599999999992, 0.163399999999996, -0.100599999999986, -0.39500000000001, -1.06960000000001, -0.836399999999998, -0.753199999999993, -0.412399999999991, -0.790400000000005, -0.29679999999999, -0.28540000000001, -0.193000000000012, -0.0772000000000048, -0.962799999999987, -0.414800000000014}; @@ -158,53 +110,47 @@ CUCO_HLL_TUNING_ARR_DECL bias_data_p16{47270.0, 46423.3584, 45585.7074, 44757.15 CUCO_HLL_TUNING_ARR_DECL bias_data_p17{94541.0, 92848.811, 91174.019, 89517.558, 87879.9705, 86262.7565, 84663.5125, 83083.7435, 81521.7865, 79977.272, 78455.9465, 76950.219, 75465.432, 73994.152, 72546.71, 71115.2345, 69705.6765, 68314.937, 66944.2705, 65591.255, 64252.9485, 62938.016, 61636.8225, 60355.592, 59092.789, 57850.568, 56624.518, 55417.343, 54231.1415, 53067.387, 51903.526, 50774.649, 49657.6415, 48561.05, 47475.7575, 46410.159, 45364.852, 44327.053, 43318.4005, 42325.6165, 41348.4595, 40383.6265, 39436.77, 38509.502, 37594.035, 36695.939, 35818.6895, 34955.691, 34115.8095, 33293.949, 32465.0775, 31657.6715, 30877.2585, 30093.78, 29351.3695, 28594.1365, 27872.115, 27168.7465, 26477.076, 25774.541, 25106.5375, 24452.5135, 23815.5125, 23174.0655, 22555.2685, 21960.2065, 21376.3555, 20785.1925, 20211.517, 19657.0725, 19141.6865, 18579.737, 18081.3955, 17578.995, 17073.44, 16608.335, 16119.911, 15651.266, 15194.583, 14749.0495, 14343.4835, 13925.639, 13504.509, 13099.3885, 12691.2855, 12328.018, 11969.0345, 11596.5145, 11245.6355, 10917.6575, 10580.9785, 10277.8605, 9926.58100000001, 9605.538, 9300.42950000003, 8989.97850000003, 8728.73249999998, 8448.3235, 8175.31050000002, 7898.98700000002, 7629.79100000003, 7413.76199999999, 7149.92300000001, 6921.12650000001, 6677.1545, 6443.28000000003, 6278.23450000002, 6014.20049999998, 5791.20299999998, 5605.78450000001, 5438.48800000001, 5234.2255, 5059.6825, 4887.43349999998, 4682.935, 4496.31099999999, 4322.52250000002, 4191.42499999999, 4021.24200000003, 3900.64799999999, 3762.84250000003, 3609.98050000001, 3502.29599999997, 3363.84250000003, 3206.54849999998, 3079.70000000001, 2971.42300000001, 2867.80349999998, 2727.08100000001, 2630.74900000001, 2496.6165, 2440.902, 2356.19150000002, 2235.58199999999, 2120.54149999999, 2012.25449999998, 1933.35600000003, 1820.93099999998, 1761.54800000001, 1663.09350000002, 1578.84600000002, 1509.48149999999, 1427.3345, 1379.56150000001, 1306.68099999998, 1212.63449999999, 1084.17300000001, 1124.16450000001, 1060.69949999999, 1007.48849999998, 941.194499999983, 879.880500000028, 836.007500000007, 782.802000000025, 748.385499999975, 647.991500000004, 626.730500000005, 570.776000000013, 484.000500000024, 513.98550000001, 418.985499999952, 386.996999999974, 370.026500000036, 355.496999999974, 356.731499999994, 255.92200000002, 259.094000000041, 205.434499999974, 165.374500000034, 197.347500000033, 95.718499999959, 67.6165000000037, 54.6970000000438, 31.7395000000251, -15.8784999999916, 8.42500000004657, -26.3754999999655, -118.425500000012, -66.6629999999423, -42.9745000000112, -107.364999999991, -189.839000000036, -162.611499999999, -164.964999999967, -189.079999999958, -223.931499999948, -235.329999999958, -269.639500000048, -249.087999999989, -206.475499999942, -283.04449999996, -290.667000000016, -304.561499999953, -336.784499999951, -380.386500000022, -283.280499999993, -364.533000000054, -389.059499999974, -364.454000000027, -415.748000000021, -417.155000000028}; CUCO_HLL_TUNING_ARR_DECL bias_data_p18{189083.0, 185696.913, 182348.774, 179035.946, 175762.762, 172526.444, 169329.754, 166166.099, 163043.269, 159958.91, 156907.912, 153906.845, 150924.199, 147996.568, 145093.457, 142239.233, 139421.475, 136632.27, 133889.588, 131174.2, 128511.619, 125868.621, 123265.385, 120721.061, 118181.769, 115709.456, 113252.446, 110840.198, 108465.099, 106126.164, 103823.469, 101556.618, 99308.004, 97124.508, 94937.803, 92833.731, 90745.061, 88677.627, 86617.47, 84650.442, 82697.833, 80769.132, 78879.629, 77014.432, 75215.626, 73384.587, 71652.482, 69895.93, 68209.301, 66553.669, 64921.981, 63310.323, 61742.115, 60205.018, 58698.658, 57190.657, 55760.865, 54331.169, 52908.167, 51550.273, 50225.254, 48922.421, 47614.533, 46362.049, 45098.569, 43926.083, 42736.03, 41593.473, 40425.26, 39316.237, 38243.651, 37170.617, 36114.609, 35084.19, 34117.233, 33206.509, 32231.505, 31318.728, 30403.404, 29540.0550000001, 28679.236, 27825.862, 26965.216, 26179.148, 25462.08, 24645.952, 23922.523, 23198.144, 22529.128, 21762.4179999999, 21134.779, 20459.117, 19840.818, 19187.04, 18636.3689999999, 17982.831, 17439.7389999999, 16874.547, 16358.2169999999, 15835.684, 15352.914, 14823.681, 14329.313, 13816.897, 13342.874, 12880.882, 12491.648, 12021.254, 11625.392, 11293.7610000001, 10813.697, 10456.209, 10099.074, 9755.39000000001, 9393.18500000006, 9047.57900000003, 8657.98499999999, 8395.85900000005, 8033.0, 7736.95900000003, 7430.59699999995, 7258.47699999996, 6924.58200000005, 6691.29399999999, 6357.92500000005, 6202.05700000003, 5921.19700000004, 5628.28399999999, 5404.96799999999, 5226.71100000001, 4990.75600000005, 4799.77399999998, 4622.93099999998, 4472.478, 4171.78700000001, 3957.46299999999, 3868.95200000005, 3691.14300000004, 3474.63100000005, 3341.67200000002, 3109.14000000001, 3071.97400000005, 2796.40399999998, 2756.17799999996, 2611.46999999997, 2471.93000000005, 2382.26399999997, 2209.22400000005, 2142.28399999999, 2013.96100000001, 1911.18999999994, 1818.27099999995, 1668.47900000005, 1519.65800000005, 1469.67599999998, 1367.13800000004, 1248.52899999998, 1181.23600000003, 1022.71900000004, 1088.20700000005, 959.03600000008, 876.095999999903, 791.183999999892, 703.337000000058, 731.949999999953, 586.86400000006, 526.024999999907, 323.004999999888, 320.448000000091, 340.672999999952, 309.638999999966, 216.601999999955, 102.922999999952, 19.2399999999907, -0.114000000059605, -32.6240000000689, -89.3179999999702, -153.497999999905, -64.2970000000205, -143.695999999996, -259.497999999905, -253.017999999924, -213.948000000091, -397.590000000084, -434.006000000052, -403.475000000093, -297.958000000101, -404.317000000039, -528.898999999976, -506.621000000043, -513.205000000075, -479.351000000024, -596.139999999898, -527.016999999993, -664.681000000099, -680.306000000099, -704.050000000047, -850.486000000034, -757.43200000003, -713.308999999892}; -template -__host__ __device__ auto const& bias_data() noexcept; - -template <> -__host__ __device__ auto const& bias_data<4>() noexcept { return bias_data_p4; }; - -template <> -__host__ __device__ auto const& bias_data<5>() noexcept { return bias_data_p5; }; - -template <> -__host__ __device__ auto const& bias_data<6>() noexcept { return bias_data_p6; }; - -template <> -__host__ __device__ auto const& bias_data<7>() noexcept { return bias_data_p7; }; - -template <> -__host__ __device__ auto const& bias_data<8>() noexcept { return bias_data_p8; }; - -template <> -__host__ __device__ auto const& bias_data<9>() noexcept { return bias_data_p9; }; - -template <> -__host__ __device__ auto const& bias_data<10>() noexcept { return bias_data_p10; }; - -template <> -__host__ __device__ auto const& bias_data<11>() noexcept { return bias_data_p11; }; - -template <> -__host__ __device__ auto const& bias_data<12>() noexcept { return bias_data_p12; }; - -template <> -__host__ __device__ auto const& bias_data<13>() noexcept { return bias_data_p13; }; - -template <> -__host__ __device__ auto const& bias_data<14>() noexcept { return bias_data_p14; }; - -template <> -__host__ __device__ auto const& bias_data<15>() noexcept { return bias_data_p15; }; - -template <> -__host__ __device__ auto const& bias_data<16>() noexcept { return bias_data_p16; }; - -template <> -__host__ __device__ auto const& bias_data<17>() noexcept { return bias_data_p17; }; - -template <> -__host__ __device__ auto const& bias_data<18>() noexcept { return bias_data_p18; }; +__host__ __device__ constexpr double const* bias_data(int32_t precision) noexcept { + switch (precision) { + case 4: return bias_data_p4.data(); + case 5: return bias_data_p5.data(); + case 6: return bias_data_p6.data(); + case 7: return bias_data_p7.data(); + case 8: return bias_data_p8.data(); + case 9: return bias_data_p9.data(); + case 10: return bias_data_p10.data(); + case 11: return bias_data_p11.data(); + case 12: return bias_data_p12.data(); + case 13: return bias_data_p13.data(); + case 14: return bias_data_p14.data(); + case 15: return bias_data_p15.data(); + case 16: return bias_data_p16.data(); + case 17: return bias_data_p17.data(); + case 18: return bias_data_p18.data(); + default: return nullptr; + } +} + +__host__ __device__ constexpr size_t bias_data_size(int32_t precision) noexcept { + switch (precision) { + case 4: return bias_data_p4.size(); + case 5: return bias_data_p5.size(); + case 6: return bias_data_p6.size(); + case 7: return bias_data_p7.size(); + case 8: return bias_data_p8.size(); + case 9: return bias_data_p9.size(); + case 10: return bias_data_p10.size(); + case 11: return bias_data_p11.size(); + case 12: return bias_data_p12.size(); + case 13: return bias_data_p13.size(); + case 14: return bias_data_p14.size(); + case 15: return bias_data_p15.size(); + case 16: return bias_data_p16.size(); + case 17: return bias_data_p17.size(); + case 18: return bias_data_p18.size(); + default: return 0; + } +} // clang-format on } // namespace cuco::hyperloglog_ns::detail \ No newline at end of file From e93c248439be182e7756280efd77694c3c5e6210 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Fri, 15 Mar 2024 13:50:37 +0000 Subject: [PATCH 33/78] Allow wider vector sizes --- include/cuco/detail/hyperloglog/hyperloglog_ref.cuh | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh index 35ffce286..60552bf00 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh @@ -166,9 +166,10 @@ class hyperloglog_ref { // In case the input iterator represents a contiguous memory segment we can employ efficient // vectorized loads if constexpr (thrust::is_contiguous_iterator_v) { - auto const ptr = thrust::raw_pointer_cast(&first[0]); + auto const ptr = thrust::raw_pointer_cast(&first[0]); + auto constexpr max_vector_bytes = 32; auto const alignment = - 1 << cuda::std::countr_zero(reinterpret_cast(ptr) | 16); + 1 << cuda::std::countr_zero(reinterpret_cast(ptr) | max_vector_bytes); auto const vector_size = alignment / sizeof(value_type); switch (vector_size) { @@ -184,6 +185,10 @@ class hyperloglog_ref { kernel = reinterpret_cast( cuco::hyperloglog_ns::detail::add_shmem_vectorized<8, hyperloglog_ref>); break; + case 16: + kernel = reinterpret_cast( + cuco::hyperloglog_ns::detail::add_shmem_vectorized<16, hyperloglog_ref>); + break; }; } From 204b8e25aa6cf09ee12e5ea357f7b799096b6287 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Fri, 15 Mar 2024 14:56:28 +0000 Subject: [PATCH 34/78] Fix processing of remaining items --- include/cuco/detail/hyperloglog/kernels.cuh | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/include/cuco/detail/hyperloglog/kernels.cuh b/include/cuco/detail/hyperloglog/kernels.cuh index 4da78d020..c7e03491e 100644 --- a/include/cuco/detail/hyperloglog/kernels.cuh +++ b/include/cuco/detail/hyperloglog/kernels.cuh @@ -49,12 +49,14 @@ CUCO_KERNEL void add_shmem_vectorized(typename RefType::value_type const* first, auto const loop_stride = cuco::detail::grid_stride(); auto idx = cuco::detail::global_thread_id(); + auto const grid = cooperative_groups::this_grid(); auto const block = cooperative_groups::this_thread_block(); local_ref_type local_ref(cuda::std::span{local_sketch, ref.sketch_bytes()}, {}); local_ref.clear(block); block.sync(); + // each thread processes VectorSize-many items per iteration vector_type vec; while (idx < n / VectorSize) { vec = *reinterpret_cast( @@ -64,8 +66,13 @@ CUCO_KERNEL void add_shmem_vectorized(typename RefType::value_type const* first, } idx += loop_stride; } - auto const remainder = n % VectorSize; - if (idx >= n / VectorSize and idx < n / VectorSize + remainder) { local_ref.add(*(first + idx)); } + // a single thread processes the remaining items + cooperative_groups::invoke_one(grid, [&]() { + auto const remainder = n % VectorSize; + for (int i = 0; i < remainder; ++i) { + local_ref.add(*(first + n - i - 1)); + } + }); block.sync(); ref.merge(block, local_ref); From fe1cf5a9a4e04205a66681efbe328570bd54bbcb Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Fri, 15 Mar 2024 15:02:49 +0000 Subject: [PATCH 35/78] Guard invoke_one with macro --- include/cuco/detail/__config | 4 ++++ include/cuco/detail/hyperloglog/kernels.cuh | 9 +++++++++ 2 files changed, 13 insertions(+) diff --git a/include/cuco/detail/__config b/include/cuco/detail/__config index e0ac92a23..6d4bf7339 100644 --- a/include/cuco/detail/__config +++ b/include/cuco/detail/__config @@ -39,6 +39,10 @@ #define CUCO_HAS_CUDA_BARRIER #endif +#if defined(CUDART_VERSION) && (CUDART_VERSION >= 12010) +#define CUCO_HAS_CG_INVOKE_ONE +#endif + #if (CUCO_CUDA_MINIMUM_ARCH >= 700) #define CUCO_HAS_INDEPENDENT_THREADS #endif diff --git a/include/cuco/detail/hyperloglog/kernels.cuh b/include/cuco/detail/hyperloglog/kernels.cuh index c7e03491e..ba4ceb506 100644 --- a/include/cuco/detail/hyperloglog/kernels.cuh +++ b/include/cuco/detail/hyperloglog/kernels.cuh @@ -67,12 +67,21 @@ CUCO_KERNEL void add_shmem_vectorized(typename RefType::value_type const* first, idx += loop_stride; } // a single thread processes the remaining items +#if defined(CUCO_HAS_CG_INVOKE_ONE) cooperative_groups::invoke_one(grid, [&]() { auto const remainder = n % VectorSize; for (int i = 0; i < remainder; ++i) { local_ref.add(*(first + n - i - 1)); } }); +#else + if (grid.thread_rank() == 0) { + auto const remainder = n % VectorSize; + for (int i = 0; i < remainder; ++i) { + local_ref.add(*(first + n - i - 1)); + } + } +#endif block.sync(); ref.merge(block, local_ref); From ae9e77c826923aafe68caa67ff658a026ac01507 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Mon, 18 Mar 2024 23:05:34 +0000 Subject: [PATCH 36/78] Specify sketch size/precision at runtime --- benchmarks/distinct_count_estimator_bench.cu | 24 +++-- .../device_ref_example.cu | 10 +- .../distinct_count_estimator.inl | 93 ++++++++++-------- .../distinct_count_estimator_ref.inl | 96 ++++++++++--------- include/cuco/detail/hyperloglog/finalizer.cuh | 78 ++++++++------- .../cuco/detail/hyperloglog/hyperloglog.cuh | 47 +++++---- .../detail/hyperloglog/hyperloglog_ref.cuh | 83 +++++++++------- include/cuco/distinct_count_estimator.cuh | 40 ++++---- include/cuco/distinct_count_estimator_ref.cuh | 47 ++++----- .../unique_sequence_test.cu | 25 ++--- 10 files changed, 293 insertions(+), 250 deletions(-) diff --git a/benchmarks/distinct_count_estimator_bench.cu b/benchmarks/distinct_count_estimator_bench.cu index 12504f120..9b3ba02c0 100644 --- a/benchmarks/distinct_count_estimator_bench.cu +++ b/benchmarks/distinct_count_estimator_bench.cu @@ -56,7 +56,8 @@ void distinct_count_estimator_e2e(nvbench::state& state, nvbench::type_list items(num_items); @@ -66,7 +67,7 @@ void distinct_count_estimator_e2e(nvbench::state& state, nvbench::type_list(num_items, "InputSize"); - Estimator estimator; + Estimator estimator(sketch_size_kb); estimator.add(items.begin(), items.end()); double estimated_cardinality = estimator.estimate(); @@ -99,7 +100,8 @@ void distinct_count_estimator_add(nvbench::state& state, nvbench::type_list items(num_items); @@ -109,7 +111,7 @@ void distinct_count_estimator_add(nvbench::state& state, nvbench::type_list(num_items, "InputSize"); - Estimator estimator; + Estimator estimator(sketch_size_kb); state.exec(nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { estimator.clear_async({launch.get_stream()}); @@ -119,21 +121,16 @@ void distinct_count_estimator_add(nvbench::state& state, nvbench::type_list, - cuco::distinct_count_estimator, - cuco::distinct_count_estimator, - cuco::distinct_count_estimator, - cuco::distinct_count_estimator, - cuco::distinct_count_estimator, - cuco::distinct_count_estimator<__int128_t, 10>, - cuco::distinct_count_estimator<__int128_t, 11>, - cuco::distinct_count_estimator<__int128_t, 12>>; +using ESTIMATOR_RANGE = nvbench::type_list, + cuco::distinct_count_estimator, + cuco::distinct_count_estimator<__int128_t>>; NVBENCH_BENCH_TYPES(distinct_count_estimator_e2e, NVBENCH_TYPE_AXES(ESTIMATOR_RANGE, nvbench::type_list)) .set_name("distinct_count_estimator_e2e") .set_type_axes_names({"Estimator", "Distribution"}) .add_int64_power_of_two_axis("NumInputs", {28, 29, 30}) + .add_int64_axis("SketchSizeKB", {8, 16, 32}) .set_max_noise(defaults::MAX_NOISE); NVBENCH_BENCH_TYPES(distinct_count_estimator_add, @@ -141,4 +138,5 @@ NVBENCH_BENCH_TYPES(distinct_count_estimator_add, .set_name("distinct_count_estimator::add_async") .set_type_axes_names({"Estimator", "Distribution"}) .add_int64_power_of_two_axis("NumInputs", {28, 29, 30}) + .add_int64_axis("SketchSizeKB", {8, 16, 32}) .set_max_noise(defaults::MAX_NOISE); \ No newline at end of file diff --git a/examples/distinct_count_estimator/device_ref_example.cu b/examples/distinct_count_estimator/device_ref_example.cu index 845634388..c8716e421 100644 --- a/examples/distinct_count_estimator/device_ref_example.cu +++ b/examples/distinct_count_estimator/device_ref_example.cu @@ -37,15 +37,14 @@ __global__ void piggyback_kernel(RefType ref, InputIt first, std::size_t n) using local_ref_type = typename RefType::with_scope; // Shared memory storage for the block-local estimator - alignas(local_ref_type::sketch_alignment()) - __shared__ std::byte local_sketch[local_ref_type::sketch_bytes()]; + extern __shared__ std::byte local_sketch[]; auto const loop_stride = gridDim.x * blockDim.x; auto idx = blockDim.x * blockIdx.x + threadIdx.x; auto const block = cooperative_groups::this_thread_block(); // Create the local estimator with the shared memory storage - local_ref_type local_ref(cuda::std::span{local_sketch, local_ref_type::sketch_bytes()}, {}); + local_ref_type local_ref(cuda::std::span{local_sketch, ref.sketch_bytes()}); // Initialize the local estimator local_ref.clear(block); @@ -103,8 +102,11 @@ int main(void) // Clear the estimator so it can be reused estimator.clear(); + // Number of dynamic shared memory bytes required to store a CTA-local sketch + auto const sketch_bytes = estimator.sketch_bytes(); + // Call the custom kernel and pass a non-owning reference to the estimator to the GPU - piggyback_kernel<<<10, 512>>>(estimator.ref(), items.begin(), num_items); + piggyback_kernel<<<10, 512, sketch_bytes>>>(estimator.ref(), items.begin(), num_items); // Calculate the cardinality estimate from the custom kernel std::size_t const estimated_cardinality_custom = estimator.estimate(); diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl index df68a0593..54806aba6 100644 --- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl +++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl @@ -16,115 +16,124 @@ namespace cuco { -template -constexpr distinct_count_estimator::distinct_count_estimator( - Hash const& hash, Allocator const& alloc, cuco::cuda_stream_ref stream) - : impl_{std::make_unique(hash, alloc, stream)} +template +constexpr distinct_count_estimator::distinct_count_estimator( + std::size_t max_sketch_size_kb, + Hash const& hash, + Allocator const& alloc, + cuco::cuda_stream_ref stream) + : impl_{std::make_unique(max_sketch_size_kb, hash, alloc, stream)} { } -template -void distinct_count_estimator::clear_async( +template +void distinct_count_estimator::clear_async( cuco::cuda_stream_ref stream) noexcept { this->impl_->clear_async(stream); } -template -void distinct_count_estimator::clear( - cuco::cuda_stream_ref stream) +template +void distinct_count_estimator::clear(cuco::cuda_stream_ref stream) { this->impl_->clear(stream); } -template +template template -void distinct_count_estimator::add_async( +void distinct_count_estimator::add_async( InputIt first, InputIt last, cuco::cuda_stream_ref stream) noexcept { this->impl_->add_async(first, last, stream); } -template +template template -void distinct_count_estimator::add( - InputIt first, InputIt last, cuco::cuda_stream_ref stream) +void distinct_count_estimator::add(InputIt first, + InputIt last, + cuco::cuda_stream_ref stream) { this->impl_->add(first, last, stream); } -template +template template -void distinct_count_estimator::merge_async( - distinct_count_estimator const& other, +void distinct_count_estimator::merge_async( + distinct_count_estimator const& other, cuco::cuda_stream_ref stream) noexcept { this->impl_->merge_async(other, stream); } -template +template template -void distinct_count_estimator::merge( - distinct_count_estimator const& other, +void distinct_count_estimator::merge( + distinct_count_estimator const& other, cuco::cuda_stream_ref stream) { this->impl_->merge(other, stream); } -template +template template -void distinct_count_estimator::merge_async( +void distinct_count_estimator::merge_async( ref_type const& other, cuco::cuda_stream_ref stream) noexcept { this->impl_->merge_async(other, stream); } -template +template template -void distinct_count_estimator::merge( - ref_type const& other, cuco::cuda_stream_ref stream) +void distinct_count_estimator::merge(ref_type const& other, + cuco::cuda_stream_ref stream) { this->impl_->merge(other, stream); } -template -std::size_t distinct_count_estimator::estimate( +template +std::size_t distinct_count_estimator::estimate( cuco::cuda_stream_ref stream) const { return this->impl_->estimate(stream); } -template -typename distinct_count_estimator::ref_type<> -distinct_count_estimator::ref() const noexcept +template +typename distinct_count_estimator::ref_type<> +distinct_count_estimator::ref() const noexcept { return {this->sketch(), this->hash()}; } -template -auto distinct_count_estimator::hash() const noexcept +template +auto distinct_count_estimator::hash() const noexcept { return this->impl_->hash(); } -template -auto distinct_count_estimator::sketch() const noexcept +template +cuda::std::span distinct_count_estimator::sketch() + const noexcept { return this->impl_->sketch(); } -template -constexpr size_t -distinct_count_estimator::sketch_bytes() noexcept +template +constexpr size_t distinct_count_estimator::sketch_bytes() const noexcept +{ + return this->impl_->sketch_bytes(); +} + +template +constexpr size_t distinct_count_estimator::sketch_bytes( + size_t max_sketch_size_kb) noexcept { - return impl_type::sketch_bytes(); + return impl_type::sketch_bytes(max_sketch_size_kb); } -template -constexpr size_t -distinct_count_estimator::sketch_alignment() noexcept +template +constexpr size_t distinct_count_estimator::sketch_alignment() noexcept { - return impl_type::sketch(); + return impl_type::sketch_alignment(); } } // namespace cuco \ No newline at end of file diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl index 50bea1675..d0cf85475 100644 --- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl +++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl @@ -16,123 +16,127 @@ namespace cuco { -template -template -__host__ __device__ constexpr distinct_count_estimator_ref:: - distinct_count_estimator_ref(cuda::std::span sketch_span, Hash const& hash) noexcept +template +__host__ + __device__ constexpr distinct_count_estimator_ref::distinct_count_estimator_ref( + cuda::std::span sketch_span, Hash const& hash) : impl_{sketch_span, hash} { } -template +template template -__device__ void distinct_count_estimator_ref::clear( - CG const& group) noexcept +__device__ void distinct_count_estimator_ref::clear(CG const& group) noexcept { this->impl_.clear(group); } -template -__host__ void distinct_count_estimator_ref::clear_async( +template +__host__ void distinct_count_estimator_ref::clear_async( cuco::cuda_stream_ref stream) noexcept { this->impl_.clear_async(stream); } -template -__host__ void distinct_count_estimator_ref::clear( - cuco::cuda_stream_ref stream) +template +__host__ void distinct_count_estimator_ref::clear(cuco::cuda_stream_ref stream) { this->impl_.clear(stream); } -template -__device__ void distinct_count_estimator_ref::add(T const& item) noexcept +template +__device__ void distinct_count_estimator_ref::add(T const& item) noexcept { this->impl_.add(item); } -template +template template -__host__ void distinct_count_estimator_ref::add_async( - InputIt first, InputIt last, cuco::cuda_stream_ref stream) +__host__ void distinct_count_estimator_ref::add_async(InputIt first, + InputIt last, + cuco::cuda_stream_ref stream) { this->impl_.add_async(first, last, stream); } -template +template template -__host__ void distinct_count_estimator_ref::add( - InputIt first, InputIt last, cuco::cuda_stream_ref stream) +__host__ void distinct_count_estimator_ref::add(InputIt first, + InputIt last, + cuco::cuda_stream_ref stream) { this->impl_.add(first, last, stream); } -template +template template -__device__ void distinct_count_estimator_ref::merge( - CG const& group, - distinct_count_estimator_ref const& other) noexcept +__device__ void distinct_count_estimator_ref::merge( + CG const& group, distinct_count_estimator_ref const& other) noexcept { this->impl_.merge(group, other.impl_); } -template +template template -__host__ void distinct_count_estimator_ref::merge_async( - distinct_count_estimator_ref const& other, +__host__ void distinct_count_estimator_ref::merge_async( + distinct_count_estimator_ref const& other, cuco::cuda_stream_ref stream) noexcept { this->impl_.merge_async(other, stream); } -template +template template -__host__ void distinct_count_estimator_ref::merge( - distinct_count_estimator_ref const& other, - cuco::cuda_stream_ref stream) +__host__ void distinct_count_estimator_ref::merge( + distinct_count_estimator_ref const& other, cuco::cuda_stream_ref stream) { this->impl_.merge(other, stream); } -template -__device__ std::size_t distinct_count_estimator_ref::estimate( +template +__device__ std::size_t distinct_count_estimator_ref::estimate( cooperative_groups::thread_block const& group) const noexcept { return this->impl_.estimate(group); } -template -__host__ std::size_t distinct_count_estimator_ref::estimate( +template +__host__ std::size_t distinct_count_estimator_ref::estimate( cuco::cuda_stream_ref stream) const { return this->impl_.estimate(stream); } -template -__host__ __device__ auto distinct_count_estimator_ref::hash() - const noexcept +template +__host__ __device__ auto distinct_count_estimator_ref::hash() const noexcept { return this->impl_.hash(); } -template -__host__ __device__ auto distinct_count_estimator_ref::sketch() - const noexcept +template +__host__ __device__ cuda::std::span +distinct_count_estimator_ref::sketch() const noexcept { return this->impl_.sketch(); } -template +template __host__ __device__ constexpr std::size_t -distinct_count_estimator_ref::sketch_bytes() noexcept +distinct_count_estimator_ref::sketch_bytes() const noexcept { - return impl_type::sketch_bytes(); + return this->impl_.sketch_bytes(); } -template +template __host__ __device__ constexpr std::size_t -distinct_count_estimator_ref::sketch_alignment() noexcept +distinct_count_estimator_ref::sketch_bytes(std::size_t max_sketch_size_kb) noexcept +{ + return impl_type::sketch_bytes(max_sketch_size_kb); +} + +template +__host__ __device__ constexpr std::size_t +distinct_count_estimator_ref::sketch_alignment() noexcept { return impl_type::sketch_alignment(); } diff --git a/include/cuco/detail/hyperloglog/finalizer.cuh b/include/cuco/detail/hyperloglog/finalizer.cuh index 60a7ffcab..f40a0e751 100644 --- a/include/cuco/detail/hyperloglog/finalizer.cuh +++ b/include/cuco/detail/hyperloglog/finalizer.cuh @@ -30,17 +30,24 @@ namespace cuco::hyperloglog_ns::detail { * @note Variable names correspond to the definitions given in the HLL++ paper: * https://static.googleusercontent.com/media/research.google.com/de//pubs/archive/40671.pdf * - * @tparam Precision Tuning parameter to trade accuracy for runtime/memory footprint */ -template class finalizer { // Note: Most of the types in this implementation are explicit instead of relying on `auto` to // avoid confusion with the reference implementation. - // this minimum number of registers is required by HLL++ - static_assert(Precision >= 4, "Precision must be greater or equal to 4"); - public: + /** + * @brief Contructs an HLL finalizer object. + * + * @throws Iff precision vale is not supported + * + * @param precision HLL precision parameter + */ + __host__ __device__ constexpr finalizer(int precision) : precision_{precision}, m_{1 << precision} + { + // TODO check if precision >= 4 + } + /** * @brief Compute the bias-corrected cardinality estimate. * @@ -49,53 +56,50 @@ class finalizer { * * @return Bias-corrected cardinality estimate */ - __host__ __device__ static std::size_t constexpr finalize(double z, int v) noexcept + __host__ __device__ constexpr std::size_t operator()(double z, int v) const noexcept { auto e = alpha_mm() / z; if (v > 0) { // Use linear counting for small cardinality estimates. - double const h = m * log(static_cast(m) / v); + double const h = this->m_ * log(static_cast(this->m_) / v); // The threshold `2.5 * m` is from the original HLL algorithm. - if (e <= 2.5 * m) { return cuda::std::round(h); } + if (e <= 2.5 * this->m_) { return cuda::std::round(h); } - if constexpr (Precision < 19) { - e = (h <= threshold(Precision)) ? h : bias_corrected_estimate(e); + if (this->precision_ < 19) { + e = (h <= threshold(this->precision_)) ? h : bias_corrected_estimate(e); } } else { // HLL++ is defined only when p < 19, otherwise we need to fallback to HLL. - if constexpr (Precision < 19) { e = bias_corrected_estimate(e); } + if (this->precision_ < 19) { e = bias_corrected_estimate(e); } } return cuda::std::round(e); } private: - static auto constexpr m = (1 << Precision); ///< Number of registers - static auto constexpr k = 6; ///< Number of interpolation points to consider - - __host__ __device__ static double constexpr alpha_mm() noexcept + __host__ __device__ constexpr double alpha_mm() const noexcept { - if constexpr (m == 16) { - return 0.673 * m * m; - } else if constexpr (m == 32) { - return 0.697 * m * m; - } else if constexpr (m == 64) { - return 0.709 * m * m; + if (this->m_ == 16) { + return 0.673 * this->m_ * this->m_; + } else if (this->m_ == 32) { + return 0.697 * this->m_ * this->m_; + } else if (this->m_ == 64) { + return 0.709 * this->m_ * this->m_; } else { - return (0.7213 / (1.0 + 1.079 / m)) * m * m; + return (0.7213 / (1.0 + 1.079 / this->m_)) * this->m_ * this->m_; } } - __host__ __device__ static double constexpr bias_corrected_estimate(double e) noexcept + __host__ __device__ constexpr double bias_corrected_estimate(double e) const noexcept { - return (e < 5.0 * m) ? e - bias(e) : e; + return (e < 5.0 * this->m_) ? e - bias(e) : e; } - __host__ __device__ static double constexpr bias(double e) noexcept + __host__ __device__ constexpr double bias(double e) const noexcept { auto const anchor_index = interpolation_anchor_index(e); - int const n = raw_estimate_data_size(Precision); + int const n = raw_estimate_data_size(this->precision_); auto low = cuda::std::max(anchor_index - k + 1, 0); auto high = cuda::std::min(low + k, n); @@ -106,7 +110,7 @@ class finalizer { high += 1; } - auto biases = bias_data(Precision); + auto biases = bias_data(this->precision_); double bias_sum = 0.0; for (int i = low; i < high; ++i) { bias_sum += biases[i]; @@ -115,19 +119,19 @@ class finalizer { return bias_sum / (high - low); } - __host__ __device__ static double distance(double e, int i) noexcept + __host__ __device__ constexpr double distance(double e, int i) const noexcept { - auto const diff = e - raw_estimate_data(Precision)[i]; + auto const diff = e - raw_estimate_data(this->precision_)[i]; return diff * diff; } - __host__ __device__ static int interpolation_anchor_index(double e) noexcept + __host__ __device__ constexpr int interpolation_anchor_index(double e) const noexcept { - auto estimates = raw_estimate_data(Precision); - int const n = raw_estimate_data_size(Precision); - int left = 0; - int right = static_cast(n) - 1; - int mid; + auto estimates = raw_estimate_data(this->precision_); + int const n = raw_estimate_data_size(this->precision_); + int left = 0; + int right = static_cast(n) - 1; + int mid = -1; int candidate_index = 0; // Index of the closest element found while (left <= right) { @@ -157,5 +161,9 @@ class finalizer { return candidate_index; } + + static constexpr auto k = 6; ///< Number of interpolation points to consider + int precision_; + int m_; }; } // namespace cuco::hyperloglog_ns::detail \ No newline at end of file diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh index 56e13da66..159afeb99 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog.cuh @@ -32,25 +32,20 @@ namespace cuco::detail { * * @note This class implements the HyperLogLog/HyperLogLog++ algorithm: * https://static.googleusercontent.com/media/research.google.com/de//pubs/archive/40671.pdf. - * @note The `Precision` parameter can be used to trade runtime/memory footprint for better - * accuracy. A higher value corresponds to a more accurate result, however, setting the precision - * too high will result in deminishing results. * * @tparam T Type of items to count - * @tparam Precision Tuning parameter to trade runtime/memory footprint for better accuracy * @tparam Scope The scope in which operations will be performed by individual threads * @tparam Hash Hash function used to hash items * @tparam Allocator Type of allocator used for device storage */ -template +template class hyperloglog { public: - static constexpr auto thread_scope = Scope; ///< CUDA thread scope - static constexpr auto precision = Precision; ///< Precision + static constexpr auto thread_scope = Scope; ///< CUDA thread scope template - using ref_type = hyperloglog_ref; ///< Non-owning reference - ///< type + using ref_type = hyperloglog_ref; ///< Non-owning reference + ///< type using value_type = typename ref_type<>::value_type; ///< Type of items to count using hash_type = typename ref_type<>::hash_type; ///< Hash function type @@ -63,15 +58,19 @@ class hyperloglog { * * @note This function synchronizes the given stream. * + * @param max_sketch_size_kb Maximum sketch size in KB * @param hash The hash function used to hash items * @param alloc Allocator used for allocating device storage * @param stream CUDA stream used to initialize the object */ - constexpr hyperloglog(Hash const& hash, Allocator const& alloc, cuco::cuda_stream_ref stream) + constexpr hyperloglog(std::size_t max_sketch_size_kb, + Hash const& hash, + Allocator const& alloc, + cuco::cuda_stream_ref stream) : allocator_{alloc}, - deleter_{this->sketch_bytes(), this->allocator_}, - sketch_{this->allocator_.allocate(this->sketch_bytes()), this->deleter_}, - ref_{cuda::std::span{this->sketch_.get(), this->sketch_bytes()}, hash} + deleter_{this->sketch_bytes(max_sketch_size_kb), this->allocator_}, + sketch_{this->allocator_.allocate(this->sketch_bytes(max_sketch_size_kb)), this->deleter_}, + ref_{cuda::std::span{this->sketch_.get(), this->sketch_bytes(max_sketch_size_kb)}, hash} { this->ref_.clear_async(stream); } @@ -153,7 +152,7 @@ class hyperloglog { * @param stream CUDA stream this operation is executed in */ template - void merge_async(hyperloglog const& other, + void merge_async(hyperloglog const& other, cuco::cuda_stream_ref stream) noexcept { this->ref_.merge_async(other.ref(), stream); @@ -172,7 +171,7 @@ class hyperloglog { * @param stream CUDA stream this operation is executed in */ template - void merge(hyperloglog const& other, + void merge(hyperloglog const& other, cuco::cuda_stream_ref stream) { this->ref_.merge(other.ref(), stream); @@ -242,7 +241,7 @@ class hyperloglog { * * @return The cuda::std::span of the sketch */ - [[nodiscard]] auto sketch() const noexcept { return this->ref_.sketch(); } + [[nodiscard]] cuda::std::span sketch() const noexcept { return this->ref_.sketch(); } /** * @brief Gets the number of bytes required for the sketch storage. @@ -251,7 +250,19 @@ class hyperloglog { */ [[nodiscard]] constexpr std::size_t sketch_bytes() const noexcept { - return ref_type<>::sketch_bytes(); + return this->ref_.sketch_bytes(); + } + + /** + * @brief Gets the number of bytes required for the sketch storage. + * + * @param max_sketch_size_kb Upper bound sketch size in KB + * + * @return The number of bytes required for the sketch + */ + [[nodiscard]] static constexpr std::size_t sketch_bytes(std::size_t max_sketch_size_kb) noexcept + { + return ref_type<>::sketch_bytes(max_sketch_size_kb); } /** @@ -273,7 +284,7 @@ class hyperloglog { // Needs to be friends with other instantiations of this class template to have access to their // storage - template + template friend class hyperloglog; }; } // namespace cuco::detail \ No newline at end of file diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh index 60552bf00..5016fb21f 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh @@ -44,30 +44,25 @@ namespace cuco::detail { * * @note This class implements the HyperLogLog/HyperLogLog++ algorithm: * https://static.googleusercontent.com/media/research.google.com/de//pubs/archive/40671.pdf. - * @note The `Precision` parameter can be used to trade runtime/memory footprint for better - * accuracy. A higher value corresponds to a more accurate result, however, setting the precision - * too high will result in deminishing results. * * @tparam T Type of items to count - * @tparam Precision Tuning parameter to trade runtime/memory footprint for better accuracy * @tparam Scope The scope in which operations will be performed by individual threads * @tparam Hash Hash function used to hash items */ -template +template class hyperloglog_ref { using register_type = int; ///< Register array storage // We use `int` here since this is the smallest type that supports native `atomicMax` on GPUs using fp_type = float; ///< Floating point type used for reduction public: - static constexpr auto thread_scope = Scope; ///< CUDA thread scope - static constexpr auto precision = Precision; ///< Precision + static constexpr auto thread_scope = Scope; ///< CUDA thread scope using value_type = T; ///< Type of items to count using hash_type = Hash; ///< Hash function type template - using with_scope = hyperloglog_ref; ///< Ref type with different - ///< thread scope + using with_scope = hyperloglog_ref; ///< Ref type with different + ///< thread scope /** * @brief Constructs a non-owning `hyperloglog_ref` object. @@ -75,10 +70,11 @@ class hyperloglog_ref { * @param sketch_span Reference to sketch storage * @param hash The hash function used to hash items */ - template - __host__ __device__ constexpr hyperloglog_ref(cuda::std::span sketch_span, - Hash const& hash) noexcept + __host__ __device__ constexpr hyperloglog_ref(cuda::std::span sketch_span, + Hash const& hash) : hash_{hash}, + precision_{cuda::std::countr_zero(this->sketch_bytes(sketch_span.size() / 1024) / + sizeof(register_type))}, sketch_{reinterpret_cast(sketch_span.data()), this->sketch_bytes() / sizeof(register_type)} { @@ -133,10 +129,10 @@ class hyperloglog_ref { __device__ void add(T const& item) noexcept { using hash_value_type = decltype(cuda::std::declval()(cuda::std::declval())); - hash_value_type constexpr register_mask = (1ull << Precision) - 1; - auto const h = this->hash_(item); - auto const reg = h & register_mask; - auto const zeroes = cuda::std::countl_zero(h | register_mask) + 1; // __clz + hash_value_type const register_mask = (1ull << this->precision_) - 1; + auto const h = this->hash_(item); + auto const reg = h & register_mask; + auto const zeroes = cuda::std::countl_zero(h | register_mask) + 1; // __clz this->update_max(reg, zeroes); } @@ -260,9 +256,12 @@ class hyperloglog_ref { * @param other Other estimator reference to be merged into `*this` */ template - __device__ void merge(CG const& group, - hyperloglog_ref const& other) noexcept + __device__ void merge(CG const& group, hyperloglog_ref const& other) noexcept { + if (other.precision_ != this->precision_) { + __trap(); // TODO check if this hurts performance + } + for (int i = group.thread_rank(); i < this->sketch_.size(); i += group.size()) { this->update_max(i, other.sketch_[i]); } @@ -278,7 +277,7 @@ class hyperloglog_ref { * @param stream CUDA stream this operation is executed in */ template - __host__ void merge_async(hyperloglog_ref const& other, + __host__ void merge_async(hyperloglog_ref const& other, cuco::cuda_stream_ref stream) noexcept { auto constexpr block_size = 1024; @@ -297,7 +296,7 @@ class hyperloglog_ref { * @param stream CUDA stream this operation is executed in */ template - __host__ void merge(hyperloglog_ref const& other, + __host__ void merge(hyperloglog_ref const& other, cuco::cuda_stream_ref stream) { this->merge_async(other, stream); @@ -353,9 +352,10 @@ class hyperloglog_ref { group.sync(); if (group.thread_rank() == 0) { - auto const z = block_sum.load(cuda::std::memory_order_relaxed); - auto const v = block_zeroes.load(cuda::std::memory_order_relaxed); - estimate = cuco::hyperloglog_ns::detail::finalizer::finalize(z, v); + auto const z = block_sum.load(cuda::std::memory_order_relaxed); + auto const v = block_zeroes.load(cuda::std::memory_order_relaxed); + auto const finalize = cuco::hyperloglog_ns::detail::finalizer(this->precision_); + estimate = finalize(z, v); } group.sync(); @@ -373,7 +373,7 @@ class hyperloglog_ref { */ [[nodiscard]] __host__ std::size_t estimate(cuco::cuda_stream_ref stream) const { - auto const num_regs = 1ull << Precision; + auto const num_regs = 1ull << this->precision_; thrust::host_vector host_sketch(num_regs); // TODO check if storage is host accessible @@ -393,8 +393,10 @@ class hyperloglog_ref { zeroes += reg == 0; } + auto const finalize = cuco::hyperloglog_ns::detail::finalizer(this->precision_); + // pass intermediate result to finalizer for bias correction, etc. - return cuco::hyperloglog_ns::detail::finalizer::finalize(sum, zeroes); + return finalize(sum, zeroes); } /** @@ -409,16 +411,33 @@ class hyperloglog_ref { * * @return The cuda::std::span of the sketch */ - [[nodiscard]] __host__ __device__ auto sketch() const noexcept { return this->sketch_; } + [[nodiscard]] __host__ __device__ cuda::std::span sketch() const noexcept + { + return cuda::std::span(reinterpret_cast(this->sketch_.data()), + this->sketch_bytes()); + } + + /** + * @brief Gets the number of bytes required for the sketch storage. + * + * @return The number of bytes required for the sketch + */ + [[nodiscard]] __host__ __device__ std::size_t sketch_bytes() const noexcept + { + return (1ull << this->precision_) * sizeof(register_type); + } /** * @brief Gets the number of bytes required for the sketch storage. * + * @param max_sketch_size_kb Upper bound sketch size in KB + * * @return The number of bytes required for the sketch */ - [[nodiscard]] __host__ __device__ static constexpr std::size_t sketch_bytes() noexcept + [[nodiscard]] __host__ __device__ static constexpr std::size_t sketch_bytes( + std::size_t max_sketch_size_kb) noexcept { - return (1ull << Precision) * sizeof(register_type); + return cuda::std::bit_floor(max_sketch_size_kb * 1024); } /** @@ -474,11 +493,11 @@ class hyperloglog_ref { shmem_bytes); } - hash_type hash_; ///< Hash function used to hash items - cuda::std::span - sketch_; ///< HLL sketch storage + hash_type hash_; ///< Hash function used to hash items + int32_t precision_; ///< HLL precision parameter + cuda::std::span sketch_; ///< HLL sketch storage - template + template friend class hyperloglog_ref; }; } // namespace cuco::detail \ No newline at end of file diff --git a/include/cuco/distinct_count_estimator.cuh b/include/cuco/distinct_count_estimator.cuh index 38f9cbd16..0a2490ad7 100644 --- a/include/cuco/distinct_count_estimator.cuh +++ b/include/cuco/distinct_count_estimator.cuh @@ -32,9 +32,6 @@ namespace cuco { * * @note This implementation is based on the HyperLogLog++ algorithm: * https://static.googleusercontent.com/media/research.google.com/de//pubs/archive/40671.pdf. - * @note The `Precision` parameter can be used to trade runtime/memory footprint for better - * accuracy. A higher value corresponds to a more accurate result, however, setting the precision - * too high will result in deminishing returns. * * @tparam T Type of items to count * @tparam Precision Tuning parameter to trade runtime/memory footprint for better accuracy @@ -43,21 +40,18 @@ namespace cuco { * @tparam Allocator Type of allocator used for device storage */ template , class Allocator = cuco::cuda_allocator> class distinct_count_estimator { - using impl_type = detail::hyperloglog; + using impl_type = detail::hyperloglog; public: static constexpr auto thread_scope = impl_type::thread_scope; ///< CUDA thread scope - static constexpr auto precision = impl_type::precision; ///< Precision template - using ref_type = - cuco::distinct_count_estimator_ref; ///< Non-owning reference - ///< type + using ref_type = cuco::distinct_count_estimator_ref; ///< Non-owning reference + ///< type using value_type = typename impl_type::value_type; ///< Type of items to count using allocator_type = typename impl_type::allocator_type; ///< Allocator type @@ -68,13 +62,15 @@ class distinct_count_estimator { * * @note This function synchronizes the given stream. * + * @param max_sketch_size_kb Maximum sketch size in KB * @param hash The hash function used to hash items * @param alloc Allocator used for allocating device storage * @param stream CUDA stream used to initialize the object */ - constexpr distinct_count_estimator(Hash const& hash = {}, - Allocator const& alloc = {}, - cuco::cuda_stream_ref stream = {}); + constexpr distinct_count_estimator(std::size_t max_sketch_size_kb = 32, + Hash const& hash = {}, + Allocator const& alloc = {}, + cuco::cuda_stream_ref stream = {}); ~distinct_count_estimator() = default; @@ -148,9 +144,8 @@ class distinct_count_estimator { * @param stream CUDA stream this operation is executed in */ template - void merge_async( - distinct_count_estimator const& other, - cuco::cuda_stream_ref stream = {}) noexcept; + void merge_async(distinct_count_estimator const& other, + cuco::cuda_stream_ref stream = {}) noexcept; /** * @brief Merges the result of `other` estimator into `*this` estimator. @@ -165,7 +160,7 @@ class distinct_count_estimator { * @param stream CUDA stream this operation is executed in */ template - void merge(distinct_count_estimator const& other, + void merge(distinct_count_estimator const& other, cuco::cuda_stream_ref stream = {}); /** @@ -223,14 +218,23 @@ class distinct_count_estimator { * * @return The cuda::std::span of the sketch */ - [[nodiscard]] auto sketch() const noexcept; + [[nodiscard]] cuda::std::span sketch() const noexcept; /** * @brief Gets the number of bytes required for the sketch storage. * * @return The number of bytes required for the sketch */ - [[nodiscard]] static constexpr std::size_t sketch_bytes() noexcept; + [[nodiscard]] constexpr std::size_t sketch_bytes() const noexcept; + + /** + * @brief Gets the number of bytes required for the sketch storage. + * + * @param max_sketch_size_kb Upper bound sketch size in KB + * + * @return The number of bytes required for the sketch + */ + [[nodiscard]] static constexpr std::size_t sketch_bytes(std::size_t max_sketch_size_kb) noexcept; /** * @brief Gets the alignment required for the sketch storage. diff --git a/include/cuco/distinct_count_estimator_ref.cuh b/include/cuco/distinct_count_estimator_ref.cuh index 905a6d379..65f899723 100644 --- a/include/cuco/distinct_count_estimator_ref.cuh +++ b/include/cuco/distinct_count_estimator_ref.cuh @@ -30,29 +30,23 @@ namespace cuco { * * @note This implementation is based on the HyperLogLog++ algorithm: * https://static.googleusercontent.com/media/research.google.com/de//pubs/archive/40671.pdf. - * @note The `Precision` parameter can be used to trade runtime/memory footprint for better - * accuracy. A higher value corresponds to a more accurate result, however, setting the precision - * too high will result in deminishing results. * * @tparam T Type of items to count - * @tparam Precision Tuning parameter to trade runtime/memory footprint for better accuracy * @tparam Scope The scope in which operations will be performed by individual threads * @tparam Hash Hash function used to hash items */ -template +template class distinct_count_estimator_ref { - using impl_type = detail::hyperloglog_ref; + using impl_type = detail::hyperloglog_ref; public: static constexpr auto thread_scope = impl_type::thread_scope; ///< CUDA thread scope - static constexpr auto precision = impl_type::precision; ///< Precision using value_type = typename impl_type::value_type; ///< Type of items to count template - using with_scope = - distinct_count_estimator_ref; ///< Ref type with different thread - ///< scope + using with_scope = distinct_count_estimator_ref; ///< Ref type with different + ///< thread scope // TODO let storage_type be inferred? /** @@ -61,9 +55,8 @@ class distinct_count_estimator_ref { * @param sketch_span Reference to sketch storage * @param hash The hash function used to hash items */ - template - __host__ __device__ constexpr distinct_count_estimator_ref(cuda::std::span sketch_span, - Hash const& hash = {}) noexcept; + __host__ __device__ constexpr distinct_count_estimator_ref(cuda::std::span sketch_span, + Hash const& hash = {}); /** * @brief Resets the estimator, i.e., clears the current count estimate. @@ -140,9 +133,8 @@ class distinct_count_estimator_ref { * @param other Other estimator reference to be merged into `*this` */ template - __device__ void merge( - CG const& group, - distinct_count_estimator_ref const& other) noexcept; + __device__ void merge(CG const& group, + distinct_count_estimator_ref const& other) noexcept; /** * @brief Asynchronously merges the result of `other` estimator reference into `*this` estimator. @@ -153,9 +145,8 @@ class distinct_count_estimator_ref { * @param stream CUDA stream this operation is executed in */ template - __host__ void merge_async( - distinct_count_estimator_ref const& other, - cuco::cuda_stream_ref stream = {}) noexcept; + __host__ void merge_async(distinct_count_estimator_ref const& other, + cuco::cuda_stream_ref stream = {}) noexcept; /** * @brief Merges the result of `other` estimator reference into `*this` estimator. @@ -169,7 +160,7 @@ class distinct_count_estimator_ref { * @param stream CUDA stream this operation is executed in */ template - __host__ void merge(distinct_count_estimator_ref const& other, + __host__ void merge(distinct_count_estimator_ref const& other, cuco::cuda_stream_ref stream = {}); /** @@ -205,14 +196,24 @@ class distinct_count_estimator_ref { * * @return The cuda::std::span of the sketch */ - [[nodiscard]] __host__ __device__ auto sketch() const noexcept; + [[nodiscard]] __host__ __device__ cuda::std::span sketch() const noexcept; /** * @brief Gets the number of bytes required for the sketch storage. * * @return The number of bytes required for the sketch */ - [[nodiscard]] __host__ __device__ static constexpr std::size_t sketch_bytes() noexcept; + [[nodiscard]] __host__ __device__ constexpr std::size_t sketch_bytes() const noexcept; + + /** + * @brief Gets the number of bytes required for the sketch storage. + * + * @param max_sketch_size_kb Upper bound sketch size in KB + * + * @return The number of bytes required for the sketch + */ + [[nodiscard]] __host__ __device__ static constexpr std::size_t sketch_bytes( + std::size_t max_sketch_size_kb) noexcept; /** * @brief Gets the alignment required for the sketch storage. @@ -224,7 +225,7 @@ class distinct_count_estimator_ref { private: impl_type impl_; ///< Implementation object - template + template friend class distinct_count_estimator_ref; }; } // namespace cuco diff --git a/tests/distinct_count_estimator/unique_sequence_test.cu b/tests/distinct_count_estimator/unique_sequence_test.cu index 9ebbc6291..fffdd751b 100644 --- a/tests/distinct_count_estimator/unique_sequence_test.cu +++ b/tests/distinct_count_estimator/unique_sequence_test.cu @@ -32,24 +32,9 @@ TEMPLATE_TEST_CASE_SIG("distinct_count_estimator: unique sequence", "", ((typename T, int32_t Precision, typename Hash), T, Precision, Hash), - (int32_t, 9, cuco::xxhash_64), - (int32_t, 11, cuco::xxhash_64), - (int32_t, 13, cuco::xxhash_64), - (int32_t, 16, cuco::xxhash_64), - (int32_t, 18, cuco::xxhash_64), - (int32_t, 20, cuco::xxhash_64), - (int64_t, 9, cuco::xxhash_64), - (int64_t, 11, cuco::xxhash_64), - (int64_t, 13, cuco::xxhash_64), - (int64_t, 16, cuco::xxhash_64), - (int64_t, 18, cuco::xxhash_64), - (int64_t, 20, cuco::xxhash_64), - (__int128_t, 9, cuco::xxhash_64<__int128_t>), - (__int128_t, 11, cuco::xxhash_64<__int128_t>), - (__int128_t, 13, cuco::xxhash_64<__int128_t>), - (__int128_t, 16, cuco::xxhash_64<__int128_t>), - (__int128_t, 18, cuco::xxhash_64<__int128_t>), - (__int128_t, 20, cuco::xxhash_64<__int128_t>)) + (int32_t, cuco::xxhash_64), + (int64_t, cuco::xxhash_64), + (__int128_t, cuco::xxhash_64<__int128_t>)) { // This factor determines the error threshold for passing the test // TODO might be too high @@ -59,6 +44,8 @@ TEMPLATE_TEST_CASE_SIG("distinct_count_estimator: unique sequence", 1.04 / std::sqrt(static_cast(1ull << Precision)); auto num_items_pow2 = GENERATE(25, 26, 28); + auto sketch_size_kb = GENERATE(2, 8, 32, 256, 1024, 4096); + INFO("sketch_size_kb=" << sketch_size_kb); INFO("num_items=2^" << num_items_pow2); auto num_items = 1ull << num_items_pow2; @@ -68,7 +55,7 @@ TEMPLATE_TEST_CASE_SIG("distinct_count_estimator: unique sequence", thrust::sequence(items.begin(), items.end(), 0); // Initialize the estimator - cuco::distinct_count_estimator estimator; + cuco::distinct_count_estimator estimator(sketch_size_kb); REQUIRE(estimator.estimate() == 0); From 65ff70a5a6d128958e1af2237bf4422d2d4bed42 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Mon, 18 Mar 2024 23:27:50 +0000 Subject: [PATCH 37/78] Pre-compute register mask --- include/cuco/detail/hyperloglog/hyperloglog_ref.cuh | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh index 5016fb21f..af450d080 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh @@ -54,6 +54,8 @@ class hyperloglog_ref { using register_type = int; ///< Register array storage // We use `int` here since this is the smallest type that supports native `atomicMax` on GPUs using fp_type = float; ///< Floating point type used for reduction + using hash_value_type = + decltype(cuda::std::declval()(cuda::std::declval())); ///< Hash value type public: static constexpr auto thread_scope = Scope; ///< CUDA thread scope @@ -75,6 +77,7 @@ class hyperloglog_ref { : hash_{hash}, precision_{cuda::std::countr_zero(this->sketch_bytes(sketch_span.size() / 1024) / sizeof(register_type))}, + register_mask_{(1ull << this->precision_) - 1}, sketch_{reinterpret_cast(sketch_span.data()), this->sketch_bytes() / sizeof(register_type)} { @@ -128,11 +131,9 @@ class hyperloglog_ref { */ __device__ void add(T const& item) noexcept { - using hash_value_type = decltype(cuda::std::declval()(cuda::std::declval())); - hash_value_type const register_mask = (1ull << this->precision_) - 1; - auto const h = this->hash_(item); - auto const reg = h & register_mask; - auto const zeroes = cuda::std::countl_zero(h | register_mask) + 1; // __clz + auto const h = this->hash_(item); + auto const reg = h & this->register_mask_; + auto const zeroes = cuda::std::countl_zero(h | this->register_mask_) + 1; // __clz this->update_max(reg, zeroes); } @@ -495,6 +496,7 @@ class hyperloglog_ref { hash_type hash_; ///< Hash function used to hash items int32_t precision_; ///< HLL precision parameter + hash_value_type register_mask_; ///< Mask used to separate register index from count cuda::std::span sketch_; ///< HLL sketch storage template From 806879922d320f636c233b00b5e6c762d85100fd Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Tue, 19 Mar 2024 16:12:01 +0000 Subject: [PATCH 38/78] Fix unit test --- .../unique_sequence_test.cu | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/tests/distinct_count_estimator/unique_sequence_test.cu b/tests/distinct_count_estimator/unique_sequence_test.cu index fffdd751b..2addf3bee 100644 --- a/tests/distinct_count_estimator/unique_sequence_test.cu +++ b/tests/distinct_count_estimator/unique_sequence_test.cu @@ -31,23 +31,25 @@ TEMPLATE_TEST_CASE_SIG("distinct_count_estimator: unique sequence", "", - ((typename T, int32_t Precision, typename Hash), T, Precision, Hash), + ((typename T, typename Hash), T, Hash), (int32_t, cuco::xxhash_64), (int64_t, cuco::xxhash_64), (__int128_t, cuco::xxhash_64<__int128_t>)) { + auto num_items_pow2 = GENERATE(25, 26, 28); + auto hll_precision = GENERATE(8, 10, 12, 13, 18, 20); + auto sketch_size_kb = 4 * (1ull << hll_precision) / 1024; + INFO("hll_precision=" << hll_precision); + INFO("sketch_size_kb=" << sketch_size_kb); + INFO("num_items=2^" << num_items_pow2); + auto num_items = 1ull << num_items_pow2; + // This factor determines the error threshold for passing the test // TODO might be too high double constexpr tolerance_factor = 2.5; // RSD for a given precision is given by the following formula double const relative_standard_deviation = - 1.04 / std::sqrt(static_cast(1ull << Precision)); - - auto num_items_pow2 = GENERATE(25, 26, 28); - auto sketch_size_kb = GENERATE(2, 8, 32, 256, 1024, 4096); - INFO("sketch_size_kb=" << sketch_size_kb); - INFO("num_items=2^" << num_items_pow2); - auto num_items = 1ull << num_items_pow2; + 1.04 / std::sqrt(static_cast(1ull << hll_precision)); thrust::device_vector items(num_items); From 04c303dcbe435f82237a4d382c8c74d2880735ba Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Wed, 20 Mar 2024 13:21:48 +0000 Subject: [PATCH 39/78] Add sketch_size_kb strong type and fix stupid bug where I called a static member function with this-> --- benchmarks/distinct_count_estimator_bench.cu | 2 +- .../host_bulk_example.cu | 1 - .../distinct_count_estimator.inl | 4 +- .../distinct_count_estimator_ref.inl | 3 +- .../cuco/detail/hyperloglog/hyperloglog.cuh | 12 ++-- .../detail/hyperloglog/hyperloglog_ref.cuh | 10 ++-- include/cuco/distinct_count_estimator.cuh | 13 +++-- include/cuco/distinct_count_estimator_ref.cuh | 7 ++- include/cuco/sketch_size.hpp | 55 +++++++++++++++++++ .../unique_sequence_test.cu | 3 +- 10 files changed, 87 insertions(+), 23 deletions(-) create mode 100644 include/cuco/sketch_size.hpp diff --git a/benchmarks/distinct_count_estimator_bench.cu b/benchmarks/distinct_count_estimator_bench.cu index 9b3ba02c0..07135a999 100644 --- a/benchmarks/distinct_count_estimator_bench.cu +++ b/benchmarks/distinct_count_estimator_bench.cu @@ -67,7 +67,7 @@ void distinct_count_estimator_e2e(nvbench::state& state, nvbench::type_list(num_items, "InputSize"); - Estimator estimator(sketch_size_kb); + Estimator estimator(cuco::sketch_size_kb(sketch_size_kb)); estimator.add(items.begin(), items.end()); double estimated_cardinality = estimator.estimate(); diff --git a/examples/distinct_count_estimator/host_bulk_example.cu b/examples/distinct_count_estimator/host_bulk_example.cu index 9e60ae47b..add3cb626 100644 --- a/examples/distinct_count_estimator/host_bulk_example.cu +++ b/examples/distinct_count_estimator/host_bulk_example.cu @@ -25,7 +25,6 @@ * @file host_bulk_example.cu * @brief Demonstrates usage of `cuco::distinct_count_estimator` "bulk" host APIs. */ - int main(void) { using T = int; diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl index 54806aba6..be3fcfb9e 100644 --- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl +++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl @@ -18,7 +18,7 @@ namespace cuco { template constexpr distinct_count_estimator::distinct_count_estimator( - std::size_t max_sketch_size_kb, + cuco::sketch_size_kb max_sketch_size_kb, Hash const& hash, Allocator const& alloc, cuco::cuda_stream_ref stream) @@ -125,7 +125,7 @@ constexpr size_t distinct_count_estimator::sketch_byt template constexpr size_t distinct_count_estimator::sketch_bytes( - size_t max_sketch_size_kb) noexcept + cuco::sketch_size_kb max_sketch_size_kb) noexcept { return impl_type::sketch_bytes(max_sketch_size_kb); } diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl index d0cf85475..90f609b5d 100644 --- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl +++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl @@ -129,7 +129,8 @@ distinct_count_estimator_ref::sketch_bytes() const noexcept template __host__ __device__ constexpr std::size_t -distinct_count_estimator_ref::sketch_bytes(std::size_t max_sketch_size_kb) noexcept +distinct_count_estimator_ref::sketch_bytes( + cuco::sketch_size_kb max_sketch_size_kb) noexcept { return impl_type::sketch_bytes(max_sketch_size_kb); } diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh index 159afeb99..b8e5c5db2 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog.cuh @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -63,14 +64,14 @@ class hyperloglog { * @param alloc Allocator used for allocating device storage * @param stream CUDA stream used to initialize the object */ - constexpr hyperloglog(std::size_t max_sketch_size_kb, + constexpr hyperloglog(cuco::sketch_size_kb max_sketch_size_kb, Hash const& hash, Allocator const& alloc, cuco::cuda_stream_ref stream) : allocator_{alloc}, - deleter_{this->sketch_bytes(max_sketch_size_kb), this->allocator_}, - sketch_{this->allocator_.allocate(this->sketch_bytes(max_sketch_size_kb)), this->deleter_}, - ref_{cuda::std::span{this->sketch_.get(), this->sketch_bytes(max_sketch_size_kb)}, hash} + deleter_{sketch_bytes(max_sketch_size_kb), this->allocator_}, + sketch_{this->allocator_.allocate(sketch_bytes(max_sketch_size_kb)), this->deleter_}, + ref_{cuda::std::span{this->sketch_.get(), sketch_bytes(max_sketch_size_kb)}, hash} { this->ref_.clear_async(stream); } @@ -260,7 +261,8 @@ class hyperloglog { * * @return The number of bytes required for the sketch */ - [[nodiscard]] static constexpr std::size_t sketch_bytes(std::size_t max_sketch_size_kb) noexcept + [[nodiscard]] static constexpr std::size_t sketch_bytes( + cuco::sketch_size_kb max_sketch_size_kb) noexcept { return ref_type<>::sketch_bytes(max_sketch_size_kb); } diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh index af450d080..75b95bf4a 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -75,8 +76,9 @@ class hyperloglog_ref { __host__ __device__ constexpr hyperloglog_ref(cuda::std::span sketch_span, Hash const& hash) : hash_{hash}, - precision_{cuda::std::countr_zero(this->sketch_bytes(sketch_span.size() / 1024) / - sizeof(register_type))}, + precision_{cuda::std::countr_zero( + sketch_bytes(cuco::sketch_size_kb(static_cast(sketch_span.size() / 1024))) / + sizeof(register_type))}, register_mask_{(1ull << this->precision_) - 1}, sketch_{reinterpret_cast(sketch_span.data()), this->sketch_bytes() / sizeof(register_type)} @@ -436,9 +438,9 @@ class hyperloglog_ref { * @return The number of bytes required for the sketch */ [[nodiscard]] __host__ __device__ static constexpr std::size_t sketch_bytes( - std::size_t max_sketch_size_kb) noexcept + cuco::sketch_size_kb max_sketch_size_kb) noexcept { - return cuda::std::bit_floor(max_sketch_size_kb * 1024); + return cuda::std::bit_floor(static_cast(max_sketch_size_kb * 1024)); } /** diff --git a/include/cuco/distinct_count_estimator.cuh b/include/cuco/distinct_count_estimator.cuh index 0a2490ad7..9860d7246 100644 --- a/include/cuco/distinct_count_estimator.cuh +++ b/include/cuco/distinct_count_estimator.cuh @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -34,7 +35,6 @@ namespace cuco { * https://static.googleusercontent.com/media/research.google.com/de//pubs/archive/40671.pdf. * * @tparam T Type of items to count - * @tparam Precision Tuning parameter to trade runtime/memory footprint for better accuracy * @tparam Scope The scope in which operations will be performed by individual threads * @tparam Hash Hash function used to hash items * @tparam Allocator Type of allocator used for device storage @@ -67,10 +67,10 @@ class distinct_count_estimator { * @param alloc Allocator used for allocating device storage * @param stream CUDA stream used to initialize the object */ - constexpr distinct_count_estimator(std::size_t max_sketch_size_kb = 32, - Hash const& hash = {}, - Allocator const& alloc = {}, - cuco::cuda_stream_ref stream = {}); + constexpr distinct_count_estimator(cuco::sketch_size_kb max_sketch_size_kb = 32_KB, + Hash const& hash = {}, + Allocator const& alloc = {}, + cuco::cuda_stream_ref stream = {}); ~distinct_count_estimator() = default; @@ -234,7 +234,8 @@ class distinct_count_estimator { * * @return The number of bytes required for the sketch */ - [[nodiscard]] static constexpr std::size_t sketch_bytes(std::size_t max_sketch_size_kb) noexcept; + [[nodiscard]] static constexpr std::size_t sketch_bytes( + cuco::sketch_size_kb max_sketch_size_kb) noexcept; /** * @brief Gets the alignment required for the sketch storage. diff --git a/include/cuco/distinct_count_estimator_ref.cuh b/include/cuco/distinct_count_estimator_ref.cuh index 65f899723..feb2ac6d4 100644 --- a/include/cuco/distinct_count_estimator_ref.cuh +++ b/include/cuco/distinct_count_estimator_ref.cuh @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -35,7 +36,9 @@ namespace cuco { * @tparam Scope The scope in which operations will be performed by individual threads * @tparam Hash Hash function used to hash items */ -template +template > class distinct_count_estimator_ref { using impl_type = detail::hyperloglog_ref; @@ -213,7 +216,7 @@ class distinct_count_estimator_ref { * @return The number of bytes required for the sketch */ [[nodiscard]] __host__ __device__ static constexpr std::size_t sketch_bytes( - std::size_t max_sketch_size_kb) noexcept; + cuco::sketch_size_kb max_sketch_size_kb) noexcept; /** * @brief Gets the alignment required for the sketch storage. diff --git a/include/cuco/sketch_size.hpp b/include/cuco/sketch_size.hpp new file mode 100644 index 000000000..f9dce1aed --- /dev/null +++ b/include/cuco/sketch_size.hpp @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +namespace cuco { + +/** + * @brief Strng type for specifying the sketch size of cuco::distinct_count_estimator(_ref) in KB. + * + * Values can also be given as literals, e.g., 64.3_KB + */ +class sketch_size_kb { + public: + /** + * @brief Constructs a sketch_size_kb object. + * + * @param value The size of a sketch given in KB + */ + __host__ __device__ explicit constexpr sketch_size_kb(double value) noexcept : value_{value} {} + + /** + * @brief Conversion to value type. + * + * @return Sketch size in KB + */ + __host__ __device__ constexpr operator double() const noexcept { return this->value_; } + + private: + double value_; ///< Sketch size in KB +}; +} // namespace cuco + +// User-defined literal operators for sketch_size_KB +__host__ __device__ constexpr cuco::sketch_size_kb operator""_KB(long double value) +{ + return cuco::sketch_size_kb{static_cast(value)}; +} + +__host__ __device__ constexpr cuco::sketch_size_kb operator""_KB(unsigned long long int value) +{ + return cuco::sketch_size_kb{static_cast(value)}; +} \ No newline at end of file diff --git a/tests/distinct_count_estimator/unique_sequence_test.cu b/tests/distinct_count_estimator/unique_sequence_test.cu index 2addf3bee..7d6321de6 100644 --- a/tests/distinct_count_estimator/unique_sequence_test.cu +++ b/tests/distinct_count_estimator/unique_sequence_test.cu @@ -57,7 +57,8 @@ TEMPLATE_TEST_CASE_SIG("distinct_count_estimator: unique sequence", thrust::sequence(items.begin(), items.end(), 0); // Initialize the estimator - cuco::distinct_count_estimator estimator(sketch_size_kb); + cuco::distinct_count_estimator estimator{ + cuco::sketch_size_kb(sketch_size_kb)}; REQUIRE(estimator.estimate() == 0); From a7036ae06f0b7cafe4b8cbb253df6f89673f49a1 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Wed, 20 Mar 2024 13:29:29 +0000 Subject: [PATCH 40/78] Fix benchmark --- benchmarks/distinct_count_estimator_bench.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/distinct_count_estimator_bench.cu b/benchmarks/distinct_count_estimator_bench.cu index 07135a999..9bc3fc514 100644 --- a/benchmarks/distinct_count_estimator_bench.cu +++ b/benchmarks/distinct_count_estimator_bench.cu @@ -67,7 +67,7 @@ void distinct_count_estimator_e2e(nvbench::state& state, nvbench::type_list(num_items, "InputSize"); - Estimator estimator(cuco::sketch_size_kb(sketch_size_kb)); + Estimator estimator{cuco::sketch_size_kb(sketch_size_kb)}; estimator.add(items.begin(), items.end()); double estimated_cardinality = estimator.estimate(); @@ -111,7 +111,7 @@ void distinct_count_estimator_add(nvbench::state& state, nvbench::type_list(num_items, "InputSize"); - Estimator estimator(sketch_size_kb); + Estimator estimator{cuco::sketch_size_kb(sketch_size_kb)}; state.exec(nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { estimator.clear_async({launch.get_stream()}); From 3e25da738f762231aca8fcbf737b99f8aad6d96a Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Wed, 20 Mar 2024 14:17:41 +0000 Subject: [PATCH 41/78] More robust error estimation in benchmark --- benchmarks/distinct_count_estimator_bench.cu | 60 +++++++++++++------- 1 file changed, 40 insertions(+), 20 deletions(-) diff --git a/benchmarks/distinct_count_estimator_bench.cu b/benchmarks/distinct_count_estimator_bench.cu index 9bc3fc514..a75411002 100644 --- a/benchmarks/distinct_count_estimator_bench.cu +++ b/benchmarks/distinct_count_estimator_bench.cu @@ -48,6 +48,31 @@ template return set.size(); } +template +[[nodiscard]] double relative_error(nvbench::state& state, std::size_t num_samples = 5) +{ + using T = typename Estimator::value_type; + + auto const num_items = state.get_int64("NumInputs"); + auto const sketch_size_kb = state.get_int64("SketchSizeKB"); + + thrust::device_vector items(num_items); + + key_generator gen; + Estimator estimator{cuco::sketch_size_kb(sketch_size_kb)}; + double error_sum = 0; + for (std::size_t i = 0; i < num_samples; ++i) { + gen.generate(dist_from_state(state), items.begin(), items.end()); + estimator.add(items.begin(), items.end()); + double estimated_cardinality = estimator.estimate(); + double true_cardinality = exact_distinct_count(items.begin(), num_items); + error_sum += abs(true_cardinality - estimated_cardinality) / true_cardinality; + estimator.clear(); + } + + return error_sum / num_samples; +} + /** * @brief A benchmark evaluating `cuco::distinct_count_estimator` end-to-end performance */ @@ -59,36 +84,31 @@ void distinct_count_estimator_e2e(nvbench::state& state, nvbench::type_list items(num_items); - - key_generator gen; - gen.generate(dist_from_state(state), items.begin(), items.end()); - state.add_element_count(num_items); state.add_global_memory_reads(num_items, "InputSize"); - Estimator estimator{cuco::sketch_size_kb(sketch_size_kb)}; - estimator.add(items.begin(), items.end()); + auto const err = relative_error(state); + auto& summ = state.add_summary("MeanRelativeError"); + summ.set_string("hint", "MRelErr"); + summ.set_string("short_name", "MeanRelativeError"); + summ.set_string("description", "Mean relatve approximation error."); + summ.set_float64("value", err); - double estimated_cardinality = estimator.estimate(); - double const true_cardinality = exact_distinct_count(items.begin(), num_items); - auto const relative_error = abs(true_cardinality - estimated_cardinality) / true_cardinality; + thrust::device_vector items(num_items); - auto& summ = state.add_summary("RelativeError"); - summ.set_string("hint", "RelErr"); - summ.set_string("short_name", "RelativeError"); - summ.set_string("description", "Relatve approximation error."); - summ.set_float64("value", relative_error); + key_generator gen; + gen.generate(dist_from_state(state), items.begin(), items.end()); - estimator.clear(); + Estimator estimator{cuco::sketch_size_kb(sketch_size_kb)}; + std::size_t estimated_cardinality = 0; state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { - estimator.clear_async({launch.get_stream()}); - timer.start(); estimator.add_async(items.begin(), items.end(), {launch.get_stream()}); estimated_cardinality = estimator.estimate({launch.get_stream()}); timer.stop(); + + estimator.clear_async({launch.get_stream()}); }); } @@ -113,11 +133,11 @@ void distinct_count_estimator_add(nvbench::state& state, nvbench::type_list Date: Wed, 20 Mar 2024 15:01:11 +0000 Subject: [PATCH 42/78] Benchmark gmem fallback kernel --- benchmarks/distinct_count_estimator_bench.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/distinct_count_estimator_bench.cu b/benchmarks/distinct_count_estimator_bench.cu index a75411002..c5ae8a6b3 100644 --- a/benchmarks/distinct_count_estimator_bench.cu +++ b/benchmarks/distinct_count_estimator_bench.cu @@ -158,5 +158,5 @@ NVBENCH_BENCH_TYPES(distinct_count_estimator_add, .set_name("distinct_count_estimator::add_async") .set_type_axes_names({"Estimator", "Distribution"}) .add_int64_power_of_two_axis("NumInputs", {28, 29, 30}) - .add_int64_axis("SketchSizeKB", {8, 16, 32}) + .add_int64_axis("SketchSizeKB", {8, 16, 32, 256}) // 256KB uses gmem fallback kernel .set_max_noise(defaults::MAX_NOISE); \ No newline at end of file From 99c0dee0347a8c71cf16206fbcdc8626e0141a2f Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Wed, 20 Mar 2024 15:04:58 +0000 Subject: [PATCH 43/78] Rename max_sketch_size_kb -> sketch_size_kb --- .../distinct_count_estimator.inl | 8 ++++---- .../distinct_count_estimator_ref.inl | 4 ++-- include/cuco/detail/hyperloglog/hyperloglog.cuh | 16 ++++++++-------- .../cuco/detail/hyperloglog/hyperloglog_ref.cuh | 6 +++--- include/cuco/distinct_count_estimator.cuh | 14 +++++++------- include/cuco/distinct_count_estimator_ref.cuh | 4 ++-- 6 files changed, 26 insertions(+), 26 deletions(-) diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl index be3fcfb9e..21128de9e 100644 --- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl +++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl @@ -18,11 +18,11 @@ namespace cuco { template constexpr distinct_count_estimator::distinct_count_estimator( - cuco::sketch_size_kb max_sketch_size_kb, + cuco::sketch_size_kb sketch_size_kb, Hash const& hash, Allocator const& alloc, cuco::cuda_stream_ref stream) - : impl_{std::make_unique(max_sketch_size_kb, hash, alloc, stream)} + : impl_{std::make_unique(sketch_size_kb, hash, alloc, stream)} { } @@ -125,9 +125,9 @@ constexpr size_t distinct_count_estimator::sketch_byt template constexpr size_t distinct_count_estimator::sketch_bytes( - cuco::sketch_size_kb max_sketch_size_kb) noexcept + cuco::sketch_size_kb sketch_size_kb) noexcept { - return impl_type::sketch_bytes(max_sketch_size_kb); + return impl_type::sketch_bytes(sketch_size_kb); } template diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl index 90f609b5d..53beb5016 100644 --- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl +++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl @@ -130,9 +130,9 @@ distinct_count_estimator_ref::sketch_bytes() const noexcept template __host__ __device__ constexpr std::size_t distinct_count_estimator_ref::sketch_bytes( - cuco::sketch_size_kb max_sketch_size_kb) noexcept + cuco::sketch_size_kb sketch_size_kb) noexcept { - return impl_type::sketch_bytes(max_sketch_size_kb); + return impl_type::sketch_bytes(sketch_size_kb); } template diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh index b8e5c5db2..5c0d07833 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog.cuh @@ -59,19 +59,19 @@ class hyperloglog { * * @note This function synchronizes the given stream. * - * @param max_sketch_size_kb Maximum sketch size in KB + * @param sketch_size_kb Maximum sketch size in KB * @param hash The hash function used to hash items * @param alloc Allocator used for allocating device storage * @param stream CUDA stream used to initialize the object */ - constexpr hyperloglog(cuco::sketch_size_kb max_sketch_size_kb, + constexpr hyperloglog(cuco::sketch_size_kb sketch_size_kb, Hash const& hash, Allocator const& alloc, cuco::cuda_stream_ref stream) : allocator_{alloc}, - deleter_{sketch_bytes(max_sketch_size_kb), this->allocator_}, - sketch_{this->allocator_.allocate(sketch_bytes(max_sketch_size_kb)), this->deleter_}, - ref_{cuda::std::span{this->sketch_.get(), sketch_bytes(max_sketch_size_kb)}, hash} + deleter_{sketch_bytes(sketch_size_kb), this->allocator_}, + sketch_{this->allocator_.allocate(sketch_bytes(sketch_size_kb)), this->deleter_}, + ref_{cuda::std::span{this->sketch_.get(), sketch_bytes(sketch_size_kb)}, hash} { this->ref_.clear_async(stream); } @@ -257,14 +257,14 @@ class hyperloglog { /** * @brief Gets the number of bytes required for the sketch storage. * - * @param max_sketch_size_kb Upper bound sketch size in KB + * @param sketch_size_kb Upper bound sketch size in KB * * @return The number of bytes required for the sketch */ [[nodiscard]] static constexpr std::size_t sketch_bytes( - cuco::sketch_size_kb max_sketch_size_kb) noexcept + cuco::sketch_size_kb sketch_size_kb) noexcept { - return ref_type<>::sketch_bytes(max_sketch_size_kb); + return ref_type<>::sketch_bytes(sketch_size_kb); } /** diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh index 75b95bf4a..06adaf78b 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh @@ -433,14 +433,14 @@ class hyperloglog_ref { /** * @brief Gets the number of bytes required for the sketch storage. * - * @param max_sketch_size_kb Upper bound sketch size in KB + * @param sketch_size_kb Upper bound sketch size in KB * * @return The number of bytes required for the sketch */ [[nodiscard]] __host__ __device__ static constexpr std::size_t sketch_bytes( - cuco::sketch_size_kb max_sketch_size_kb) noexcept + cuco::sketch_size_kb sketch_size_kb) noexcept { - return cuda::std::bit_floor(static_cast(max_sketch_size_kb * 1024)); + return cuda::std::bit_floor(static_cast(sketch_size_kb * 1024)); } /** diff --git a/include/cuco/distinct_count_estimator.cuh b/include/cuco/distinct_count_estimator.cuh index 9860d7246..863a23a83 100644 --- a/include/cuco/distinct_count_estimator.cuh +++ b/include/cuco/distinct_count_estimator.cuh @@ -62,15 +62,15 @@ class distinct_count_estimator { * * @note This function synchronizes the given stream. * - * @param max_sketch_size_kb Maximum sketch size in KB + * @param sketch_size_kb Maximum sketch size in KB * @param hash The hash function used to hash items * @param alloc Allocator used for allocating device storage * @param stream CUDA stream used to initialize the object */ - constexpr distinct_count_estimator(cuco::sketch_size_kb max_sketch_size_kb = 32_KB, - Hash const& hash = {}, - Allocator const& alloc = {}, - cuco::cuda_stream_ref stream = {}); + constexpr distinct_count_estimator(cuco::sketch_size_kb sketch_size_kb = 32_KB, + Hash const& hash = {}, + Allocator const& alloc = {}, + cuco::cuda_stream_ref stream = {}); ~distinct_count_estimator() = default; @@ -230,12 +230,12 @@ class distinct_count_estimator { /** * @brief Gets the number of bytes required for the sketch storage. * - * @param max_sketch_size_kb Upper bound sketch size in KB + * @param sketch_size_kb Upper bound sketch size in KB * * @return The number of bytes required for the sketch */ [[nodiscard]] static constexpr std::size_t sketch_bytes( - cuco::sketch_size_kb max_sketch_size_kb) noexcept; + cuco::sketch_size_kb sketch_size_kb) noexcept; /** * @brief Gets the alignment required for the sketch storage. diff --git a/include/cuco/distinct_count_estimator_ref.cuh b/include/cuco/distinct_count_estimator_ref.cuh index feb2ac6d4..25f1834e3 100644 --- a/include/cuco/distinct_count_estimator_ref.cuh +++ b/include/cuco/distinct_count_estimator_ref.cuh @@ -211,12 +211,12 @@ class distinct_count_estimator_ref { /** * @brief Gets the number of bytes required for the sketch storage. * - * @param max_sketch_size_kb Upper bound sketch size in KB + * @param sketch_size_kb Upper bound sketch size in KB * * @return The number of bytes required for the sketch */ [[nodiscard]] __host__ __device__ static constexpr std::size_t sketch_bytes( - cuco::sketch_size_kb max_sketch_size_kb) noexcept; + cuco::sketch_size_kb sketch_size_kb) noexcept; /** * @brief Gets the alignment required for the sketch storage. From aeaecf405c1bc475b13639b72968c586bddcbde5 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Wed, 20 Mar 2024 15:32:49 +0000 Subject: [PATCH 44/78] Improve error handling and docs --- .../distinct_count_estimator.inl | 4 +- .../distinct_count_estimator_ref.inl | 5 +-- .../cuco/detail/hyperloglog/hyperloglog.cuh | 15 ++++++- .../detail/hyperloglog/hyperloglog_ref.cuh | 39 ++++++++++++++++--- include/cuco/distinct_count_estimator.cuh | 15 ++++++- include/cuco/distinct_count_estimator_ref.cuh | 14 +++++-- 6 files changed, 74 insertions(+), 18 deletions(-) diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl index 21128de9e..9ea4816d5 100644 --- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl +++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl @@ -60,7 +60,7 @@ template template void distinct_count_estimator::merge_async( distinct_count_estimator const& other, - cuco::cuda_stream_ref stream) noexcept + cuco::cuda_stream_ref stream) { this->impl_->merge_async(other, stream); } @@ -77,7 +77,7 @@ void distinct_count_estimator::merge( template template void distinct_count_estimator::merge_async( - ref_type const& other, cuco::cuda_stream_ref stream) noexcept + ref_type const& other, cuco::cuda_stream_ref stream) { this->impl_->merge_async(other, stream); } diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl index 53beb5016..3be39ac44 100644 --- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl +++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl @@ -71,7 +71,7 @@ __host__ void distinct_count_estimator_ref::add(InputIt first, template template __device__ void distinct_count_estimator_ref::merge( - CG const& group, distinct_count_estimator_ref const& other) noexcept + CG const& group, distinct_count_estimator_ref const& other) { this->impl_.merge(group, other.impl_); } @@ -79,8 +79,7 @@ __device__ void distinct_count_estimator_ref::merge( template template __host__ void distinct_count_estimator_ref::merge_async( - distinct_count_estimator_ref const& other, - cuco::cuda_stream_ref stream) noexcept + distinct_count_estimator_ref const& other, cuco::cuda_stream_ref stream) { this->impl_.merge_async(other, stream); } diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh index 5c0d07833..23079e4db 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog.cuh @@ -59,6 +59,9 @@ class hyperloglog { * * @note This function synchronizes the given stream. * + * @throw If sketch size < 0.0625KB or 64B + * @throw If sketch storage has insufficient alignment + * * @param sketch_size_kb Maximum sketch size in KB * @param hash The hash function used to hash items * @param alloc Allocator used for allocating device storage @@ -146,6 +149,8 @@ class hyperloglog { /** * @brief Asynchronously merges the result of `other` estimator into `*this` estimator. * + * @throw If this->sketch_bytes() != other.sketch_bytes() + * * @tparam OtherScope Thread scope of `other` estimator * @tparam OtherAllocator Allocator type of `other` estimator * @@ -154,7 +159,7 @@ class hyperloglog { */ template void merge_async(hyperloglog const& other, - cuco::cuda_stream_ref stream) noexcept + cuco::cuda_stream_ref stream) { this->ref_.merge_async(other.ref(), stream); } @@ -165,6 +170,8 @@ class hyperloglog { * @note This function synchronizes the given stream. For asynchronous execution use * `merge_async`. * + * @throw If this->sketch_bytes() != other.sketch_bytes() + * * @tparam OtherScope Thread scope of `other` estimator * @tparam OtherAllocator Allocator type of `other` estimator * @@ -181,13 +188,15 @@ class hyperloglog { /** * @brief Asynchronously merges the result of `other` estimator reference into `*this` estimator. * + * @throw If this->sketch_bytes() != other.sketch_bytes() + * * @tparam OtherScope Thread scope of `other` estimator * * @param other Other estimator reference to be merged into `*this` * @param stream CUDA stream this operation is executed in */ template - void merge_async(ref_type const& other, cuco::cuda_stream_ref stream) noexcept + void merge_async(ref_type const& other, cuco::cuda_stream_ref stream) { this->ref_.merge_async(other, stream); } @@ -198,6 +207,8 @@ class hyperloglog { * @note This function synchronizes the given stream. For asynchronous execution use * `merge_async`. * + * @throw If this->sketch_bytes() != other.sketch_bytes() + * * @tparam OtherScope Thread scope of `other` estimator * * @param other Other estimator reference to be merged into `*this` diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh index 06adaf78b..87d49d245 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh @@ -70,6 +70,9 @@ class hyperloglog_ref { /** * @brief Constructs a non-owning `hyperloglog_ref` object. * + * @throw If sketch size < 0.0625KB or 64B + * @throw If sketch storage has insufficient alignment + * * @param sketch_span Reference to sketch storage * @param hash The hash function used to hash items */ @@ -83,7 +86,24 @@ class hyperloglog_ref { sketch_{reinterpret_cast(sketch_span.data()), this->sketch_bytes() / sizeof(register_type)} { - // TODO check size and alignment + auto const alignment = + 1ull << cuda::std::countr_zero(reinterpret_cast(sketch_span.data())); + + if (alignment < sketch_alignment()) { +#ifdef __CUDA_ARCH__ + __trap(); +#else + CUCO_FAIL("Insufficient sketch alignment", std::runtime_error); +#endif + } + + if (this->precision_ < 4) { +#ifdef __CUDA_ARCH__ + __trap(); +#else + CUCO_FAIL("Minimum required sketch size is 0.0625KB or 64B", std::runtime_error); +#endif + } } /** @@ -252,6 +272,8 @@ class hyperloglog_ref { /** * @brief Merges the result of `other` estimator reference into `*this` estimator reference. * + * @throw If this->sketch_bytes() != other.sketch_bytes() + * * @tparam CG CUDA Cooperative Group type * @tparam OtherScope Thread scope of `other` estimator * @@ -259,11 +281,9 @@ class hyperloglog_ref { * @param other Other estimator reference to be merged into `*this` */ template - __device__ void merge(CG const& group, hyperloglog_ref const& other) noexcept + __device__ void merge(CG const& group, hyperloglog_ref const& other) { - if (other.precision_ != this->precision_) { - __trap(); // TODO check if this hurts performance - } + if (other.precision_ != this->precision_) { __trap(); } for (int i = group.thread_rank(); i < this->sketch_.size(); i += group.size()) { this->update_max(i, other.sketch_[i]); @@ -274,6 +294,8 @@ class hyperloglog_ref { * @brief Asynchronously merges the result of `other` estimator reference into `*this` * estimator. * + * @throw If this->sketch_bytes() != other.sketch_bytes() + * * @tparam OtherScope Thread scope of `other` estimator * * @param other Other estimator reference to be merged into `*this` @@ -281,8 +303,11 @@ class hyperloglog_ref { */ template __host__ void merge_async(hyperloglog_ref const& other, - cuco::cuda_stream_ref stream) noexcept + cuco::cuda_stream_ref stream) { + CUCO_EXPECTS(other.precision == this->precision_, + "Cannot merge estimators with different sketch sizes", + std::runtime_error); auto constexpr block_size = 1024; cuco::hyperloglog_ns::detail::merge<<<1, block_size, 0, stream>>>(other, *this); } @@ -293,6 +318,8 @@ class hyperloglog_ref { * @note This function synchronizes the given stream. For asynchronous execution use * `merge_async`. * + * @throw If this->sketch_bytes() != other.sketch_bytes() + * * @tparam OtherScope Thread scope of `other` estimator * * @param other Other estimator reference to be merged into `*this` diff --git a/include/cuco/distinct_count_estimator.cuh b/include/cuco/distinct_count_estimator.cuh index 863a23a83..3cd22b469 100644 --- a/include/cuco/distinct_count_estimator.cuh +++ b/include/cuco/distinct_count_estimator.cuh @@ -62,6 +62,9 @@ class distinct_count_estimator { * * @note This function synchronizes the given stream. * + * @throw If sketch size < 0.0625KB or 64B + * @throw If sketch storage has insufficient alignment + * * @param sketch_size_kb Maximum sketch size in KB * @param hash The hash function used to hash items * @param alloc Allocator used for allocating device storage @@ -137,6 +140,8 @@ class distinct_count_estimator { /** * @brief Asynchronously merges the result of `other` estimator into `*this` estimator. * + * @throw If this->sketch_bytes() != other.sketch_bytes() + * * @tparam OtherScope Thread scope of `other` estimator * @tparam OtherAllocator Allocator type of `other` estimator * @@ -145,7 +150,7 @@ class distinct_count_estimator { */ template void merge_async(distinct_count_estimator const& other, - cuco::cuda_stream_ref stream = {}) noexcept; + cuco::cuda_stream_ref stream = {}); /** * @brief Merges the result of `other` estimator into `*this` estimator. @@ -153,6 +158,8 @@ class distinct_count_estimator { * @note This function synchronizes the given stream. For asynchronous execution use * `merge_async`. * + * @throw If this->sketch_bytes() != other.sketch_bytes() + * * @tparam OtherScope Thread scope of `other` estimator * @tparam OtherAllocator Allocator type of `other` estimator * @@ -166,13 +173,15 @@ class distinct_count_estimator { /** * @brief Asynchronously merges the result of `other` estimator reference into `*this` estimator. * + * @throw If this->sketch_bytes() != other.sketch_bytes() + * * @tparam OtherScope Thread scope of `other` estimator * * @param other Other estimator reference to be merged into `*this` * @param stream CUDA stream this operation is executed in */ template - void merge_async(ref_type const& other, cuco::cuda_stream_ref stream = {}) noexcept; + void merge_async(ref_type const& other, cuco::cuda_stream_ref stream = {}); /** * @brief Merges the result of `other` estimator reference into `*this` estimator. @@ -180,6 +189,8 @@ class distinct_count_estimator { * @note This function synchronizes the given stream. For asynchronous execution use * `merge_async`. * + * @throw If this->sketch_bytes() != other.sketch_bytes() + * * @tparam OtherScope Thread scope of `other` estimator * * @param other Other estimator reference to be merged into `*this` diff --git a/include/cuco/distinct_count_estimator_ref.cuh b/include/cuco/distinct_count_estimator_ref.cuh index 25f1834e3..a0806cccf 100644 --- a/include/cuco/distinct_count_estimator_ref.cuh +++ b/include/cuco/distinct_count_estimator_ref.cuh @@ -51,10 +51,12 @@ class distinct_count_estimator_ref { using with_scope = distinct_count_estimator_ref; ///< Ref type with different ///< thread scope - // TODO let storage_type be inferred? /** * @brief Constructs a non-owning `distinct_count_estimator_ref` object. * + * @throw If sketch size < 0.0625KB or 64B + * @throw If sketch storage has insufficient alignment + * * @param sketch_span Reference to sketch storage * @param hash The hash function used to hash items */ @@ -129,6 +131,8 @@ class distinct_count_estimator_ref { /** * @brief Merges the result of `other` estimator reference into `*this` estimator reference. * + * @throw If this->sketch_bytes() != other.sketch_bytes() + * * @tparam CG CUDA Cooperative Group type * @tparam OtherScope Thread scope of `other` estimator * @@ -137,11 +141,13 @@ class distinct_count_estimator_ref { */ template __device__ void merge(CG const& group, - distinct_count_estimator_ref const& other) noexcept; + distinct_count_estimator_ref const& other); /** * @brief Asynchronously merges the result of `other` estimator reference into `*this` estimator. * + * @throw If this->sketch_bytes() != other.sketch_bytes() + * * @tparam OtherScope Thread scope of `other` estimator * * @param other Other estimator reference to be merged into `*this` @@ -149,11 +155,13 @@ class distinct_count_estimator_ref { */ template __host__ void merge_async(distinct_count_estimator_ref const& other, - cuco::cuda_stream_ref stream = {}) noexcept; + cuco::cuda_stream_ref stream = {}); /** * @brief Merges the result of `other` estimator reference into `*this` estimator. * + * @throw If this->sketch_bytes() != other.sketch_bytes() + * * @note This function synchronizes the given stream. For asynchronous execution use * `merge_async`. * From 55fa31205be9d968c86b514c101492cc65b5b724 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Wed, 20 Mar 2024 17:52:50 +0000 Subject: [PATCH 45/78] Cleanup finalizer --- include/cuco/detail/hyperloglog/finalizer.cuh | 30 ++++++++----------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/include/cuco/detail/hyperloglog/finalizer.cuh b/include/cuco/detail/hyperloglog/finalizer.cuh index f40a0e751..845453b0d 100644 --- a/include/cuco/detail/hyperloglog/finalizer.cuh +++ b/include/cuco/detail/hyperloglog/finalizer.cuh @@ -29,6 +29,7 @@ namespace cuco::hyperloglog_ns::detail { * * @note Variable names correspond to the definitions given in the HLL++ paper: * https://static.googleusercontent.com/media/research.google.com/de//pubs/archive/40671.pdf + * @note Previcion must be >= 4. * */ class finalizer { @@ -39,13 +40,10 @@ class finalizer { /** * @brief Contructs an HLL finalizer object. * - * @throws Iff precision vale is not supported - * * @param precision HLL precision parameter */ __host__ __device__ constexpr finalizer(int precision) : precision_{precision}, m_{1 << precision} { - // TODO check if precision >= 4 } /** @@ -58,7 +56,7 @@ class finalizer { */ __host__ __device__ constexpr std::size_t operator()(double z, int v) const noexcept { - auto e = alpha_mm() / z; + double e = this->alpha_mm() / z; if (v > 0) { // Use linear counting for small cardinality estimates. @@ -67,11 +65,11 @@ class finalizer { if (e <= 2.5 * this->m_) { return cuda::std::round(h); } if (this->precision_ < 19) { - e = (h <= threshold(this->precision_)) ? h : bias_corrected_estimate(e); + e = (h <= threshold(this->precision_)) ? h : this->bias_corrected_estimate(e); } } else { // HLL++ is defined only when p < 19, otherwise we need to fallback to HLL. - if (this->precision_ < 19) { e = bias_corrected_estimate(e); } + if (this->precision_ < 19) { e = this->bias_corrected_estimate(e); } } return cuda::std::round(e); @@ -80,32 +78,30 @@ class finalizer { private: __host__ __device__ constexpr double alpha_mm() const noexcept { - if (this->m_ == 16) { - return 0.673 * this->m_ * this->m_; - } else if (this->m_ == 32) { - return 0.697 * this->m_ * this->m_; - } else if (this->m_ == 64) { - return 0.709 * this->m_ * this->m_; - } else { - return (0.7213 / (1.0 + 1.079 / this->m_)) * this->m_ * this->m_; + double const m2 = this->m_ * this->m_; + switch (this->m_) { + case 16: return 0.673 * m2; + case 32: return 0.697 * m2; + case 64: return 0.709 * m2; + default: return (0.7213 / (1.0 + 1.079 / this->m_)) * m2; } } __host__ __device__ constexpr double bias_corrected_estimate(double e) const noexcept { - return (e < 5.0 * this->m_) ? e - bias(e) : e; + return (e < 5.0 * this->m_) ? e - this->bias(e) : e; } __host__ __device__ constexpr double bias(double e) const noexcept { - auto const anchor_index = interpolation_anchor_index(e); + auto const anchor_index = this->interpolation_anchor_index(e); int const n = raw_estimate_data_size(this->precision_); auto low = cuda::std::max(anchor_index - k + 1, 0); auto high = cuda::std::min(low + k, n); // Keep moving bounds as long as the (exclusive) high bound is closer to the estimate than // the lower (inclusive) bound. - while (high < n and distance(e, high) < distance(e, low)) { + while (high < n and this->distance(e, high) < this->distance(e, low)) { low += 1; high += 1; } From 2229c6844ec6a02097d2a84ca0444ae52f16af3a Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Wed, 20 Mar 2024 17:53:17 +0000 Subject: [PATCH 46/78] Use double reduction --- include/cuco/detail/hyperloglog/hyperloglog_ref.cuh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh index 87d49d245..f601d0b3b 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh @@ -54,7 +54,7 @@ template class hyperloglog_ref { using register_type = int; ///< Register array storage // We use `int` here since this is the smallest type that supports native `atomicMax` on GPUs - using fp_type = float; ///< Floating point type used for reduction + using fp_type = double; ///< Floating point type used for reduction using hash_value_type = decltype(cuda::std::declval()(cuda::std::declval())); ///< Hash value type public: @@ -374,6 +374,7 @@ class hyperloglog_ref { auto const warp_zeroes = cooperative_groups::reduce(warp, thread_zeroes, cooperative_groups::plus()); // TODO warp sync needed? + // TODO use invoke_one if (warp.thread_rank() == 0) { block_sum.fetch_add(warp_sum, cuda::std::memory_order_relaxed); block_zeroes.fetch_add(warp_zeroes, cuda::std::memory_order_relaxed); From 156a843f12fe57215c336d0ebd8dee449b886663 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Wed, 20 Mar 2024 17:56:46 +0000 Subject: [PATCH 47/78] Use .estimate() in device ref example --- .../device_ref_example.cu | 45 +++++++++++++++++-- 1 file changed, 41 insertions(+), 4 deletions(-) diff --git a/examples/distinct_count_estimator/device_ref_example.cu b/examples/distinct_count_estimator/device_ref_example.cu index c8716e421..933726641 100644 --- a/examples/distinct_count_estimator/device_ref_example.cu +++ b/examples/distinct_count_estimator/device_ref_example.cu @@ -69,10 +69,10 @@ __global__ void piggyback_kernel(RefType ref, InputIt first, std::size_t n) block.sync(); // We can also compute the local estimate on the device - auto const local_estimate = local_ref.estimate(block); + // auto const local_estimate = local_ref.estimate(block); if (block.thread_rank() == 0) { // The local estimate should approximately be `num_items`/`gridDim.x` - printf("Estimate for block %d = %llu\n", blockIdx.x, local_estimate); + // printf("Estimate for block %d = %llu\n", blockIdx.x, local_estimate); } // In the end, we merge the shared memory estimator into the global estimator which gives us the @@ -80,10 +80,40 @@ __global__ void piggyback_kernel(RefType ref, InputIt first, std::size_t n) ref.merge(block, local_ref); } +template +__global__ void device_estimate_kernel(cuco::sketch_size_kb sketch_size_kb, + InputIt in, + size_t n, + OutputIt out) +{ + extern __shared__ std::byte local_sketch[]; + + auto const block = cooperative_groups::this_thread_block(); + + // only a single block computes the estimate + if (block.group_index().x == 0) { + Ref estimator(cuda::std::span(local_sketch, Ref::sketch_bytes(sketch_size_kb))); + + estimator.clear(block); + block.sync(); + + for (int i = block.thread_rank(); i < n; i += block.num_threads()) { + estimator.add(*(in + i)); + } + block.sync(); + // we can compute the final estimate on the device and return the result to the host + auto const estimate = estimator.estimate(block); + + if (block.thread_rank() == 0) { *out = estimate; } + } +} + int main(void) { using T = int; + using estimator_type = cuco::distinct_count_estimator; constexpr std::size_t num_items = 1ull << 28; // 1GB + auto const sketch_size_kb = 32_KB; thrust::device_vector items(num_items); @@ -91,7 +121,7 @@ int main(void) thrust::sequence(items.begin(), items.end(), 0); // Initialize the estimator - cuco::distinct_count_estimator estimator; + estimator_type estimator(sketch_size_kb); // Add all items to the estimator estimator.add(items.begin(), items.end()); @@ -111,7 +141,14 @@ int main(void) // Calculate the cardinality estimate from the custom kernel std::size_t const estimated_cardinality_custom = estimator.estimate(); - if (estimated_cardinality_bulk == estimated_cardinality_custom) { + thrust::device_vector device_estimate(1); + device_estimate_kernel> + <<<1, 512, sketch_bytes>>>(sketch_size_kb, items.begin(), num_items, device_estimate.begin()); + + std::size_t const estimated_cardinality_device = device_estimate[0]; + + if (estimated_cardinality_custom == estimated_cardinality_bulk and + estimated_cardinality_device == estimated_cardinality_bulk) { std::cout << "Success! Cardinality estimates are identical" << std::endl; } From 80dde95525a6a5fcfcd6348d262182762e532e25 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Wed, 20 Mar 2024 18:00:56 +0000 Subject: [PATCH 48/78] Add device ref test --- tests/CMakeLists.txt | 3 +- .../device_ref_test.cu | 94 +++++++++++++++++++ 2 files changed, 96 insertions(+), 1 deletion(-) create mode 100644 tests/distinct_count_estimator/device_ref_test.cu diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 496a014f3..5110c2cbd 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -118,4 +118,5 @@ ConfigureTest(DYNAMIC_BITSET_TEST ################################################################################################### # - distinct_count_estimator ---------------------------------------------------------------------- ConfigureTest(DISTINCT_COUNT_ESTIMATOR_TEST - distinct_count_estimator/unique_sequence_test.cu) + distinct_count_estimator/unique_sequence_test.cu + distinct_count_estimator/device_ref_test.cu) diff --git a/tests/distinct_count_estimator/device_ref_test.cu b/tests/distinct_count_estimator/device_ref_test.cu new file mode 100644 index 000000000..33fe8993f --- /dev/null +++ b/tests/distinct_count_estimator/device_ref_test.cu @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include + +#include +#include + +#include +#include + +#include +#include +#include + +template +__global__ void estimate_kernel(cuco::sketch_size_kb sketch_size_kb, + InputIt in, + size_t n, + OutputIt out) +{ + extern __shared__ std::byte local_sketch[]; + + auto const block = cooperative_groups::this_thread_block(); + + // only a single block computes the estimate + if (block.group_index().x == 0) { + Ref estimator(cuda::std::span(local_sketch, Ref::sketch_bytes(sketch_size_kb))); + + estimator.clear(block); + block.sync(); + + for (int i = block.thread_rank(); i < n; i += block.num_threads()) { + estimator.add(*(in + i)); + } + block.sync(); + auto const estimate = estimator.estimate(block); + if (block.thread_rank() == 0) { *out = estimate; } + } +} + +TEMPLATE_TEST_CASE_SIG("distinct_count_estimator: device ref", + "", + ((typename T, typename Hash), T, Hash), + (int32_t, cuco::xxhash_64), + (int64_t, cuco::xxhash_64), + (__int128_t, cuco::xxhash_64<__int128_t>)) +{ + using estimator_type = cuco::distinct_count_estimator; + + auto num_items_pow2 = GENERATE(25, 26, 28); + auto hll_precision = GENERATE(8, 10, 12, 13); + auto sketch_size_kb = 4 * (1ull << hll_precision) / 1024; + INFO("hll_precision=" << hll_precision); + INFO("sketch_size_kb=" << sketch_size_kb); + INFO("num_items=2^" << num_items_pow2); + auto num_items = 1ull << num_items_pow2; + + thrust::device_vector items(num_items); + + // Generate `num_items` distinct items + thrust::sequence(items.begin(), items.end(), 0); + + // Initialize the estimator + estimator_type estimator{cuco::sketch_size_kb(sketch_size_kb)}; + + // Add all items to the estimator + estimator.add(items.begin(), items.end()); + + auto const host_estimate = estimator.estimate(); + + thrust::device_vector device_estimate(1); + estimate_kernel> + <<<1, 512, estimator.sketch_bytes()>>>( + cuco::sketch_size_kb(sketch_size_kb), items.begin(), num_items, device_estimate.begin()); + + REQUIRE(device_estimate[0] == host_estimate); +} From 730bf73c1fbb2d3dc265d5d1f7698d12ae839c05 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Wed, 20 Mar 2024 22:09:24 +0000 Subject: [PATCH 49/78] Restructure to reduce fp error --- include/cuco/detail/hyperloglog/finalizer.cuh | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/include/cuco/detail/hyperloglog/finalizer.cuh b/include/cuco/detail/hyperloglog/finalizer.cuh index 845453b0d..705464ac0 100644 --- a/include/cuco/detail/hyperloglog/finalizer.cuh +++ b/include/cuco/detail/hyperloglog/finalizer.cuh @@ -78,12 +78,11 @@ class finalizer { private: __host__ __device__ constexpr double alpha_mm() const noexcept { - double const m2 = this->m_ * this->m_; switch (this->m_) { - case 16: return 0.673 * m2; - case 32: return 0.697 * m2; - case 64: return 0.709 * m2; - default: return (0.7213 / (1.0 + 1.079 / this->m_)) * m2; + case 16: return 0.673 * this->m_ * this->m_; + case 32: return 0.697 * this->m_ * this->m_; + case 64: return 0.709 * this->m_ * this->m_; + default: return (0.7213 / (1.0 + 1.079 / this->m_)) * this->m_ * this->m_; } } From c50e7954662b118b57f8a8df2d18272b1549ec57 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Wed, 20 Mar 2024 22:16:44 +0000 Subject: [PATCH 50/78] Rename parameter for other estimator ref --- .../distinct_count_estimator.inl | 10 +++++----- include/cuco/detail/hyperloglog/hyperloglog.cuh | 12 ++++++------ include/cuco/distinct_count_estimator.cuh | 8 ++++---- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl index 9ea4816d5..5b105af5c 100644 --- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl +++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl @@ -77,17 +77,17 @@ void distinct_count_estimator::merge( template template void distinct_count_estimator::merge_async( - ref_type const& other, cuco::cuda_stream_ref stream) + ref_type const& other_ref, cuco::cuda_stream_ref stream) { - this->impl_->merge_async(other, stream); + this->impl_->merge_async(other_ref, stream); } template template -void distinct_count_estimator::merge(ref_type const& other, - cuco::cuda_stream_ref stream) +void distinct_count_estimator::merge( + ref_type const& other_ref, cuco::cuda_stream_ref stream) { - this->impl_->merge(other, stream); + this->impl_->merge(other_ref, stream); } template diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh index 23079e4db..b07b5e83b 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog.cuh @@ -192,13 +192,13 @@ class hyperloglog { * * @tparam OtherScope Thread scope of `other` estimator * - * @param other Other estimator reference to be merged into `*this` + * @param other_ref Other estimator reference to be merged into `*this` * @param stream CUDA stream this operation is executed in */ template - void merge_async(ref_type const& other, cuco::cuda_stream_ref stream) + void merge_async(ref_type const& other_ref, cuco::cuda_stream_ref stream) { - this->ref_.merge_async(other, stream); + this->ref_.merge_async(other_ref, stream); } /** @@ -211,13 +211,13 @@ class hyperloglog { * * @tparam OtherScope Thread scope of `other` estimator * - * @param other Other estimator reference to be merged into `*this` + * @param other_ref Other estimator reference to be merged into `*this` * @param stream CUDA stream this operation is executed in */ template - void merge(ref_type const& other, cuco::cuda_stream_ref stream) + void merge(ref_type const& other_ref, cuco::cuda_stream_ref stream) { - this->ref_.merge(other, stream); + this->ref_.merge(other_ref, stream); } /** diff --git a/include/cuco/distinct_count_estimator.cuh b/include/cuco/distinct_count_estimator.cuh index 3cd22b469..8e4cc097a 100644 --- a/include/cuco/distinct_count_estimator.cuh +++ b/include/cuco/distinct_count_estimator.cuh @@ -177,11 +177,11 @@ class distinct_count_estimator { * * @tparam OtherScope Thread scope of `other` estimator * - * @param other Other estimator reference to be merged into `*this` + * @param other_ref Other estimator reference to be merged into `*this` * @param stream CUDA stream this operation is executed in */ template - void merge_async(ref_type const& other, cuco::cuda_stream_ref stream = {}); + void merge_async(ref_type const& other_ref, cuco::cuda_stream_ref stream = {}); /** * @brief Merges the result of `other` estimator reference into `*this` estimator. @@ -193,11 +193,11 @@ class distinct_count_estimator { * * @tparam OtherScope Thread scope of `other` estimator * - * @param other Other estimator reference to be merged into `*this` + * @param other_ref Other estimator reference to be merged into `*this` * @param stream CUDA stream this operation is executed in */ template - void merge(ref_type const& other, cuco::cuda_stream_ref stream = {}); + void merge(ref_type const& other_ref, cuco::cuda_stream_ref stream = {}); /** * @brief Compute the estimated distinct items count. From d5595dae09bda28954dd3c0d3bf7eb124cf3f052 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Wed, 20 Mar 2024 22:28:40 +0000 Subject: [PATCH 51/78] Update benchmark --- benchmarks/distinct_count_estimator_bench.cu | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/benchmarks/distinct_count_estimator_bench.cu b/benchmarks/distinct_count_estimator_bench.cu index c5ae8a6b3..f071fbf6b 100644 --- a/benchmarks/distinct_count_estimator_bench.cu +++ b/benchmarks/distinct_count_estimator_bench.cu @@ -49,7 +49,7 @@ template } template -[[nodiscard]] double relative_error(nvbench::state& state, std::size_t num_samples = 5) +[[nodiscard]] double relative_error(nvbench::state& state, std::size_t num_samples) { using T = typename Estimator::value_type; @@ -87,8 +87,9 @@ void distinct_count_estimator_e2e(nvbench::state& state, nvbench::type_list(num_items, "InputSize"); - auto const err = relative_error(state); - auto& summ = state.add_summary("MeanRelativeError"); + auto const err_samples = (cuda::std::is_same_v) ? 1 : 5; + auto const err = relative_error(state, err_samples); + auto& summ = state.add_summary("MeanRelativeError"); summ.set_string("hint", "MRelErr"); summ.set_string("short_name", "MeanRelativeError"); summ.set_string("description", "Mean relatve approximation error."); @@ -146,17 +147,19 @@ using ESTIMATOR_RANGE = nvbench::type_list>; NVBENCH_BENCH_TYPES(distinct_count_estimator_e2e, - NVBENCH_TYPE_AXES(ESTIMATOR_RANGE, nvbench::type_list)) + NVBENCH_TYPE_AXES(ESTIMATOR_RANGE, nvbench::type_list)) .set_name("distinct_count_estimator_e2e") .set_type_axes_names({"Estimator", "Distribution"}) .add_int64_power_of_two_axis("NumInputs", {28, 29, 30}) - .add_int64_axis("SketchSizeKB", {8, 16, 32}) + .add_int64_axis("SketchSizeKB", {8, 16, 32, 64, 128, 256}) // 256KB uses gmem fallback kernel + .add_int64_axis("Multiplicity", {1}) .set_max_noise(defaults::MAX_NOISE); NVBENCH_BENCH_TYPES(distinct_count_estimator_add, - NVBENCH_TYPE_AXES(ESTIMATOR_RANGE, nvbench::type_list)) + NVBENCH_TYPE_AXES(ESTIMATOR_RANGE, nvbench::type_list)) .set_name("distinct_count_estimator::add_async") .set_type_axes_names({"Estimator", "Distribution"}) .add_int64_power_of_two_axis("NumInputs", {28, 29, 30}) - .add_int64_axis("SketchSizeKB", {8, 16, 32, 256}) // 256KB uses gmem fallback kernel + .add_int64_axis("SketchSizeKB", {8, 16, 32, 64, 128, 256}) + .add_int64_axis("Multiplicity", {1}) .set_max_noise(defaults::MAX_NOISE); \ No newline at end of file From 16ad77a5efbb66bfa69d9d37dc6011a196436d6e Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Wed, 20 Mar 2024 22:43:39 +0000 Subject: [PATCH 52/78] Rebind allocator to register_type to ensure proper alignment --- .../cuco/detail/hyperloglog/hyperloglog.cuh | 20 +++++++++++-------- .../detail/hyperloglog/hyperloglog_ref.cuh | 6 +++--- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh index b07b5e83b..3c962536b 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog.cuh @@ -48,11 +48,12 @@ class hyperloglog { using ref_type = hyperloglog_ref; ///< Non-owning reference ///< type - using value_type = typename ref_type<>::value_type; ///< Type of items to count - using hash_type = typename ref_type<>::hash_type; ///< Hash function type + using value_type = typename ref_type<>::value_type; ///< Type of items to count + using hash_type = typename ref_type<>::hash_type; ///< Hash function type + using register_type = typename ref_type<>::register_type; ///< HLL register type using allocator_type = - typename std::allocator_traits::template rebind_alloc; ///< Allocator - ///< type + typename std::allocator_traits::template rebind_alloc; ///< Allocator + ///< type /** * @brief Constructs a `hyperloglog` host object. @@ -72,9 +73,12 @@ class hyperloglog { Allocator const& alloc, cuco::cuda_stream_ref stream) : allocator_{alloc}, - deleter_{sketch_bytes(sketch_size_kb), this->allocator_}, - sketch_{this->allocator_.allocate(sketch_bytes(sketch_size_kb)), this->deleter_}, - ref_{cuda::std::span{this->sketch_.get(), sketch_bytes(sketch_size_kb)}, hash} + deleter_{sketch_bytes(sketch_size_kb) / sizeof(register_type), this->allocator_}, + sketch_{this->allocator_.allocate(sketch_bytes(sketch_size_kb) / sizeof(register_type)), + this->deleter_}, + ref_{cuda::std::span{reinterpret_cast(this->sketch_.get()), + sketch_bytes(sketch_size_kb)}, + hash} { this->ref_.clear_async(stream); } @@ -291,7 +295,7 @@ class hyperloglog { private: allocator_type allocator_; ///< Storage allocator custom_deleter deleter_; ///< Storage deleter - std::unique_ptr> + std::unique_ptr> sketch_; ///< Sketch storage ref_type<> ref_; //< Ref type diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh index f601d0b3b..b85b3b5eb 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh @@ -52,7 +52,6 @@ namespace cuco::detail { */ template class hyperloglog_ref { - using register_type = int; ///< Register array storage // We use `int` here since this is the smallest type that supports native `atomicMax` on GPUs using fp_type = double; ///< Floating point type used for reduction using hash_value_type = @@ -60,8 +59,9 @@ class hyperloglog_ref { public: static constexpr auto thread_scope = Scope; ///< CUDA thread scope - using value_type = T; ///< Type of items to count - using hash_type = Hash; ///< Hash function type + using value_type = T; ///< Type of items to count + using hash_type = Hash; ///< Hash function type + using register_type = int; ///< HLL register type template using with_scope = hyperloglog_ref; ///< Ref type with different From b501a32b95488c291c5562dbdb23a50026cd5d02 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Wed, 20 Mar 2024 22:45:27 +0000 Subject: [PATCH 53/78] Use cudaMemcpyDefault --- include/cuco/detail/hyperloglog/hyperloglog_ref.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh index b85b3b5eb..c84f0c76d 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh @@ -411,7 +411,7 @@ class hyperloglog_ref { CUCO_CUDA_TRY(cudaMemcpyAsync(host_sketch.data(), this->sketch_.data(), sizeof(register_type) * num_regs, - cudaMemcpyDeviceToHost, + cudaMemcpyDefault, stream)); stream.synchronize(); From 0bf0a88104061c1c68a84abbe6bcc4cdcce6ab47 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Wed, 20 Mar 2024 22:53:49 +0000 Subject: [PATCH 54/78] Mention alignment requirements in device_ref_example --- examples/distinct_count_estimator/device_ref_example.cu | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/examples/distinct_count_estimator/device_ref_example.cu b/examples/distinct_count_estimator/device_ref_example.cu index 933726641..92c5169d9 100644 --- a/examples/distinct_count_estimator/device_ref_example.cu +++ b/examples/distinct_count_estimator/device_ref_example.cu @@ -39,6 +39,14 @@ __global__ void piggyback_kernel(RefType ref, InputIt first, std::size_t n) // Shared memory storage for the block-local estimator extern __shared__ std::byte local_sketch[]; + // The following check is optional since the base address of dynamic shared memory is guaranteed + // to meet the alignment requirements + /* + auto const alignment = + 1ull << cuda::std::countr_zero(reinterpret_cast(local_sketch)); + assert(alignment >= local_ref_type::sketch_alignment()); + */ + auto const loop_stride = gridDim.x * blockDim.x; auto idx = blockDim.x * blockIdx.x + threadIdx.x; auto const block = cooperative_groups::this_thread_block(); From a36136054b9771b39ff3cffdc015c3490aade6e1 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Wed, 20 Mar 2024 23:47:18 +0000 Subject: [PATCH 55/78] Pass T instead of Estimator to benchmark --- benchmarks/distinct_count_estimator_bench.cu | 26 +++++++++----------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/benchmarks/distinct_count_estimator_bench.cu b/benchmarks/distinct_count_estimator_bench.cu index f071fbf6b..272cfea88 100644 --- a/benchmarks/distinct_count_estimator_bench.cu +++ b/benchmarks/distinct_count_estimator_bench.cu @@ -76,10 +76,10 @@ template /** * @brief A benchmark evaluating `cuco::distinct_count_estimator` end-to-end performance */ -template -void distinct_count_estimator_e2e(nvbench::state& state, nvbench::type_list) +template +void distinct_count_estimator_e2e(nvbench::state& state, nvbench::type_list) { - using T = typename Estimator::value_type; + using estimator_type = cuco::distinct_count_estimator; auto const num_items = state.get_int64("NumInputs"); auto const sketch_size_kb = state.get_int64("SketchSizeKB"); @@ -88,7 +88,7 @@ void distinct_count_estimator_e2e(nvbench::state& state, nvbench::type_list(num_items, "InputSize"); auto const err_samples = (cuda::std::is_same_v) ? 1 : 5; - auto const err = relative_error(state, err_samples); + auto const err = relative_error(state, err_samples); auto& summ = state.add_summary("MeanRelativeError"); summ.set_string("hint", "MRelErr"); summ.set_string("short_name", "MeanRelativeError"); @@ -100,7 +100,7 @@ void distinct_count_estimator_e2e(nvbench::state& state, nvbench::type_list(state), items.begin(), items.end()); - Estimator estimator{cuco::sketch_size_kb(sketch_size_kb)}; + estimator_type estimator{cuco::sketch_size_kb(sketch_size_kb)}; std::size_t estimated_cardinality = 0; state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { @@ -116,10 +116,10 @@ void distinct_count_estimator_e2e(nvbench::state& state, nvbench::type_list -void distinct_count_estimator_add(nvbench::state& state, nvbench::type_list) +template +void distinct_count_estimator_add(nvbench::state& state, nvbench::type_list) { - using T = typename Estimator::value_type; + using estimator_type = cuco::distinct_count_estimator; auto const num_items = state.get_int64("NumInputs"); auto const sketch_size_kb = state.get_int64("SketchSizeKB"); @@ -132,7 +132,7 @@ void distinct_count_estimator_add(nvbench::state& state, nvbench::type_list(num_items, "InputSize"); - Estimator estimator{cuco::sketch_size_kb(sketch_size_kb)}; + estimator_type estimator{cuco::sketch_size_kb(sketch_size_kb)}; state.exec(nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { timer.start(); estimator.add_async(items.begin(), items.end(), {launch.get_stream()}); @@ -142,12 +142,10 @@ void distinct_count_estimator_add(nvbench::state& state, nvbench::type_list, - cuco::distinct_count_estimator, - cuco::distinct_count_estimator<__int128_t>>; +using TYPE_RANGE = nvbench::type_list; NVBENCH_BENCH_TYPES(distinct_count_estimator_e2e, - NVBENCH_TYPE_AXES(ESTIMATOR_RANGE, nvbench::type_list)) + NVBENCH_TYPE_AXES(TYPE_RANGE, nvbench::type_list)) .set_name("distinct_count_estimator_e2e") .set_type_axes_names({"Estimator", "Distribution"}) .add_int64_power_of_two_axis("NumInputs", {28, 29, 30}) @@ -156,7 +154,7 @@ NVBENCH_BENCH_TYPES(distinct_count_estimator_e2e, .set_max_noise(defaults::MAX_NOISE); NVBENCH_BENCH_TYPES(distinct_count_estimator_add, - NVBENCH_TYPE_AXES(ESTIMATOR_RANGE, nvbench::type_list)) + NVBENCH_TYPE_AXES(TYPE_RANGE, nvbench::type_list)) .set_name("distinct_count_estimator::add_async") .set_type_axes_names({"Estimator", "Distribution"}) .add_int64_power_of_two_axis("NumInputs", {28, 29, 30}) From 66870a7fa6bb97799807c2aa3567a85eb7152ddc Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Wed, 20 Mar 2024 23:51:51 +0000 Subject: [PATCH 56/78] Fix typo in benchmark script --- benchmarks/distinct_count_estimator_bench.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/distinct_count_estimator_bench.cu b/benchmarks/distinct_count_estimator_bench.cu index 272cfea88..76a664eaa 100644 --- a/benchmarks/distinct_count_estimator_bench.cu +++ b/benchmarks/distinct_count_estimator_bench.cu @@ -147,7 +147,7 @@ using TYPE_RANGE = nvbench::type_list)) .set_name("distinct_count_estimator_e2e") - .set_type_axes_names({"Estimator", "Distribution"}) + .set_type_axes_names({"T", "Distribution"}) .add_int64_power_of_two_axis("NumInputs", {28, 29, 30}) .add_int64_axis("SketchSizeKB", {8, 16, 32, 64, 128, 256}) // 256KB uses gmem fallback kernel .add_int64_axis("Multiplicity", {1}) @@ -156,7 +156,7 @@ NVBENCH_BENCH_TYPES(distinct_count_estimator_e2e, NVBENCH_BENCH_TYPES(distinct_count_estimator_add, NVBENCH_TYPE_AXES(TYPE_RANGE, nvbench::type_list)) .set_name("distinct_count_estimator::add_async") - .set_type_axes_names({"Estimator", "Distribution"}) + .set_type_axes_names({"T", "Distribution"}) .add_int64_power_of_two_axis("NumInputs", {28, 29, 30}) .add_int64_axis("SketchSizeKB", {8, 16, 32, 64, 128, 256}) .add_int64_axis("Multiplicity", {1}) From b990dcae5b55caebe95d76509f18f9fb6a507fc3 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Thu, 21 Mar 2024 14:10:45 +0000 Subject: [PATCH 57/78] Rename hash function --- .../distinct_count_estimator/distinct_count_estimator.inl | 6 +++--- .../distinct_count_estimator_ref.inl | 5 +++-- include/cuco/detail/hyperloglog/hyperloglog.cuh | 4 ++-- include/cuco/detail/hyperloglog/hyperloglog_ref.cuh | 6 +++--- include/cuco/distinct_count_estimator.cuh | 3 ++- include/cuco/distinct_count_estimator_ref.cuh | 3 ++- 6 files changed, 15 insertions(+), 12 deletions(-) diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl index 5b105af5c..6538e1588 100644 --- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl +++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl @@ -101,13 +101,13 @@ template typename distinct_count_estimator::ref_type<> distinct_count_estimator::ref() const noexcept { - return {this->sketch(), this->hash()}; + return {this->sketch(), this->hash_function()}; } template -auto distinct_count_estimator::hash() const noexcept +auto distinct_count_estimator::hash_function() const noexcept { - return this->impl_->hash(); + return this->impl_->hash_function(); } template diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl index 3be39ac44..535e40b32 100644 --- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl +++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl @@ -107,9 +107,10 @@ __host__ std::size_t distinct_count_estimator_ref::estimate( } template -__host__ __device__ auto distinct_count_estimator_ref::hash() const noexcept +__host__ __device__ auto distinct_count_estimator_ref::hash_function() + const noexcept { - return this->impl_.hash(); + return this->impl_.hash_function(); } template diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh index 3c962536b..2b6ca738b 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog.cuh @@ -49,7 +49,7 @@ class hyperloglog { ///< type using value_type = typename ref_type<>::value_type; ///< Type of items to count - using hash_type = typename ref_type<>::hash_type; ///< Hash function type + using hasher = typename ref_type<>::hasher; ///< Hash function type using register_type = typename ref_type<>::register_type; ///< HLL register type using allocator_type = typename std::allocator_traits::template rebind_alloc; ///< Allocator @@ -250,7 +250,7 @@ class hyperloglog { * * @return The hash function */ - [[nodiscard]] auto hash() const noexcept { return this->ref_.hash(); } + [[nodiscard]] auto hash_function() const noexcept { return this->ref_.hash_function(); } /** * @brief Gets the span of the sketch. diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh index c84f0c76d..c8ace0f23 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh @@ -60,7 +60,7 @@ class hyperloglog_ref { static constexpr auto thread_scope = Scope; ///< CUDA thread scope using value_type = T; ///< Type of items to count - using hash_type = Hash; ///< Hash function type + using hasher = Hash; ///< Hash function type using register_type = int; ///< HLL register type template @@ -435,7 +435,7 @@ class hyperloglog_ref { * * @return The hash function */ - [[nodiscard]] __host__ __device__ auto hash() const noexcept { return this->hash_; } + [[nodiscard]] __host__ __device__ auto hash_function() const noexcept { return this->hash_; } /** * @brief Gets the span of the sketch. @@ -524,7 +524,7 @@ class hyperloglog_ref { shmem_bytes); } - hash_type hash_; ///< Hash function used to hash items + hasher hash_; ///< Hash function used to hash items int32_t precision_; ///< HLL precision parameter hash_value_type register_mask_; ///< Mask used to separate register index from count cuda::std::span sketch_; ///< HLL sketch storage diff --git a/include/cuco/distinct_count_estimator.cuh b/include/cuco/distinct_count_estimator.cuh index 8e4cc097a..64ed5cc56 100644 --- a/include/cuco/distinct_count_estimator.cuh +++ b/include/cuco/distinct_count_estimator.cuh @@ -54,6 +54,7 @@ class distinct_count_estimator { ///< type using value_type = typename impl_type::value_type; ///< Type of items to count + using hasher = typename impl_type::hasher; ///< Type of hash function using allocator_type = typename impl_type::allocator_type; ///< Allocator type // TODO enable CTAD @@ -222,7 +223,7 @@ class distinct_count_estimator { * * @return The hash function */ - [[nodiscard]] auto hash() const noexcept; + [[nodiscard]] auto hash_function() const noexcept; /** * @brief Gets the span of the sketch. diff --git a/include/cuco/distinct_count_estimator_ref.cuh b/include/cuco/distinct_count_estimator_ref.cuh index a0806cccf..f6ebfe94b 100644 --- a/include/cuco/distinct_count_estimator_ref.cuh +++ b/include/cuco/distinct_count_estimator_ref.cuh @@ -46,6 +46,7 @@ class distinct_count_estimator_ref { static constexpr auto thread_scope = impl_type::thread_scope; ///< CUDA thread scope using value_type = typename impl_type::value_type; ///< Type of items to count + using hasher = typename impl_type::hasher; ///< Type of hash function template using with_scope = distinct_count_estimator_ref; ///< Ref type with different @@ -200,7 +201,7 @@ class distinct_count_estimator_ref { * * @return The hash function */ - [[nodiscard]] __host__ __device__ auto hash() const noexcept; + [[nodiscard]] __host__ __device__ auto hash_function() const noexcept; /** * @brief Gets the span of the sketch. From c87309e9895ccc82e25b390787a81fed46c5ede0 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Thu, 21 Mar 2024 14:15:35 +0000 Subject: [PATCH 58/78] Use placement new to initialize sketch --- include/cuco/detail/hyperloglog/hyperloglog_ref.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh index c8ace0f23..ade4a8166 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh @@ -117,7 +117,7 @@ class hyperloglog_ref { __device__ void clear(CG const& group) noexcept { for (int i = group.thread_rank(); i < this->sketch_.size(); i += group.size()) { - this->sketch_[i] = 0; + new (&(this->sketch_[i])) register_type{}; } } From 03d4b41187cadfdf4d02a04ef3f0e9aced08df01 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Thu, 21 Mar 2024 14:19:21 +0000 Subject: [PATCH 59/78] Remove custom_deleter member --- include/cuco/detail/hyperloglog/hyperloglog.cuh | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh index 2b6ca738b..38dff73f2 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog.cuh @@ -73,9 +73,9 @@ class hyperloglog { Allocator const& alloc, cuco::cuda_stream_ref stream) : allocator_{alloc}, - deleter_{sketch_bytes(sketch_size_kb) / sizeof(register_type), this->allocator_}, - sketch_{this->allocator_.allocate(sketch_bytes(sketch_size_kb) / sizeof(register_type)), - this->deleter_}, + sketch_{ + this->allocator_.allocate(sketch_bytes(sketch_size_kb) / sizeof(register_type)), + custom_deleter{sketch_bytes(sketch_size_kb) / sizeof(register_type), this->allocator_}}, ref_{cuda::std::span{reinterpret_cast(this->sketch_.get()), sketch_bytes(sketch_size_kb)}, hash} @@ -293,8 +293,7 @@ class hyperloglog { } private: - allocator_type allocator_; ///< Storage allocator - custom_deleter deleter_; ///< Storage deleter + allocator_type allocator_; ///< Storage allocator std::unique_ptr> sketch_; ///< Sketch storage ref_type<> ref_; //< Ref type From 7de06fb05c8da41af8a2b88a6b3ab1b2db3b46d6 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Thu, 21 Mar 2024 14:21:34 +0000 Subject: [PATCH 60/78] Rename sketch_size.hpp -> sktech_size.cuh --- include/cuco/detail/hyperloglog/hyperloglog.cuh | 2 +- include/cuco/detail/hyperloglog/hyperloglog_ref.cuh | 2 +- include/cuco/distinct_count_estimator.cuh | 2 +- include/cuco/distinct_count_estimator_ref.cuh | 2 +- include/cuco/{sketch_size.hpp => sketch_size.cuh} | 0 5 files changed, 4 insertions(+), 4 deletions(-) rename include/cuco/{sketch_size.hpp => sketch_size.cuh} (100%) diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh index 38dff73f2..a5c0a0e4a 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog.cuh @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh index ade4a8166..67c38f0b8 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include #include diff --git a/include/cuco/distinct_count_estimator.cuh b/include/cuco/distinct_count_estimator.cuh index 64ed5cc56..51fb82080 100644 --- a/include/cuco/distinct_count_estimator.cuh +++ b/include/cuco/distinct_count_estimator.cuh @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include diff --git a/include/cuco/distinct_count_estimator_ref.cuh b/include/cuco/distinct_count_estimator_ref.cuh index f6ebfe94b..d639310aa 100644 --- a/include/cuco/distinct_count_estimator_ref.cuh +++ b/include/cuco/distinct_count_estimator_ref.cuh @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include diff --git a/include/cuco/sketch_size.hpp b/include/cuco/sketch_size.cuh similarity index 100% rename from include/cuco/sketch_size.hpp rename to include/cuco/sketch_size.cuh From 185d3c4c213edde41da76401b6da48ea6c91923b Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Thu, 21 Mar 2024 14:24:19 +0000 Subject: [PATCH 61/78] Use std::abs --- benchmarks/distinct_count_estimator_bench.cu | 3 ++- examples/distinct_count_estimator/host_bulk_example.cu | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/benchmarks/distinct_count_estimator_bench.cu b/benchmarks/distinct_count_estimator_bench.cu index 76a664eaa..37c5f4f00 100644 --- a/benchmarks/distinct_count_estimator_bench.cu +++ b/benchmarks/distinct_count_estimator_bench.cu @@ -28,6 +28,7 @@ #include +#include #include using namespace cuco::benchmark; @@ -66,7 +67,7 @@ template estimator.add(items.begin(), items.end()); double estimated_cardinality = estimator.estimate(); double true_cardinality = exact_distinct_count(items.begin(), num_items); - error_sum += abs(true_cardinality - estimated_cardinality) / true_cardinality; + error_sum += std::abs(true_cardinality - estimated_cardinality) / true_cardinality; estimator.clear(); } diff --git a/examples/distinct_count_estimator/host_bulk_example.cu b/examples/distinct_count_estimator/host_bulk_example.cu index add3cb626..96a46a8df 100644 --- a/examples/distinct_count_estimator/host_bulk_example.cu +++ b/examples/distinct_count_estimator/host_bulk_example.cu @@ -18,6 +18,7 @@ #include #include +#include #include #include @@ -49,7 +50,8 @@ int main(void) std::cout << "True cardinality: " << num_items << "\nEstimated cardinality: " << estimated_cardinality << "\nRelative error: " - << abs(static_cast(num_items) - static_cast(estimated_cardinality)) / + << std::abs(static_cast(num_items) - + static_cast(estimated_cardinality)) / num_items << std::endl; From 023d0809a683cf484a70950484f78480ff0a9c16 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Thu, 21 Mar 2024 14:28:09 +0000 Subject: [PATCH 62/78] Use std::vector instead of thrust::host_vector> --- include/cuco/detail/hyperloglog/hyperloglog_ref.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh index 67c38f0b8..ade97f7cb 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh @@ -26,7 +26,6 @@ #include #include -#include #include #include @@ -37,6 +36,7 @@ #include #include +#include namespace cuco::detail { @@ -405,7 +405,7 @@ class hyperloglog_ref { [[nodiscard]] __host__ std::size_t estimate(cuco::cuda_stream_ref stream) const { auto const num_regs = 1ull << this->precision_; - thrust::host_vector host_sketch(num_regs); + std::vector host_sketch(num_regs); // TODO check if storage is host accessible CUCO_CUDA_TRY(cudaMemcpyAsync(host_sketch.data(), From 53cdf376f161717734ffb34283057e9ba540e3c4 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Thu, 21 Mar 2024 14:30:57 +0000 Subject: [PATCH 63/78] Add note about shmem alignment --- include/cuco/detail/hyperloglog/kernels.cuh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/cuco/detail/hyperloglog/kernels.cuh b/include/cuco/detail/hyperloglog/kernels.cuh index ba4ceb506..c04ad4617 100644 --- a/include/cuco/detail/hyperloglog/kernels.cuh +++ b/include/cuco/detail/hyperloglog/kernels.cuh @@ -44,7 +44,8 @@ CUCO_KERNEL void add_shmem_vectorized(typename RefType::value_type const* first, using vector_type = cuda::std::array; using local_ref_type = typename RefType::with_scope; - // TODO assert alignment + // Base address of dynamic shared memory is guaranteed to be aligned to at least 16 bytes which is + // sufficient for this purpose extern __shared__ std::byte local_sketch[]; auto const loop_stride = cuco::detail::grid_stride(); From 2a81714e7d7e1bfa521eeeaa0045804791eece14 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Thu, 21 Mar 2024 14:32:05 +0000 Subject: [PATCH 64/78] Remove comment --- include/cuco/distinct_count_estimator.cuh | 1 - 1 file changed, 1 deletion(-) diff --git a/include/cuco/distinct_count_estimator.cuh b/include/cuco/distinct_count_estimator.cuh index 51fb82080..d7318d26a 100644 --- a/include/cuco/distinct_count_estimator.cuh +++ b/include/cuco/distinct_count_estimator.cuh @@ -82,7 +82,6 @@ class distinct_count_estimator { distinct_count_estimator& operator=(distinct_count_estimator const&) = delete; distinct_count_estimator(distinct_count_estimator&&) = default; ///< Move constructor - // TODO this is somehow required to pass the Doxygen check. /** * @brief Copy-assignment operator. * From d859b39e7aa302de7e2207f273c0990e27b17a88 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Thu, 21 Mar 2024 15:19:03 +0000 Subject: [PATCH 65/78] Remove device-sided error handling since it hurts performance --- .../detail/hyperloglog/hyperloglog_ref.cuh | 34 ++++++++----------- include/cuco/distinct_count_estimator_ref.cuh | 7 ++-- 2 files changed, 19 insertions(+), 22 deletions(-) diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh index ade97f7cb..7186d785b 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh @@ -35,6 +35,7 @@ #include #include +#include // there is no #include #include @@ -70,8 +71,9 @@ class hyperloglog_ref { /** * @brief Constructs a non-owning `hyperloglog_ref` object. * - * @throw If sketch size < 0.0625KB or 64B - * @throw If sketch storage has insufficient alignment + * @throw If sketch size < 0.0625KB or 64B. Throws if called from host; UB if called from device. + * @throw If sketch storage has insufficient alignment. Throws if called from host; UB if called. + * from device. * * @param sketch_span Reference to sketch storage * @param hash The hash function used to hash items @@ -86,24 +88,15 @@ class hyperloglog_ref { sketch_{reinterpret_cast(sketch_span.data()), this->sketch_bytes() / sizeof(register_type)} { +#ifndef __CUDA_ARCH__ auto const alignment = 1ull << cuda::std::countr_zero(reinterpret_cast(sketch_span.data())); + CUCO_EXPECTS( + alignment >= sketch_alignment(), "Insufficient sketch alignment", std::runtime_error); - if (alignment < sketch_alignment()) { -#ifdef __CUDA_ARCH__ - __trap(); -#else - CUCO_FAIL("Insufficient sketch alignment", std::runtime_error); + CUCO_EXPECTS( + this->precision_ >= 4, "Minimum required sketch size is 0.0625KB or 64B", std::runtime_error); #endif - } - - if (this->precision_ < 4) { -#ifdef __CUDA_ARCH__ - __trap(); -#else - CUCO_FAIL("Minimum required sketch size is 0.0625KB or 64B", std::runtime_error); -#endif - } } /** @@ -272,7 +265,7 @@ class hyperloglog_ref { /** * @brief Merges the result of `other` estimator reference into `*this` estimator reference. * - * @throw If this->sketch_bytes() != other.sketch_bytes() + * @throw If this->sketch_bytes() != other.sketch_bytes() then behavior is undefined * * @tparam CG CUDA Cooperative Group type * @tparam OtherScope Thread scope of `other` estimator @@ -283,7 +276,8 @@ class hyperloglog_ref { template __device__ void merge(CG const& group, hyperloglog_ref const& other) { - if (other.precision_ != this->precision_) { __trap(); } + // TODO find a better way to do error handling in device code + // if (other.precision_ != this->precision_) { __trap(); } for (int i = group.thread_rank(); i < this->sketch_.size(); i += group.size()) { this->update_max(i, other.sketch_[i]); @@ -468,7 +462,9 @@ class hyperloglog_ref { [[nodiscard]] __host__ __device__ static constexpr std::size_t sketch_bytes( cuco::sketch_size_kb sketch_size_kb) noexcept { - return cuda::std::bit_floor(static_cast(sketch_size_kb * 1024)); + // minimum precision is 4 or 64 bytes + return std::max(static_cast(sizeof(register_type) * 1ull << 4), + cuda::std::bit_floor(static_cast(sketch_size_kb * 1024))); } /** diff --git a/include/cuco/distinct_count_estimator_ref.cuh b/include/cuco/distinct_count_estimator_ref.cuh index d639310aa..bc0b9da61 100644 --- a/include/cuco/distinct_count_estimator_ref.cuh +++ b/include/cuco/distinct_count_estimator_ref.cuh @@ -55,8 +55,9 @@ class distinct_count_estimator_ref { /** * @brief Constructs a non-owning `distinct_count_estimator_ref` object. * - * @throw If sketch size < 0.0625KB or 64B - * @throw If sketch storage has insufficient alignment + * @throw If sketch size < 0.0625KB or 64B. Throws if called from host; UB if called from device. + * @throw If sketch storage has insufficient alignment. Throws if called from host; UB if called + * from device. * * @param sketch_span Reference to sketch storage * @param hash The hash function used to hash items @@ -132,7 +133,7 @@ class distinct_count_estimator_ref { /** * @brief Merges the result of `other` estimator reference into `*this` estimator reference. * - * @throw If this->sketch_bytes() != other.sketch_bytes() + * @throw If this->sketch_bytes() != other.sketch_bytes() then behavior is undefined * * @tparam CG CUDA Cooperative Group type * @tparam OtherScope Thread scope of `other` estimator From 43be0f000828ee17602e9202a30f5294d794a4b1 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Thu, 21 Mar 2024 23:46:19 +0000 Subject: [PATCH 66/78] Constexpr all the things! --- .../distinct_count_estimator.inl | 28 +++++++-------- .../distinct_count_estimator_ref.inl | 32 ++++++++--------- .../cuco/detail/hyperloglog/hyperloglog.cuh | 34 +++++++++++-------- .../detail/hyperloglog/hyperloglog_ref.cuh | 34 +++++++++++-------- include/cuco/distinct_count_estimator.cuh | 30 ++++++++-------- include/cuco/distinct_count_estimator_ref.cuh | 31 +++++++++-------- 6 files changed, 101 insertions(+), 88 deletions(-) diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl index 6538e1588..5454165a6 100644 --- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl +++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl @@ -27,21 +27,22 @@ constexpr distinct_count_estimator::distinct_count_es } template -void distinct_count_estimator::clear_async( +constexpr void distinct_count_estimator::clear_async( cuco::cuda_stream_ref stream) noexcept { this->impl_->clear_async(stream); } template -void distinct_count_estimator::clear(cuco::cuda_stream_ref stream) +constexpr void distinct_count_estimator::clear( + cuco::cuda_stream_ref stream) { this->impl_->clear(stream); } template template -void distinct_count_estimator::add_async( +constexpr void distinct_count_estimator::add_async( InputIt first, InputIt last, cuco::cuda_stream_ref stream) noexcept { this->impl_->add_async(first, last, stream); @@ -49,16 +50,15 @@ void distinct_count_estimator::add_async( template template -void distinct_count_estimator::add(InputIt first, - InputIt last, - cuco::cuda_stream_ref stream) +constexpr void distinct_count_estimator::add( + InputIt first, InputIt last, cuco::cuda_stream_ref stream) { this->impl_->add(first, last, stream); } template template -void distinct_count_estimator::merge_async( +constexpr void distinct_count_estimator::merge_async( distinct_count_estimator const& other, cuco::cuda_stream_ref stream) { @@ -67,7 +67,7 @@ void distinct_count_estimator::merge_async( template template -void distinct_count_estimator::merge( +constexpr void distinct_count_estimator::merge( distinct_count_estimator const& other, cuco::cuda_stream_ref stream) { @@ -76,7 +76,7 @@ void distinct_count_estimator::merge( template template -void distinct_count_estimator::merge_async( +constexpr void distinct_count_estimator::merge_async( ref_type const& other_ref, cuco::cuda_stream_ref stream) { this->impl_->merge_async(other_ref, stream); @@ -84,34 +84,34 @@ void distinct_count_estimator::merge_async( template template -void distinct_count_estimator::merge( +constexpr void distinct_count_estimator::merge( ref_type const& other_ref, cuco::cuda_stream_ref stream) { this->impl_->merge(other_ref, stream); } template -std::size_t distinct_count_estimator::estimate( +constexpr std::size_t distinct_count_estimator::estimate( cuco::cuda_stream_ref stream) const { return this->impl_->estimate(stream); } template -typename distinct_count_estimator::ref_type<> +constexpr typename distinct_count_estimator::ref_type<> distinct_count_estimator::ref() const noexcept { return {this->sketch(), this->hash_function()}; } template -auto distinct_count_estimator::hash_function() const noexcept +constexpr auto distinct_count_estimator::hash_function() const noexcept { return this->impl_->hash_function(); } template -cuda::std::span distinct_count_estimator::sketch() +constexpr cuda::std::span distinct_count_estimator::sketch() const noexcept { return this->impl_->sketch(); diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl index 535e40b32..a607c3ce9 100644 --- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl +++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl @@ -26,51 +26,51 @@ __host__ template template -__device__ void distinct_count_estimator_ref::clear(CG const& group) noexcept +__device__ constexpr void distinct_count_estimator_ref::clear( + CG const& group) noexcept { this->impl_.clear(group); } template -__host__ void distinct_count_estimator_ref::clear_async( +__host__ constexpr void distinct_count_estimator_ref::clear_async( cuco::cuda_stream_ref stream) noexcept { this->impl_.clear_async(stream); } template -__host__ void distinct_count_estimator_ref::clear(cuco::cuda_stream_ref stream) +__host__ constexpr void distinct_count_estimator_ref::clear( + cuco::cuda_stream_ref stream) { this->impl_.clear(stream); } template -__device__ void distinct_count_estimator_ref::add(T const& item) noexcept +__device__ constexpr void distinct_count_estimator_ref::add(T const& item) noexcept { this->impl_.add(item); } template template -__host__ void distinct_count_estimator_ref::add_async(InputIt first, - InputIt last, - cuco::cuda_stream_ref stream) +__host__ constexpr void distinct_count_estimator_ref::add_async( + InputIt first, InputIt last, cuco::cuda_stream_ref stream) { this->impl_.add_async(first, last, stream); } template template -__host__ void distinct_count_estimator_ref::add(InputIt first, - InputIt last, - cuco::cuda_stream_ref stream) +__host__ constexpr void distinct_count_estimator_ref::add( + InputIt first, InputIt last, cuco::cuda_stream_ref stream) { this->impl_.add(first, last, stream); } template template -__device__ void distinct_count_estimator_ref::merge( +__device__ constexpr void distinct_count_estimator_ref::merge( CG const& group, distinct_count_estimator_ref const& other) { this->impl_.merge(group, other.impl_); @@ -78,7 +78,7 @@ __device__ void distinct_count_estimator_ref::merge( template template -__host__ void distinct_count_estimator_ref::merge_async( +__host__ constexpr void distinct_count_estimator_ref::merge_async( distinct_count_estimator_ref const& other, cuco::cuda_stream_ref stream) { this->impl_.merge_async(other, stream); @@ -86,7 +86,7 @@ __host__ void distinct_count_estimator_ref::merge_async( template template -__host__ void distinct_count_estimator_ref::merge( +__host__ constexpr void distinct_count_estimator_ref::merge( distinct_count_estimator_ref const& other, cuco::cuda_stream_ref stream) { this->impl_.merge(other, stream); @@ -100,21 +100,21 @@ __device__ std::size_t distinct_count_estimator_ref::estimate( } template -__host__ std::size_t distinct_count_estimator_ref::estimate( +__host__ constexpr std::size_t distinct_count_estimator_ref::estimate( cuco::cuda_stream_ref stream) const { return this->impl_.estimate(stream); } template -__host__ __device__ auto distinct_count_estimator_ref::hash_function() +__host__ __device__ constexpr auto distinct_count_estimator_ref::hash_function() const noexcept { return this->impl_.hash_function(); } template -__host__ __device__ cuda::std::span +__host__ __device__ constexpr cuda::std::span distinct_count_estimator_ref::sketch() const noexcept { return this->impl_.sketch(); diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh index a5c0a0e4a..1d24eb3fc 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog.cuh @@ -101,7 +101,10 @@ class hyperloglog { * * @param stream CUDA stream this operation is executed in */ - void clear_async(cuco::cuda_stream_ref stream) noexcept { this->ref_.clear_async(stream); } + constexpr void clear_async(cuco::cuda_stream_ref stream) noexcept + { + this->ref_.clear_async(stream); + } /** * @brief Resets the estimator, i.e., clears the current count estimate. @@ -111,7 +114,7 @@ class hyperloglog { * * @param stream CUDA stream this operation is executed in */ - void clear(cuco::cuda_stream_ref stream) { this->ref_.clear(stream); } + constexpr void clear(cuco::cuda_stream_ref stream) { this->ref_.clear(stream); } /** * @brief Asynchronously adds to be counted items to the estimator. @@ -125,7 +128,7 @@ class hyperloglog { * @param stream CUDA stream this operation is executed in */ template - void add_async(InputIt first, InputIt last, cuco::cuda_stream_ref stream) + constexpr void add_async(InputIt first, InputIt last, cuco::cuda_stream_ref stream) { this->ref_.add_async(first, last, stream); } @@ -145,7 +148,7 @@ class hyperloglog { * @param stream CUDA stream this operation is executed in */ template - void add(InputIt first, InputIt last, cuco::cuda_stream_ref stream) + constexpr void add(InputIt first, InputIt last, cuco::cuda_stream_ref stream) { this->ref_.add(first, last, stream); } @@ -162,8 +165,8 @@ class hyperloglog { * @param stream CUDA stream this operation is executed in */ template - void merge_async(hyperloglog const& other, - cuco::cuda_stream_ref stream) + constexpr void merge_async(hyperloglog const& other, + cuco::cuda_stream_ref stream) { this->ref_.merge_async(other.ref(), stream); } @@ -183,8 +186,8 @@ class hyperloglog { * @param stream CUDA stream this operation is executed in */ template - void merge(hyperloglog const& other, - cuco::cuda_stream_ref stream) + constexpr void merge(hyperloglog const& other, + cuco::cuda_stream_ref stream) { this->ref_.merge(other.ref(), stream); } @@ -200,7 +203,7 @@ class hyperloglog { * @param stream CUDA stream this operation is executed in */ template - void merge_async(ref_type const& other_ref, cuco::cuda_stream_ref stream) + constexpr void merge_async(ref_type const& other_ref, cuco::cuda_stream_ref stream) { this->ref_.merge_async(other_ref, stream); } @@ -219,7 +222,7 @@ class hyperloglog { * @param stream CUDA stream this operation is executed in */ template - void merge(ref_type const& other_ref, cuco::cuda_stream_ref stream) + constexpr void merge(ref_type const& other_ref, cuco::cuda_stream_ref stream) { this->ref_.merge(other_ref, stream); } @@ -233,7 +236,7 @@ class hyperloglog { * * @return Approximate distinct items count */ - [[nodiscard]] std::size_t estimate(cuco::cuda_stream_ref stream) const + [[nodiscard]] constexpr std::size_t estimate(cuco::cuda_stream_ref stream) const { return this->ref_.estimate(stream); } @@ -243,21 +246,24 @@ class hyperloglog { * * @return Device ref object of the current `distinct_count_estimator` host object */ - [[nodiscard]] ref_type<> ref() const noexcept { return this->ref_; } + [[nodiscard]] constexpr ref_type<> ref() const noexcept { return this->ref_; } /** * @brief Get hash function. * * @return The hash function */ - [[nodiscard]] auto hash_function() const noexcept { return this->ref_.hash_function(); } + [[nodiscard]] constexpr auto hash_function() const noexcept { return this->ref_.hash_function(); } /** * @brief Gets the span of the sketch. * * @return The cuda::std::span of the sketch */ - [[nodiscard]] cuda::std::span sketch() const noexcept { return this->ref_.sketch(); } + [[nodiscard]] constexpr cuda::std::span sketch() const noexcept + { + return this->ref_.sketch(); + } /** * @brief Gets the number of bytes required for the sketch storage. diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh index 7186d785b..10dc14273 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh @@ -107,7 +107,7 @@ class hyperloglog_ref { * @param group CUDA Cooperative group this operation is executed in */ template - __device__ void clear(CG const& group) noexcept + __device__ constexpr void clear(CG const& group) noexcept { for (int i = group.thread_rank(); i < this->sketch_.size(); i += group.size()) { new (&(this->sketch_[i])) register_type{}; @@ -122,7 +122,7 @@ class hyperloglog_ref { * * @param stream CUDA stream this operation is executed in */ - __host__ void clear(cuco::cuda_stream_ref stream) + __host__ constexpr void clear(cuco::cuda_stream_ref stream) { this->clear_async(stream); stream.synchronize(); @@ -133,7 +133,7 @@ class hyperloglog_ref { * * @param stream CUDA stream this operation is executed in */ - __host__ void clear_async(cuco::cuda_stream_ref stream) noexcept + __host__ constexpr void clear_async(cuco::cuda_stream_ref stream) noexcept { auto constexpr block_size = 1024; cuco::hyperloglog_ns::detail::clear<<<1, block_size, 0, stream>>>(*this); @@ -144,7 +144,7 @@ class hyperloglog_ref { * * @param item The item to be counted */ - __device__ void add(T const& item) noexcept + __device__ constexpr void add(T const& item) noexcept { auto const h = this->hash_(item); auto const reg = h & this->register_mask_; @@ -165,7 +165,7 @@ class hyperloglog_ref { * @param stream CUDA stream this operation is executed in */ template - __host__ void add_async(InputIt first, InputIt last, cuco::cuda_stream_ref stream) + __host__ constexpr void add_async(InputIt first, InputIt last, cuco::cuda_stream_ref stream) { auto const num_items = cuco::detail::distance(first, last); if (num_items == 0) { return; } @@ -256,7 +256,7 @@ class hyperloglog_ref { * @param stream CUDA stream this operation is executed in */ template - __host__ void add(InputIt first, InputIt last, cuco::cuda_stream_ref stream) + __host__ constexpr void add(InputIt first, InputIt last, cuco::cuda_stream_ref stream) { this->add_async(first, last, stream); stream.synchronize(); @@ -274,7 +274,8 @@ class hyperloglog_ref { * @param other Other estimator reference to be merged into `*this` */ template - __device__ void merge(CG const& group, hyperloglog_ref const& other) + __device__ constexpr void merge(CG const& group, + hyperloglog_ref const& other) { // TODO find a better way to do error handling in device code // if (other.precision_ != this->precision_) { __trap(); } @@ -296,8 +297,8 @@ class hyperloglog_ref { * @param stream CUDA stream this operation is executed in */ template - __host__ void merge_async(hyperloglog_ref const& other, - cuco::cuda_stream_ref stream) + __host__ constexpr void merge_async(hyperloglog_ref const& other, + cuco::cuda_stream_ref stream) { CUCO_EXPECTS(other.precision == this->precision_, "Cannot merge estimators with different sketch sizes", @@ -320,8 +321,8 @@ class hyperloglog_ref { * @param stream CUDA stream this operation is executed in */ template - __host__ void merge(hyperloglog_ref const& other, - cuco::cuda_stream_ref stream) + __host__ constexpr void merge(hyperloglog_ref const& other, + cuco::cuda_stream_ref stream) { this->merge_async(other, stream); stream.synchronize(); @@ -396,7 +397,7 @@ class hyperloglog_ref { * * @return Approximate distinct items count */ - [[nodiscard]] __host__ std::size_t estimate(cuco::cuda_stream_ref stream) const + [[nodiscard]] __host__ constexpr std::size_t estimate(cuco::cuda_stream_ref stream) const { auto const num_regs = 1ull << this->precision_; std::vector host_sketch(num_regs); @@ -429,14 +430,17 @@ class hyperloglog_ref { * * @return The hash function */ - [[nodiscard]] __host__ __device__ auto hash_function() const noexcept { return this->hash_; } + [[nodiscard]] __host__ __device__ constexpr auto hash_function() const noexcept + { + return this->hash_; + } /** * @brief Gets the span of the sketch. * * @return The cuda::std::span of the sketch */ - [[nodiscard]] __host__ __device__ cuda::std::span sketch() const noexcept + [[nodiscard]] __host__ __device__ constexpr cuda::std::span sketch() const noexcept { return cuda::std::span(reinterpret_cast(this->sketch_.data()), this->sketch_bytes()); @@ -447,7 +451,7 @@ class hyperloglog_ref { * * @return The number of bytes required for the sketch */ - [[nodiscard]] __host__ __device__ std::size_t sketch_bytes() const noexcept + [[nodiscard]] __host__ __device__ constexpr std::size_t sketch_bytes() const noexcept { return (1ull << this->precision_) * sizeof(register_type); } diff --git a/include/cuco/distinct_count_estimator.cuh b/include/cuco/distinct_count_estimator.cuh index d7318d26a..011194ad5 100644 --- a/include/cuco/distinct_count_estimator.cuh +++ b/include/cuco/distinct_count_estimator.cuh @@ -94,7 +94,7 @@ class distinct_count_estimator { * * @param stream CUDA stream this operation is executed in */ - void clear_async(cuco::cuda_stream_ref stream = {}) noexcept; + constexpr void clear_async(cuco::cuda_stream_ref stream = {}) noexcept; /** * @brief Resets the estimator, i.e., clears the current count estimate. @@ -104,7 +104,7 @@ class distinct_count_estimator { * * @param stream CUDA stream this operation is executed in */ - void clear(cuco::cuda_stream_ref stream = {}); + constexpr void clear(cuco::cuda_stream_ref stream = {}); /** * @brief Asynchronously adds to be counted items to the estimator. @@ -118,7 +118,7 @@ class distinct_count_estimator { * @param stream CUDA stream this operation is executed in */ template - void add_async(InputIt first, InputIt last, cuco::cuda_stream_ref stream = {}) noexcept; + constexpr void add_async(InputIt first, InputIt last, cuco::cuda_stream_ref stream = {}) noexcept; /** * @brief Adds to be counted items to the estimator. @@ -135,7 +135,7 @@ class distinct_count_estimator { * @param stream CUDA stream this operation is executed in */ template - void add(InputIt first, InputIt last, cuco::cuda_stream_ref stream = {}); + constexpr void add(InputIt first, InputIt last, cuco::cuda_stream_ref stream = {}); /** * @brief Asynchronously merges the result of `other` estimator into `*this` estimator. @@ -149,8 +149,9 @@ class distinct_count_estimator { * @param stream CUDA stream this operation is executed in */ template - void merge_async(distinct_count_estimator const& other, - cuco::cuda_stream_ref stream = {}); + constexpr void merge_async( + distinct_count_estimator const& other, + cuco::cuda_stream_ref stream = {}); /** * @brief Merges the result of `other` estimator into `*this` estimator. @@ -167,8 +168,8 @@ class distinct_count_estimator { * @param stream CUDA stream this operation is executed in */ template - void merge(distinct_count_estimator const& other, - cuco::cuda_stream_ref stream = {}); + constexpr void merge(distinct_count_estimator const& other, + cuco::cuda_stream_ref stream = {}); /** * @brief Asynchronously merges the result of `other` estimator reference into `*this` estimator. @@ -181,7 +182,8 @@ class distinct_count_estimator { * @param stream CUDA stream this operation is executed in */ template - void merge_async(ref_type const& other_ref, cuco::cuda_stream_ref stream = {}); + constexpr void merge_async(ref_type const& other_ref, + cuco::cuda_stream_ref stream = {}); /** * @brief Merges the result of `other` estimator reference into `*this` estimator. @@ -197,7 +199,7 @@ class distinct_count_estimator { * @param stream CUDA stream this operation is executed in */ template - void merge(ref_type const& other_ref, cuco::cuda_stream_ref stream = {}); + constexpr void merge(ref_type const& other_ref, cuco::cuda_stream_ref stream = {}); /** * @brief Compute the estimated distinct items count. @@ -208,28 +210,28 @@ class distinct_count_estimator { * * @return Approximate distinct items count */ - [[nodiscard]] std::size_t estimate(cuco::cuda_stream_ref stream = {}) const; + [[nodiscard]] constexpr std::size_t estimate(cuco::cuda_stream_ref stream = {}) const; /** * @brief Get device ref. * * @return Device ref object of the current `distinct_count_estimator` host object */ - [[nodiscard]] ref_type<> ref() const noexcept; + [[nodiscard]] constexpr ref_type<> ref() const noexcept; /** * @brief Get hash function. * * @return The hash function */ - [[nodiscard]] auto hash_function() const noexcept; + [[nodiscard]] constexpr auto hash_function() const noexcept; /** * @brief Gets the span of the sketch. * * @return The cuda::std::span of the sketch */ - [[nodiscard]] cuda::std::span sketch() const noexcept; + [[nodiscard]] constexpr cuda::std::span sketch() const noexcept; /** * @brief Gets the number of bytes required for the sketch storage. diff --git a/include/cuco/distinct_count_estimator_ref.cuh b/include/cuco/distinct_count_estimator_ref.cuh index bc0b9da61..74b60dbb3 100644 --- a/include/cuco/distinct_count_estimator_ref.cuh +++ b/include/cuco/distinct_count_estimator_ref.cuh @@ -73,14 +73,14 @@ class distinct_count_estimator_ref { * @param group CUDA Cooperative group this operation is executed in */ template - __device__ void clear(CG const& group) noexcept; + __device__ constexpr void clear(CG const& group) noexcept; /** * @brief Asynchronously resets the estimator, i.e., clears the current count estimate. * * @param stream CUDA stream this operation is executed in */ - __host__ void clear_async(cuco::cuda_stream_ref stream = {}) noexcept; + __host__ constexpr void clear_async(cuco::cuda_stream_ref stream = {}) noexcept; /** * @brief Resets the estimator, i.e., clears the current count estimate. @@ -90,14 +90,14 @@ class distinct_count_estimator_ref { * * @param stream CUDA stream this operation is executed in */ - __host__ void clear(cuco::cuda_stream_ref stream = {}); + __host__ constexpr void clear(cuco::cuda_stream_ref stream = {}); /** * @brief Adds an item to the estimator. * * @param item The item to be counted */ - __device__ void add(T const& item) noexcept; + __device__ constexpr void add(T const& item) noexcept; /** * @brief Asynchronously adds to be counted items to the estimator. @@ -111,7 +111,7 @@ class distinct_count_estimator_ref { * @param stream CUDA stream this operation is executed in */ template - __host__ void add_async(InputIt first, InputIt last, cuco::cuda_stream_ref stream = {}); + __host__ constexpr void add_async(InputIt first, InputIt last, cuco::cuda_stream_ref stream = {}); /** * @brief Adds to be counted items to the estimator. @@ -128,7 +128,7 @@ class distinct_count_estimator_ref { * @param stream CUDA stream this operation is executed in */ template - __host__ void add(InputIt first, InputIt last, cuco::cuda_stream_ref stream = {}); + __host__ constexpr void add(InputIt first, InputIt last, cuco::cuda_stream_ref stream = {}); /** * @brief Merges the result of `other` estimator reference into `*this` estimator reference. @@ -142,8 +142,8 @@ class distinct_count_estimator_ref { * @param other Other estimator reference to be merged into `*this` */ template - __device__ void merge(CG const& group, - distinct_count_estimator_ref const& other); + __device__ constexpr void merge(CG const& group, + distinct_count_estimator_ref const& other); /** * @brief Asynchronously merges the result of `other` estimator reference into `*this` estimator. @@ -156,8 +156,9 @@ class distinct_count_estimator_ref { * @param stream CUDA stream this operation is executed in */ template - __host__ void merge_async(distinct_count_estimator_ref const& other, - cuco::cuda_stream_ref stream = {}); + __host__ constexpr void merge_async( + distinct_count_estimator_ref const& other, + cuco::cuda_stream_ref stream = {}); /** * @brief Merges the result of `other` estimator reference into `*this` estimator. @@ -173,8 +174,8 @@ class distinct_count_estimator_ref { * @param stream CUDA stream this operation is executed in */ template - __host__ void merge(distinct_count_estimator_ref const& other, - cuco::cuda_stream_ref stream = {}); + __host__ constexpr void merge(distinct_count_estimator_ref const& other, + cuco::cuda_stream_ref stream = {}); /** * @brief Compute the estimated distinct items count. @@ -195,21 +196,21 @@ class distinct_count_estimator_ref { * * @return Approximate distinct items count */ - [[nodiscard]] __host__ std::size_t estimate(cuco::cuda_stream_ref stream = {}) const; + [[nodiscard]] __host__ constexpr std::size_t estimate(cuco::cuda_stream_ref stream = {}) const; /** * @brief Gets the hash function. * * @return The hash function */ - [[nodiscard]] __host__ __device__ auto hash_function() const noexcept; + [[nodiscard]] __host__ __device__ constexpr auto hash_function() const noexcept; /** * @brief Gets the span of the sketch. * * @return The cuda::std::span of the sketch */ - [[nodiscard]] __host__ __device__ cuda::std::span sketch() const noexcept; + [[nodiscard]] __host__ __device__ constexpr cuda::std::span sketch() const noexcept; /** * @brief Gets the number of bytes required for the sketch storage. From fbd6dab0066d535659d95cd6ae82058b557e0517 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Fri, 22 Mar 2024 00:47:26 +0000 Subject: [PATCH 67/78] Add constructor overload which takes the desired standard deviation --- .../host_bulk_example.cu | 6 +- .../distinct_count_estimator.inl | 17 +++++ .../distinct_count_estimator_ref.inl | 8 +++ .../cuco/detail/hyperloglog/hyperloglog.cuh | 66 ++++++++++++++++--- .../detail/hyperloglog/hyperloglog_ref.cuh | 29 +++++++- include/cuco/distinct_count_estimator.cuh | 29 +++++++- include/cuco/distinct_count_estimator_ref.cuh | 13 +++- include/cuco/standard_deviation.cuh | 45 +++++++++++++ 8 files changed, 197 insertions(+), 16 deletions(-) create mode 100644 include/cuco/standard_deviation.cuh diff --git a/examples/distinct_count_estimator/host_bulk_example.cu b/examples/distinct_count_estimator/host_bulk_example.cu index 96a46a8df..56ee90a42 100644 --- a/examples/distinct_count_estimator/host_bulk_example.cu +++ b/examples/distinct_count_estimator/host_bulk_example.cu @@ -36,8 +36,12 @@ int main(void) // Generate `num_items` distinct items thrust::sequence(items.begin(), items.end(), 0); + // We define the desired standard deviation of the approximation error + // 0.0122197 is the default value and corresponds to a 32KB sketch size + auto const sd = cuco::standard_deviation{0.0122197}; + // Initialize the estimator - cuco::distinct_count_estimator estimator; + cuco::distinct_count_estimator estimator{sd}; // Add all items to the estimator estimator.add(items.begin(), items.end()); diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl index 5454165a6..ed5d9792f 100644 --- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl +++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl @@ -26,6 +26,16 @@ constexpr distinct_count_estimator::distinct_count_es { } +template +constexpr distinct_count_estimator::distinct_count_estimator( + cuco::standard_deviation standard_deviation, + Hash const& hash, + Allocator const& alloc, + cuco::cuda_stream_ref stream) + : impl_{std::make_unique(standard_deviation, hash, alloc, stream)} +{ +} + template constexpr void distinct_count_estimator::clear_async( cuco::cuda_stream_ref stream) noexcept @@ -130,6 +140,13 @@ constexpr size_t distinct_count_estimator::sketch_byt return impl_type::sketch_bytes(sketch_size_kb); } +template +constexpr size_t distinct_count_estimator::sketch_bytes( + cuco::standard_deviation standard_deviation) noexcept +{ + return impl_type::sketch_bytes(standard_deviation); +} + template constexpr size_t distinct_count_estimator::sketch_alignment() noexcept { diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl index a607c3ce9..97649d5bc 100644 --- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl +++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl @@ -135,6 +135,14 @@ distinct_count_estimator_ref::sketch_bytes( return impl_type::sketch_bytes(sketch_size_kb); } +template +__host__ __device__ constexpr std::size_t +distinct_count_estimator_ref::sketch_bytes( + cuco::standard_deviation standard_deviation) noexcept +{ + return impl_type::sketch_bytes(standard_deviation); +} + template __host__ __device__ constexpr std::size_t distinct_count_estimator_ref::sketch_alignment() noexcept diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh index 1d24eb3fc..13106cc08 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog.cuh @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -55,13 +56,34 @@ class hyperloglog { typename std::allocator_traits::template rebind_alloc; ///< Allocator ///< type + private: /** * @brief Constructs a `hyperloglog` host object. * * @note This function synchronizes the given stream. * - * @throw If sketch size < 0.0625KB or 64B - * @throw If sketch storage has insufficient alignment + * @param sketch_size_b Sketch size in bytes + * @param hash The hash function used to hash items + * @param alloc Allocator used for allocating device storage + * @param stream CUDA stream used to initialize the object + */ + constexpr hyperloglog(std::size_t sketch_size_b, + Hash const& hash, + Allocator const& alloc, + cuco::cuda_stream_ref stream) + : allocator_{alloc}, + sketch_{this->allocator_.allocate(sketch_size_b / sizeof(register_type)), + custom_deleter{sketch_size_b / sizeof(register_type), this->allocator_}}, + ref_{cuda::std::span{reinterpret_cast(this->sketch_.get()), sketch_size_b}, hash} + { + this->ref_.clear_async(stream); + } + + public: + /** + * @brief Constructs a `hyperloglog` host object. + * + * @note This function synchronizes the given stream. * * @param sketch_size_kb Maximum sketch size in KB * @param hash The hash function used to hash items @@ -72,15 +94,26 @@ class hyperloglog { Hash const& hash, Allocator const& alloc, cuco::cuda_stream_ref stream) - : allocator_{alloc}, - sketch_{ - this->allocator_.allocate(sketch_bytes(sketch_size_kb) / sizeof(register_type)), - custom_deleter{sketch_bytes(sketch_size_kb) / sizeof(register_type), this->allocator_}}, - ref_{cuda::std::span{reinterpret_cast(this->sketch_.get()), - sketch_bytes(sketch_size_kb)}, - hash} + : hyperloglog{sketch_bytes(sketch_size_kb), hash, alloc, stream} + { + } + + /** + * @brief Constructs a `hyperloglog` host object. + * + * @note This function synchronizes the given stream. + * + * @param standard_deviation Desired standard deviation for the approximation error + * @param hash The hash function used to hash items + * @param alloc Allocator used for allocating device storage + * @param stream CUDA stream used to initialize the object + */ + constexpr hyperloglog(cuco::standard_deviation standard_deviation, + Hash const& hash, + Allocator const& alloc, + cuco::cuda_stream_ref stream) + : hyperloglog{sketch_bytes(standard_deviation), hash, alloc, stream} { - this->ref_.clear_async(stream); } ~hyperloglog() = default; @@ -288,6 +321,19 @@ class hyperloglog { return ref_type<>::sketch_bytes(sketch_size_kb); } + /** + * @brief Gets the number of bytes required for the sketch storage. + * + * @param standard_deviation Upper bound standard deviation for approximation error + * + * @return The number of bytes required for the sketch + */ + [[nodiscard]] static constexpr std::size_t sketch_bytes( + cuco::standard_deviation standard_deviation) noexcept + { + return ref_type<>::sketch_bytes(standard_deviation); + } + /** * @brief Gets the alignment required for the sketch storage. * diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh index 10dc14273..99af829d2 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -71,7 +72,8 @@ class hyperloglog_ref { /** * @brief Constructs a non-owning `hyperloglog_ref` object. * - * @throw If sketch size < 0.0625KB or 64B. Throws if called from host; UB if called from device. + * @throw If sketch size < 0.0625KB or 64B or standard deviation > 0.2765. Throws if called from + * host; UB if called from device. * @throw If sketch storage has insufficient alignment. Throws if called from host; UB if called. * from device. * @@ -471,6 +473,31 @@ class hyperloglog_ref { cuda::std::bit_floor(static_cast(sketch_size_kb * 1024))); } + /** + * @brief Gets the number of bytes required for the sketch storage. + * + * @param standard_deviation Upper bound standard deviation for approximation error + * + * @return The number of bytes required for the sketch + */ + [[nodiscard]] __host__ __device__ static constexpr std::size_t sketch_bytes( + cuco::standard_deviation standard_deviation) noexcept + { + // implementation taken from + // https://github.com/apache/spark/blob/6a27789ad7d59cd133653a49be0bb49729542abe/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/HyperLogLogPlusPlusHelper.scala#L43 + + // minimum precision is 4 or 64 bytes + auto const precision = std::max( + static_cast(4), + static_cast( + cuda::std::ceil(2.0 * cuda::std::log(1.106 / standard_deviation) / cuda::std::log(2.0)))); + + // inverse of this function (ommitting the minimum precision constraint) is + // standard_deviation = 1.106 / exp((precision * log(2.0)) / 2.0) + + return sizeof(register_type) * (1ull << precision); + } + /** * @brief Gets the alignment required for the sketch storage. * diff --git a/include/cuco/distinct_count_estimator.cuh b/include/cuco/distinct_count_estimator.cuh index 011194ad5..a43590a43 100644 --- a/include/cuco/distinct_count_estimator.cuh +++ b/include/cuco/distinct_count_estimator.cuh @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -63,9 +64,6 @@ class distinct_count_estimator { * * @note This function synchronizes the given stream. * - * @throw If sketch size < 0.0625KB or 64B - * @throw If sketch storage has insufficient alignment - * * @param sketch_size_kb Maximum sketch size in KB * @param hash The hash function used to hash items * @param alloc Allocator used for allocating device storage @@ -76,6 +74,21 @@ class distinct_count_estimator { Allocator const& alloc = {}, cuco::cuda_stream_ref stream = {}); + /** + * @brief Constructs a `distinct_count_estimator` host object. + * + * @note This function synchronizes the given stream. + * + * @param standard_deviation Desired standard deviation for the approximation error + * @param hash The hash function used to hash items + * @param alloc Allocator used for allocating device storage + * @param stream CUDA stream used to initialize the object + */ + constexpr distinct_count_estimator(cuco::standard_deviation standard_deviation, + Hash const& hash = {}, + Allocator const& alloc = {}, + cuco::cuda_stream_ref stream = {}); + ~distinct_count_estimator() = default; distinct_count_estimator(distinct_count_estimator const&) = delete; @@ -250,6 +263,16 @@ class distinct_count_estimator { [[nodiscard]] static constexpr std::size_t sketch_bytes( cuco::sketch_size_kb sketch_size_kb) noexcept; + /** + * @brief Gets the number of bytes required for the sketch storage. + * + * @param standard_deviation Upper bound standard deviation for approximation error + * + * @return The number of bytes required for the sketch + */ + [[nodiscard]] static constexpr std::size_t sketch_bytes( + cuco::standard_deviation standard_deviation) noexcept; + /** * @brief Gets the alignment required for the sketch storage. * diff --git a/include/cuco/distinct_count_estimator_ref.cuh b/include/cuco/distinct_count_estimator_ref.cuh index 74b60dbb3..d2eb6ab58 100644 --- a/include/cuco/distinct_count_estimator_ref.cuh +++ b/include/cuco/distinct_count_estimator_ref.cuh @@ -55,7 +55,8 @@ class distinct_count_estimator_ref { /** * @brief Constructs a non-owning `distinct_count_estimator_ref` object. * - * @throw If sketch size < 0.0625KB or 64B. Throws if called from host; UB if called from device. + * @throw If sketch size < 0.0625KB or 64B or standard deviation > 0.2765. Throws if called from + * host; UB if called from device. * @throw If sketch storage has insufficient alignment. Throws if called from host; UB if called * from device. * @@ -229,6 +230,16 @@ class distinct_count_estimator_ref { [[nodiscard]] __host__ __device__ static constexpr std::size_t sketch_bytes( cuco::sketch_size_kb sketch_size_kb) noexcept; + /** + * @brief Gets the number of bytes required for the sketch storage. + * + * @param standard_deviation Upper bound standard deviation for approximation error + * + * @return The number of bytes required for the sketch + */ + [[nodiscard]] __host__ __device__ static constexpr std::size_t sketch_bytes( + cuco::standard_deviation standard_deviation) noexcept; + /** * @brief Gets the alignment required for the sketch storage. * diff --git a/include/cuco/standard_deviation.cuh b/include/cuco/standard_deviation.cuh new file mode 100644 index 000000000..9486784b9 --- /dev/null +++ b/include/cuco/standard_deviation.cuh @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +namespace cuco { + +/** + * @brief Strong type for specifying the desired standard deviation of + * cuco::distinct_count_estimator(_ref). + */ +class standard_deviation { + public: + /** + * @brief Constructs a standard_deviation object. + * + * @param value The desired standard deviation + */ + __host__ __device__ explicit constexpr standard_deviation(double value) noexcept : value_{value} + { + } + + /** + * @brief Conversion to value type. + * + * @return Standard deviation + */ + __host__ __device__ constexpr operator double() const noexcept { return this->value_; } + + private: + double value_; ///< Sketch size in KB +}; +} // namespace cuco \ No newline at end of file From dfe1a070cdc643dcfd03a446f272c0619bfdbda5 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Fri, 22 Mar 2024 00:49:17 +0000 Subject: [PATCH 68/78] Remove stray include --- benchmarks/utils.hpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/benchmarks/utils.hpp b/benchmarks/utils.hpp index 97ca4988f..392cafe06 100644 --- a/benchmarks/utils.hpp +++ b/benchmarks/utils.hpp @@ -21,8 +21,6 @@ #include -#include // thread_scope - namespace cuco::benchmark { template From 2629adc95346b45d62c7b192c0c24f3b5c0a319c Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Sat, 23 Mar 2024 02:04:43 +0000 Subject: [PATCH 69/78] Bugfixes --- .../detail/hyperloglog/hyperloglog_ref.cuh | 34 +++++++++++-------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh index 99af829d2..1abb6d38e 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh @@ -84,7 +84,7 @@ class hyperloglog_ref { Hash const& hash) : hash_{hash}, precision_{cuda::std::countr_zero( - sketch_bytes(cuco::sketch_size_kb(static_cast(sketch_span.size() / 1024))) / + sketch_bytes(cuco::sketch_size_kb(static_cast(sketch_span.size() / 1024.0))) / sizeof(register_type))}, register_mask_{(1ull << this->precision_) - 1}, sketch_{reinterpret_cast(sketch_span.data()), @@ -152,6 +152,10 @@ class hyperloglog_ref { auto const reg = h & this->register_mask_; auto const zeroes = cuda::std::countl_zero(h | this->register_mask_) + 1; // __clz + // reversed order (same one as Spark uses) + // auto const reg = h >> ((sizeof(hash_value_type) * 8) - this->precision_); + // auto const zeroes = cuda::std::countl_zero(h << this->precision_) + 1; + this->update_max(reg, zeroes); } @@ -207,19 +211,21 @@ class hyperloglog_ref { } if (kernel != nullptr and this->try_reserve_shmem(kernel, shmem_bytes)) { - // We make use of the occupancy calculator to get the minimum number of blocks which still - // saturates the GPU. This reduces the shmem initialization overhead and atomic contention on - // the final register array during the merge phase. - CUCO_CUDA_TRY( - cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, kernel, shmem_bytes)); - - auto const ptr = thrust::raw_pointer_cast(&first[0]); - void* kernel_args[] = { - (void*)(&ptr), // TODO can't use reinterpret_cast since it can't cast away const - (void*)(&num_items), - reinterpret_cast(this)}; - CUCO_CUDA_TRY( - cudaLaunchKernel(kernel, grid_size, block_size, kernel_args, shmem_bytes, stream)); + if constexpr (thrust::is_contiguous_iterator_v) { + // We make use of the occupancy calculator to get the minimum number of blocks which still + // saturates the GPU. This reduces the shmem initialization overhead and atomic contention + // on the final register array during the merge phase. + CUCO_CUDA_TRY( + cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, kernel, shmem_bytes)); + + auto const ptr = thrust::raw_pointer_cast(&first[0]); + void* kernel_args[] = { + (void*)(&ptr), // TODO can't use reinterpret_cast since it can't cast away const + (void*)(&num_items), + reinterpret_cast(this)}; + CUCO_CUDA_TRY( + cudaLaunchKernel(kernel, grid_size, block_size, kernel_args, shmem_bytes, stream)); + } } else { kernel = reinterpret_cast( cuco::hyperloglog_ns::detail::add_shmem); From 1ad97e27fc26d2e898d6d092957e3b1ffd922ec0 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Wed, 27 Mar 2024 15:19:29 +0000 Subject: [PATCH 70/78] Fix merge --- .../distinct_count_estimator/distinct_count_estimator.inl | 8 ++++---- include/cuco/detail/hyperloglog/hyperloglog_ref.cuh | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl index ed5d9792f..2d5ad3a47 100644 --- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl +++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl @@ -72,7 +72,7 @@ constexpr void distinct_count_estimator::merge_async( distinct_count_estimator const& other, cuco::cuda_stream_ref stream) { - this->impl_->merge_async(other, stream); + this->impl_->merge_async(*(other.impl_), stream); } template @@ -81,7 +81,7 @@ constexpr void distinct_count_estimator::merge( distinct_count_estimator const& other, cuco::cuda_stream_ref stream) { - this->impl_->merge(other, stream); + this->impl_->merge(*(other.impl_), stream); } template @@ -89,7 +89,7 @@ template constexpr void distinct_count_estimator::merge_async( ref_type const& other_ref, cuco::cuda_stream_ref stream) { - this->impl_->merge_async(other_ref, stream); + this->impl_->merge_async(other_ref.impl_, stream); } template @@ -97,7 +97,7 @@ template constexpr void distinct_count_estimator::merge( ref_type const& other_ref, cuco::cuda_stream_ref stream) { - this->impl_->merge(other_ref, stream); + this->impl_->merge(other_ref.impl_, stream); } template diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh index 1abb6d38e..dfd2df499 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh @@ -308,7 +308,7 @@ class hyperloglog_ref { __host__ constexpr void merge_async(hyperloglog_ref const& other, cuco::cuda_stream_ref stream) { - CUCO_EXPECTS(other.precision == this->precision_, + CUCO_EXPECTS(other.precision_ == this->precision_, "Cannot merge estimators with different sketch sizes", std::runtime_error); auto constexpr block_size = 1024; From 3b0da20b25e4e95ba9e308b8145fd975fae189f7 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Wed, 27 Mar 2024 15:27:54 +0000 Subject: [PATCH 71/78] Add Spark parity tests --- tests/CMakeLists.txt | 1 + .../spark_parity_test.cu | 187 ++++++++++++++++++ 2 files changed, 188 insertions(+) create mode 100644 tests/distinct_count_estimator/spark_parity_test.cu diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 0acaae3c4..a37f2d4e2 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -120,4 +120,5 @@ ConfigureTest(DYNAMIC_BITSET_TEST # - distinct_count_estimator ---------------------------------------------------------------------- ConfigureTest(DISTINCT_COUNT_ESTIMATOR_TEST distinct_count_estimator/unique_sequence_test.cu + distinct_count_estimator/spark_parity_test.cu distinct_count_estimator/device_ref_test.cu) diff --git a/tests/distinct_count_estimator/spark_parity_test.cu b/tests/distinct_count_estimator/spark_parity_test.cu new file mode 100644 index 000000000..4aaaa8a66 --- /dev/null +++ b/tests/distinct_count_estimator/spark_parity_test.cu @@ -0,0 +1,187 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include // std::memcpy + +/** + * @file spark_parity_test.cu + * @brief Unit test to ensure parity with Spark's HLL implementation + * + * The following unit tests mimic Spark's unit tests which can be found here: + * https://github.com/apache/spark/blob/d10dbaa31a44878df5c7e144f111e18261346531/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlusSuite.scala + * + */ + +// TODO implement this test once add_if is available +// TEST_CASE("distinct_count_estimator: Spark parity: add nulls", "") + +TEST_CASE("distinct_count_estimator: Spark parity: deterministic cardinality estimation", "") +{ + using T = int; + using estimator_type = + cuco::distinct_count_estimator>; + + constexpr size_t repeats = 10; + // This factor determines the error threshold for passing the test + constexpr double tolerance_factor = 3.0; + auto num_items = GENERATE(100, 500, 1000, 5000, 10000, 50000, 100000, 500000, 1000000); + auto standard_deviation = GENERATE(0.1, 0.05, 0.025, 0.01, 0.001); + + auto expected_hll_precision = std::max( + static_cast(4), + static_cast(std::ceil(2.0 * std::log(1.106 / standard_deviation) / std::log(2.0)))); + auto expected_sketch_bytes = 4 * (1ull << expected_hll_precision); + + INFO("num_items" << num_items); + INFO("standard_deviation=" << standard_deviation); + INFO("expected_hll_precision=" << expected_hll_precision); + INFO("expected_sketch_bytes=" << expected_sketch_bytes); + + auto sd = cuco::standard_deviation(standard_deviation); + auto sb = cuco::sketch_size_kb(expected_sketch_bytes / 1024.0); + + // Validate sketch size calculation + REQUIRE(estimator_type::sketch_bytes(sd) >= 64); + REQUIRE(estimator_type::sketch_bytes(sd) == expected_sketch_bytes); + REQUIRE(estimator_type::sketch_bytes(sd) == estimator_type::sketch_bytes(sb)); + + auto items_begin = + thrust::make_transform_iterator(thrust::make_counting_iterator(0), + cuda::proclaim_return_type([repeats] __device__(auto i) { + return static_cast(i / repeats); + })); + + estimator_type estimator{sd}; + + REQUIRE(estimator.estimate() == 0); + + // Add all items to the estimator + estimator.add(items_begin, items_begin + num_items); + + auto const estimate = estimator.estimate(); + + double const relative_error = + std::abs((static_cast(estimate) / static_cast(num_items / repeats)) - 1.0); + // RSD for a given precision is given by the following formula + double const expected_standard_deviation = + 1.04 / std::sqrt(static_cast(1ull << expected_hll_precision)); + + // Check if the error is acceptable + REQUIRE(relative_error < expected_standard_deviation * tolerance_factor); +} + +// the following test is omitted since we refrain from doing randomized unit tests in cuco +// TEST_CASE("distinct_count_estimator: Spark parity: random cardinality estimation", "") + +TEST_CASE("distinct_count_estimator: Spark parity: merging HLL instances", "") +{ + using T = int; + using estimator_type = + cuco::distinct_count_estimator>; + + auto num_items = 1000000; + auto standard_deviation = cuco::standard_deviation(0.05); + + auto items_begin = thrust::make_counting_iterator(0); + + // count lower half of input + estimator_type lower{standard_deviation}; + lower.add(items_begin, items_begin + num_items / 2); + + // count upper half of input + estimator_type upper{standard_deviation}; + upper.add(items_begin + num_items / 2, items_begin + num_items); + + // merge upper into lower so lower has seen the entire input + lower.merge(upper); + + auto reversed_items_begin = thrust::make_transform_iterator( + items_begin, cuda::proclaim_return_type([num_items] __device__(auto i) { + return static_cast(num_items - i); + })); + + // count the entire input vector but in reversed order + estimator_type entire{standard_deviation}; + entire.add(reversed_items_begin, reversed_items_begin + num_items); + + auto const entire_sketch = entire.sketch(); + auto const lower_sketch = lower.sketch(); + + // check if sketches are bitwise identical + REQUIRE(cuco::test::equal(entire_sketch.data(), + entire_sketch.data() + entire_sketch.size(), + lower_sketch.data(), + thrust::equal_to{})); +} + +/* +The following unit tests fail since xxhash_64 does not deduplicate different bit patterns for NaN +values and +-0.0. They are thus counted as distinct items. + +TEST_CASE("distinct_count_estimator: Spark parity: add 0.0 and -0.0", "") +{ + using T = double; + using estimator_type = + cuco::distinct_count_estimator>; + + auto standard_deviation = cuco::standard_deviation(0.05); + + auto items = thrust::device_vector({0.0, -0.0}); + + estimator_type estimator{standard_deviation}; + estimator.add(items.begin(), items.end()); + + REQUIRE(estimator.estimate() == 1); +} + +TEST_CASE("distinct_count_estimator: Spark parity: add NaN", "") +{ + using T = double; + using estimator_type = + cuco::distinct_count_estimator>; + + auto standard_deviation = cuco::standard_deviation(0.05); + + // Define the special bit pattern for the NaN. + uint64_t nan_bits = 0x7ff1234512345678ULL; + double special_nan; + std::memcpy(&special_nan, &nan_bits, sizeof(special_nan)); + + auto items = thrust::device_vector({0.0, special_nan}); + + estimator_type estimator{standard_deviation}; + estimator.add(items.begin(), items.end()); + + REQUIRE(estimator.estimate() == 1); +} +*/ From f80509fa998af9f4ac3958e2461f0d46daf50aae Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Wed, 27 Mar 2024 15:31:54 +0000 Subject: [PATCH 72/78] Fix error calculation --- benchmarks/distinct_count_estimator_bench.cu | 2 +- examples/distinct_count_estimator/host_bulk_example.cu | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/benchmarks/distinct_count_estimator_bench.cu b/benchmarks/distinct_count_estimator_bench.cu index 37c5f4f00..eca28e4ed 100644 --- a/benchmarks/distinct_count_estimator_bench.cu +++ b/benchmarks/distinct_count_estimator_bench.cu @@ -67,7 +67,7 @@ template estimator.add(items.begin(), items.end()); double estimated_cardinality = estimator.estimate(); double true_cardinality = exact_distinct_count(items.begin(), num_items); - error_sum += std::abs(true_cardinality - estimated_cardinality) / true_cardinality; + error_sum += std::abs(estimated_cardinality / true_cardinality - 1.0); estimator.clear(); } diff --git a/examples/distinct_count_estimator/host_bulk_example.cu b/examples/distinct_count_estimator/host_bulk_example.cu index 56ee90a42..0cd535e8b 100644 --- a/examples/distinct_count_estimator/host_bulk_example.cu +++ b/examples/distinct_count_estimator/host_bulk_example.cu @@ -53,10 +53,9 @@ int main(void) std::size_t const estimated_cardinality = estimator.estimate(); std::cout << "True cardinality: " << num_items - << "\nEstimated cardinality: " << estimated_cardinality << "\nRelative error: " - << std::abs(static_cast(num_items) - - static_cast(estimated_cardinality)) / - num_items + << "\nEstimated cardinality: " << estimated_cardinality << "\nError: " + << std::abs( + static_cast(estimated_cardinality) / static_cast(num_items) - 1.0) << std::endl; return 0; From b61a2db6967b586630131f603438cd7192d2197f Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Wed, 3 Apr 2024 00:25:16 +0000 Subject: [PATCH 73/78] Move include/cuco/sentinel.cuh -> include/cuco/types.cuh --- include/cuco/dynamic_map.cuh | 2 +- include/cuco/static_map.cuh | 2 +- include/cuco/static_map_ref.cuh | 2 +- include/cuco/static_multimap.cuh | 2 +- include/cuco/static_multiset.cuh | 2 +- include/cuco/static_multiset_ref.cuh | 2 +- include/cuco/static_set.cuh | 2 +- include/cuco/static_set_ref.cuh | 2 +- include/cuco/{sentinel.cuh => types.cuh} | 10 ++++++++++ 9 files changed, 18 insertions(+), 8 deletions(-) rename include/cuco/{sentinel.cuh => types.cuh} (70%) diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh index bf3c7c8a2..aedb81f1a 100644 --- a/include/cuco/dynamic_map.cuh +++ b/include/cuco/dynamic_map.cuh @@ -18,8 +18,8 @@ #include #include -#include #include +#include #include #include diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index 02434eb25..324c56ced 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -22,8 +22,8 @@ #include #include #include -#include #include +#include #include #include #include diff --git a/include/cuco/static_map_ref.cuh b/include/cuco/static_map_ref.cuh index 953507a6a..80c60711d 100644 --- a/include/cuco/static_map_ref.cuh +++ b/include/cuco/static_map_ref.cuh @@ -20,8 +20,8 @@ #include #include #include -#include #include +#include #include #include diff --git a/include/cuco/static_multimap.cuh b/include/cuco/static_multimap.cuh index 1d6e67df0..abdb747d0 100644 --- a/include/cuco/static_multimap.cuh +++ b/include/cuco/static_multimap.cuh @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include diff --git a/include/cuco/static_multiset.cuh b/include/cuco/static_multiset.cuh index b4a684bcc..5d16857cf 100644 --- a/include/cuco/static_multiset.cuh +++ b/include/cuco/static_multiset.cuh @@ -21,9 +21,9 @@ #include #include #include -#include #include #include +#include #include #include #include diff --git a/include/cuco/static_multiset_ref.cuh b/include/cuco/static_multiset_ref.cuh index 975ca915b..a8b5bff62 100644 --- a/include/cuco/static_multiset_ref.cuh +++ b/include/cuco/static_multiset_ref.cuh @@ -20,8 +20,8 @@ #include #include #include -#include #include +#include #include #include diff --git a/include/cuco/static_set.cuh b/include/cuco/static_set.cuh index 7c819668e..2aaee75d6 100644 --- a/include/cuco/static_set.cuh +++ b/include/cuco/static_set.cuh @@ -21,9 +21,9 @@ #include #include #include -#include #include #include +#include #include #include #include diff --git a/include/cuco/static_set_ref.cuh b/include/cuco/static_set_ref.cuh index f2f661190..004b6b92f 100644 --- a/include/cuco/static_set_ref.cuh +++ b/include/cuco/static_set_ref.cuh @@ -20,8 +20,8 @@ #include #include #include -#include #include +#include #include #include diff --git a/include/cuco/sentinel.cuh b/include/cuco/types.cuh similarity index 70% rename from include/cuco/sentinel.cuh rename to include/cuco/types.cuh index b71d3e6c5..f0c579f85 100644 --- a/include/cuco/sentinel.cuh +++ b/include/cuco/types.cuh @@ -18,6 +18,16 @@ #include +/** + * @brief Defines various strong type wrappers used across this library. + * + * @note Each strong type inherits from `cuco::detail::strong_type`. `CUCO_DEFINE_STRONG_TYPE` + * and `CUCO_DEFINE_TEMPLATE_STRONG_TYPE` are convenience macros used to define a named type in a + * single line, e.g., `CUCO_DEFINE_STRONG_TYPE(foo, double)` defines `struct foo : public + * cuco::detail::strong_type {...};`, where `cuco::foo{42.0}` is implicitly convertible to + * `double{42.0}`. + */ + namespace cuco { /** * @brief A strong type wrapper `cuco::empty_key` used to denote the empty key sentinel. From 9b0ee68e6eabc60015d3f7746e109b80989e7a50 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Wed, 3 Apr 2024 00:34:16 +0000 Subject: [PATCH 74/78] Move HLL-related strong types to types.cuh --- .../cuco/detail/hyperloglog/hyperloglog.cuh | 3 +- .../detail/hyperloglog/hyperloglog_ref.cuh | 3 +- include/cuco/distinct_count_estimator.cuh | 3 +- include/cuco/distinct_count_estimator_ref.cuh | 2 +- include/cuco/sketch_size.cuh | 55 ------------------- include/cuco/standard_deviation.cuh | 45 --------------- include/cuco/types.cuh | 26 +++++++++ 7 files changed, 30 insertions(+), 107 deletions(-) delete mode 100644 include/cuco/sketch_size.cuh delete mode 100644 include/cuco/standard_deviation.cuh diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh index 13106cc08..011d2bee7 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog.cuh @@ -20,8 +20,7 @@ #include #include #include -#include -#include +#include #include #include diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh index dfd2df499..fc10e32b7 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh @@ -22,8 +22,7 @@ #include #include #include -#include -#include +#include #include #include diff --git a/include/cuco/distinct_count_estimator.cuh b/include/cuco/distinct_count_estimator.cuh index a43590a43..0eecca954 100644 --- a/include/cuco/distinct_count_estimator.cuh +++ b/include/cuco/distinct_count_estimator.cuh @@ -19,8 +19,7 @@ #include #include #include -#include -#include +#include #include #include diff --git a/include/cuco/distinct_count_estimator_ref.cuh b/include/cuco/distinct_count_estimator_ref.cuh index d2eb6ab58..44374c6b5 100644 --- a/include/cuco/distinct_count_estimator_ref.cuh +++ b/include/cuco/distinct_count_estimator_ref.cuh @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include diff --git a/include/cuco/sketch_size.cuh b/include/cuco/sketch_size.cuh deleted file mode 100644 index f9dce1aed..000000000 --- a/include/cuco/sketch_size.cuh +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -namespace cuco { - -/** - * @brief Strng type for specifying the sketch size of cuco::distinct_count_estimator(_ref) in KB. - * - * Values can also be given as literals, e.g., 64.3_KB - */ -class sketch_size_kb { - public: - /** - * @brief Constructs a sketch_size_kb object. - * - * @param value The size of a sketch given in KB - */ - __host__ __device__ explicit constexpr sketch_size_kb(double value) noexcept : value_{value} {} - - /** - * @brief Conversion to value type. - * - * @return Sketch size in KB - */ - __host__ __device__ constexpr operator double() const noexcept { return this->value_; } - - private: - double value_; ///< Sketch size in KB -}; -} // namespace cuco - -// User-defined literal operators for sketch_size_KB -__host__ __device__ constexpr cuco::sketch_size_kb operator""_KB(long double value) -{ - return cuco::sketch_size_kb{static_cast(value)}; -} - -__host__ __device__ constexpr cuco::sketch_size_kb operator""_KB(unsigned long long int value) -{ - return cuco::sketch_size_kb{static_cast(value)}; -} \ No newline at end of file diff --git a/include/cuco/standard_deviation.cuh b/include/cuco/standard_deviation.cuh deleted file mode 100644 index 9486784b9..000000000 --- a/include/cuco/standard_deviation.cuh +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -namespace cuco { - -/** - * @brief Strong type for specifying the desired standard deviation of - * cuco::distinct_count_estimator(_ref). - */ -class standard_deviation { - public: - /** - * @brief Constructs a standard_deviation object. - * - * @param value The desired standard deviation - */ - __host__ __device__ explicit constexpr standard_deviation(double value) noexcept : value_{value} - { - } - - /** - * @brief Conversion to value type. - * - * @return Standard deviation - */ - __host__ __device__ constexpr operator double() const noexcept { return this->value_; } - - private: - double value_; ///< Sketch size in KB -}; -} // namespace cuco \ No newline at end of file diff --git a/include/cuco/types.cuh b/include/cuco/types.cuh index f0c579f85..eddc289df 100644 --- a/include/cuco/types.cuh +++ b/include/cuco/types.cuh @@ -43,4 +43,30 @@ CUCO_DEFINE_TEMPLATE_STRONG_TYPE(empty_value); * @brief A strong type wrapper `cuco::erased_key` used to denote the erased key sentinel. */ CUCO_DEFINE_TEMPLATE_STRONG_TYPE(erased_key); + +/** + * @brief A strong type wrapper `cuco::sketch_size_kb` for specifying the upper-bound sketch size of + * `cuco::distinct_count_estimator(_ref)` in KB. + * + * @note Values can also be specified as literals, e.g., 64.3_KB. + */ +CUCO_DEFINE_STRONG_TYPE(sketch_size_kb, double); + +/** + * @brief A strong type wrapper `cuco::standard_deviation` for specifying the desired standard + * deviation for the cardinality estimate of `cuco::distinct_count_estimator(_ref)`. + */ +CUCO_DEFINE_STRONG_TYPE(standard_deviation, double); + } // namespace cuco + +// User-defined literal operators for `cuco::sketch_size_KB` +__host__ __device__ constexpr cuco::sketch_size_kb operator""_KB(long double value) +{ + return cuco::sketch_size_kb{static_cast(value)}; +} + +__host__ __device__ constexpr cuco::sketch_size_kb operator""_KB(unsigned long long int value) +{ + return cuco::sketch_size_kb{static_cast(value)}; +} From 75cd96789023c82d64d1c5b1bd9b02c18b735de8 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Wed, 3 Apr 2024 00:59:03 +0000 Subject: [PATCH 75/78] Apparently Doxygen has become even pickier... --- include/cuco/operator.hpp | 12 ++++++------ include/cuco/utility/allocator.hpp | 19 +++++++++++++++++++ include/cuco/utility/cuda_thread_scope.cuh | 12 ++++++++---- include/cuco/utility/traits.hpp | 9 ++++++--- 4 files changed, 39 insertions(+), 13 deletions(-) diff --git a/include/cuco/operator.hpp b/include/cuco/operator.hpp index 8199e23c6..418148eb5 100644 --- a/include/cuco/operator.hpp +++ b/include/cuco/operator.hpp @@ -24,37 +24,37 @@ inline namespace op { * @brief `insert` operator tag */ struct insert_tag { -} inline constexpr insert; +} inline constexpr insert; ///< `cuco::insert` operator /** * @brief `insert_and_find` operator tag */ struct insert_and_find_tag { -} inline constexpr insert_and_find; +} inline constexpr insert_and_find; ///< `cuco::insert_and_find` operator /** * @brief `insert_or_assign` operator tag */ struct insert_or_assign_tag { -} inline constexpr insert_or_assign; +} inline constexpr insert_or_assign; ///< `cuco::insert_or_assign` operator /** * @brief `erase` operator tag */ struct erase_tag { -} inline constexpr erase; +} inline constexpr erase; ///< `cuco::erase` operator /** * @brief `contains` operator tag */ struct contains_tag { -} inline constexpr contains; +} inline constexpr contains; ///< `cuco::contains` operator /** * @brief `find` operator tag */ struct find_tag { -} inline constexpr find; +} inline constexpr find; ///< `cuco::find` operator } // namespace op } // namespace cuco diff --git a/include/cuco/utility/allocator.hpp b/include/cuco/utility/allocator.hpp index 583571620..060ae0dd8 100644 --- a/include/cuco/utility/allocator.hpp +++ b/include/cuco/utility/allocator.hpp @@ -60,12 +60,31 @@ class cuda_allocator { void deallocate(value_type* p, std::size_t) { CUCO_CUDA_TRY(cudaFree(p)); } }; +/** + * @brief Equality comparison operator. + * + * @tparam T Value type of LHS object + * @tparam U Value type of RHS object + * + * @return `true` iff given arguments are equal + */ template bool operator==(cuda_allocator const&, cuda_allocator const&) noexcept { return true; } +/** + * @brief Inequality comparison operator. + * + * @tparam T Value type of LHS object + * @tparam U Value type of RHS object + * + * @param lhs Left-hand side object to compare + * @param rhs Right-hand side object to compare + * + * @return `true` iff given arguments are not equal + */ template bool operator!=(cuda_allocator const& lhs, cuda_allocator const& rhs) noexcept { diff --git a/include/cuco/utility/cuda_thread_scope.cuh b/include/cuco/utility/cuda_thread_scope.cuh index 4e2242487..906605a14 100644 --- a/include/cuco/utility/cuda_thread_scope.cuh +++ b/include/cuco/utility/cuda_thread_scope.cuh @@ -36,9 +36,13 @@ struct cuda_thread_scope { }; // alias definitions -inline constexpr auto thread_scope_system = cuda_thread_scope{}; -inline constexpr auto thread_scope_device = cuda_thread_scope{}; -inline constexpr auto thread_scope_block = cuda_thread_scope{}; -inline constexpr auto thread_scope_thread = cuda_thread_scope{}; +inline constexpr auto thread_scope_system = + cuda_thread_scope{}; ///< `cuco::thread_scope_system` +inline constexpr auto thread_scope_device = + cuda_thread_scope{}; ///< `cuco::thread_scope_device` +inline constexpr auto thread_scope_block = + cuda_thread_scope{}; ///< `cuco::thread_scope_block` +inline constexpr auto thread_scope_thread = + cuda_thread_scope{}; ///< `cuco::thread_scope_thread` } // namespace cuco diff --git a/include/cuco/utility/traits.hpp b/include/cuco/utility/traits.hpp index dcbfe432a..1325b3a52 100644 --- a/include/cuco/utility/traits.hpp +++ b/include/cuco/utility/traits.hpp @@ -46,7 +46,8 @@ struct is_bitwise_comparable -inline constexpr bool is_bitwise_comparable_v = is_bitwise_comparable::value; +inline constexpr bool is_bitwise_comparable_v = + is_bitwise_comparable::value; ///< Shortcut definition /** * @brief Declares that a type `Type` is bitwise comparable. @@ -59,9 +60,11 @@ inline constexpr bool is_bitwise_comparable_v = is_bitwise_comparable::value; } template -inline constexpr bool dependent_bool_value = value; +inline constexpr bool dependent_bool_value = value; ///< Unpacked dependent bool value template -inline constexpr bool dependent_false = dependent_bool_value; +inline constexpr bool dependent_false = + dependent_bool_value; ///< Emits a `false` value which is dependent on the given + ///< argument types } // namespace cuco From 6929f65ced3b51475e37213ad9dcfb51cd53ce59 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Wed, 3 Apr 2024 16:30:28 +0000 Subject: [PATCH 76/78] Clean up device ref example --- examples/distinct_count_estimator/device_ref_example.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/distinct_count_estimator/device_ref_example.cu b/examples/distinct_count_estimator/device_ref_example.cu index 92c5169d9..ab4d1929f 100644 --- a/examples/distinct_count_estimator/device_ref_example.cu +++ b/examples/distinct_count_estimator/device_ref_example.cu @@ -31,7 +31,7 @@ */ template -__global__ void piggyback_kernel(RefType ref, InputIt first, std::size_t n) +__global__ void fused_kernel(RefType ref, InputIt first, std::size_t n) { // Transform the reference type (with device scope) to a reference type with block scope using local_ref_type = typename RefType::with_scope; @@ -67,7 +67,7 @@ __global__ void piggyback_kernel(RefType ref, InputIt first, std::size_t n) /* Here we can add some custom workload that takes the input `item`. - The idea is that cardinality estimation can be fused/piggy-backed with any other workload that + The idea is that cardinality estimation can be fused with any other workload that traverses the data. Since `local_ref.add` can run close to the SOL of the DRAM bandwidth, we get the estimate "for free" while performing other computations over the data. */ @@ -144,7 +144,7 @@ int main(void) auto const sketch_bytes = estimator.sketch_bytes(); // Call the custom kernel and pass a non-owning reference to the estimator to the GPU - piggyback_kernel<<<10, 512, sketch_bytes>>>(estimator.ref(), items.begin(), num_items); + fused_kernel<<<10, 512, sketch_bytes>>>(estimator.ref(), items.begin(), num_items); // Calculate the cardinality estimate from the custom kernel std::size_t const estimated_cardinality_custom = estimator.estimate(); From a496f9ee781d5521f915800d4626db55f534d2b1 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Wed, 3 Apr 2024 16:33:16 +0000 Subject: [PATCH 77/78] Update godbolt links --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index e15558bc2..b31df5bac 100644 --- a/README.md +++ b/README.md @@ -239,7 +239,7 @@ We plan to add many GPU-accelerated, concurrent data structures to `cuCollection `cuco::distinct_count_estimator` implements the well-established [HyperLogLog++ algorithm](https://static.googleusercontent.com/media/research.google.com/de//pubs/archive/40671.pdf) for approximating the count of distinct items in a multiset/stream. #### Examples: -- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/distinct_count_estimator/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/z/EG7cMssxo)) -- [Device-ref APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/distinct_count_estimator/device_ref_example.cu) (see [live example in godbolt](https://godbolt.org/z/va8eE9dqb)) +- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/distinct_count_estimator/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/z/ahjEoWM1E)) +- [Device-ref APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/distinct_count_estimator/device_ref_example.cu) (see [live example in godbolt](https://godbolt.org/z/qebYY8Goj)) From 7fecd7b71a7f547561c85a10ba039734aa09381d Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Wed, 3 Apr 2024 16:36:02 +0000 Subject: [PATCH 78/78] Clean up unique sequence unit test --- tests/distinct_count_estimator/unique_sequence_test.cu | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/distinct_count_estimator/unique_sequence_test.cu b/tests/distinct_count_estimator/unique_sequence_test.cu index 7d6321de6..3c1558b30 100644 --- a/tests/distinct_count_estimator/unique_sequence_test.cu +++ b/tests/distinct_count_estimator/unique_sequence_test.cu @@ -45,7 +45,6 @@ TEMPLATE_TEST_CASE_SIG("distinct_count_estimator: unique sequence", auto num_items = 1ull << num_items_pow2; // This factor determines the error threshold for passing the test - // TODO might be too high double constexpr tolerance_factor = 2.5; // RSD for a given precision is given by the following formula double const relative_standard_deviation = @@ -76,7 +75,7 @@ TEMPLATE_TEST_CASE_SIG("distinct_count_estimator: unique sequence", REQUIRE(estimator.estimate() == 0); double const relative_error = - std::abs(static_cast(num_items) - static_cast(estimate)) / num_items; + std::abs((static_cast(estimate) / static_cast(num_items)) - 1.0); // Check if the error is acceptable REQUIRE(relative_error < tolerance_factor * relative_standard_deviation);