From 836e77adf94182048c769d23a3256139ed4cd5dc Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Wed, 24 Jan 2024 00:58:40 +0000
Subject: [PATCH 01/78] First draft

---
 examples/CMakeLists.txt                       |    1 +
 .../host_bulk_example.cu                      |   61 +
 .../distinct_count_estimator.inl              |  106 +
 .../distinct_count_estimator_ref.inl          |   57 +
 include/cuco/detail/hyperloglog/finalizer.cuh |   79 +
 .../cuco/detail/hyperloglog/hyperloglog.cuh   |  196 ++
 .../detail/hyperloglog/hyperloglog_ref.cuh    |  189 ++
 include/cuco/detail/hyperloglog/kernels.cuh   |   75 +
 include/cuco/detail/hyperloglog/storage.cuh   |   24 +
 include/cuco/detail/hyperloglog/tuning.cuh    | 2577 +++++++++++++++++
 include/cuco/distinct_count_estimator.cuh     |   94 +
 include/cuco/distinct_count_estimator_ref.cuh |   61 +
 12 files changed, 3520 insertions(+)
 create mode 100644 examples/distinct_count_estimator/host_bulk_example.cu
 create mode 100644 include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl
 create mode 100644 include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl
 create mode 100644 include/cuco/detail/hyperloglog/finalizer.cuh
 create mode 100644 include/cuco/detail/hyperloglog/hyperloglog.cuh
 create mode 100644 include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
 create mode 100644 include/cuco/detail/hyperloglog/kernels.cuh
 create mode 100644 include/cuco/detail/hyperloglog/storage.cuh
 create mode 100644 include/cuco/detail/hyperloglog/tuning.cuh
 create mode 100644 include/cuco/distinct_count_estimator.cuh
 create mode 100644 include/cuco/distinct_count_estimator_ref.cuh
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index a3d0ae247..f6e753cf2 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -41,3 +41,4 @@ ConfigureExample(STATIC_MAP_DEVICE_SIDE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/sta
 ConfigureExample(STATIC_MAP_CUSTOM_TYPE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/custom_type_example.cu")
 ConfigureExample(STATIC_MAP_COUNT_BY_KEY_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/count_by_key_example.cu")
 ConfigureExample(STATIC_MULTIMAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_multimap/host_bulk_example.cu")
+ConfigureExample(DISTINCT_COUNT_ESTIMATOR_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/distinct_count_estimator/host_bulk_example.cu")
diff --git a/examples/distinct_count_estimator/host_bulk_example.cu b/examples/distinct_count_estimator/host_bulk_example.cu
new file mode 100644
index 000000000..18085e72f
--- /dev/null
+++ b/examples/distinct_count_estimator/host_bulk_example.cu
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cuco/distinct_count_estimator.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/sequence.h>
+
+#include <cstddef>
+#include <iostream>
+
+int main()
+{
+  using T                         = int;
+  std::size_t constexpr num_items = 1ull << 30;  // 4GB
+
+  thrust::device_vector<T> items(num_items);
+  // create a vector of distinct items
+  thrust::sequence(items.begin(), items.end(), 0);
+
+  cudaEvent_t start, stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+
+  cuco::distinct_count_estimator<T> estimator;
+  cudaEventRecord(start);
+  // add all items to the estimator
+  estimator.add(items.begin(), items.end());
+  // after the estimator has seen all items, we can calculate the cardinality
+  std::size_t const estimated_cardinality = estimator.estimate();
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+
+  float milliseconds = 0;
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  float input_size_gb = num_items * sizeof(T) / 1073741824.0f;
+  float throughput    = input_size_gb / (milliseconds / 1000.0f);
+
+  std::cout << "True cardinality:\t" << num_items << "\nEstimated cardinality:\t"
+            << estimated_cardinality << "\nRelative error:\t"
+            << abs(static_cast<double>(num_items) - static_cast<double>(estimated_cardinality)) /
+                 num_items
+            << "\nData size:\t" << input_size_gb << "GB"
+            << "\nElapsed time:\t" << milliseconds << "ms"
+            << "\nMemory throughput\t" << throughput << "GB/s" << std::endl;
+
+  cudaEventDestroy(start);
+  cudaEventDestroy(stop);
+}
\ No newline at end of file
diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl
new file mode 100644
index 000000000..7013bc956
--- /dev/null
+++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace cuco {
+
+template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash, class Allocator>
+constexpr distinct_count_estimator<T, Precision, Scope, Hash, Allocator>::distinct_count_estimator(
+  cuco::cuda_thread_scope<Scope> scope,
+  Hash const& hash,
+  Allocator const& alloc,
+  cuco::cuda_stream_ref stream)
+  : impl_{std::make_unique<impl_type>(scope, hash, alloc, stream)}
+{
+}
+
+template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash, class Allocator>
+void distinct_count_estimator<T, Precision, Scope, Hash, Allocator>::clear_async(
+  cuco::cuda_stream_ref stream) noexcept
+{
+  this->impl_->clear_async(stream);
+}
+
+template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash, class Allocator>
+void distinct_count_estimator<T, Precision, Scope, Hash, Allocator>::clear(
+  cuco::cuda_stream_ref stream)
+{
+  this->impl_->clear(stream);
+}
+
+template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash, class Allocator>
+template <class InputIt>
+void distinct_count_estimator<T, Precision, Scope, Hash, Allocator>::add_async(
+  InputIt first, InputIt last, cuco::cuda_stream_ref stream) noexcept
+{
+  this->impl_->add_async(first, last, stream);
+}
+
+template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash, class Allocator>
+template <class InputIt>
+void distinct_count_estimator<T, Precision, Scope, Hash, Allocator>::add(
+  InputIt first, InputIt last, cuco::cuda_stream_ref stream)
+{
+  this->impl_->add(first, last, stream);
+}
+
+template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash, class Allocator>
+template <cuda::thread_scope OtherScope, class OtherAllocator>
+void distinct_count_estimator<T, Precision, Scope, Hash, Allocator>::merge_async(
+  distinct_count_estimator<T, Precision, OtherScope, Hash, OtherAllocator> const& other,
+  cuco::cuda_stream_ref stream) noexcept
+{
+  this->impl_->merge_async(other, stream);
+}
+
+template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash, class Allocator>
+template <cuda::thread_scope OtherScope, class OtherAllocator>
+void distinct_count_estimator<T, Precision, Scope, Hash, Allocator>::merge(
+  distinct_count_estimator<T, Precision, OtherScope, Hash, OtherAllocator> const& other,
+  cuco::cuda_stream_ref stream)
+{
+  this->impl_->merge(other, stream);
+}
+
+template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash, class Allocator>
+template <cuda::thread_scope OtherScope>
+void distinct_count_estimator<T, Precision, Scope, Hash, Allocator>::merge_async(
+  ref_type<OtherScope> const& other, cuco::cuda_stream_ref stream) noexcept
+{
+  this->impl_->merge_async(other, stream);
+}
+
+template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash, class Allocator>
+template <cuda::thread_scope OtherScope>
+void distinct_count_estimator<T, Precision, Scope, Hash, Allocator>::merge(
+  ref_type<OtherScope> const& other, cuco::cuda_stream_ref stream)
+{
+  this->impl_->merge(other, stream);
+}
+
+template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash, class Allocator>
+std::size_t distinct_count_estimator<T, Precision, Scope, Hash, Allocator>::estimate(
+  cuco::cuda_stream_ref stream) const
+{
+  return this->impl_->estimate(stream);
+}
+
+template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash, class Allocator>
+typename distinct_count_estimator<T, Precision, Scope, Hash, Allocator>::ref_type<>
+distinct_count_estimator<T, Precision, Scope, Hash, Allocator>::ref() const noexcept
+{
+  return this->impl_->ref();
+}
+}  // namespace cuco
\ No newline at end of file
diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl
new file mode 100644
index 000000000..1359033d0
--- /dev/null
+++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace cuco {
+
+template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash>
+__host__ __device__ constexpr distinct_count_estimator_ref<T, Precision, Scope, Hash>::
+  distinct_count_estimator_ref(storage_type& storage,
+                               cuco::cuda_thread_scope<Scope> scope,
+                               Hash const& hash) noexcept
+  : impl_{storage, scope, hash}
+{
+}
+
+template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash>
+template <class CG>
+__device__ void distinct_count_estimator_ref<T, Precision, Scope, Hash>::clear(
+  CG const& group) noexcept
+{
+  this->impl_.clear(group);
+}
+
+template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash>
+__device__ void distinct_count_estimator_ref<T, Precision, Scope, Hash>::add(T const& item) noexcept
+{
+  this->impl_.add(item);
+}
+
+template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash>
+template <class CG, cuda::thread_scope OtherScope>
+__device__ void distinct_count_estimator_ref<T, Precision, Scope, Hash>::merge(
+  CG const& group,
+  distinct_count_estimator_ref<T, Precision, OtherScope, Hash> const& other) noexcept
+{
+  this->impl_.merge(group, other);
+}
+
+template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash>
+__device__ std::size_t distinct_count_estimator_ref<T, Precision, Scope, Hash>::estimate(
+  cooperative_groups::thread_block const& group) const noexcept
+{
+  this->impl_.estimate(group);
+}
+}  // namespace cuco
\ No newline at end of file
diff --git a/include/cuco/detail/hyperloglog/finalizer.cuh b/include/cuco/detail/hyperloglog/finalizer.cuh
new file mode 100644
index 000000000..9f5c9a20d
--- /dev/null
+++ b/include/cuco/detail/hyperloglog/finalizer.cuh
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cuco/detail/hyperloglog/tuning.cuh>
+
+#include <cuda/std/cmath>
+
+namespace cuco::hyperloglog_ns::detail {
+template <int32_t Precision>
+class finalizer {
+  // this minimum number of registers is required by HLL++
+  static_assert(Precision >= 4, "Precision must be greater or equal to 4");
+
+ public:
+  __host__ __device__ static double constexpr finalize(double z, int v) noexcept
+  {
+    auto e = alpha_mm() / z;
+    // TODO remove test code
+    // printf("raw e: %lf\n", e);
+
+    if (v > 0) {
+      // Use linear counting for small cardinality estimates.
+      double const h = m * log(static_cast<double>(m) / v);
+      // HLL++ is defined only when p < 19, otherwise we need to fallback to HLL.
+      // The threshold `2.5 * m` is from the original HLL algorithm.
+      if ((Precision < 19 and h <= thresholds[Precision - 4]) or e <= 2.5 * m) {
+        e = h;
+      } else {
+        e = bias_corrected_estimate(e);
+      }
+    } else {
+      e = bias_corrected_estimate(e);
+    }
+
+    return cuda::std::round(e);
+  }
+
+ private:
+  static auto constexpr m = (1 << Precision);
+
+  __host__ __device__ static double constexpr alpha_mm() noexcept
+  {
+    if constexpr (m == 16) {
+      return 0.673 * m * m;
+    } else if constexpr (m == 32) {
+      return 0.697 * m * m;
+    } else if constexpr (m == 64) {
+      return 0.709 * m * m;
+    } else {
+      return (0.7213 / (1.0 + 1.079 / m)) * m * m;
+    }
+  }
+
+  __host__ __device__ static double constexpr bias_corrected_estimate(double e) noexcept
+  {
+    if constexpr (Precision < 19) {
+      if (e < 5.0 * m) { return e - bias(e); }
+    }
+    return e;
+  }
+
+  // TODO implement HLL++ bias correction
+  __host__ __device__ static double constexpr bias(double e) noexcept { return e * 0; }
+};
+}  // namespace cuco::hyperloglog_ns::detail
\ No newline at end of file
diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh
new file mode 100644
index 000000000..bd3871261
--- /dev/null
+++ b/include/cuco/detail/hyperloglog/hyperloglog.cuh
@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cuco/cuda_stream_ref.hpp>
+#include <cuco/detail/hyperloglog/finalizer.cuh>
+#include <cuco/detail/hyperloglog/hyperloglog_ref.cuh>
+#include <cuco/detail/hyperloglog/kernels.cuh>
+#include <cuco/detail/hyperloglog/storage.cuh>
+#include <cuco/hash_functions.cuh>
+#include <cuco/utility/allocator.hpp>
+#include <cuco/utility/cuda_thread_scope.cuh>
+
+#include <cstddef>
+#include <iterator>
+#include <memory>
+
+namespace cuco::detail {
+template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash, class Allocator>
+class hyperloglog {
+ public:
+  static constexpr auto thread_scope = Scope;  ///< CUDA thread scope
+  static constexpr auto precision    = Precision;
+
+  using allocator_type = Allocator;  ///< Allocator type
+  using storage_type   = detail::hyperloglog_storage<precision>;
+  using storage_allocator_type =
+    typename std::allocator_traits<Allocator>::template rebind_alloc<storage_type>;
+
+  template <cuda::thread_scope NewScope = thread_scope>
+  using ref_type = hyperloglog_ref<T, Precision, NewScope, Hash>;
+
+  constexpr hyperloglog(cuco::cuda_thread_scope<Scope>,
+                        Hash const& hash,
+                        Allocator const& alloc,
+                        cuco::cuda_stream_ref stream)
+    : hash_{hash},
+      storage_allocator_{alloc},
+      storage_deleter_{storage_allocator_},
+      storage_{storage_allocator_.allocate(1ull), storage_deleter_}
+  {
+    this->clear_async(stream);  // TODO async or sync?
+  }
+
+  hyperloglog(hyperloglog const&)            = delete;
+  hyperloglog& operator=(hyperloglog const&) = delete;
+  hyperloglog(hyperloglog&&)                 = default;
+  hyperloglog& operator=(hyperloglog&&)      = default;
+  ~hyperloglog()                             = default;
+
+  void clear_async(cuco::cuda_stream_ref stream) noexcept
+  {
+    auto constexpr block_size = 1024;
+    cuco::hyperloglog_ns::detail::clear<<<1, block_size, 0, stream>>>(this->ref());
+  }
+
+  void clear(cuco::cuda_stream_ref stream)
+  {
+    this->clear_async(stream);
+    stream.synchronize();
+  }
+
+  template <class InputIt>
+  void add_async(InputIt first, InputIt last, cuco::cuda_stream_ref stream) noexcept
+  {
+    auto const num_items = cuco::detail::distance(first, last);  // TODO include
+    if (num_items == 0) { return; }
+
+    // TODO fallback to local memory registers in case they don't fit in shmem
+
+    int grid_size  = 0;
+    int block_size = 0;
+    // TODO check cuda error?
+    cudaOccupancyMaxPotentialBlockSize(
+      &grid_size, &block_size, &cuco::hyperloglog_ns::detail::add_shmem<InputIt, ref_type<>>);
+
+    cuco::hyperloglog_ns::detail::add_shmem<<<grid_size, block_size, 0, stream>>>(
+      first, num_items, this->ref());
+  }
+
+  template <class InputIt>
+  void add(InputIt first, InputIt last, cuco::cuda_stream_ref stream)
+  {
+    this->add_async(first, last, stream);
+    stream.synchronize();
+  }
+
+  template <cuda::thread_scope OtherScope, class OtherAllocator>
+  void merge_async(hyperloglog<T, Precision, OtherScope, Hash, OtherAllocator> const& other,
+                   cuco::cuda_stream_ref stream = {}) noexcept
+  {
+    this->merge_async(other.ref(), stream);
+  }
+
+  template <cuda::thread_scope OtherScope, class OtherAllocator>
+  void merge(hyperloglog<T, Precision, OtherScope, Hash, OtherAllocator> const& other,
+             cuco::cuda_stream_ref stream = {})
+  {
+    this->merge_async(other, stream);
+    stream.synchronize();
+  }
+
+  template <cuda::thread_scope OtherScope>
+  void merge_async(ref_type<OtherScope> const& other, cuco::cuda_stream_ref stream = {}) noexcept
+  {
+    auto constexpr block_size = 1024;
+    cuco::hyperloglog_ns::detail::merge<<<1, block_size, 0, stream>>>(other, this->ref());
+  }
+
+  template <cuda::thread_scope OtherScope>
+  void merge(ref_type<OtherScope> const& other, cuco::cuda_stream_ref stream = {})
+  {
+    this->merge_async(other, stream);
+    stream.synchronize();
+  }
+
+  [[nodiscard]] std::size_t estimate(cuco::cuda_stream_ref stream) const
+  {
+    // TODO remove test code
+    // std::size_t* result;
+    // cudaMallocHost(&result, sizeof(std::size_t));
+
+    // int grid_size  = 0;
+    // int block_size = 0;
+    // // TODO check cuda error?
+    // cudaOccupancyMaxPotentialBlockSize(
+    //   &grid_size, &block_size, &cuco::hyperloglog_ns::detail::estimate<ref_type<>>);
+
+    // cuco::hyperloglog_ns::detail::estimate<<<grid_size, block_size, 0, stream>>>(
+    //   result, this->ref());
+    // stream.synchronize();
+
+    // return *result;
+
+    // TODO this function currently copies the registers to the host and then finalizes the result;
+    // move computation to device? Edit: host computation is faster -.-
+    storage_type registers;
+    // TODO check if storage is host accessible
+    CUCO_CUDA_TRY(cudaMemcpyAsync(
+      &registers, this->storage_.get(), sizeof(storage_type), cudaMemcpyDeviceToHost, stream));
+    stream.synchronize();
+
+    using fp_type = typename ref_type<>::fp_type;
+    fp_type sum   = 0;
+    int zeroes    = 0;
+    // geometric mean computation + count registers with 0s
+    for (std::size_t i = 0; i < registers.size(); ++i) {
+      auto const reg = registers[i];
+      sum += fp_type{1} / static_cast<fp_type>(1 << reg);
+      zeroes += reg == 0;
+    }
+
+    // pass intermediate result to finalizer for bias correction, etc.
+    return cuco::hyperloglog_ns::detail::finalizer<Precision>::finalize(sum, zeroes);
+  }
+
+  [[nodiscard]] ref_type<> ref() const noexcept
+  {
+    return ref_type<>{*(this->storage_.get()), {}, this->hash_};
+  }
+
+ private:
+  struct storage_deleter {
+    using pointer = typename storage_allocator_type::value_type*;
+
+    storage_deleter(storage_allocator_type& a) : allocator{a} {}
+
+    storage_deleter(storage_deleter const&) = default;
+
+    void operator()(pointer ptr) { allocator.deallocate(ptr, 1); }
+
+    storage_allocator_type& allocator;
+  };
+
+  Hash hash_;
+  storage_allocator_type storage_allocator_;
+  storage_deleter storage_deleter_;
+  std::unique_ptr<storage_type, storage_deleter> storage_;
+
+  template <class T_, int32_t Precision_, cuda::thread_scope Scope_, class Hash_, class Allocator_>
+  friend class hyperloglog;
+};
+}  // namespace cuco::detail
\ No newline at end of file
diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
new file mode 100644
index 000000000..ba9333f95
--- /dev/null
+++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cuco/detail/hyperloglog/finalizer.cuh>
+#include <cuco/detail/hyperloglog/storage.cuh>
+#include <cuco/hash_functions.cuh>
+#include <cuco/utility/cuda_thread_scope.cuh>
+#include <cuco/utility/traits.hpp>
+
+#include <cstddef>
+#include <cuda/std/bit>
+
+#include <cooperative_groups.h>
+#include <cooperative_groups/reduce.h>
+
+namespace cuco::detail {
+template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash>
+class hyperloglog_ref {
+ public:
+  using fp_type                      = float;
+  static constexpr auto thread_scope = Scope;  ///< CUDA thread scope
+  static constexpr auto precision    = Precision;
+
+  using storage_type = hyperloglog_storage<Precision>;
+  template <cuda::thread_scope NewScope>
+  using with_scope = hyperloglog_ref<T, Precision, NewScope, Hash>;
+
+  __host__ __device__ constexpr hyperloglog_ref(storage_type& storage,
+                                                cuco::cuda_thread_scope<Scope> = {},
+                                                Hash const& hash               = {}) noexcept
+    : hash_{hash}, storage_{storage}
+  {
+  }
+
+  template <class CG>
+  __device__ void clear(CG const& group) noexcept
+  {
+    for (int i = group.thread_rank(); i < this->storage_.size(); i += group.size()) {
+      this->storage_[i] = 0;
+    }
+
+    // TODO remove test code
+    // int4 constexpr empty{0, 0, 0, 0};
+    // auto vec4 = reinterpret_cast<int4*>(this->storage_.data());
+    // // #pragma unroll 2
+    // for (int i = group.thread_rank(); i < (this->storage_.size() / 4); i += group.size()) {
+    //   vec4[i] = empty;
+    // }
+  }
+
+  __device__ void add(T const& item) noexcept
+  {
+    // static_assert NumBuckets is not too big
+    auto constexpr register_mask = (1 << Precision) - 1;
+    auto const h                 = this->hash_(item);
+    auto const reg               = h & register_mask;
+    auto const zeroes            = cuda::std::countl_zero(h | register_mask) + 1;  // __clz
+
+    if constexpr (Scope == cuda::thread_scope_thread) {
+      this->storage_[reg] = max(this->storage_[reg], zeroes);
+    } else if constexpr (Scope == cuda::thread_scope_block) {
+      atomicMax_block(&(this->storage_[reg]), zeroes);
+    } else if constexpr (Scope == cuda::thread_scope_device) {
+      atomicMax(&(this->storage_[reg]), zeroes);
+    } else if constexpr (Scope == cuda::thread_scope_system) {
+      atomicMax_system(&(this->storage_[reg]), zeroes);
+    } else {
+      static_assert(cuco::dependent_false<decltype(Scope)>, "Unsupported thread scope");
+    }
+  }
+
+  template <class CG, cuda::thread_scope OtherScope>
+  __device__ void merge(CG const& group,
+                        hyperloglog_ref<T, Precision, OtherScope, Hash> const& other) noexcept
+  {
+    for (int i = group.thread_rank(); i < this->storage_.size(); i += group.size()) {
+      if constexpr (Scope == cuda::thread_scope_thread) {
+        this->storage_[i] = max(this->storage_[i], other.storage_[i]);
+      } else if constexpr (Scope == cuda::thread_scope_block) {
+        atomicMax_block(this->storage_.data() + i, other.storage_[i]);
+      } else if constexpr (Scope == cuda::thread_scope_device) {
+        atomicMax(this->storage_.data() + i, other.storage_[i]);
+      } else if constexpr (Scope == cuda::thread_scope_system) {
+        atomicMax_system(this->storage_.data() + i, other.storage_[i]);
+      } else {
+        static_assert(cuco::dependent_false<decltype(Scope)>, "Unsupported thread scope");
+      }
+    }
+
+    // TODO remove test code
+    /*
+    auto vec4 = reinterpret_cast<int4 const*>(other.storage_.data());
+    // #pragma unroll 2
+    for (int i = group.thread_rank(); i < (this->storage_.size() / 4); i += group.size()) {
+      auto const items = vec4[i];
+      if constexpr (Scope == cuda::thread_scope_thread) {
+        auto max_vec4  = reinterpret_cast<int4*>(this->storage_.data());
+        auto max_items = max_vec4[i];
+        max_items.x    = max(max_items.x, items.x);
+        max_items.y    = max(max_items.y, items.y);
+        max_items.z    = max(max_items.z, items.z);
+        max_items.w    = max(max_items.w, items.w);
+        max_vec4[i]    = max_items;
+      } else if constexpr (Scope == cuda::thread_scope_block) {
+        atomicMax_block(this->storage_.data() + (i * 4 + 0), items.x);
+        atomicMax_block(this->storage_.data() + (i * 4 + 1), items.y);
+        atomicMax_block(this->storage_.data() + (i * 4 + 2), items.z);
+        atomicMax_block(this->storage_.data() + (i * 4 + 3), items.w);
+      } else if constexpr (Scope == cuda::thread_scope_device) {
+        atomicMax(this->storage_.data() + (i * 4 + 0), items.x);
+        atomicMax(this->storage_.data() + (i * 4 + 1), items.y);
+        atomicMax(this->storage_.data() + (i * 4 + 2), items.z);
+        atomicMax(this->storage_.data() + (i * 4 + 3), items.w);
+      } else if constexpr (Scope == cuda::thread_scope_system) {
+        atomicMax_system(this->storage_.data() + (i * 4 + 0), items.x);
+        atomicMax_system(this->storage_.data() + (i * 4 + 1), items.y);
+        atomicMax_system(this->storage_.data() + (i * 4 + 2), items.z);
+        atomicMax_system(this->storage_.data() + (i * 4 + 3), items.w);
+      } else {
+        static_assert(cuco::dependent_false<decltype(Scope)>, "Unsupported thread scope");
+      }
+    }
+    */
+  }
+
+  [[nodiscard]] __device__ std::size_t estimate(
+    cooperative_groups::thread_block const& group) const noexcept
+  {
+    __shared__ cuda::atomic<fp_type, cuda::thread_scope_block> block_sum;
+    __shared__ cuda::atomic<int, cuda::thread_scope_block> block_zeroes;
+    __shared__ std::size_t estimate;
+
+    // TODO is this needed?
+    if (group.thread_rank() == 0) {
+      block_sum.store(0, cuda::std::memory_order_relaxed);
+      block_zeroes.store(0, cuda::std::memory_order_relaxed);
+    }
+    group.sync();
+
+    // a warp
+    auto const tile = cooperative_groups::tiled_partition<32>(group);
+
+    fp_type thread_sum = 0;
+    int thread_zeroes  = 0;
+    for (int i = group.thread_rank(); i < this->storage_.size(); i += group.size()) {
+      auto const reg = this->storage_[i];
+      thread_sum += fp_type{1} / static_cast<fp_type>(1 << reg);
+      thread_zeroes += reg == 0;
+    }
+
+    // CG reduce Z and V
+    cooperative_groups::reduce_update_async(
+      tile, block_sum, thread_sum, cooperative_groups::plus<fp_type>());
+    cooperative_groups::reduce_update_async(
+      tile, block_zeroes, thread_zeroes, cooperative_groups::plus<int>());
+    group.sync();
+
+    if (group.thread_rank() == 0) {
+      auto const z = block_sum.load(cuda::std::memory_order_relaxed);
+      auto const v = block_zeroes.load(cuda::std::memory_order_relaxed);
+      estimate     = cuco::hyperloglog_ns::detail::finalizer<Precision>::finalize(z, v);
+    }
+    group.sync();
+
+    return estimate;
+  }
+
+ private:
+  Hash hash_;
+  storage_type& storage_;  // TODO is a reference the right choice here??
+
+  template <class T_, int32_t Precision_, cuda::thread_scope Scope_, class Hash_>
+  friend class hyperloglog_ref;
+};
+}  // namespace cuco::detail
\ No newline at end of file
diff --git a/include/cuco/detail/hyperloglog/kernels.cuh b/include/cuco/detail/hyperloglog/kernels.cuh
new file mode 100644
index 000000000..70064abcc
--- /dev/null
+++ b/include/cuco/detail/hyperloglog/kernels.cuh
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cuco/detail/utility/cuda.cuh>
+#include <cuco/utility/cuda_thread_scope.cuh>
+
+#include <cstddef>
+
+#include <cooperative_groups.h>
+
+namespace cuco::hyperloglog_ns::detail {
+
+template <class RefType>
+__global__ void clear(RefType ref)
+{
+  auto const block = cooperative_groups::this_thread_block();
+  if (block.group_index().x == 0) { ref.clear(block); }
+}
+
+template <class InputIt, class RefType>
+__global__ void add_shmem(InputIt first, cuco::detail::index_type n, RefType ref)
+{
+  using local_ref_type = typename RefType::with_scope<cuda::thread_scope_block>;
+
+  __shared__ typename local_ref_type::storage_type local_storage;
+
+  auto const loop_stride = cuco::detail::grid_stride();
+  auto idx               = cuco::detail::global_thread_id();
+  auto const block       = cooperative_groups::this_thread_block();
+
+  local_ref_type local_ref(local_storage);
+  local_ref.clear(block);
+  block.sync();
+
+  while (idx < n) {
+    local_ref.add(*(first + idx));
+    idx += loop_stride;
+  }
+  block.sync();
+
+  ref.merge(block, local_ref);
+}
+
+template <class OtherRefType, class RefType>
+__global__ void merge(OtherRefType other_ref, RefType ref)
+{
+  auto const block = cooperative_groups::this_thread_block();
+  if (block.group_index().x == 0) { ref.merge(block, other_ref); }
+}
+
+// TODO this kernel currently isn't being used
+template <class RefType>
+__global__ void estimate(std::size_t* cardinality, RefType ref)
+{
+  auto const block = cooperative_groups::this_thread_block();
+  if (block.group_index().x == 0) {
+    auto const estimate = ref.estimate(block);
+    if (block.thread_rank() == 0) { *cardinality = estimate; }
+  }
+}
+}  // namespace cuco::hyperloglog_ns::detail
\ No newline at end of file
diff --git a/include/cuco/detail/hyperloglog/storage.cuh b/include/cuco/detail/hyperloglog/storage.cuh
new file mode 100644
index 000000000..195bdbe1c
--- /dev/null
+++ b/include/cuco/detail/hyperloglog/storage.cuh
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cuda/std/array>
+
+namespace cuco::detail {
+template <int32_t Precision>
+struct alignas(sizeof(int) * 4) hyperloglog_storage
+  : public cuda::std::array<int, 1ull << Precision> {};
+}  // namespace cuco::detail
diff --git a/include/cuco/detail/hyperloglog/tuning.cuh b/include/cuco/detail/hyperloglog/tuning.cuh
new file mode 100644
index 000000000..f49e43e24
--- /dev/null
+++ b/include/cuco/detail/hyperloglog/tuning.cuh
@@ -0,0 +1,2577 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cuda/std/array>
+
+namespace cuco::hyperloglog_ns::detail {
+
+// TODO this will spawn one copy of each array in every TU :(
+// TODO use float instead of double?
+// TODO use __constant__?
+#ifndef CUCO_HLL_TUNING_ARR_DECL
+#define CUCO_HLL_TUNING_ARR_DECL __device__ static cuda::std::array constexpr
+#endif
+
+CUCO_HLL_TUNING_ARR_DECL thresholds{10.0,
+                                    20.0,
+                                    40.0,
+                                    80.0,
+                                    220.0,
+                                    400.0,
+                                    900.0,
+                                    1800.0,
+                                    3100.0,
+                                    6500.0,
+                                    15500.0,
+                                    20000.0,
+                                    50000.0,
+                                    120000.0,
+                                    350000.0};
+
+// HLL++ uses an interpolation method over the raw estimated cardinality to select the optimal bias.
+// Parameters/interpolation points taken from
+// https://docs.google.com/document/d/1gyjfMHy43U9OWBXxfaeG-3MjGzejW1dlpyMwEYAAWEI/mobilebasic
+CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p4{
+  11.0,    11.717,  12.207,  12.7896, 13.2882, 13.8204, 14.3772, 14.9342, 15.5202, 16.161,
+  16.7722, 17.4636, 18.0396, 18.6766, 19.3566, 20.0454, 20.7936, 21.4856, 22.2666, 22.9946,
+  23.766,  24.4692, 25.3638, 26.0764, 26.7864, 27.7602, 28.4814, 29.433,  30.2926, 31.0664,
+  31.9996, 32.7956, 33.5366, 34.5894, 35.5738, 36.2698, 37.3682, 38.0544, 39.2342, 40.0108,
+  40.7966, 41.9298, 42.8704, 43.6358, 44.5194, 45.773,  46.6772, 47.6174, 48.4888, 49.3304,
+  50.2506, 51.4996, 52.3824, 53.3078, 54.3984, 55.5838, 56.6618, 57.2174, 58.3514, 59.0802,
+  60.1482, 61.0376, 62.3598, 62.8078, 63.9744, 64.914,  65.781,  67.1806, 68.0594, 68.8446,
+  69.7928, 70.8248, 71.8324, 72.8598, 73.6246, 74.7014, 75.393,  76.6708, 77.2394};
+CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p5{
+  23.0,     23.1194,  23.8208,  24.2318,  24.77,    25.2436,  25.7774,  26.2848,  26.8224,
+  27.3742,  27.9336,  28.503,   29.0494,  29.6292,  30.2124,  30.798,   31.367,   31.9728,
+  32.5944,  33.217,   33.8438,  34.3696,  35.0956,  35.7044,  36.324,   37.0668,  37.6698,
+  38.3644,  39.049,   39.6918,  40.4146,  41.082,   41.687,   42.5398,  43.2462,  43.857,
+  44.6606,  45.4168,  46.1248,  46.9222,  47.6804,  48.447,   49.3454,  49.9594,  50.7636,
+  51.5776,  52.331,   53.19,    53.9676,  54.7564,  55.5314,  56.4442,  57.3708,  57.9774,
+  58.9624,  59.8796,  60.755,   61.472,   62.2076,  63.1024,  63.8908,  64.7338,  65.7728,
+  66.629,   67.413,   68.3266,  69.1524,  70.2642,  71.1806,  72.0566,  72.9192,  73.7598,
+  74.3516,  75.5802,  76.4386,  77.4916,  78.1524,  79.1892,  79.8414,  80.8798,  81.8376,
+  82.4698,  83.7656,  84.331,   85.5914,  86.6012,  87.7016,  88.5582,  89.3394,  90.3544,
+  91.4912,  92.308,   93.3552,  93.9746,  95.2052,  95.727,   97.1322,  98.3944,  98.7588,
+  100.242,  101.1914, 102.2538, 102.8776, 103.6292, 105.1932, 105.9152, 107.0868, 107.6728,
+  108.7144, 110.3114, 110.8716, 111.245,  112.7908, 113.7064, 114.636,  115.7464, 116.1788,
+  117.7464, 118.4896, 119.6166, 120.5082, 121.7798, 122.9028, 123.4426, 124.8854, 125.705,
+  126.4652, 128.3464, 128.3462, 130.0398, 131.0342, 131.0042, 132.4766, 133.511,  134.7252,
+  135.425,  136.5172, 138.0572, 138.6694, 139.3712, 140.8598, 141.4594, 142.554,  143.4006,
+  144.7374, 146.1634, 146.8994, 147.605,  147.9304, 149.1636, 150.2468, 151.5876, 152.2096,
+  153.7032, 154.7146, 155.807,  156.9228, 157.0372, 158.5852};
+CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p6{
+  46.0,     46.1902,  47.271,   47.8358,  48.8142,  49.2854,  50.317,   51.354,   51.8924,
+  52.9436,  53.4596,  54.5262,  55.6248,  56.1574,  57.2822,  57.837,   58.9636,  60.074,
+  60.7042,  61.7976,  62.4772,  63.6564,  64.7942,  65.5004,  66.686,   67.291,   68.5672,
+  69.8556,  70.4982,  71.8204,  72.4252,  73.7744,  75.0786,  75.8344,  77.0294,  77.8098,
+  79.0794,  80.5732,  81.1878,  82.5648,  83.2902,  84.6784,  85.3352,  86.8946,  88.3712,
+  89.0852,  90.499,   91.2686,  92.6844,  94.2234,  94.9732,  96.3356,  97.2286,  98.7262,
+  100.3284, 101.1048, 102.5962, 103.3562, 105.1272, 106.4184, 107.4974, 109.0822, 109.856,
+  111.48,   113.2834, 114.0208, 115.637,  116.5174, 118.0576, 119.7476, 120.427,  122.1326,
+  123.2372, 125.2788, 126.6776, 127.7926, 129.1952, 129.9564, 131.6454, 133.87,   134.5428,
+  136.2,    137.0294, 138.6278, 139.6782, 141.792,  143.3516, 144.2832, 146.0394, 147.0748,
+  148.4912, 150.849,  151.696,  153.5404, 154.073,  156.3714, 157.7216, 158.7328, 160.4208,
+  161.4184, 163.9424, 165.2772, 166.411,  168.1308, 168.769,  170.9258, 172.6828, 173.7502,
+  175.706,  176.3886, 179.0186, 180.4518, 181.927,  183.4172, 184.4114, 186.033,  188.5124,
+  189.5564, 191.6008, 192.4172, 193.8044, 194.997,  197.4548, 198.8948, 200.2346, 202.3086,
+  203.1548, 204.8842, 206.6508, 206.6772, 209.7254, 210.4752, 212.7228, 214.6614, 215.1676,
+  217.793,  218.0006, 219.9052, 221.66,   223.5588, 225.1636, 225.6882, 227.7126, 229.4502,
+  231.1978, 232.9756, 233.1654, 236.727,  238.1974, 237.7474, 241.1346, 242.3048, 244.1948,
+  245.3134, 246.879,  249.1204, 249.853,  252.6792, 253.857,  254.4486, 257.2362, 257.9534,
+  260.0286, 260.5632, 262.663,  264.723,  265.7566, 267.2566, 267.1624, 270.62,   272.8216,
+  273.2166, 275.2056, 276.2202, 278.3726, 280.3344, 281.9284, 283.9728, 284.1924, 286.4872,
+  287.587,  289.807,  291.1206, 292.769,  294.8708, 296.665,  297.1182, 299.4012, 300.6352,
+  302.1354, 304.1756, 306.1606, 307.3462, 308.5214, 309.4134, 310.8352, 313.9684, 315.837,
+  316.7796, 318.9858};
+CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p7{
+  92.0,     93.4934,  94.9758,  96.4574,  97.9718,  99.4954,  101.5302, 103.0756, 104.6374,
+  106.1782, 107.7888, 109.9522, 111.592,  113.2532, 114.9086, 116.5938, 118.9474, 120.6796,
+  122.4394, 124.2176, 125.9768, 128.4214, 130.2528, 132.0102, 133.8658, 135.7278, 138.3044,
+  140.1316, 142.093,  144.0032, 145.9092, 148.6306, 150.5294, 152.5756, 154.6508, 156.662,
+  159.552,  161.3724, 163.617,  165.5754, 167.7872, 169.8444, 172.7988, 174.8606, 177.2118,
+  179.3566, 181.4476, 184.5882, 186.6816, 189.0824, 191.0258, 193.6048, 196.4436, 198.7274,
+  200.957,  203.147,  205.4364, 208.7592, 211.3386, 213.781,  215.8028, 218.656,  221.6544,
+  223.996,  226.4718, 229.1544, 231.6098, 234.5956, 237.0616, 239.5758, 242.4878, 244.5244,
+  248.2146, 250.724,  252.8722, 255.5198, 258.0414, 261.941,  264.9048, 266.87,   269.4304,
+  272.028,  274.4708, 278.37,   281.0624, 283.4668, 286.5532, 289.4352, 293.2564, 295.2744,
+  298.2118, 300.7472, 304.1456, 307.2928, 309.7504, 312.5528, 315.979,  318.2102, 322.1834,
+  324.3494, 327.325,  330.6614, 332.903,  337.2544, 339.9042, 343.215,  345.2864, 348.0814,
+  352.6764, 355.301,  357.139,  360.658,  363.1732, 366.5902, 369.9538, 373.0828, 375.922,
+  378.9902, 382.7328, 386.4538, 388.1136, 391.2234, 394.0878, 396.708,  401.1556, 404.1852,
+  406.6372, 409.6822, 412.7796, 416.6078, 418.4916, 422.131,  424.5376, 428.1988, 432.211,
+  434.4502, 438.5282, 440.912,  444.0448, 447.7432, 450.8524, 453.7988, 456.7858, 458.8868,
+  463.9886, 466.5064, 468.9124, 472.6616, 475.4682, 478.582,  481.304,  485.2738, 488.6894,
+  490.329,  496.106,  497.6908, 501.1374, 504.5322, 506.8848, 510.3324, 513.4512, 516.179,
+  520.4412, 522.6066, 526.167,  528.7794, 533.379,  536.067,  538.46,   542.9116, 545.692,
+  547.9546, 552.493,  555.2722, 557.335,  562.449,  564.2014, 569.0738, 571.0974, 574.8564,
+  578.2996, 581.409,  583.9704, 585.8098, 589.6528, 594.5998, 595.958,  600.068,  603.3278,
+  608.2016, 609.9632, 612.864,  615.43,   620.7794, 621.272,  625.8644, 629.206,  633.219,
+  634.5154, 638.6102};
+CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p8{
+  184.2152,  187.2454,  190.2096,  193.6652,  196.6312,  199.6822,  203.249,   206.3296,  210.0038,
+  213.2074,  216.4612,  220.27,    223.5178,  227.4412,  230.8032,  234.1634,  238.1688,  241.6074,
+  245.6946,  249.2664,  252.8228,  257.0432,  260.6824,  264.9464,  268.6268,  272.2626,  276.8376,
+  280.4034,  284.8956,  288.8522,  292.7638,  297.3552,  301.3556,  305.7526,  309.9292,  313.8954,
+  318.8198,  322.7668,  327.298,   331.6688,  335.9466,  340.9746,  345.1672,  349.3474,  354.3028,
+  358.8912,  364.114,   368.4646,  372.9744,  378.4092,  382.6022,  387.843,   392.5684,  397.1652,
+  402.5426,  407.4152,  412.5388,  417.3592,  422.1366,  427.486,   432.3918,  437.5076,  442.509,
+  447.3834,  453.3498,  458.0668,  463.7346,  469.1228,  473.4528,  479.7,     484.644,   491.0518,
+  495.5774,  500.9068,  506.432,   512.1666,  517.434,   522.6644,  527.4894,  533.6312,  538.3804,
+  544.292,   550.5496,  556.0234,  562.8206,  566.6146,  572.4188,  579.117,   583.6762,  590.6576,
+  595.7864,  601.509,   607.5334,  612.9204,  619.772,   624.2924,  630.8654,  636.1836,  642.745,
+  649.1316,  655.0386,  660.0136,  666.6342,  671.6196,  678.1866,  684.4282,  689.3324,  695.4794,
+  702.5038,  708.129,   713.528,   720.3204,  726.463,   732.7928,  739.123,   744.7418,  751.2192,
+  756.5102,  762.6066,  769.0184,  775.2224,  781.4014,  787.7618,  794.1436,  798.6506,  805.6378,
+  811.766,   819.7514,  824.5776,  828.7322,  837.8048,  843.6302,  849.9336,  854.4798,  861.3388,
+  867.9894,  873.8196,  880.3136,  886.2308,  892.4588,  899.0816,  905.4076,  912.0064,  917.3878,
+  923.619,   929.998,   937.3482,  943.9506,  947.991,   955.1144,  962.203,   968.8222,  975.7324,
+  981.7826,  988.7666,  994.2648,  1000.3128, 1007.4082, 1013.7536, 1020.3376, 1026.7156, 1031.7478,
+  1037.4292, 1045.393,  1051.2278, 1058.3434, 1062.8726, 1071.884,  1076.806,  1082.9176, 1089.1678,
+  1095.5032, 1102.525,  1107.2264, 1115.315,  1120.93,   1127.252,  1134.1496, 1139.0408, 1147.5448,
+  1153.3296, 1158.1974, 1166.5262, 1174.3328, 1175.657,  1184.4222, 1190.9172, 1197.1292, 1204.4606,
+  1210.4578, 1218.8728, 1225.3336, 1226.6592, 1236.5768, 1241.363,  1249.4074, 1254.6566, 1260.8014,
+  1266.5454, 1274.5192};
+CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p9{
+  369.0,     374.8294,  381.2452,  387.6698,  394.1464,  400.2024,  406.8782,  413.6598,  420.462,
+  427.2826,  433.7102,  440.7416,  447.9366,  455.1046,  462.285,   469.0668,  476.306,   483.8448,
+  491.301,   498.9886,  506.2422,  513.8138,  521.7074,  529.7428,  537.8402,  545.1664,  553.3534,
+  561.594,   569.6886,  577.7876,  585.65,    594.228,   602.8036,  611.1666,  620.0818,  628.0824,
+  637.2574,  646.302,   655.1644,  664.0056,  672.3802,  681.7192,  690.5234,  700.2084,  708.831,
+  718.485,   728.1112,  737.4764,  746.76,    756.3368,  766.5538,  775.5058,  785.2646,  795.5902,
+  804.3818,  814.8998,  824.9532,  835.2062,  845.2798,  854.4728,  864.9582,  875.3292,  886.171,
+  896.781,   906.5716,  916.7048,  927.5322,  937.875,   949.3972,  958.3464,  969.7274,  980.2834,
+  992.1444,  1003.4264, 1013.0166, 1024.018,  1035.0438, 1046.34,   1057.6856, 1068.9836, 1079.0312,
+  1091.677,  1102.3188, 1113.4846, 1124.4424, 1135.739,  1147.1488, 1158.9202, 1169.406,  1181.5342,
+  1193.2834, 1203.8954, 1216.3286, 1226.2146, 1239.6684, 1251.9946, 1262.123,  1275.4338, 1285.7378,
+  1296.076,  1308.9692, 1320.4964, 1333.0998, 1343.9864, 1357.7754, 1368.3208, 1380.4838, 1392.7388,
+  1406.0758, 1416.9098, 1428.9728, 1440.9228, 1453.9292, 1462.617,  1476.05,   1490.2996, 1500.6128,
+  1513.7392, 1524.5174, 1536.6322, 1548.2584, 1562.3766, 1572.423,  1587.1232, 1596.5164, 1610.5938,
+  1622.5972, 1633.1222, 1647.7674, 1658.5044, 1671.57,   1683.7044, 1695.4142, 1708.7102, 1720.6094,
+  1732.6522, 1747.841,  1756.4072, 1769.9786, 1782.3276, 1797.5216, 1808.3186, 1819.0694, 1834.354,
+  1844.575,  1856.2808, 1871.1288, 1880.7852, 1893.9622, 1906.3418, 1920.6548, 1932.9302, 1945.8584,
+  1955.473,  1968.8248, 1980.6446, 1995.9598, 2008.349,  2019.8556, 2033.0334, 2044.0206, 2059.3956,
+  2069.9174, 2082.6084, 2093.7036, 2106.6108, 2118.9124, 2132.301,  2144.7628, 2159.8422, 2171.0212,
+  2183.101,  2193.5112, 2208.052,  2221.3194, 2233.3282, 2247.295,  2257.7222, 2273.342,  2286.5638,
+  2299.6786, 2310.8114, 2322.3312, 2335.516,  2349.874,  2363.5968, 2373.865,  2387.1918, 2401.8328,
+  2414.8496, 2424.544,  2436.7592, 2447.1682, 2464.1958, 2474.3438, 2489.0006, 2497.4526, 2513.6586,
+  2527.19,   2540.7028, 2553.768};
+CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p10{
+  738.1256,  750.4234,  763.1064,  775.4732,  788.4636,  801.0644,  814.488,   827.9654,  841.0832,
+  854.7864,  868.1992,  882.2176,  896.5228,  910.1716,  924.7752,  938.899,   953.6126,  968.6492,
+  982.9474,  998.5214,  1013.1064, 1028.6364, 1044.2468, 1059.4588, 1075.3832, 1091.0584, 1106.8606,
+  1123.3868, 1139.5062, 1156.1862, 1172.463,  1189.339,  1206.1936, 1223.1292, 1240.1854, 1257.2908,
+  1275.3324, 1292.8518, 1310.5204, 1328.4854, 1345.9318, 1364.552,  1381.4658, 1400.4256, 1419.849,
+  1438.152,  1456.8956, 1474.8792, 1494.118,  1513.62,   1532.5132, 1551.9322, 1570.7726, 1590.6086,
+  1610.5332, 1630.5918, 1650.4294, 1669.7662, 1690.4106, 1710.7338, 1730.9012, 1750.4486, 1770.1556,
+  1791.6338, 1812.7312, 1833.6264, 1853.9526, 1874.8742, 1896.8326, 1918.1966, 1939.5594, 1961.07,
+  1983.037,  2003.1804, 2026.071,  2047.4884, 2070.0848, 2091.2944, 2114.333,  2135.9626, 2158.2902,
+  2181.0814, 2202.0334, 2224.4832, 2246.39,   2269.7202, 2292.1714, 2314.2358, 2338.9346, 2360.891,
+  2384.0264, 2408.3834, 2430.1544, 2454.8684, 2476.9896, 2501.4368, 2522.8702, 2548.0408, 2570.6738,
+  2593.5208, 2617.0158, 2640.2302, 2664.0962, 2687.4986, 2714.2588, 2735.3914, 2759.6244, 2781.8378,
+  2808.0072, 2830.6516, 2856.2454, 2877.2136, 2903.4546, 2926.785,  2951.2294, 2976.468,  3000.867,
+  3023.6508, 3049.91,   3073.5984, 3098.162,  3121.5564, 3146.2328, 3170.9484, 3195.5902, 3221.3346,
+  3242.7032, 3271.6112, 3296.5546, 3317.7376, 3345.072,  3369.9518, 3394.326,  3418.1818, 3444.6926,
+  3469.086,  3494.2754, 3517.8698, 3544.248,  3565.3768, 3588.7234, 3616.979,  3643.7504, 3668.6812,
+  3695.72,   3719.7392, 3742.6224, 3770.4456, 3795.6602, 3819.9058, 3844.002,  3869.517,  3895.6824,
+  3920.8622, 3947.1364, 3973.985,  3995.4772, 4021.62,   4046.628,  4074.65,   4096.2256, 4121.831,
+  4146.6406, 4173.276,  4195.0744, 4223.9696, 4251.3708, 4272.9966, 4300.8046, 4326.302,  4353.1248,
+  4374.312,  4403.0322, 4426.819,  4450.0598, 4478.5206, 4504.8116, 4528.8928, 4553.9584, 4578.8712,
+  4603.8384, 4632.3872, 4655.5128, 4675.821,  4704.6222, 4731.9862, 4755.4174, 4781.2628, 4804.332,
+  4832.3048, 4862.8752, 4883.4148, 4906.9544, 4935.3516, 4954.3532, 4984.0248, 5011.217,  5035.3258,
+  5057.3672, 5084.1828};
+CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p11{
+  1477.0,    1501.6014, 1526.5802, 1551.7942, 1577.3042, 1603.2062,  1629.8402,  1656.2292,
+  1682.9462, 1709.9926, 1737.3026, 1765.4252, 1793.0578, 1821.6092,  1849.626,   1878.5568,
+  1908.527,  1937.5154, 1967.1874, 1997.3878, 2027.37,   2058.1972,  2089.5728,  2120.1012,
+  2151.9668, 2183.292,  2216.0772, 2247.8578, 2280.6562, 2313.041,   2345.714,   2380.3112,
+  2414.1806, 2447.9854, 2481.656,  2516.346,  2551.5154, 2586.8378,  2621.7448,  2656.6722,
+  2693.5722, 2729.1462, 2765.4124, 2802.8728, 2838.898,  2876.408,   2913.4926,  2951.4938,
+  2989.6776, 3026.282,  3065.7704, 3104.1012, 3143.7388, 3181.6876,  3221.1872,  3261.5048,
+  3300.0214, 3339.806,  3381.409,  3421.4144, 3461.4294, 3502.2286,  3544.651,   3586.6156,
+  3627.337,  3670.083,  3711.1538, 3753.5094, 3797.01,   3838.6686,  3882.1678,  3922.8116,
+  3967.9978, 4009.9204, 4054.3286, 4097.5706, 4140.6014, 4185.544,   4229.5976,  4274.583,
+  4316.9438, 4361.672,  4406.2786, 4451.8628, 4496.1834, 4543.505,   4589.1816,  4632.5188,
+  4678.2294, 4724.8908, 4769.0194, 4817.052,  4861.4588, 4910.1596,  4956.4344,  5002.5238,
+  5048.13,   5093.6374, 5142.8162, 5187.7894, 5237.3984, 5285.6078,  5331.0858,  5379.1036,
+  5428.6258, 5474.6018, 5522.7618, 5571.5822, 5618.59,   5667.9992,  5714.88,    5763.454,
+  5808.6982, 5860.3644, 5910.2914, 5953.571,  6005.9232, 6055.1914,  6104.5882,  6154.5702,
+  6199.7036, 6251.1764, 6298.7596, 6350.0302, 6398.061,  6448.4694,  6495.933,   6548.0474,
+  6597.7166, 6646.9416, 6695.9208, 6742.6328, 6793.5276, 6842.1934,  6894.2372,  6945.3864,
+  6996.9228, 7044.2372, 7094.1374, 7142.2272, 7192.2942, 7238.8338,  7288.9006,  7344.0908,
+  7394.8544, 7443.5176, 7490.4148, 7542.9314, 7595.6738, 7641.9878,  7694.3688,  7743.0448,
+  7797.522,  7845.53,   7899.594,  7950.3132, 7996.455,  8050.9442,  8092.9114,  8153.1374,
+  8197.4472, 8252.8278, 8301.8728, 8348.6776, 8401.4698, 8453.551,   8504.6598,  8553.8944,
+  8604.1276, 8657.6514, 8710.3062, 8758.908,  8807.8706, 8862.1702,  8910.4668,  8960.77,
+  9007.2766, 9063.164,  9121.0534, 9164.1354, 9218.1594, 9267.767,   9319.0594,  9372.155,
+  9419.7126, 9474.3722, 9520.1338, 9572.368,  9622.7702, 9675.8448,  9726.5396,  9778.7378,
+  9827.6554, 9878.1922, 9928.7782, 9978.3984, 10026.578, 10076.5626, 10137.1618, 10177.5244,
+  10229.9176};
+CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p12{
+  2954.0,     3003.4782,  3053.3568,  3104.3666,  3155.324,   3206.9598,  3259.648,   3312.539,
+  3366.1474,  3420.2576,  3474.8376,  3530.6076,  3586.451,   3643.38,    3700.4104,  3757.5638,
+  3815.9676,  3875.193,   3934.838,   3994.8548,  4055.018,   4117.1742,  4178.4482,  4241.1294,
+  4304.4776,  4367.4044,  4431.8724,  4496.3732,  4561.4304,  4627.5326,  4693.949,   4761.5532,
+  4828.7256,  4897.6182,  4965.5186,  5034.4528,  5104.865,   5174.7164,  5244.6828,  5316.6708,
+  5387.8312,  5459.9036,  5532.476,   5604.8652,  5679.6718,  5753.757,   5830.2072,  5905.2828,
+  5980.0434,  6056.6264,  6134.3192,  6211.5746,  6290.0816,  6367.1176,  6447.9796,  6526.5576,
+  6606.1858,  6686.9144,  6766.1142,  6847.0818,  6927.9664,  7010.9096,  7091.0816,  7175.3962,
+  7260.3454,  7344.018,   7426.4214,  7511.3106,  7596.0686,  7679.8094,  7765.818,   7852.4248,
+  7936.834,   8022.363,   8109.5066,  8200.4554,  8288.5832,  8373.366,   8463.4808,  8549.7682,
+  8642.0522,  8728.3288,  8820.9528,  8907.727,   9001.0794,  9091.2522,  9179.988,   9269.852,
+  9362.6394,  9453.642,   9546.9024,  9640.6616,  9732.6622,  9824.3254,  9917.7484,  10007.9392,
+  10106.7508, 10196.2152, 10289.8114, 10383.5494, 10482.3064, 10576.8734, 10668.7872, 10764.7156,
+  10862.0196, 10952.793,  11049.9748, 11146.0702, 11241.4492, 11339.2772, 11434.2336, 11530.741,
+  11627.6136, 11726.311,  11821.5964, 11918.837,  12015.3724, 12113.0162, 12213.0424, 12306.9804,
+  12408.4518, 12504.8968, 12604.586,  12700.9332, 12798.705,  12898.5142, 12997.0488, 13094.788,
+  13198.475,  13292.7764, 13392.9698, 13486.8574, 13590.1616, 13686.5838, 13783.6264, 13887.2638,
+  13992.0978, 14081.0844, 14189.9956, 14280.0912, 14382.4956, 14486.4384, 14588.1082, 14686.2392,
+  14782.276,  14888.0284, 14985.1864, 15088.8596, 15187.0998, 15285.027,  15383.6694, 15495.8266,
+  15591.3736, 15694.2008, 15790.3246, 15898.4116, 15997.4522, 16095.5014, 16198.8514, 16291.7492,
+  16402.6424, 16499.1266, 16606.2436, 16697.7186, 16796.3946, 16902.3376, 17005.7672, 17100.814,
+  17206.8282, 17305.8262, 17416.0744, 17508.4092, 17617.0178, 17715.4554, 17816.758,  17920.1748,
+  18012.9236, 18119.7984, 18223.2248, 18324.2482, 18426.6276, 18525.0932, 18629.8976, 18733.2588,
+  18831.0466, 18940.1366, 19032.2696, 19131.729,  19243.4864, 19349.6932, 19442.866,  19547.9448,
+  19653.2798, 19754.4034, 19854.0692, 19965.1224, 20065.1774, 20158.2212, 20253.353,  20366.3264,
+  20463.22};
+CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p13{
+  5908.5052,  6007.2672,  6107.347,   6208.5794,  6311.2622,  6414.5514,  6519.3376,  6625.6952,
+  6732.5988,  6841.3552,  6950.5972,  7061.3082,  7173.5646,  7287.109,   7401.8216,  7516.4344,
+  7633.3802,  7751.2962,  7870.3784,  7990.292,   8110.79,    8233.4574,  8356.6036,  8482.2712,
+  8607.7708,  8735.099,   8863.1858,  8993.4746,  9123.8496,  9255.6794,  9388.5448,  9522.7516,
+  9657.3106,  9792.6094,  9930.5642,  10068.794,  10206.7256, 10347.81,   10490.3196, 10632.0778,
+  10775.9916, 10920.4662, 11066.124,  11213.073,  11358.0362, 11508.1006, 11659.1716, 11808.7514,
+  11959.4884, 12112.1314, 12265.037,  12420.3756, 12578.933,  12734.311,  12890.0006, 13047.2144,
+  13207.3096, 13368.5144, 13528.024,  13689.847,  13852.7528, 14018.3168, 14180.5372, 14346.9668,
+  14513.5074, 14677.867,  14846.2186, 15017.4186, 15184.9716, 15356.339,  15529.2972, 15697.3578,
+  15871.8686, 16042.187,  16216.4094, 16389.4188, 16565.9126, 16742.3272, 16919.0042, 17094.7592,
+  17273.965,  17451.8342, 17634.4254, 17810.5984, 17988.9242, 18171.051,  18354.7938, 18539.466,
+  18721.0408, 18904.9972, 19081.867,  19271.9118, 19451.8694, 19637.9816, 19821.2922, 20013.1292,
+  20199.3858, 20387.8726, 20572.9514, 20770.7764, 20955.1714, 21144.751,  21329.9952, 21520.709,
+  21712.7016, 21906.3868, 22096.2626, 22286.0524, 22475.051,  22665.5098, 22862.8492, 23055.5294,
+  23249.6138, 23437.848,  23636.273,  23826.093,  24020.3296, 24213.3896, 24411.7392, 24602.9614,
+  24805.7952, 24998.1552, 25193.9588, 25389.0166, 25585.8392, 25780.6976, 25981.2728, 26175.977,
+  26376.5252, 26570.1964, 26773.387,  26962.9812, 27163.0586, 27368.164,  27565.0534, 27758.7428,
+  27961.1276, 28163.2324, 28362.3816, 28565.7668, 28758.644,  28956.9768, 29163.4722, 29354.7026,
+  29561.1186, 29767.9948, 29959.9986, 30164.0492, 30366.9818, 30562.5338, 30762.9928, 30976.1592,
+  31166.274,  31376.722,  31570.3734, 31770.809,  31974.8934, 32179.5286, 32387.5442, 32582.3504,
+  32794.076,  32989.9528, 33191.842,  33392.4684, 33595.659,  33801.8672, 34000.3414, 34200.0922,
+  34402.6792, 34610.0638, 34804.0084, 35011.13,   35218.669,  35418.6634, 35619.0792, 35830.6534,
+  36028.4966, 36229.7902, 36438.6422, 36630.7764, 36833.3102, 37048.6728, 37247.3916, 37453.5904,
+  37669.3614, 37854.5526, 38059.305,  38268.0936, 38470.2516, 38674.7064, 38876.167,  39068.3794,
+  39281.9144, 39492.8566, 39684.8628, 39898.4108, 40093.1836, 40297.6858, 40489.7086, 40717.2424};
+CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p14{
+  11817.475,  12015.0046, 12215.3792, 12417.7504, 12623.1814, 12830.0086, 13040.0072, 13252.503,
+  13466.178,  13683.2738, 13902.0344, 14123.9798, 14347.394,  14573.7784, 14802.6894, 15033.6824,
+  15266.9134, 15502.8624, 15741.4944, 15980.7956, 16223.8916, 16468.6316, 16715.733,  16965.5726,
+  17217.204,  17470.666,  17727.8516, 17986.7886, 18247.6902, 18510.9632, 18775.304,  19044.7486,
+  19314.4408, 19587.202,  19862.2576, 20135.924,  20417.0324, 20697.9788, 20979.6112, 21265.0274,
+  21550.723,  21841.6906, 22132.162,  22428.1406, 22722.127,  23020.5606, 23319.7394, 23620.4014,
+  23925.2728, 24226.9224, 24535.581,  24845.505,  25155.9618, 25470.3828, 25785.9702, 26103.7764,
+  26420.4132, 26742.0186, 27062.8852, 27388.415,  27714.6024, 28042.296,  28365.4494, 28701.1526,
+  29031.8008, 29364.2156, 29704.497,  30037.1458, 30380.111,  30723.8168, 31059.5114, 31404.9498,
+  31751.6752, 32095.2686, 32444.7792, 32794.767,  33145.204,  33498.4226, 33847.6502, 34209.006,
+  34560.849,  34919.4838, 35274.9778, 35635.1322, 35996.3266, 36359.1394, 36722.8266, 37082.8516,
+  37447.7354, 37815.9606, 38191.0692, 38559.4106, 38924.8112, 39294.6726, 39663.973,  40042.261,
+  40416.2036, 40779.2036, 41161.6436, 41540.9014, 41921.1998, 42294.7698, 42678.5264, 43061.3464,
+  43432.375,  43818.432,  44198.6598, 44583.0138, 44970.4794, 45353.924,  45729.858,  46118.2224,
+  46511.5724, 46900.7386, 47280.6964, 47668.1472, 48055.6796, 48446.9436, 48838.7146, 49217.7296,
+  49613.7796, 50010.7508, 50410.0208, 50793.7886, 51190.2456, 51583.1882, 51971.0796, 52376.5338,
+  52763.319,  53165.5534, 53556.5594, 53948.2702, 54346.352,  54748.7914, 55138.577,  55543.4824,
+  55941.1748, 56333.7746, 56745.1552, 57142.7944, 57545.2236, 57935.9956, 58348.5268, 58737.5474,
+  59158.5962, 59542.6896, 59958.8004, 60349.3788, 60755.0212, 61147.6144, 61548.194,  61946.0696,
+  62348.6042, 62763.603,  63162.781,  63560.635,  63974.3482, 64366.4908, 64771.5876, 65176.7346,
+  65597.3916, 65995.915,  66394.0384, 66822.9396, 67203.6336, 67612.2032, 68019.0078, 68420.0388,
+  68821.22,   69235.8388, 69640.0724, 70055.155,  70466.357,  70863.4266, 71276.2482, 71677.0306,
+  72080.2006, 72493.0214, 72893.5952, 73314.5856, 73714.9852, 74125.3022, 74521.2122, 74933.6814,
+  75341.5904, 75743.0244, 76166.0278, 76572.1322, 76973.1028, 77381.6284, 77800.6092, 78189.328,
+  78607.0962, 79012.2508, 79407.8358, 79825.725,  80238.701,  80646.891,  81035.6436, 81460.0448,
+  81876.3884};
+CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p15{
+  23635.0036,  24030.8034,  24431.4744,  24837.1524,  25246.7928,  25661.326,   26081.3532,
+  26505.2806,  26933.9892,  27367.7098,  27805.318,   28248.799,   28696.4382,  29148.8244,
+  29605.5138,  30066.8668,  30534.2344,  31006.32,    31480.778,   31962.2418,  32447.3324,
+  32938.0232,  33432.731,   33930.728,   34433.9896,  34944.1402,  35457.5588,  35974.5958,
+  36497.3296,  37021.9096,  37554.326,   38088.0826,  38628.8816,  39171.3192,  39723.2326,
+  40274.5554,  40832.3142,  41390.613,   41959.5908,  42532.5466,  43102.0344,  43683.5072,
+  44266.694,   44851.2822,  45440.7862,  46038.0586,  46640.3164,  47241.064,   47846.155,
+  48454.7396,  49076.9168,  49692.542,   50317.4778,  50939.65,    51572.5596,  52210.2906,
+  52843.7396,  53481.3996,  54127.236,   54770.406,   55422.6598,  56078.7958,  56736.7174,
+  57397.6784,  58064.5784,  58730.308,   59404.9784,  60077.0864,  60751.9158,  61444.1386,
+  62115.817,   62808.7742,  63501.4774,  64187.5454,  64883.6622,  65582.7468,  66274.5318,
+  66976.9276,  67688.7764,  68402.138,   69109.6274,  69822.9706,  70543.6108,  71265.5202,
+  71983.3848,  72708.4656,  73433.384,   74158.4664,  74896.4868,  75620.9564,  76362.1434,
+  77098.3204,  77835.7662,  78582.6114,  79323.9902,  80067.8658,  80814.9246,  81567.0136,
+  82310.8536,  83061.9952,  83821.4096,  84580.8608,  85335.547,   86092.5802,  86851.6506,
+  87612.311,   88381.2016,  89146.3296,  89907.8974,  90676.846,   91451.4152,  92224.5518,
+  92995.8686,  93763.5066,  94551.2796,  95315.1944,  96096.1806,  96881.0918,  97665.679,
+  98442.68,    99229.3002,  100011.0994, 100790.6386, 101580.1564, 102377.7484, 103152.1392,
+  103944.2712, 104730.216,  105528.6336, 106324.9398, 107117.6706, 107890.3988, 108695.2266,
+  109485.238,  110294.7876, 111075.0958, 111878.0496, 112695.2864, 113464.5486, 114270.0474,
+  115068.608,  115884.3626, 116673.2588, 117483.3716, 118275.097,  119085.4092, 119879.2808,
+  120687.5868, 121499.9944, 122284.916,  123095.9254, 123912.5038, 124709.0454, 125503.7182,
+  126323.259,  127138.9412, 127943.8294, 128755.646,  129556.5354, 130375.3298, 131161.4734,
+  131971.1962, 132787.5458, 133588.1056, 134431.351,  135220.2906, 136023.398,  136846.6558,
+  137667.0004, 138463.663,  139283.7154, 140074.6146, 140901.3072, 141721.8548, 142543.2322,
+  143356.1096, 144173.7412, 144973.0948, 145794.3162, 146609.5714, 147420.003,  148237.9784,
+  149050.5696, 149854.761,  150663.1966, 151494.0754, 152313.1416, 153112.6902, 153935.7206,
+  154746.9262, 155559.547,  156401.9746, 157228.7036, 158008.7254, 158820.75,   159646.9184,
+  160470.4458, 161279.5348, 162093.3114, 162918.542,  163729.2842};
+CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p16{
+  47271.0,     48062.3584,  48862.7074,  49673.152,   50492.8416,  51322.9514,  52161.03,
+  53009.407,   53867.6348,  54734.206,   55610.5144,  56496.2096,  57390.795,   58297.268,
+  59210.6448,  60134.665,   61068.0248,  62010.4472,  62962.5204,  63923.5742,  64895.0194,
+  65876.4182,  66862.6136,  67862.6968,  68868.8908,  69882.8544,  70911.271,   71944.0924,
+  72990.0326,  74040.692,   75100.6336,  76174.7826,  77252.5998,  78340.2974,  79438.2572,
+  80545.4976,  81657.2796,  82784.6336,  83915.515,   85059.7362,  86205.9368,  87364.4424,
+  88530.3358,  89707.3744,  90885.9638,  92080.197,   93275.5738,  94479.391,   95695.918,
+  96919.2236,  98148.4602,  99382.3474,  100625.6974, 101878.0284, 103141.6278, 104409.4588,
+  105686.2882, 106967.5402, 108261.6032, 109548.1578, 110852.0728, 112162.231,  113479.0072,
+  114806.2626, 116137.9072, 117469.5048, 118813.5186, 120165.4876, 121516.2556, 122875.766,
+  124250.5444, 125621.2222, 127003.2352, 128387.848,  129775.2644, 131181.7776, 132577.3086,
+  133979.9458, 135394.1132, 136800.9078, 138233.217,  139668.5308, 141085.212,  142535.2122,
+  143969.0684, 145420.2872, 146878.1542, 148332.7572, 149800.3202, 151269.66,   152743.6104,
+  154213.0948, 155690.288,  157169.4246, 158672.1756, 160160.059,  161650.6854, 163145.7772,
+  164645.6726, 166159.1952, 167682.1578, 169177.3328, 170700.0118, 172228.8964, 173732.6664,
+  175265.5556, 176787.799,  178317.111,  179856.6914, 181400.865,  182943.4612, 184486.742,
+  186033.4698, 187583.7886, 189148.1868, 190688.4526, 192250.1926, 193810.9042, 195354.2972,
+  196938.7682, 198493.5898, 200079.2824, 201618.912,  203205.5492, 204765.5798, 206356.1124,
+  207929.3064, 209498.7196, 211086.229,  212675.1324, 214256.7892, 215826.2392, 217412.8474,
+  218995.6724, 220618.6038, 222207.1166, 223781.0364, 225387.4332, 227005.7928, 228590.4336,
+  230217.8738, 231805.1054, 233408.9,    234995.3432, 236601.4956, 238190.7904, 239817.2548,
+  241411.2832, 243002.4066, 244640.1884, 246255.3128, 247849.3508, 249479.9734, 251106.8822,
+  252705.027,  254332.9242, 255935.129,  257526.9014, 259154.772,  260777.625,  262390.253,
+  264004.4906, 265643.59,   267255.4076, 268873.426,  270470.7252, 272106.4804, 273722.4456,
+  275337.794,  276945.7038, 278592.9154, 280204.3726, 281841.1606, 283489.171,  285130.1716,
+  286735.3362, 288364.7164, 289961.1814, 291595.5524, 293285.683,  294899.6668, 296499.3434,
+  298128.0462, 299761.8946, 301394.2424, 302997.6748, 304615.1478, 306269.7724, 307886.114,
+  309543.1028, 311153.2862, 312782.8546, 314421.2008, 316033.2438, 317692.9636, 319305.2648,
+  320948.7406, 322566.3364, 324228.4224, 325847.1542};
+CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p17{
+  94542.0,     96125.811,   97728.019,   99348.558,   100987.9705, 102646.7565, 104324.5125,
+  106021.7435, 107736.7865, 109469.272,  111223.9465, 112995.219,  114787.432,  116593.152,
+  118422.71,   120267.2345, 122134.6765, 124020.937,  125927.2705, 127851.255,  129788.9485,
+  131751.016,  133726.8225, 135722.592,  137736.789,  139770.568,  141821.518,  143891.343,
+  145982.1415, 148095.387,  150207.526,  152355.649,  154515.6415, 156696.05,   158887.7575,
+  161098.159,  163329.852,  165569.053,  167837.4005, 170121.6165, 172420.4595, 174732.6265,
+  177062.77,   179412.502,  181774.035,  184151.939,  186551.6895, 188965.691,  191402.8095,
+  193857.949,  196305.0775, 198774.6715, 201271.2585, 203764.78,   206299.3695, 208818.1365,
+  211373.115,  213946.7465, 216532.076,  219105.541,  221714.5375, 224337.5135, 226977.5125,
+  229613.0655, 232270.2685, 234952.2065, 237645.3555, 240331.1925, 243034.517,  245756.0725,
+  248517.6865, 251232.737,  254011.3955, 256785.995,  259556.44,   262368.335,  265156.911,
+  267965.266,  270785.583,  273616.0495, 276487.4835, 279346.639,  282202.509,  285074.3885,
+  287942.2855, 290856.018,  293774.0345, 296678.5145, 299603.6355, 302552.6575, 305492.9785,
+  308466.8605, 311392.581,  314347.538,  317319.4295, 320285.9785, 323301.7325, 326298.3235,
+  329301.3105, 332301.987,  335309.791,  338370.762,  341382.923,  344431.1265, 347464.1545,
+  350507.28,   353619.2345, 356631.2005, 359685.203,  362776.7845, 365886.488,  368958.2255,
+  372060.6825, 375165.4335, 378237.935,  381328.311,  384430.5225, 387576.425,  390683.242,
+  393839.648,  396977.8425, 400101.9805, 403271.296,  406409.8425, 409529.5485, 412678.7,
+  415847.423,  419020.8035, 422157.081,  425337.749,  428479.6165, 431700.902,  434893.1915,
+  438049.582,  441210.5415, 444379.2545, 447577.356,  450741.931,  453959.548,  457137.0935,
+  460329.846,  463537.4815, 466732.3345, 469960.5615, 473164.681,  476347.6345, 479496.173,
+  482813.1645, 486025.6995, 489249.4885, 492460.1945, 495675.8805, 498908.0075, 502131.802,
+  505374.3855, 508550.9915, 511806.7305, 515026.776,  518217.0005, 521523.9855, 524705.9855,
+  527950.997,  531210.0265, 534472.497,  537750.7315, 540926.922,  544207.094,  547429.4345,
+  550666.3745, 553975.3475, 557150.7185, 560399.6165, 563662.697,  566916.7395, 570146.1215,
+  573447.425,  576689.6245, 579874.5745, 583202.337,  586503.0255, 589715.635,  592910.161,
+  596214.3885, 599488.035,  602740.92,   605983.0685, 609248.67,   612491.3605, 615787.912,
+  619107.5245, 622307.9555, 625577.333,  628840.4385, 632085.2155, 635317.6135, 638691.7195,
+  641887.467,  645139.9405, 648441.546,  651666.252,  654941.845};
+CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p18{
+  189084.0,    192250.913,  195456.774,  198696.946,  201977.762,  205294.444,  208651.754,
+  212042.099,  215472.269,  218941.91,   222443.912,  225996.845,  229568.199,  233193.568,
+  236844.457,  240543.233,  244279.475,  248044.27,   251854.588,  255693.2,    259583.619,
+  263494.621,  267445.385,  271454.061,  275468.769,  279549.456,  283646.446,  287788.198,
+  291966.099,  296181.164,  300431.469,  304718.618,  309024.004,  313393.508,  317760.803,
+  322209.731,  326675.061,  331160.627,  335654.47,   340241.442,  344841.833,  349467.132,
+  354130.629,  358819.432,  363574.626,  368296.587,  373118.482,  377914.93,   382782.301,
+  387680.669,  392601.981,  397544.323,  402529.115,  407546.018,  412593.658,  417638.657,
+  422762.865,  427886.169,  433017.167,  438213.273,  443441.254,  448692.421,  453937.533,
+  459239.049,  464529.569,  469910.083,  475274.03,   480684.473,  486070.26,   491515.237,
+  496995.651,  502476.617,  507973.609,  513497.19,   519083.233,  524726.509,  530305.505,
+  535945.728,  541584.404,  547274.055,  552967.236,  558667.862,  564360.216,  570128.148,
+  575965.08,   581701.952,  587532.523,  593361.144,  599246.128,  605033.418,  610958.779,
+  616837.117,  622772.818,  628672.04,   634675.369,  640574.831,  646585.739,  652574.547,
+  658611.217,  664642.684,  670713.914,  676737.681,  682797.313,  688837.897,  694917.874,
+  701009.882,  707173.648,  713257.254,  719415.392,  725636.761,  731710.697,  737906.209,
+  744103.074,  750313.39,   756504.185,  762712.579,  768876.985,  775167.859,  781359.0,
+  787615.959,  793863.597,  800245.477,  806464.582,  812785.294,  819005.925,  825403.057,
+  831676.197,  837936.284,  844266.968,  850642.711,  856959.756,  863322.774,  869699.931,
+  876102.478,  882355.787,  888694.463,  895159.952,  901536.143,  907872.631,  914293.672,
+  920615.14,   927130.974,  933409.404,  939922.178,  946331.47,   952745.93,   959209.264,
+  965590.224,  972077.284,  978501.961,  984953.19,   991413.271,  997817.479,  1004222.658,
+  1010725.676, 1017177.138, 1023612.529, 1030098.236, 1036493.719, 1043112.207, 1049537.036,
+  1056008.096, 1062476.184, 1068942.337, 1075524.95,  1081932.864, 1088426.025, 1094776.005,
+  1101327.448, 1107901.673, 1114423.639, 1120884.602, 1127324.923, 1133794.24,  1140328.886,
+  1146849.376, 1153346.682, 1159836.502, 1166478.703, 1172953.304, 1179391.502, 1185950.982,
+  1192544.052, 1198913.41,  1205430.994, 1212015.525, 1218674.042, 1225121.683, 1231551.101,
+  1238126.379, 1244673.795, 1251260.649, 1257697.86,  1264320.983, 1270736.319, 1277274.694,
+  1283804.95,  1290211.514, 1296858.568, 1303455.691};
+
+// Meta array storing interpolation points for estimates for Precision=4..18
+__device__ static cuda::std::array constexpr raw_estimate_data{raw_estimate_data_p4.data(),
+                                                               raw_estimate_data_p5.data(),
+                                                               raw_estimate_data_p6.data(),
+                                                               raw_estimate_data_p7.data(),
+                                                               raw_estimate_data_p8.data(),
+                                                               raw_estimate_data_p9.data(),
+                                                               raw_estimate_data_p10.data(),
+                                                               raw_estimate_data_p11.data(),
+                                                               raw_estimate_data_p12.data(),
+                                                               raw_estimate_data_p13.data(),
+                                                               raw_estimate_data_p14.data(),
+                                                               raw_estimate_data_p15.data(),
+                                                               raw_estimate_data_p16.data(),
+                                                               raw_estimate_data_p17.data(),
+                                                               raw_estimate_data_p18.data()};
+
+CUCO_HLL_TUNING_ARR_DECL bias_data_p4{10.0,
+                                      9.717,
+                                      9.207,
+                                      8.7896,
+                                      8.2882,
+                                      7.8204,
+                                      7.3772,
+                                      6.9342,
+                                      6.5202,
+                                      6.161,
+                                      5.7722,
+                                      5.4636,
+                                      5.0396,
+                                      4.6766,
+                                      4.3566,
+                                      4.0454,
+                                      3.7936,
+                                      3.4856,
+                                      3.2666,
+                                      2.9946,
+                                      2.766,
+                                      2.4692,
+                                      2.3638,
+                                      2.0764,
+                                      1.7864,
+                                      1.7602,
+                                      1.4814,
+                                      1.433,
+                                      1.2926,
+                                      1.0664,
+                                      0.999600000000001,
+                                      0.7956,
+                                      0.5366,
+                                      0.589399999999998,
+                                      0.573799999999999,
+                                      0.269799999999996,
+                                      0.368200000000002,
+                                      0.0544000000000011,
+                                      0.234200000000001,
+                                      0.0108000000000033,
+                                      -0.203400000000002,
+                                      -0.0701999999999998,
+                                      -0.129600000000003,
+                                      -0.364199999999997,
+                                      -0.480600000000003,
+                                      -0.226999999999997,
+                                      -0.322800000000001,
+                                      -0.382599999999996,
+                                      -0.511200000000002,
+                                      -0.669600000000003,
+                                      -0.749400000000001,
+                                      -0.500399999999999,
+                                      -0.617600000000003,
+                                      -0.6922,
+                                      -0.601599999999998,
+                                      -0.416200000000003,
+                                      -0.338200000000001,
+                                      -0.782600000000002,
+                                      -0.648600000000002,
+                                      -0.919800000000002,
+                                      -0.851799999999997,
+                                      -0.962400000000002,
+                                      -0.6402,
+                                      -1.1922,
+                                      -1.0256,
+                                      -1.086,
+                                      -1.21899999999999,
+                                      -0.819400000000002,
+                                      -0.940600000000003,
+                                      -1.1554,
+                                      -1.2072,
+                                      -1.1752,
+                                      -1.16759999999999,
+                                      -1.14019999999999,
+                                      -1.3754,
+                                      -1.29859999999999,
+                                      -1.607,
+                                      -1.3292,
+                                      -1.7606};
+CUCO_HLL_TUNING_ARR_DECL bias_data_p5{22.0,
+                                      21.1194,
+                                      20.8208,
+                                      20.2318,
+                                      19.77,
+                                      19.2436,
+                                      18.7774,
+                                      18.2848,
+                                      17.8224,
+                                      17.3742,
+                                      16.9336,
+                                      16.503,
+                                      16.0494,
+                                      15.6292,
+                                      15.2124,
+                                      14.798,
+                                      14.367,
+                                      13.9728,
+                                      13.5944,
+                                      13.217,
+                                      12.8438,
+                                      12.3696,
+                                      12.0956,
+                                      11.7044,
+                                      11.324,
+                                      11.0668,
+                                      10.6698,
+                                      10.3644,
+                                      10.049,
+                                      9.6918,
+                                      9.4146,
+                                      9.082,
+                                      8.687,
+                                      8.5398,
+                                      8.2462,
+                                      7.857,
+                                      7.6606,
+                                      7.4168,
+                                      7.1248,
+                                      6.9222,
+                                      6.6804,
+                                      6.447,
+                                      6.3454,
+                                      5.9594,
+                                      5.7636,
+                                      5.5776,
+                                      5.331,
+                                      5.19,
+                                      4.9676,
+                                      4.7564,
+                                      4.5314,
+                                      4.4442,
+                                      4.3708,
+                                      3.9774,
+                                      3.9624,
+                                      3.8796,
+                                      3.755,
+                                      3.472,
+                                      3.2076,
+                                      3.1024,
+                                      2.8908,
+                                      2.7338,
+                                      2.7728,
+                                      2.629,
+                                      2.413,
+                                      2.3266,
+                                      2.1524,
+                                      2.2642,
+                                      2.1806,
+                                      2.0566,
+                                      1.9192,
+                                      1.7598,
+                                      1.3516,
+                                      1.5802,
+                                      1.43859999999999,
+                                      1.49160000000001,
+                                      1.1524,
+                                      1.1892,
+                                      0.841399999999993,
+                                      0.879800000000003,
+                                      0.837599999999995,
+                                      0.469800000000006,
+                                      0.765600000000006,
+                                      0.331000000000003,
+                                      0.591399999999993,
+                                      0.601200000000006,
+                                      0.701599999999999,
+                                      0.558199999999999,
+                                      0.339399999999998,
+                                      0.354399999999998,
+                                      0.491200000000006,
+                                      0.308000000000007,
+                                      0.355199999999996,
+                                      -0.0254000000000048,
+                                      0.205200000000005,
+                                      -0.272999999999996,
+                                      0.132199999999997,
+                                      0.394400000000005,
+                                      -0.241200000000006,
+                                      0.242000000000004,
+                                      0.191400000000002,
+                                      0.253799999999998,
+                                      -0.122399999999999,
+                                      -0.370800000000003,
+                                      0.193200000000004,
+                                      -0.0848000000000013,
+                                      0.0867999999999967,
+                                      -0.327200000000005,
+                                      -0.285600000000002,
+                                      0.311400000000006,
+                                      -0.128399999999999,
+                                      -0.754999999999995,
+                                      -0.209199999999996,
+                                      -0.293599999999998,
+                                      -0.364000000000004,
+                                      -0.253600000000006,
+                                      -0.821200000000005,
+                                      -0.253600000000006,
+                                      -0.510400000000004,
+                                      -0.383399999999995,
+                                      -0.491799999999998,
+                                      -0.220200000000006,
+                                      -0.0972000000000008,
+                                      -0.557400000000001,
+                                      -0.114599999999996,
+                                      -0.295000000000002,
+                                      -0.534800000000004,
+                                      0.346399999999988,
+                                      -0.65379999999999,
+                                      0.0398000000000138,
+                                      0.0341999999999985,
+                                      -0.995800000000003,
+                                      -0.523400000000009,
+                                      -0.489000000000004,
+                                      -0.274799999999999,
+                                      -0.574999999999989,
+                                      -0.482799999999997,
+                                      0.0571999999999946,
+                                      -0.330600000000004,
+                                      -0.628800000000012,
+                                      -0.140199999999993,
+                                      -0.540600000000012,
+                                      -0.445999999999998,
+                                      -0.599400000000003,
+                                      -0.262599999999992,
+                                      0.163399999999996,
+                                      -0.100599999999986,
+                                      -0.39500000000001,
+                                      -1.06960000000001,
+                                      -0.836399999999998,
+                                      -0.753199999999993,
+                                      -0.412399999999991,
+                                      -0.790400000000005,
+                                      -0.29679999999999,
+                                      -0.28540000000001,
+                                      -0.193000000000012,
+                                      -0.0772000000000048,
+                                      -0.962799999999987,
+                                      -0.414800000000014};
+CUCO_HLL_TUNING_ARR_DECL bias_data_p6{45.0,
+                                      44.1902,
+                                      43.271,
+                                      42.8358,
+                                      41.8142,
+                                      41.2854,
+                                      40.317,
+                                      39.354,
+                                      38.8924,
+                                      37.9436,
+                                      37.4596,
+                                      36.5262,
+                                      35.6248,
+                                      35.1574,
+                                      34.2822,
+                                      33.837,
+                                      32.9636,
+                                      32.074,
+                                      31.7042,
+                                      30.7976,
+                                      30.4772,
+                                      29.6564,
+                                      28.7942,
+                                      28.5004,
+                                      27.686,
+                                      27.291,
+                                      26.5672,
+                                      25.8556,
+                                      25.4982,
+                                      24.8204,
+                                      24.4252,
+                                      23.7744,
+                                      23.0786,
+                                      22.8344,
+                                      22.0294,
+                                      21.8098,
+                                      21.0794,
+                                      20.5732,
+                                      20.1878,
+                                      19.5648,
+                                      19.2902,
+                                      18.6784,
+                                      18.3352,
+                                      17.8946,
+                                      17.3712,
+                                      17.0852,
+                                      16.499,
+                                      16.2686,
+                                      15.6844,
+                                      15.2234,
+                                      14.9732,
+                                      14.3356,
+                                      14.2286,
+                                      13.7262,
+                                      13.3284,
+                                      13.1048,
+                                      12.5962,
+                                      12.3562,
+                                      12.1272,
+                                      11.4184,
+                                      11.4974,
+                                      11.0822,
+                                      10.856,
+                                      10.48,
+                                      10.2834,
+                                      10.0208,
+                                      9.637,
+                                      9.51739999999999,
+                                      9.05759999999999,
+                                      8.74760000000001,
+                                      8.42700000000001,
+                                      8.1326,
+                                      8.2372,
+                                      8.2788,
+                                      7.6776,
+                                      7.79259999999999,
+                                      7.1952,
+                                      6.9564,
+                                      6.6454,
+                                      6.87,
+                                      6.5428,
+                                      6.19999999999999,
+                                      6.02940000000001,
+                                      5.62780000000001,
+                                      5.6782,
+                                      5.792,
+                                      5.35159999999999,
+                                      5.28319999999999,
+                                      5.0394,
+                                      5.07480000000001,
+                                      4.49119999999999,
+                                      4.84899999999999,
+                                      4.696,
+                                      4.54040000000001,
+                                      4.07300000000001,
+                                      4.37139999999999,
+                                      3.7216,
+                                      3.7328,
+                                      3.42080000000001,
+                                      3.41839999999999,
+                                      3.94239999999999,
+                                      3.27719999999999,
+                                      3.411,
+                                      3.13079999999999,
+                                      2.76900000000001,
+                                      2.92580000000001,
+                                      2.68279999999999,
+                                      2.75020000000001,
+                                      2.70599999999999,
+                                      2.3886,
+                                      3.01859999999999,
+                                      2.45179999999999,
+                                      2.92699999999999,
+                                      2.41720000000001,
+                                      2.41139999999999,
+                                      2.03299999999999,
+                                      2.51240000000001,
+                                      2.5564,
+                                      2.60079999999999,
+                                      2.41720000000001,
+                                      1.80439999999999,
+                                      1.99700000000001,
+                                      2.45480000000001,
+                                      1.8948,
+                                      2.2346,
+                                      2.30860000000001,
+                                      2.15479999999999,
+                                      1.88419999999999,
+                                      1.6508,
+                                      0.677199999999999,
+                                      1.72540000000001,
+                                      1.4752,
+                                      1.72280000000001,
+                                      1.66139999999999,
+                                      1.16759999999999,
+                                      1.79300000000001,
+                                      1.00059999999999,
+                                      0.905200000000008,
+                                      0.659999999999997,
+                                      1.55879999999999,
+                                      1.1636,
+                                      0.688199999999995,
+                                      0.712600000000009,
+                                      0.450199999999995,
+                                      1.1978,
+                                      0.975599999999986,
+                                      0.165400000000005,
+                                      1.727,
+                                      1.19739999999999,
+                                      -0.252600000000001,
+                                      1.13460000000001,
+                                      1.3048,
+                                      1.19479999999999,
+                                      0.313400000000001,
+                                      0.878999999999991,
+                                      1.12039999999999,
+                                      0.853000000000009,
+                                      1.67920000000001,
+                                      0.856999999999999,
+                                      0.448599999999999,
+                                      1.2362,
+                                      0.953399999999988,
+                                      1.02859999999998,
+                                      0.563199999999995,
+                                      0.663000000000011,
+                                      0.723000000000013,
+                                      0.756599999999992,
+                                      0.256599999999992,
+                                      -0.837600000000009,
+                                      0.620000000000005,
+                                      0.821599999999989,
+                                      0.216600000000028,
+                                      0.205600000000004,
+                                      0.220199999999977,
+                                      0.372599999999977,
+                                      0.334400000000016,
+                                      0.928400000000011,
+                                      0.972800000000007,
+                                      0.192400000000021,
+                                      0.487199999999973,
+                                      -0.413000000000011,
+                                      0.807000000000016,
+                                      0.120600000000024,
+                                      0.769000000000005,
+                                      0.870799999999974,
+                                      0.66500000000002,
+                                      0.118200000000002,
+                                      0.401200000000017,
+                                      0.635199999999998,
+                                      0.135400000000004,
+                                      0.175599999999974,
+                                      1.16059999999999,
+                                      0.34620000000001,
+                                      0.521400000000028,
+                                      -0.586599999999976,
+                                      -1.16480000000001,
+                                      0.968399999999974,
+                                      0.836999999999989,
+                                      0.779600000000016,
+                                      0.985799999999983};
+CUCO_HLL_TUNING_ARR_DECL bias_data_p7{91.0,
+                                      89.4934,
+                                      87.9758,
+                                      86.4574,
+                                      84.9718,
+                                      83.4954,
+                                      81.5302,
+                                      80.0756,
+                                      78.6374,
+                                      77.1782,
+                                      75.7888,
+                                      73.9522,
+                                      72.592,
+                                      71.2532,
+                                      69.9086,
+                                      68.5938,
+                                      66.9474,
+                                      65.6796,
+                                      64.4394,
+                                      63.2176,
+                                      61.9768,
+                                      60.4214,
+                                      59.2528,
+                                      58.0102,
+                                      56.8658,
+                                      55.7278,
+                                      54.3044,
+                                      53.1316,
+                                      52.093,
+                                      51.0032,
+                                      49.9092,
+                                      48.6306,
+                                      47.5294,
+                                      46.5756,
+                                      45.6508,
+                                      44.662,
+                                      43.552,
+                                      42.3724,
+                                      41.617,
+                                      40.5754,
+                                      39.7872,
+                                      38.8444,
+                                      37.7988,
+                                      36.8606,
+                                      36.2118,
+                                      35.3566,
+                                      34.4476,
+                                      33.5882,
+                                      32.6816,
+                                      32.0824,
+                                      31.0258,
+                                      30.6048,
+                                      29.4436,
+                                      28.7274,
+                                      27.957,
+                                      27.147,
+                                      26.4364,
+                                      25.7592,
+                                      25.3386,
+                                      24.781,
+                                      23.8028,
+                                      23.656,
+                                      22.6544,
+                                      21.996,
+                                      21.4718,
+                                      21.1544,
+                                      20.6098,
+                                      19.5956,
+                                      19.0616,
+                                      18.5758,
+                                      18.4878,
+                                      17.5244,
+                                      17.2146,
+                                      16.724,
+                                      15.8722,
+                                      15.5198,
+                                      15.0414,
+                                      14.941,
+                                      14.9048,
+                                      13.87,
+                                      13.4304,
+                                      13.028,
+                                      12.4708,
+                                      12.37,
+                                      12.0624,
+                                      11.4668,
+                                      11.5532,
+                                      11.4352,
+                                      11.2564,
+                                      10.2744,
+                                      10.2118,
+                                      9.74720000000002,
+                                      10.1456,
+                                      9.2928,
+                                      8.75040000000001,
+                                      8.55279999999999,
+                                      8.97899999999998,
+                                      8.21019999999999,
+                                      8.18340000000001,
+                                      7.3494,
+                                      7.32499999999999,
+                                      7.66140000000001,
+                                      6.90300000000002,
+                                      7.25439999999998,
+                                      6.9042,
+                                      7.21499999999997,
+                                      6.28640000000001,
+                                      6.08139999999997,
+                                      6.6764,
+                                      6.30099999999999,
+                                      5.13900000000001,
+                                      5.65800000000002,
+                                      5.17320000000001,
+                                      4.59019999999998,
+                                      4.9538,
+                                      5.08280000000002,
+                                      4.92200000000003,
+                                      4.99020000000002,
+                                      4.7328,
+                                      5.4538,
+                                      4.11360000000002,
+                                      4.22340000000003,
+                                      4.08780000000002,
+                                      3.70800000000003,
+                                      4.15559999999999,
+                                      4.18520000000001,
+                                      3.63720000000001,
+                                      3.68220000000002,
+                                      3.77960000000002,
+                                      3.6078,
+                                      2.49160000000001,
+                                      3.13099999999997,
+                                      2.5376,
+                                      3.19880000000001,
+                                      3.21100000000001,
+                                      2.4502,
+                                      3.52820000000003,
+                                      2.91199999999998,
+                                      3.04480000000001,
+                                      2.7432,
+                                      2.85239999999999,
+                                      2.79880000000003,
+                                      2.78579999999999,
+                                      1.88679999999999,
+                                      2.98860000000002,
+                                      2.50639999999999,
+                                      1.91239999999999,
+                                      2.66160000000002,
+                                      2.46820000000002,
+                                      1.58199999999999,
+                                      1.30399999999997,
+                                      2.27379999999999,
+                                      2.68939999999998,
+                                      1.32900000000001,
+                                      3.10599999999999,
+                                      1.69080000000002,
+                                      2.13740000000001,
+                                      2.53219999999999,
+                                      1.88479999999998,
+                                      1.33240000000001,
+                                      1.45119999999997,
+                                      1.17899999999997,
+                                      2.44119999999998,
+                                      1.60659999999996,
+                                      2.16700000000003,
+                                      0.77940000000001,
+                                      2.37900000000002,
+                                      2.06700000000001,
+                                      1.46000000000004,
+                                      2.91160000000002,
+                                      1.69200000000001,
+                                      0.954600000000028,
+                                      2.49300000000005,
+                                      2.2722,
+                                      1.33500000000004,
+                                      2.44899999999996,
+                                      1.20140000000004,
+                                      3.07380000000001,
+                                      2.09739999999999,
+                                      2.85640000000001,
+                                      2.29960000000005,
+                                      2.40899999999999,
+                                      1.97040000000004,
+                                      0.809799999999996,
+                                      1.65279999999996,
+                                      2.59979999999996,
+                                      0.95799999999997,
+                                      2.06799999999998,
+                                      2.32780000000002,
+                                      4.20159999999998,
+                                      1.96320000000003,
+                                      1.86400000000003,
+                                      1.42999999999995,
+                                      3.77940000000001,
+                                      1.27200000000005,
+                                      1.86440000000005,
+                                      2.20600000000002,
+                                      3.21900000000005,
+                                      1.5154,
+                                      2.61019999999996};
+CUCO_HLL_TUNING_ARR_DECL bias_data_p8{183.2152,
+                                      180.2454,
+                                      177.2096,
+                                      173.6652,
+                                      170.6312,
+                                      167.6822,
+                                      164.249,
+                                      161.3296,
+                                      158.0038,
+                                      155.2074,
+                                      152.4612,
+                                      149.27,
+                                      146.5178,
+                                      143.4412,
+                                      140.8032,
+                                      138.1634,
+                                      135.1688,
+                                      132.6074,
+                                      129.6946,
+                                      127.2664,
+                                      124.8228,
+                                      122.0432,
+                                      119.6824,
+                                      116.9464,
+                                      114.6268,
+                                      112.2626,
+                                      109.8376,
+                                      107.4034,
+                                      104.8956,
+                                      102.8522,
+                                      100.7638,
+                                      98.3552,
+                                      96.3556,
+                                      93.7526,
+                                      91.9292,
+                                      89.8954,
+                                      87.8198,
+                                      85.7668,
+                                      83.298,
+                                      81.6688,
+                                      79.9466,
+                                      77.9746,
+                                      76.1672,
+                                      74.3474,
+                                      72.3028,
+                                      70.8912,
+                                      69.114,
+                                      67.4646,
+                                      65.9744,
+                                      64.4092,
+                                      62.6022,
+                                      60.843,
+                                      59.5684,
+                                      58.1652,
+                                      56.5426,
+                                      55.4152,
+                                      53.5388,
+                                      52.3592,
+                                      51.1366,
+                                      49.486,
+                                      48.3918,
+                                      46.5076,
+                                      45.509,
+                                      44.3834,
+                                      43.3498,
+                                      42.0668,
+                                      40.7346,
+                                      40.1228,
+                                      38.4528,
+                                      37.7,
+                                      36.644,
+                                      36.0518,
+                                      34.5774,
+                                      33.9068,
+                                      32.432,
+                                      32.1666,
+                                      30.434,
+                                      29.6644,
+                                      28.4894,
+                                      27.6312,
+                                      26.3804,
+                                      26.292,
+                                      25.5496000000001,
+                                      25.0234,
+                                      24.8206,
+                                      22.6146,
+                                      22.4188,
+                                      22.117,
+                                      20.6762,
+                                      20.6576,
+                                      19.7864,
+                                      19.509,
+                                      18.5334,
+                                      17.9204,
+                                      17.772,
+                                      16.2924,
+                                      16.8654,
+                                      15.1836,
+                                      15.745,
+                                      15.1316,
+                                      15.0386,
+                                      14.0136,
+                                      13.6342,
+                                      12.6196,
+                                      12.1866,
+                                      12.4281999999999,
+                                      11.3324,
+                                      10.4794000000001,
+                                      11.5038,
+                                      10.129,
+                                      9.52800000000002,
+                                      10.3203999999999,
+                                      9.46299999999997,
+                                      9.79280000000006,
+                                      9.12300000000005,
+                                      8.74180000000001,
+                                      9.2192,
+                                      7.51020000000005,
+                                      7.60659999999996,
+                                      7.01840000000004,
+                                      7.22239999999999,
+                                      7.40139999999997,
+                                      6.76179999999999,
+                                      7.14359999999999,
+                                      5.65060000000005,
+                                      5.63779999999997,
+                                      5.76599999999996,
+                                      6.75139999999999,
+                                      5.57759999999996,
+                                      3.73220000000003,
+                                      5.8048,
+                                      5.63019999999995,
+                                      4.93359999999996,
+                                      3.47979999999995,
+                                      4.33879999999999,
+                                      3.98940000000005,
+                                      3.81960000000004,
+                                      3.31359999999995,
+                                      3.23080000000004,
+                                      3.4588,
+                                      3.08159999999998,
+                                      3.4076,
+                                      3.00639999999999,
+                                      2.38779999999997,
+                                      2.61900000000003,
+                                      1.99800000000005,
+                                      3.34820000000002,
+                                      2.95060000000001,
+                                      0.990999999999985,
+                                      2.11440000000005,
+                                      2.20299999999997,
+                                      2.82219999999995,
+                                      2.73239999999998,
+                                      2.7826,
+                                      3.76660000000004,
+                                      2.26480000000004,
+                                      2.31280000000004,
+                                      2.40819999999997,
+                                      2.75360000000001,
+                                      3.33759999999995,
+                                      2.71559999999999,
+                                      1.7478000000001,
+                                      1.42920000000004,
+                                      2.39300000000003,
+                                      2.22779999999989,
+                                      2.34339999999997,
+                                      0.87259999999992,
+                                      3.88400000000001,
+                                      1.80600000000004,
+                                      1.91759999999999,
+                                      1.16779999999994,
+                                      1.50320000000011,
+                                      2.52500000000009,
+                                      0.226400000000012,
+                                      2.31500000000005,
+                                      0.930000000000064,
+                                      1.25199999999995,
+                                      2.14959999999996,
+                                      0.0407999999999902,
+                                      2.5447999999999,
+                                      1.32960000000003,
+                                      0.197400000000016,
+                                      2.52620000000002,
+                                      3.33279999999991,
+                                      -1.34300000000007,
+                                      0.422199999999975,
+                                      0.917200000000093,
+                                      1.12920000000008,
+                                      1.46060000000011,
+                                      1.45779999999991,
+                                      2.8728000000001,
+                                      3.33359999999993,
+                                      -1.34079999999994,
+                                      1.57680000000005,
+                                      0.363000000000056,
+                                      1.40740000000005,
+                                      0.656600000000026,
+                                      0.801400000000058,
+                                      -0.454600000000028,
+                                      1.51919999999996};
+CUCO_HLL_TUNING_ARR_DECL bias_data_p9{368.0,
+                                      361.8294,
+                                      355.2452,
+                                      348.6698,
+                                      342.1464,
+                                      336.2024,
+                                      329.8782,
+                                      323.6598,
+                                      317.462,
+                                      311.2826,
+                                      305.7102,
+                                      299.7416,
+                                      293.9366,
+                                      288.1046,
+                                      282.285,
+                                      277.0668,
+                                      271.306,
+                                      265.8448,
+                                      260.301,
+                                      254.9886,
+                                      250.2422,
+                                      244.8138,
+                                      239.7074,
+                                      234.7428,
+                                      229.8402,
+                                      225.1664,
+                                      220.3534,
+                                      215.594,
+                                      210.6886,
+                                      205.7876,
+                                      201.65,
+                                      197.228,
+                                      192.8036,
+                                      188.1666,
+                                      184.0818,
+                                      180.0824,
+                                      176.2574,
+                                      172.302,
+                                      168.1644,
+                                      164.0056,
+                                      160.3802,
+                                      156.7192,
+                                      152.5234,
+                                      149.2084,
+                                      145.831,
+                                      142.485,
+                                      139.1112,
+                                      135.4764,
+                                      131.76,
+                                      129.3368,
+                                      126.5538,
+                                      122.5058,
+                                      119.2646,
+                                      116.5902,
+                                      113.3818,
+                                      110.8998,
+                                      107.9532,
+                                      105.2062,
+                                      102.2798,
+                                      99.4728,
+                                      96.9582,
+                                      94.3292,
+                                      92.171,
+                                      89.7809999999999,
+                                      87.5716,
+                                      84.7048,
+                                      82.5322,
+                                      79.875,
+                                      78.3972,
+                                      75.3464,
+                                      73.7274,
+                                      71.2834,
+                                      70.1444,
+                                      68.4263999999999,
+                                      66.0166,
+                                      64.018,
+                                      62.0437999999999,
+                                      60.3399999999999,
+                                      58.6856,
+                                      57.9836,
+                                      55.0311999999999,
+                                      54.6769999999999,
+                                      52.3188,
+                                      51.4846,
+                                      49.4423999999999,
+                                      47.739,
+                                      46.1487999999999,
+                                      44.9202,
+                                      43.4059999999999,
+                                      42.5342000000001,
+                                      41.2834,
+                                      38.8954000000001,
+                                      38.3286000000001,
+                                      36.2146,
+                                      36.6684,
+                                      35.9946,
+                                      33.123,
+                                      33.4338,
+                                      31.7378000000001,
+                                      29.076,
+                                      28.9692,
+                                      27.4964,
+                                      27.0998,
+                                      25.9864,
+                                      26.7754,
+                                      24.3208,
+                                      23.4838,
+                                      22.7388000000001,
+                                      24.0758000000001,
+                                      21.9097999999999,
+                                      20.9728,
+                                      19.9228000000001,
+                                      19.9292,
+                                      16.617,
+                                      17.05,
+                                      18.2996000000001,
+                                      15.6128000000001,
+                                      15.7392,
+                                      14.5174,
+                                      13.6322,
+                                      12.2583999999999,
+                                      13.3766000000001,
+                                      11.423,
+                                      13.1232,
+                                      9.51639999999998,
+                                      10.5938000000001,
+                                      9.59719999999993,
+                                      8.12220000000002,
+                                      9.76739999999995,
+                                      7.50440000000003,
+                                      7.56999999999994,
+                                      6.70440000000008,
+                                      6.41419999999994,
+                                      6.71019999999999,
+                                      5.60940000000005,
+                                      4.65219999999999,
+                                      6.84099999999989,
+                                      3.4072000000001,
+                                      3.97859999999991,
+                                      3.32760000000007,
+                                      5.52160000000003,
+                                      3.31860000000006,
+                                      2.06940000000009,
+                                      4.35400000000004,
+                                      1.57500000000005,
+                                      0.280799999999999,
+                                      2.12879999999996,
+                                      -0.214799999999968,
+                                      -0.0378000000000611,
+                                      -0.658200000000079,
+                                      0.654800000000023,
+                                      -0.0697999999999865,
+                                      0.858400000000074,
+                                      -2.52700000000004,
+                                      -2.1751999999999,
+                                      -3.35539999999992,
+                                      -1.04019999999991,
+                                      -0.651000000000067,
+                                      -2.14439999999991,
+                                      -1.96659999999997,
+                                      -3.97939999999994,
+                                      -0.604400000000169,
+                                      -3.08260000000018,
+                                      -3.39159999999993,
+                                      -5.29640000000018,
+                                      -5.38920000000007,
+                                      -5.08759999999984,
+                                      -4.69900000000007,
+                                      -5.23720000000003,
+                                      -3.15779999999995,
+                                      -4.97879999999986,
+                                      -4.89899999999989,
+                                      -7.48880000000008,
+                                      -5.94799999999987,
+                                      -5.68060000000014,
+                                      -6.67180000000008,
+                                      -4.70499999999993,
+                                      -7.27779999999984,
+                                      -4.6579999999999,
+                                      -4.4362000000001,
+                                      -4.32139999999981,
+                                      -5.18859999999995,
+                                      -6.66879999999992,
+                                      -6.48399999999992,
+                                      -5.1260000000002,
+                                      -4.4032000000002,
+                                      -6.13500000000022,
+                                      -5.80819999999994,
+                                      -4.16719999999987,
+                                      -4.15039999999999,
+                                      -7.45600000000013,
+                                      -7.24080000000004,
+                                      -9.83179999999993,
+                                      -5.80420000000004,
+                                      -8.6561999999999,
+                                      -6.99940000000015,
+                                      -10.5473999999999,
+                                      -7.34139999999979,
+                                      -6.80999999999995,
+                                      -6.29719999999998,
+                                      -6.23199999999997};
+CUCO_HLL_TUNING_ARR_DECL bias_data_p10{737.1256,
+                                       724.4234,
+                                       711.1064,
+                                       698.4732,
+                                       685.4636,
+                                       673.0644,
+                                       660.488,
+                                       647.9654,
+                                       636.0832,
+                                       623.7864,
+                                       612.1992,
+                                       600.2176,
+                                       588.5228,
+                                       577.1716,
+                                       565.7752,
+                                       554.899,
+                                       543.6126,
+                                       532.6492,
+                                       521.9474,
+                                       511.5214,
+                                       501.1064,
+                                       490.6364,
+                                       480.2468,
+                                       470.4588,
+                                       460.3832,
+                                       451.0584,
+                                       440.8606,
+                                       431.3868,
+                                       422.5062,
+                                       413.1862,
+                                       404.463,
+                                       395.339,
+                                       386.1936,
+                                       378.1292,
+                                       369.1854,
+                                       361.2908,
+                                       353.3324,
+                                       344.8518,
+                                       337.5204,
+                                       329.4854,
+                                       321.9318,
+                                       314.552,
+                                       306.4658,
+                                       299.4256,
+                                       292.849,
+                                       286.152,
+                                       278.8956,
+                                       271.8792,
+                                       265.118,
+                                       258.62,
+                                       252.5132,
+                                       245.9322,
+                                       239.7726,
+                                       233.6086,
+                                       227.5332,
+                                       222.5918,
+                                       216.4294,
+                                       210.7662,
+                                       205.4106,
+                                       199.7338,
+                                       194.9012,
+                                       188.4486,
+                                       183.1556,
+                                       178.6338,
+                                       173.7312,
+                                       169.6264,
+                                       163.9526,
+                                       159.8742,
+                                       155.8326,
+                                       151.1966,
+                                       147.5594,
+                                       143.07,
+                                       140.037,
+                                       134.1804,
+                                       131.071,
+                                       127.4884,
+                                       124.0848,
+                                       120.2944,
+                                       117.333,
+                                       112.9626,
+                                       110.2902,
+                                       107.0814,
+                                       103.0334,
+                                       99.4832000000001,
+                                       96.3899999999999,
+                                       93.7202000000002,
+                                       90.1714000000002,
+                                       87.2357999999999,
+                                       85.9346,
+                                       82.8910000000001,
+                                       80.0264000000002,
+                                       78.3834000000002,
+                                       75.1543999999999,
+                                       73.8683999999998,
+                                       70.9895999999999,
+                                       69.4367999999999,
+                                       64.8701999999998,
+                                       65.0408000000002,
+                                       61.6738,
+                                       59.5207999999998,
+                                       57.0158000000001,
+                                       54.2302,
+                                       53.0962,
+                                       50.4985999999999,
+                                       52.2588000000001,
+                                       47.3914,
+                                       45.6244000000002,
+                                       42.8377999999998,
+                                       43.0072,
+                                       40.6516000000001,
+                                       40.2453999999998,
+                                       35.2136,
+                                       36.4546,
+                                       33.7849999999999,
+                                       33.2294000000002,
+                                       32.4679999999998,
+                                       30.8670000000002,
+                                       28.6507999999999,
+                                       28.9099999999999,
+                                       27.5983999999999,
+                                       26.1619999999998,
+                                       24.5563999999999,
+                                       23.2328000000002,
+                                       21.9484000000002,
+                                       21.5902000000001,
+                                       21.3346000000001,
+                                       17.7031999999999,
+                                       20.6111999999998,
+                                       19.5545999999999,
+                                       15.7375999999999,
+                                       17.0720000000001,
+                                       16.9517999999998,
+                                       15.326,
+                                       13.1817999999998,
+                                       14.6925999999999,
+                                       13.0859999999998,
+                                       13.2754,
+                                       10.8697999999999,
+                                       11.248,
+                                       7.3768,
+                                       4.72339999999986,
+                                       7.97899999999981,
+                                       8.7503999999999,
+                                       7.68119999999999,
+                                       9.7199999999998,
+                                       7.73919999999998,
+                                       5.6224000000002,
+                                       7.44560000000001,
+                                       6.6601999999998,
+                                       5.9058,
+                                       4.00199999999995,
+                                       4.51699999999983,
+                                       4.68240000000014,
+                                       3.86220000000003,
+                                       5.13639999999987,
+                                       5.98500000000013,
+                                       2.47719999999981,
+                                       2.61999999999989,
+                                       1.62800000000016,
+                                       4.65000000000009,
+                                       0.225599999999758,
+                                       0.831000000000131,
+                                       -0.359400000000278,
+                                       1.27599999999984,
+                                       -2.92559999999958,
+                                       -0.0303999999996449,
+                                       2.37079999999969,
+                                       -2.0033999999996,
+                                       0.804600000000391,
+                                       0.30199999999968,
+                                       1.1247999999996,
+                                       -2.6880000000001,
+                                       0.0321999999996478,
+                                       -1.18099999999959,
+                                       -3.9402,
+                                       -1.47940000000017,
+                                       -0.188400000000001,
+                                       -2.10720000000038,
+                                       -2.04159999999956,
+                                       -3.12880000000041,
+                                       -4.16160000000036,
+                                       -0.612799999999879,
+                                       -3.48719999999958,
+                                       -8.17900000000009,
+                                       -5.37780000000021,
+                                       -4.01379999999972,
+                                       -5.58259999999973,
+                                       -5.73719999999958,
+                                       -7.66799999999967,
+                                       -5.69520000000011,
+                                       -1.1247999999996,
+                                       -5.58520000000044,
+                                       -8.04560000000038,
+                                       -4.64840000000004,
+                                       -11.6468000000004,
+                                       -7.97519999999986,
+                                       -5.78300000000036,
+                                       -7.67420000000038,
+                                       -10.6328000000003,
+                                       -9.81720000000041};
+CUCO_HLL_TUNING_ARR_DECL bias_data_p11{1476.0,
+                                       1449.6014,
+                                       1423.5802,
+                                       1397.7942,
+                                       1372.3042,
+                                       1347.2062,
+                                       1321.8402,
+                                       1297.2292,
+                                       1272.9462,
+                                       1248.9926,
+                                       1225.3026,
+                                       1201.4252,
+                                       1178.0578,
+                                       1155.6092,
+                                       1132.626,
+                                       1110.5568,
+                                       1088.527,
+                                       1066.5154,
+                                       1045.1874,
+                                       1024.3878,
+                                       1003.37,
+                                       982.1972,
+                                       962.5728,
+                                       942.1012,
+                                       922.9668,
+                                       903.292,
+                                       884.0772,
+                                       864.8578,
+                                       846.6562,
+                                       828.041,
+                                       809.714,
+                                       792.3112,
+                                       775.1806,
+                                       757.9854,
+                                       740.656,
+                                       724.346,
+                                       707.5154,
+                                       691.8378,
+                                       675.7448,
+                                       659.6722,
+                                       645.5722,
+                                       630.1462,
+                                       614.4124,
+                                       600.8728,
+                                       585.898,
+                                       572.408,
+                                       558.4926,
+                                       544.4938,
+                                       531.6776,
+                                       517.282,
+                                       505.7704,
+                                       493.1012,
+                                       480.7388,
+                                       467.6876,
+                                       456.1872,
+                                       445.5048,
+                                       433.0214,
+                                       420.806,
+                                       411.409,
+                                       400.4144,
+                                       389.4294,
+                                       379.2286,
+                                       369.651,
+                                       360.6156,
+                                       350.337,
+                                       342.083,
+                                       332.1538,
+                                       322.5094,
+                                       315.01,
+                                       305.6686,
+                                       298.1678,
+                                       287.8116,
+                                       280.9978,
+                                       271.9204,
+                                       265.3286,
+                                       257.5706,
+                                       249.6014,
+                                       242.544,
+                                       235.5976,
+                                       229.583,
+                                       220.9438,
+                                       214.672,
+                                       208.2786,
+                                       201.8628,
+                                       195.1834,
+                                       191.505,
+                                       186.1816,
+                                       178.5188,
+                                       172.2294,
+                                       167.8908,
+                                       161.0194,
+                                       158.052,
+                                       151.4588,
+                                       148.1596,
+                                       143.4344,
+                                       138.5238,
+                                       133.13,
+                                       127.6374,
+                                       124.8162,
+                                       118.7894,
+                                       117.3984,
+                                       114.6078,
+                                       109.0858,
+                                       105.1036,
+                                       103.6258,
+                                       98.6018000000004,
+                                       95.7618000000002,
+                                       93.5821999999998,
+                                       88.5900000000001,
+                                       86.9992000000002,
+                                       82.8800000000001,
+                                       80.4539999999997,
+                                       74.6981999999998,
+                                       74.3644000000004,
+                                       73.2914000000001,
+                                       65.5709999999999,
+                                       66.9232000000002,
+                                       65.1913999999997,
+                                       62.5882000000001,
+                                       61.5702000000001,
+                                       55.7035999999998,
+                                       56.1764000000003,
+                                       52.7596000000003,
+                                       53.0302000000001,
+                                       49.0609999999997,
+                                       48.4694,
+                                       44.933,
+                                       46.0474000000004,
+                                       44.7165999999997,
+                                       41.9416000000001,
+                                       39.9207999999999,
+                                       35.6328000000003,
+                                       35.5276000000003,
+                                       33.1934000000001,
+                                       33.2371999999996,
+                                       33.3864000000003,
+                                       33.9228000000003,
+                                       30.2371999999996,
+                                       29.1373999999996,
+                                       25.2272000000003,
+                                       24.2942000000003,
+                                       19.8338000000003,
+                                       18.9005999999999,
+                                       23.0907999999999,
+                                       21.8544000000002,
+                                       19.5176000000001,
+                                       15.4147999999996,
+                                       16.9314000000004,
+                                       18.6737999999996,
+                                       12.9877999999999,
+                                       14.3688000000002,
+                                       12.0447999999997,
+                                       15.5219999999999,
+                                       12.5299999999997,
+                                       14.5940000000001,
+                                       14.3131999999996,
+                                       9.45499999999993,
+                                       12.9441999999999,
+                                       3.91139999999996,
+                                       13.1373999999996,
+                                       5.44720000000052,
+                                       9.82779999999912,
+                                       7.87279999999919,
+                                       3.67760000000089,
+                                       5.46980000000076,
+                                       5.55099999999948,
+                                       5.65979999999945,
+                                       3.89439999999922,
+                                       3.1275999999998,
+                                       5.65140000000065,
+                                       6.3062000000009,
+                                       3.90799999999945,
+                                       1.87060000000019,
+                                       5.17020000000048,
+                                       2.46680000000015,
+                                       0.770000000000437,
+                                       -3.72340000000077,
+                                       1.16400000000067,
+                                       8.05340000000069,
+                                       0.135399999999208,
+                                       2.15940000000046,
+                                       0.766999999999825,
+                                       1.0594000000001,
+                                       3.15500000000065,
+                                       -0.287399999999252,
+                                       2.37219999999979,
+                                       -2.86620000000039,
+                                       -1.63199999999961,
+                                       -2.22979999999916,
+                                       -0.15519999999924,
+                                       -1.46039999999994,
+                                       -0.262199999999211,
+                                       -2.34460000000036,
+                                       -2.8078000000005,
+                                       -3.22179999999935,
+                                       -5.60159999999996,
+                                       -8.42200000000048,
+                                       -9.43740000000071,
+                                       0.161799999999857,
+                                       -10.4755999999998,
+                                       -10.0823999999993};
+CUCO_HLL_TUNING_ARR_DECL bias_data_p12{2953.0,
+                                       2900.4782,
+                                       2848.3568,
+                                       2796.3666,
+                                       2745.324,
+                                       2694.9598,
+                                       2644.648,
+                                       2595.539,
+                                       2546.1474,
+                                       2498.2576,
+                                       2450.8376,
+                                       2403.6076,
+                                       2357.451,
+                                       2311.38,
+                                       2266.4104,
+                                       2221.5638,
+                                       2176.9676,
+                                       2134.193,
+                                       2090.838,
+                                       2048.8548,
+                                       2007.018,
+                                       1966.1742,
+                                       1925.4482,
+                                       1885.1294,
+                                       1846.4776,
+                                       1807.4044,
+                                       1768.8724,
+                                       1731.3732,
+                                       1693.4304,
+                                       1657.5326,
+                                       1621.949,
+                                       1586.5532,
+                                       1551.7256,
+                                       1517.6182,
+                                       1483.5186,
+                                       1450.4528,
+                                       1417.865,
+                                       1385.7164,
+                                       1352.6828,
+                                       1322.6708,
+                                       1291.8312,
+                                       1260.9036,
+                                       1231.476,
+                                       1201.8652,
+                                       1173.6718,
+                                       1145.757,
+                                       1119.2072,
+                                       1092.2828,
+                                       1065.0434,
+                                       1038.6264,
+                                       1014.3192,
+                                       988.5746,
+                                       965.0816,
+                                       940.1176,
+                                       917.9796,
+                                       894.5576,
+                                       871.1858,
+                                       849.9144,
+                                       827.1142,
+                                       805.0818,
+                                       783.9664,
+                                       763.9096,
+                                       742.0816,
+                                       724.3962,
+                                       706.3454,
+                                       688.018,
+                                       667.4214,
+                                       650.3106,
+                                       633.0686,
+                                       613.8094,
+                                       597.818,
+                                       581.4248,
+                                       563.834,
+                                       547.363,
+                                       531.5066,
+                                       520.455400000001,
+                                       505.583199999999,
+                                       488.366,
+                                       476.480799999999,
+                                       459.7682,
+                                       450.0522,
+                                       434.328799999999,
+                                       423.952799999999,
+                                       408.727000000001,
+                                       399.079400000001,
+                                       387.252200000001,
+                                       373.987999999999,
+                                       360.852000000001,
+                                       351.6394,
+                                       339.642,
+                                       330.902400000001,
+                                       322.661599999999,
+                                       311.662200000001,
+                                       301.3254,
+                                       291.7484,
+                                       279.939200000001,
+                                       276.7508,
+                                       263.215200000001,
+                                       254.811400000001,
+                                       245.5494,
+                                       242.306399999999,
+                                       234.8734,
+                                       223.787200000001,
+                                       217.7156,
+                                       212.0196,
+                                       200.793,
+                                       195.9748,
+                                       189.0702,
+                                       182.449199999999,
+                                       177.2772,
+                                       170.2336,
+                                       164.741,
+                                       158.613600000001,
+                                       155.311,
+                                       147.5964,
+                                       142.837,
+                                       137.3724,
+                                       132.0162,
+                                       130.0424,
+                                       121.9804,
+                                       120.451800000001,
+                                       114.8968,
+                                       111.585999999999,
+                                       105.933199999999,
+                                       101.705,
+                                       98.5141999999996,
+                                       95.0488000000005,
+                                       89.7880000000005,
+                                       91.4750000000004,
+                                       83.7764000000006,
+                                       80.9698000000008,
+                                       72.8574000000008,
+                                       73.1615999999995,
+                                       67.5838000000003,
+                                       62.6263999999992,
+                                       63.2638000000006,
+                                       66.0977999999996,
+                                       52.0843999999997,
+                                       58.9956000000002,
+                                       47.0912000000008,
+                                       46.4956000000002,
+                                       48.4383999999991,
+                                       47.1082000000006,
+                                       43.2392,
+                                       37.2759999999998,
+                                       40.0283999999992,
+                                       35.1864000000005,
+                                       35.8595999999998,
+                                       32.0998,
+                                       28.027,
+                                       23.6694000000007,
+                                       33.8266000000003,
+                                       26.3736000000008,
+                                       27.2008000000005,
+                                       21.3245999999999,
+                                       26.4115999999995,
+                                       23.4521999999997,
+                                       19.5013999999992,
+                                       19.8513999999996,
+                                       10.7492000000002,
+                                       18.6424000000006,
+                                       13.1265999999996,
+                                       18.2436000000016,
+                                       6.71860000000015,
+                                       3.39459999999963,
+                                       6.33759999999893,
+                                       7.76719999999841,
+                                       0.813999999998487,
+                                       3.82819999999992,
+                                       0.826199999999517,
+                                       8.07440000000133,
+                                       -1.59080000000176,
+                                       5.01780000000144,
+                                       0.455399999998917,
+                                       -0.24199999999837,
+                                       0.174800000000687,
+                                       -9.07640000000174,
+                                       -4.20160000000033,
+                                       -3.77520000000004,
+                                       -4.75179999999818,
+                                       -5.3724000000002,
+                                       -8.90680000000066,
+                                       -6.10239999999976,
+                                       -5.74120000000039,
+                                       -9.95339999999851,
+                                       -3.86339999999836,
+                                       -13.7304000000004,
+                                       -16.2710000000006,
+                                       -7.51359999999841,
+                                       -3.30679999999847,
+                                       -13.1339999999982,
+                                       -10.0551999999989,
+                                       -6.72019999999975,
+                                       -8.59660000000076,
+                                       -10.9307999999983,
+                                       -1.8775999999998,
+                                       -4.82259999999951,
+                                       -13.7788,
+                                       -21.6470000000008,
+                                       -10.6735999999983,
+                                       -15.7799999999988};
+CUCO_HLL_TUNING_ARR_DECL bias_data_p13{5907.5052,         5802.2672,
+                                       5697.347,          5593.5794,
+                                       5491.2622,         5390.5514,
+                                       5290.3376,         5191.6952,
+                                       5093.5988,         4997.3552,
+                                       4902.5972,         4808.3082,
+                                       4715.5646,         4624.109,
+                                       4533.8216,         4444.4344,
+                                       4356.3802,         4269.2962,
+                                       4183.3784,         4098.292,
+                                       4014.79,           3932.4574,
+                                       3850.6036,         3771.2712,
+                                       3691.7708,         3615.099,
+                                       3538.1858,         3463.4746,
+                                       3388.8496,         3315.6794,
+                                       3244.5448,         3173.7516,
+                                       3103.3106,         3033.6094,
+                                       2966.5642,         2900.794,
+                                       2833.7256,         2769.81,
+                                       2707.3196,         2644.0778,
+                                       2583.9916,         2523.4662,
+                                       2464.124,          2406.073,
+                                       2347.0362,         2292.1006,
+                                       2238.1716,         2182.7514,
+                                       2128.4884,         2077.1314,
+                                       2025.037,          1975.3756,
+                                       1928.933,          1879.311,
+                                       1831.0006,         1783.2144,
+                                       1738.3096,         1694.5144,
+                                       1649.024,          1606.847,
+                                       1564.7528,         1525.3168,
+                                       1482.5372,         1443.9668,
+                                       1406.5074,         1365.867,
+                                       1329.2186,         1295.4186,
+                                       1257.9716,         1225.339,
+                                       1193.2972,         1156.3578,
+                                       1125.8686,         1091.187,
+                                       1061.4094,         1029.4188,
+                                       1000.9126,         972.3272,
+                                       944.004199999999,  915.7592,
+                                       889.965,           862.834200000001,
+                                       840.4254,          812.598399999999,
+                                       785.924200000001,  763.050999999999,
+                                       741.793799999999,  721.466,
+                                       699.040799999999,  677.997200000002,
+                                       649.866999999998,  634.911800000002,
+                                       609.8694,          591.981599999999,
+                                       570.2922,          557.129199999999,
+                                       538.3858,          521.872599999999,
+                                       502.951400000002,  495.776399999999,
+                                       475.171399999999,  459.751,
+                                       439.995200000001,  426.708999999999,
+                                       413.7016,          402.3868,
+                                       387.262599999998,  372.0524,
+                                       357.050999999999,  342.5098,
+                                       334.849200000001,  322.529399999999,
+                                       311.613799999999,  295.848000000002,
+                                       289.273000000001,  274.093000000001,
+                                       263.329600000001,  251.389599999999,
+                                       245.7392,          231.9614,
+                                       229.7952,          217.155200000001,
+                                       208.9588,          199.016599999999,
+                                       190.839199999999,  180.6976,
+                                       176.272799999999,  166.976999999999,
+                                       162.5252,          151.196400000001,
+                                       149.386999999999,  133.981199999998,
+                                       130.0586,          130.164000000001,
+                                       122.053400000001,  110.7428,
+                                       108.1276,          106.232400000001,
+                                       100.381600000001,  98.7668000000012,
+                                       86.6440000000002,  79.9768000000004,
+                                       82.4722000000002,  68.7026000000005,
+                                       70.1186000000016,  71.9948000000004,
+                                       58.998599999999,   59.0492000000013,
+                                       56.9818000000014,  47.5338000000011,
+                                       42.9928,           51.1591999999982,
+                                       37.2740000000013,  42.7220000000016,
+                                       31.3734000000004,  26.8090000000011,
+                                       25.8934000000008,  26.5286000000015,
+                                       29.5442000000003,  19.3503999999994,
+                                       26.0760000000009,  17.9527999999991,
+                                       14.8419999999969,  10.4683999999979,
+                                       8.65899999999965,  9.86720000000059,
+                                       4.34139999999752,  -0.907800000000861,
+                                       -3.32080000000133, -0.936199999996461,
+                                       -11.9916000000012, -8.87000000000262,
+                                       -6.33099999999831, -11.3366000000024,
+                                       -15.9207999999999, -9.34659999999712,
+                                       -15.5034000000014, -19.2097999999969,
+                                       -15.357799999998,  -28.2235999999975,
+                                       -30.6898000000001, -19.3271999999997,
+                                       -25.6083999999973, -24.409599999999,
+                                       -13.6385999999984, -33.4473999999973,
+                                       -32.6949999999997, -28.9063999999998,
+                                       -31.7483999999968, -32.2935999999972,
+                                       -35.8329999999987, -47.620600000002,
+                                       -39.0855999999985, -33.1434000000008,
+                                       -46.1371999999974, -37.5892000000022,
+                                       -46.8164000000033, -47.3142000000007,
+                                       -60.2914000000019, -37.7575999999972};
+CUCO_HLL_TUNING_ARR_DECL bias_data_p14{
+  11816.475,         11605.0046,        11395.3792,        11188.7504,        10984.1814,
+  10782.0086,        10582.0072,        10384.503,         10189.178,         9996.2738,
+  9806.0344,         9617.9798,         9431.394,          9248.7784,         9067.6894,
+  8889.6824,         8712.9134,         8538.8624,         8368.4944,         8197.7956,
+  8031.8916,         7866.6316,         7703.733,          7544.5726,         7386.204,
+  7230.666,          7077.8516,         6926.7886,         6778.6902,         6631.9632,
+  6487.304,          6346.7486,         6206.4408,         6070.202,          5935.2576,
+  5799.924,          5671.0324,         5541.9788,         5414.6112,         5290.0274,
+  5166.723,          5047.6906,         4929.162,          4815.1406,         4699.127,
+  4588.5606,         4477.7394,         4369.4014,         4264.2728,         4155.9224,
+  4055.581,          3955.505,          3856.9618,         3761.3828,         3666.9702,
+  3575.7764,         3482.4132,         3395.0186,         3305.8852,         3221.415,
+  3138.6024,         3056.296,          2970.4494,         2896.1526,         2816.8008,
+  2740.2156,         2670.497,          2594.1458,         2527.111,          2460.8168,
+  2387.5114,         2322.9498,         2260.6752,         2194.2686,         2133.7792,
+  2074.767,          2015.204,          1959.4226,         1898.6502,         1850.006,
+  1792.849,          1741.4838,         1687.9778,         1638.1322,         1589.3266,
+  1543.1394,         1496.8266,         1447.8516,         1402.7354,         1361.9606,
+  1327.0692,         1285.4106,         1241.8112,         1201.6726,         1161.973,
+  1130.261,          1094.2036,         1048.2036,         1020.6436,         990.901400000002,
+  961.199800000002,  924.769800000002,  899.526400000002,  872.346400000002,  834.375,
+  810.432000000001,  780.659800000001,  756.013800000001,  733.479399999997,  707.923999999999,
+  673.858,           652.222399999999,  636.572399999997,  615.738599999997,  586.696400000001,
+  564.147199999999,  541.679600000003,  523.943599999999,  505.714599999999,  475.729599999999,
+  461.779600000002,  449.750800000002,  439.020799999998,  412.7886,          400.245600000002,
+  383.188199999997,  362.079599999997,  357.533799999997,  334.319000000003,  327.553399999997,
+  308.559399999998,  291.270199999999,  279.351999999999,  271.791400000002,  252.576999999997,
+  247.482400000001,  236.174800000001,  218.774599999997,  220.155200000001,  208.794399999999,
+  201.223599999998,  182.995600000002,  185.5268,          164.547400000003,  176.5962,
+  150.689599999998,  157.8004,          138.378799999999,  134.021200000003,  117.614399999999,
+  108.194000000003,  97.0696000000025,  89.6042000000016,  95.6030000000028,  84.7810000000027,
+  72.635000000002,   77.3482000000004,  59.4907999999996,  55.5875999999989,  50.7346000000034,
+  61.3916000000027,  50.9149999999936,  39.0384000000049,  58.9395999999979,  29.633600000001,
+  28.2032000000036,  26.0078000000067,  17.0387999999948,  9.22000000000116,  13.8387999999977,
+  8.07240000000456,  14.1549999999988,  15.3570000000036,  3.42660000000615,  6.24820000000182,
+  -2.96940000000177, -8.79940000000352, -5.97860000000219, -14.4048000000039, -3.4143999999942,
+  -13.0148000000045, -11.6977999999945, -25.7878000000055, -22.3185999999987, -24.409599999999,
+  -31.9756000000052, -18.9722000000038, -22.8678000000073, -30.8972000000067, -32.3715999999986,
+  -22.3907999999938, -43.6720000000059, -35.9038,          -39.7492000000057, -54.1641999999993,
+  -45.2749999999942, -42.2989999999991, -44.1089999999967, -64.3564000000042, -49.9551999999967,
+  -42.6116000000038};
+CUCO_HLL_TUNING_ARR_DECL bias_data_p15{
+  23634.0036,         23210.8034,        22792.4744,        22379.1524,
+  21969.7928,         21565.326,         21165.3532,        20770.2806,
+  20379.9892,         19994.7098,        19613.318,         19236.799,
+  18865.4382,         18498.8244,        18136.5138,        17778.8668,
+  17426.2344,         17079.32,          16734.778,         16397.2418,
+  16063.3324,         15734.0232,        15409.731,         15088.728,
+  14772.9896,         14464.1402,        14157.5588,        13855.5958,
+  13559.3296,         13264.9096,        12978.326,         12692.0826,
+  12413.8816,         12137.3192,        11870.2326,        11602.5554,
+  11340.3142,         11079.613,         10829.5908,        10583.5466,
+  10334.0344,         10095.5072,        9859.694,          9625.2822,
+  9395.7862,          9174.0586,         8957.3164,         8738.064,
+  8524.155,           8313.7396,         8116.9168,         7913.542,
+  7718.4778,          7521.65,           7335.5596,         7154.2906,
+  6968.7396,          6786.3996,         6613.236,          6437.406,
+  6270.6598,          6107.7958,         5945.7174,         5787.6784,
+  5635.5784,          5482.308,          5337.9784,         5190.0864,
+  5045.9158,          4919.1386,         4771.817,          4645.7742,
+  4518.4774,          4385.5454,         4262.6622,         4142.74679999999,
+  4015.5318,          3897.9276,         3790.7764,         3685.13800000001,
+  3573.6274,          3467.9706,         3368.61079999999,  3271.5202,
+  3170.3848,          3076.4656,         2982.38400000001,  2888.4664,
+  2806.4868,          2711.9564,         2634.1434,         2551.3204,
+  2469.7662,          2396.61139999999,  2318.9902,         2243.8658,
+  2171.9246,          2105.01360000001,  2028.8536,         1960.9952,
+  1901.4096,          1841.86079999999,  1777.54700000001,  1714.5802,
+  1654.65059999999,   1596.311,          1546.2016,         1492.3296,
+  1433.8974,          1383.84600000001,  1339.4152,         1293.5518,
+  1245.8686,          1193.50659999999,  1162.27959999999,  1107.19439999999,
+  1069.18060000001,   1035.09179999999,  999.679000000004,  957.679999999993,
+  925.300199999998,   888.099400000006,  848.638600000006,  818.156400000007,
+  796.748399999997,   752.139200000005,  725.271200000003,  692.216,
+  671.633600000001,   647.939799999993,  621.670599999998,  575.398799999995,
+  561.226599999995,   532.237999999998,  521.787599999996,  483.095799999996,
+  467.049599999998,   465.286399999997,  415.548599999995,  401.047399999996,
+  380.607999999993,   377.362599999993,  347.258799999996,  338.371599999999,
+  310.096999999994,   301.409199999995,  276.280799999993,  265.586800000005,
+  258.994399999996,   223.915999999997,  215.925399999993,  213.503800000006,
+  191.045400000003,   166.718200000003,  166.259000000005,  162.941200000001,
+  148.829400000002,   141.645999999993,  123.535399999993,  122.329800000007,
+  89.473399999988,    80.1962000000058,  77.5457999999926,  59.1056000000099,
+  83.3509999999951,   52.2906000000075,  36.3979999999865,  40.6558000000077,
+  42.0003999999899,   19.6630000000005,  19.7153999999864,  -8.38539999999921,
+  -0.692799999989802, 0.854800000000978, 3.23219999999856,  -3.89040000000386,
+  -5.25880000001052,  -24.9052000000083, -22.6837999999989, -26.4286000000138,
+  -34.997000000003,   -37.0216000000073, -43.430400000012,  -58.2390000000014,
+  -68.8034000000043,  -56.9245999999985, -57.8583999999973, -77.3097999999882,
+  -73.2793999999994,  -81.0738000000129, -87.4530000000086, -65.0254000000132,
+  -57.296399999992,   -96.2746000000043, -103.25,           -96.081600000005,
+  -91.5542000000132,  -102.465200000006, -107.688599999994, -101.458000000013,
+  -109.715800000005};
+CUCO_HLL_TUNING_ARR_DECL bias_data_p16{
+  47270.0,           46423.3584,        45585.7074,        44757.152,         43938.8416,
+  43130.9514,        42330.03,          41540.407,         40759.6348,        39988.206,
+  39226.5144,        38473.2096,        37729.795,         36997.268,         36272.6448,
+  35558.665,         34853.0248,        34157.4472,        33470.5204,        32793.5742,
+  32127.0194,        31469.4182,        30817.6136,        30178.6968,        29546.8908,
+  28922.8544,        28312.271,         27707.0924,        27114.0326,        26526.692,
+  25948.6336,        25383.7826,        24823.5998,        24272.2974,        23732.2572,
+  23201.4976,        22674.2796,        22163.6336,        21656.515,         21161.7362,
+  20669.9368,        20189.4424,        19717.3358,        19256.3744,        18795.9638,
+  18352.197,         17908.5738,        17474.391,         17052.918,         16637.2236,
+  16228.4602,        15823.3474,        15428.6974,        15043.0284,        14667.6278,
+  14297.4588,        13935.2882,        13578.5402,        13234.6032,        12882.1578,
+  12548.0728,        12219.231,         11898.0072,        11587.2626,        11279.9072,
+  10973.5048,        10678.5186,        10392.4876,        10105.2556,        9825.766,
+  9562.5444,         9294.2222,         9038.2352,         8784.848,          8533.2644,
+  8301.7776,         8058.30859999999,  7822.94579999999,  7599.11319999999,  7366.90779999999,
+  7161.217,          6957.53080000001,  6736.212,          6548.21220000001,  6343.06839999999,
+  6156.28719999999,  5975.15419999999,  5791.75719999999,  5621.32019999999,  5451.66,
+  5287.61040000001,  5118.09479999999,  4957.288,          4798.4246,         4662.17559999999,
+  4512.05900000001,  4364.68539999999,  4220.77720000001,  4082.67259999999,  3957.19519999999,
+  3842.15779999999,  3699.3328,         3583.01180000001,  3473.8964,         3338.66639999999,
+  3233.55559999999,  3117.799,          3008.111,          2909.69140000001,  2814.86499999999,
+  2719.46119999999,  2624.742,          2532.46979999999,  2444.7886,         2370.1868,
+  2272.45259999999,  2196.19260000001,  2117.90419999999,  2023.2972,         1969.76819999999,
+  1885.58979999999,  1833.2824,         1733.91200000001,  1682.54920000001,  1604.57980000001,
+  1556.11240000001,  1491.3064,         1421.71960000001,  1371.22899999999,  1322.1324,
+  1264.7892,         1196.23920000001,  1143.8474,         1088.67240000001,  1073.60380000001,
+  1023.11660000001,  959.036400000012,  927.433199999999,  906.792799999996,  853.433599999989,
+  841.873800000001,  791.1054,          756.899999999994,  704.343200000003,  672.495599999995,
+  622.790399999998,  611.254799999995,  567.283200000005,  519.406599999988,  519.188400000014,
+  495.312800000014,  451.350799999986,  443.973399999988,  431.882199999993,  392.027000000002,
+  380.924200000009,  345.128999999986,  298.901400000002,  287.771999999997,  272.625,
+  247.253000000026,  222.490600000019,  223.590000000026,  196.407599999977,  176.425999999978,
+  134.725199999986,  132.4804,          110.445599999977,  86.7939999999944,  56.7038000000175,
+  64.915399999998,   38.3726000000024,  37.1606000000029,  46.170999999973,   49.1716000000015,
+  15.3362000000197,  6.71639999997569,  -34.8185999999987, -39.4476000000141, 12.6830000000191,
+  -12.3331999999937, -50.6565999999875, -59.9538000000175, -65.1054000000004, -70.7576000000117,
+  -106.325200000021, -126.852200000023, -110.227599999984, -132.885999999999, -113.897200000007,
+  -142.713800000027, -151.145399999979, -150.799200000009, -177.756200000003, -156.036399999983,
+  -182.735199999996, -177.259399999981, -198.663600000029, -174.577600000019, -193.84580000001};
+CUCO_HLL_TUNING_ARR_DECL bias_data_p17{
+  94541.0,           92848.811,         91174.019,         89517.558,         87879.9705,
+  86262.7565,        84663.5125,        83083.7435,        81521.7865,        79977.272,
+  78455.9465,        76950.219,         75465.432,         73994.152,         72546.71,
+  71115.2345,        69705.6765,        68314.937,         66944.2705,        65591.255,
+  64252.9485,        62938.016,         61636.8225,        60355.592,         59092.789,
+  57850.568,         56624.518,         55417.343,         54231.1415,        53067.387,
+  51903.526,         50774.649,         49657.6415,        48561.05,          47475.7575,
+  46410.159,         45364.852,         44327.053,         43318.4005,        42325.6165,
+  41348.4595,        40383.6265,        39436.77,          38509.502,         37594.035,
+  36695.939,         35818.6895,        34955.691,         34115.8095,        33293.949,
+  32465.0775,        31657.6715,        30877.2585,        30093.78,          29351.3695,
+  28594.1365,        27872.115,         27168.7465,        26477.076,         25774.541,
+  25106.5375,        24452.5135,        23815.5125,        23174.0655,        22555.2685,
+  21960.2065,        21376.3555,        20785.1925,        20211.517,         19657.0725,
+  19141.6865,        18579.737,         18081.3955,        17578.995,         17073.44,
+  16608.335,         16119.911,         15651.266,         15194.583,         14749.0495,
+  14343.4835,        13925.639,         13504.509,         13099.3885,        12691.2855,
+  12328.018,         11969.0345,        11596.5145,        11245.6355,        10917.6575,
+  10580.9785,        10277.8605,        9926.58100000001,  9605.538,          9300.42950000003,
+  8989.97850000003,  8728.73249999998,  8448.3235,         8175.31050000002,  7898.98700000002,
+  7629.79100000003,  7413.76199999999,  7149.92300000001,  6921.12650000001,  6677.1545,
+  6443.28000000003,  6278.23450000002,  6014.20049999998,  5791.20299999998,  5605.78450000001,
+  5438.48800000001,  5234.2255,         5059.6825,         4887.43349999998,  4682.935,
+  4496.31099999999,  4322.52250000002,  4191.42499999999,  4021.24200000003,  3900.64799999999,
+  3762.84250000003,  3609.98050000001,  3502.29599999997,  3363.84250000003,  3206.54849999998,
+  3079.70000000001,  2971.42300000001,  2867.80349999998,  2727.08100000001,  2630.74900000001,
+  2496.6165,         2440.902,          2356.19150000002,  2235.58199999999,  2120.54149999999,
+  2012.25449999998,  1933.35600000003,  1820.93099999998,  1761.54800000001,  1663.09350000002,
+  1578.84600000002,  1509.48149999999,  1427.3345,         1379.56150000001,  1306.68099999998,
+  1212.63449999999,  1084.17300000001,  1124.16450000001,  1060.69949999999,  1007.48849999998,
+  941.194499999983,  879.880500000028,  836.007500000007,  782.802000000025,  748.385499999975,
+  647.991500000004,  626.730500000005,  570.776000000013,  484.000500000024,  513.98550000001,
+  418.985499999952,  386.996999999974,  370.026500000036,  355.496999999974,  356.731499999994,
+  255.92200000002,   259.094000000041,  205.434499999974,  165.374500000034,  197.347500000033,
+  95.718499999959,   67.6165000000037,  54.6970000000438,  31.7395000000251,  -15.8784999999916,
+  8.42500000004657,  -26.3754999999655, -118.425500000012, -66.6629999999423, -42.9745000000112,
+  -107.364999999991, -189.839000000036, -162.611499999999, -164.964999999967, -189.079999999958,
+  -223.931499999948, -235.329999999958, -269.639500000048, -249.087999999989, -206.475499999942,
+  -283.04449999996,  -290.667000000016, -304.561499999953, -336.784499999951, -380.386500000022,
+  -283.280499999993, -364.533000000054, -389.059499999974, -364.454000000027, -415.748000000021,
+  -417.155000000028};
+CUCO_HLL_TUNING_ARR_DECL bias_data_p18{189083.0,
+                                       185696.913,
+                                       182348.774,
+                                       179035.946,
+                                       175762.762,
+                                       172526.444,
+                                       169329.754,
+                                       166166.099,
+                                       163043.269,
+                                       159958.91,
+                                       156907.912,
+                                       153906.845,
+                                       150924.199,
+                                       147996.568,
+                                       145093.457,
+                                       142239.233,
+                                       139421.475,
+                                       136632.27,
+                                       133889.588,
+                                       131174.2,
+                                       128511.619,
+                                       125868.621,
+                                       123265.385,
+                                       120721.061,
+                                       118181.769,
+                                       115709.456,
+                                       113252.446,
+                                       110840.198,
+                                       108465.099,
+                                       106126.164,
+                                       103823.469,
+                                       101556.618,
+                                       99308.004,
+                                       97124.508,
+                                       94937.803,
+                                       92833.731,
+                                       90745.061,
+                                       88677.627,
+                                       86617.47,
+                                       84650.442,
+                                       82697.833,
+                                       80769.132,
+                                       78879.629,
+                                       77014.432,
+                                       75215.626,
+                                       73384.587,
+                                       71652.482,
+                                       69895.93,
+                                       68209.301,
+                                       66553.669,
+                                       64921.981,
+                                       63310.323,
+                                       61742.115,
+                                       60205.018,
+                                       58698.658,
+                                       57190.657,
+                                       55760.865,
+                                       54331.169,
+                                       52908.167,
+                                       51550.273,
+                                       50225.254,
+                                       48922.421,
+                                       47614.533,
+                                       46362.049,
+                                       45098.569,
+                                       43926.083,
+                                       42736.03,
+                                       41593.473,
+                                       40425.26,
+                                       39316.237,
+                                       38243.651,
+                                       37170.617,
+                                       36114.609,
+                                       35084.19,
+                                       34117.233,
+                                       33206.509,
+                                       32231.505,
+                                       31318.728,
+                                       30403.404,
+                                       29540.0550000001,
+                                       28679.236,
+                                       27825.862,
+                                       26965.216,
+                                       26179.148,
+                                       25462.08,
+                                       24645.952,
+                                       23922.523,
+                                       23198.144,
+                                       22529.128,
+                                       21762.4179999999,
+                                       21134.779,
+                                       20459.117,
+                                       19840.818,
+                                       19187.04,
+                                       18636.3689999999,
+                                       17982.831,
+                                       17439.7389999999,
+                                       16874.547,
+                                       16358.2169999999,
+                                       15835.684,
+                                       15352.914,
+                                       14823.681,
+                                       14329.313,
+                                       13816.897,
+                                       13342.874,
+                                       12880.882,
+                                       12491.648,
+                                       12021.254,
+                                       11625.392,
+                                       11293.7610000001,
+                                       10813.697,
+                                       10456.209,
+                                       10099.074,
+                                       9755.39000000001,
+                                       9393.18500000006,
+                                       9047.57900000003,
+                                       8657.98499999999,
+                                       8395.85900000005,
+                                       8033.0,
+                                       7736.95900000003,
+                                       7430.59699999995,
+                                       7258.47699999996,
+                                       6924.58200000005,
+                                       6691.29399999999,
+                                       6357.92500000005,
+                                       6202.05700000003,
+                                       5921.19700000004,
+                                       5628.28399999999,
+                                       5404.96799999999,
+                                       5226.71100000001,
+                                       4990.75600000005,
+                                       4799.77399999998,
+                                       4622.93099999998,
+                                       4472.478,
+                                       4171.78700000001,
+                                       3957.46299999999,
+                                       3868.95200000005,
+                                       3691.14300000004,
+                                       3474.63100000005,
+                                       3341.67200000002,
+                                       3109.14000000001,
+                                       3071.97400000005,
+                                       2796.40399999998,
+                                       2756.17799999996,
+                                       2611.46999999997,
+                                       2471.93000000005,
+                                       2382.26399999997,
+                                       2209.22400000005,
+                                       2142.28399999999,
+                                       2013.96100000001,
+                                       1911.18999999994,
+                                       1818.27099999995,
+                                       1668.47900000005,
+                                       1519.65800000005,
+                                       1469.67599999998,
+                                       1367.13800000004,
+                                       1248.52899999998,
+                                       1181.23600000003,
+                                       1022.71900000004,
+                                       1088.20700000005,
+                                       959.03600000008,
+                                       876.095999999903,
+                                       791.183999999892,
+                                       703.337000000058,
+                                       731.949999999953,
+                                       586.86400000006,
+                                       526.024999999907,
+                                       323.004999999888,
+                                       320.448000000091,
+                                       340.672999999952,
+                                       309.638999999966,
+                                       216.601999999955,
+                                       102.922999999952,
+                                       19.2399999999907,
+                                       -0.114000000059605,
+                                       -32.6240000000689,
+                                       -89.3179999999702,
+                                       -153.497999999905,
+                                       -64.2970000000205,
+                                       -143.695999999996,
+                                       -259.497999999905,
+                                       -253.017999999924,
+                                       -213.948000000091,
+                                       -397.590000000084,
+                                       -434.006000000052,
+                                       -403.475000000093,
+                                       -297.958000000101,
+                                       -404.317000000039,
+                                       -528.898999999976,
+                                       -506.621000000043,
+                                       -513.205000000075,
+                                       -479.351000000024,
+                                       -596.139999999898,
+                                       -527.016999999993,
+                                       -664.681000000099,
+                                       -680.306000000099,
+                                       -704.050000000047,
+                                       -850.486000000034,
+                                       -757.43200000003,
+                                       -713.308999999892};
+
+// Meta array storing interpolation points for biases for Precision=4..18
+__device__ static cuda::std::array constexpr bias_data{bias_data_p4.data(),
+                                                       bias_data_p5.data(),
+                                                       bias_data_p6.data(),
+                                                       bias_data_p7.data(),
+                                                       bias_data_p8.data(),
+                                                       bias_data_p9.data(),
+                                                       bias_data_p10.data(),
+                                                       bias_data_p11.data(),
+                                                       bias_data_p12.data(),
+                                                       bias_data_p13.data(),
+                                                       bias_data_p14.data(),
+                                                       bias_data_p15.data(),
+                                                       bias_data_p16.data(),
+                                                       bias_data_p17.data(),
+                                                       bias_data_p18.data()};
+
+}  // namespace cuco::hyperloglog_ns::detail
\ No newline at end of file
diff --git a/include/cuco/distinct_count_estimator.cuh b/include/cuco/distinct_count_estimator.cuh
new file mode 100644
index 000000000..16c7b46d3
--- /dev/null
+++ b/include/cuco/distinct_count_estimator.cuh
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cuco/cuda_stream_ref.hpp>
+#include <cuco/detail/hyperloglog/hyperloglog.cuh>
+#include <cuco/distinct_count_estimator_ref.cuh>
+#include <cuco/hash_functions.cuh>
+#include <cuco/utility/allocator.hpp>
+#include <cuco/utility/cuda_thread_scope.cuh>
+
+#include <cstddef>
+#include <iterator>
+#include <memory>
+
+namespace cuco {
+template <class T,
+          int32_t Precision        = 11,
+          cuda::thread_scope Scope = cuda::thread_scope_device,
+          class Hash               = cuco::xxhash_64<T>,
+          class Allocator          = cuco::cuda_allocator<std::byte>>
+class distinct_count_estimator {
+  using impl_type = detail::hyperloglog<T, Precision, Scope, Hash, Allocator>;
+
+ public:
+  static constexpr auto thread_scope = impl_type::thread_scope;  ///< CUDA thread scope
+  static constexpr auto precision    = impl_type::precision;
+
+  using allocator_type = typename impl_type::allocator_type;  ///< Allocator type
+  using storage_type   = typename impl_type::storage_type;
+
+  template <cuda::thread_scope NewScope = thread_scope>
+  using ref_type = cuco::distinct_count_estimator_ref<T, Precision, NewScope, Hash>;
+
+  // TODO enable CTAD
+  constexpr distinct_count_estimator(cuco::cuda_thread_scope<Scope> scope = {},
+                                     Hash const& hash                     = {},
+                                     Allocator const& alloc               = {},
+                                     cuco::cuda_stream_ref stream         = {});
+
+  distinct_count_estimator(distinct_count_estimator const&)            = delete;
+  distinct_count_estimator& operator=(distinct_count_estimator const&) = delete;
+  distinct_count_estimator(distinct_count_estimator&&)                 = default;
+  distinct_count_estimator& operator=(distinct_count_estimator&&)      = default;
+  ~distinct_count_estimator()                                          = default;
+
+  void clear_async(cuco::cuda_stream_ref stream = {}) noexcept;
+
+  void clear(cuco::cuda_stream_ref stream = {});
+
+  template <class InputIt>
+  void add_async(InputIt first, InputIt last, cuco::cuda_stream_ref stream = {}) noexcept;
+
+  template <class InputIt>
+  void add(InputIt first, InputIt last, cuco::cuda_stream_ref stream = {});
+
+  template <cuda::thread_scope OtherScope, class OtherAllocator>
+  void merge_async(
+    distinct_count_estimator<T, Precision, OtherScope, Hash, OtherAllocator> const& other,
+    cuco::cuda_stream_ref stream = {}) noexcept;
+
+  template <cuda::thread_scope OtherScope, class OtherAllocator>
+  void merge(distinct_count_estimator<T, Precision, OtherScope, Hash, OtherAllocator> const& other,
+             cuco::cuda_stream_ref stream = {});
+
+  template <cuda::thread_scope OtherScope>
+  void merge_async(ref_type<OtherScope> const& other, cuco::cuda_stream_ref stream = {}) noexcept;
+
+  template <cuda::thread_scope OtherScope>
+  void merge(ref_type<OtherScope> const& other, cuco::cuda_stream_ref stream = {});
+
+  [[nodiscard]] std::size_t estimate(cuco::cuda_stream_ref stream = {}) const;
+
+  [[nodiscard]] ref_type<> ref() const noexcept;
+
+ private:
+  std::unique_ptr<impl_type> impl_;
+};
+}  // namespace cuco
+
+#include <cuco/detail/distinct_count_estimator/distinct_count_estimator.inl>
\ No newline at end of file
diff --git a/include/cuco/distinct_count_estimator_ref.cuh b/include/cuco/distinct_count_estimator_ref.cuh
new file mode 100644
index 000000000..5787e3f47
--- /dev/null
+++ b/include/cuco/distinct_count_estimator_ref.cuh
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cuco/detail/hyperloglog/hyperloglog_ref.cuh>
+#include <cuco/hash_functions.cuh>
+#include <cuco/utility/cuda_thread_scope.cuh>
+
+#include <cooperative_groups.h>
+
+namespace cuco {
+template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash>
+class distinct_count_estimator_ref {
+  using impl_type = detail::hyperloglog_ref<T, Precision, Scope, Hash>;
+
+ public:
+  static constexpr auto thread_scope = impl_type::thread_scope;  ///< CUDA thread scope
+  static constexpr auto precision    = impl_type::precision;
+
+  using storage_type = typename impl_type::storage_type;
+  template <cuda::thread_scope NewScope>
+  using with_scope = distinct_count_estimator_ref<T, Precision, NewScope, Hash>;
+
+  // TODO let storage_type be inferred?
+  __host__ __device__ constexpr distinct_count_estimator_ref(
+    storage_type& storage,
+    cuco::cuda_thread_scope<Scope> scope = {},
+    Hash const& hash                     = {}) noexcept;
+
+  template <class CG>
+  __device__ void clear(CG const& group) noexcept;
+
+  __device__ void add(T const& item) noexcept;
+
+  template <class CG, cuda::thread_scope OtherScope>
+  __device__ void merge(
+    CG const& group,
+    distinct_count_estimator_ref<T, Precision, OtherScope, Hash> const& other) noexcept;
+
+  [[nodiscard]] __device__ std::size_t estimate(
+    cooperative_groups::thread_block const& group) const noexcept;
+
+ private:
+  impl_type impl_;
+};
+}  // namespace cuco
+
+#include <cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl>
\ No newline at end of file

From 6718560ffab53e5e9dac7d20f7bd31a4f7b1dcff Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Wed, 24 Jan 2024 01:00:46 +0000
Subject: [PATCH 02/78] Code style

---
 include/cuco/detail/hyperloglog/hyperloglog.cuh | 6 +++---
 include/cuco/detail/hyperloglog/storage.cuh     | 3 ++-
 include/cuco/distinct_count_estimator.cuh       | 6 +++---
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh
index bd3871261..3d0dd6f29 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog.cuh
@@ -55,11 +55,11 @@ class hyperloglog {
     this->clear_async(stream);  // TODO async or sync?
   }
 
-  hyperloglog(hyperloglog const&)            = delete;
+  hyperloglog(hyperloglog const&) = delete;
   hyperloglog& operator=(hyperloglog const&) = delete;
   hyperloglog(hyperloglog&&)                 = default;
-  hyperloglog& operator=(hyperloglog&&)      = default;
-  ~hyperloglog()                             = default;
+  hyperloglog& operator=(hyperloglog&&) = default;
+  ~hyperloglog()                        = default;
 
   void clear_async(cuco::cuda_stream_ref stream) noexcept
   {
diff --git a/include/cuco/detail/hyperloglog/storage.cuh b/include/cuco/detail/hyperloglog/storage.cuh
index 195bdbe1c..effdc076a 100644
--- a/include/cuco/detail/hyperloglog/storage.cuh
+++ b/include/cuco/detail/hyperloglog/storage.cuh
@@ -20,5 +20,6 @@
 namespace cuco::detail {
 template <int32_t Precision>
 struct alignas(sizeof(int) * 4) hyperloglog_storage
-  : public cuda::std::array<int, 1ull << Precision> {};
+  : public cuda::std::array<int, 1ull << Precision> {
+};
 }  // namespace cuco::detail
diff --git a/include/cuco/distinct_count_estimator.cuh b/include/cuco/distinct_count_estimator.cuh
index 16c7b46d3..1d5dde49d 100644
--- a/include/cuco/distinct_count_estimator.cuh
+++ b/include/cuco/distinct_count_estimator.cuh
@@ -51,11 +51,11 @@ class distinct_count_estimator {
                                      Allocator const& alloc               = {},
                                      cuco::cuda_stream_ref stream         = {});
 
-  distinct_count_estimator(distinct_count_estimator const&)            = delete;
+  distinct_count_estimator(distinct_count_estimator const&) = delete;
   distinct_count_estimator& operator=(distinct_count_estimator const&) = delete;
   distinct_count_estimator(distinct_count_estimator&&)                 = default;
-  distinct_count_estimator& operator=(distinct_count_estimator&&)      = default;
-  ~distinct_count_estimator()                                          = default;
+  distinct_count_estimator& operator=(distinct_count_estimator&&) = default;
+  ~distinct_count_estimator()                                     = default;
 
   void clear_async(cuco::cuda_stream_ref stream = {}) noexcept;
 

From c59744e40a8b8654d87eb30fcb4ba9f4e99aed01 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Wed, 24 Jan 2024 01:02:30 +0000
Subject: [PATCH 03/78] Resolve merge conflicts

---
 include/cuco/detail/hyperloglog/kernels.cuh | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/include/cuco/detail/hyperloglog/kernels.cuh b/include/cuco/detail/hyperloglog/kernels.cuh
index 70064abcc..e84f49e40 100644
--- a/include/cuco/detail/hyperloglog/kernels.cuh
+++ b/include/cuco/detail/hyperloglog/kernels.cuh
@@ -23,16 +23,17 @@
 #include <cooperative_groups.h>
 
 namespace cuco::hyperloglog_ns::detail {
+CUCO_SUPPRESS_KERNEL_WARNINGS
 
 template <class RefType>
-__global__ void clear(RefType ref)
+CUCO_KERNEL void clear(RefType ref)
 {
   auto const block = cooperative_groups::this_thread_block();
   if (block.group_index().x == 0) { ref.clear(block); }
 }
 
 template <class InputIt, class RefType>
-__global__ void add_shmem(InputIt first, cuco::detail::index_type n, RefType ref)
+CUCO_KERNEL void add_shmem(InputIt first, cuco::detail::index_type n, RefType ref)
 {
   using local_ref_type = typename RefType::with_scope<cuda::thread_scope_block>;
 
@@ -56,7 +57,7 @@ __global__ void add_shmem(InputIt first, cuco::detail::index_type n, RefType ref
 }
 
 template <class OtherRefType, class RefType>
-__global__ void merge(OtherRefType other_ref, RefType ref)
+CUCO_KERNEL void merge(OtherRefType other_ref, RefType ref)
 {
   auto const block = cooperative_groups::this_thread_block();
   if (block.group_index().x == 0) { ref.merge(block, other_ref); }
@@ -64,7 +65,7 @@ __global__ void merge(OtherRefType other_ref, RefType ref)
 
 // TODO this kernel currently isn't being used
 template <class RefType>
-__global__ void estimate(std::size_t* cardinality, RefType ref)
+CUCO_KERNEL void estimate(std::size_t* cardinality, RefType ref)
 {
   auto const block = cooperative_groups::this_thread_block();
   if (block.group_index().x == 0) {

From b7533a0eeb309392740890f885ebf3d63e356b8c Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Wed, 24 Jan 2024 22:16:24 +0000
Subject: [PATCH 04/78] Initialize shmem atomics through placement new

---
 include/cuco/detail/hyperloglog/hyperloglog_ref.cuh | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
index ba9333f95..ce66036eb 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
@@ -144,10 +144,9 @@ class hyperloglog_ref {
     __shared__ cuda::atomic<int, cuda::thread_scope_block> block_zeroes;
     __shared__ std::size_t estimate;
 
-    // TODO is this needed?
     if (group.thread_rank() == 0) {
-      block_sum.store(0, cuda::std::memory_order_relaxed);
-      block_zeroes.store(0, cuda::std::memory_order_relaxed);
+      new (&block_sum) decltype(block_sum){0};
+      new (&block_zeroes) decltype(block_zeroes){0};
     }
     group.sync();
 

From f4bdac282ef415f672297db6353e510b4cf7d853 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Wed, 24 Jan 2024 22:20:09 +0000
Subject: [PATCH 05/78] Improve naming

---
 include/cuco/detail/hyperloglog/hyperloglog_ref.cuh | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
index ce66036eb..11fad856b 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
@@ -150,9 +150,6 @@ class hyperloglog_ref {
     }
     group.sync();
 
-    // a warp
-    auto const tile = cooperative_groups::tiled_partition<32>(group);
-
     fp_type thread_sum = 0;
     int thread_zeroes  = 0;
     for (int i = group.thread_rank(); i < this->storage_.size(); i += group.size()) {
@@ -161,11 +158,12 @@ class hyperloglog_ref {
       thread_zeroes += reg == 0;
     }
 
-    // CG reduce Z and V
+    // warp reduce Z and V
+    auto const warp = cooperative_groups::tiled_partition<32>(group);
     cooperative_groups::reduce_update_async(
-      tile, block_sum, thread_sum, cooperative_groups::plus<fp_type>());
+      warp, block_sum, thread_sum, cooperative_groups::plus<fp_type>());
     cooperative_groups::reduce_update_async(
-      tile, block_zeroes, thread_zeroes, cooperative_groups::plus<int>());
+      warp, block_zeroes, thread_zeroes, cooperative_groups::plus<int>());
     group.sync();
 
     if (group.thread_rank() == 0) {

From cea2afb36f1942f925501041463adda95348ae0f Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Wed, 24 Jan 2024 23:25:31 +0000
Subject: [PATCH 06/78] Move some functionality to storage class

---
 .../cuco/detail/hyperloglog/hyperloglog.cuh   |  14 +--
 .../detail/hyperloglog/hyperloglog_ref.cuh    |  75 +------------
 include/cuco/detail/hyperloglog/storage.cuh   | 102 +++++++++++++++++-
 3 files changed, 111 insertions(+), 80 deletions(-)

diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh
index 3d0dd6f29..1b9e0be15 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog.cuh
@@ -35,14 +35,14 @@ class hyperloglog {
   static constexpr auto thread_scope = Scope;  ///< CUDA thread scope
   static constexpr auto precision    = Precision;
 
+  template <cuda::thread_scope NewScope = thread_scope>
+  using ref_type = hyperloglog_ref<T, Precision, NewScope, Hash>;
+
   using allocator_type = Allocator;  ///< Allocator type
-  using storage_type   = detail::hyperloglog_storage<precision>;
+  using storage_type   = typename ref_type<>::storage_type;
   using storage_allocator_type =
     typename std::allocator_traits<Allocator>::template rebind_alloc<storage_type>;
 
-  template <cuda::thread_scope NewScope = thread_scope>
-  using ref_type = hyperloglog_ref<T, Precision, NewScope, Hash>;
-
   constexpr hyperloglog(cuco::cuda_thread_scope<Scope>,
                         Hash const& hash,
                         Allocator const& alloc,
@@ -55,11 +55,11 @@ class hyperloglog {
     this->clear_async(stream);  // TODO async or sync?
   }
 
-  hyperloglog(hyperloglog const&) = delete;
+  hyperloglog(hyperloglog const&)            = delete;
   hyperloglog& operator=(hyperloglog const&) = delete;
   hyperloglog(hyperloglog&&)                 = default;
-  hyperloglog& operator=(hyperloglog&&) = default;
-  ~hyperloglog()                        = default;
+  hyperloglog& operator=(hyperloglog&&)      = default;
+  ~hyperloglog()                             = default;
 
   void clear_async(cuco::cuda_stream_ref stream) noexcept
   {
diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
index 11fad856b..5994748c5 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
@@ -35,7 +35,7 @@ class hyperloglog_ref {
   static constexpr auto thread_scope = Scope;  ///< CUDA thread scope
   static constexpr auto precision    = Precision;
 
-  using storage_type = hyperloglog_storage<Precision>;
+  using storage_type = hyperloglog_dense_registers<Precision>;
   template <cuda::thread_scope NewScope>
   using with_scope = hyperloglog_ref<T, Precision, NewScope, Hash>;
 
@@ -49,17 +49,7 @@ class hyperloglog_ref {
   template <class CG>
   __device__ void clear(CG const& group) noexcept
   {
-    for (int i = group.thread_rank(); i < this->storage_.size(); i += group.size()) {
-      this->storage_[i] = 0;
-    }
-
-    // TODO remove test code
-    // int4 constexpr empty{0, 0, 0, 0};
-    // auto vec4 = reinterpret_cast<int4*>(this->storage_.data());
-    // // #pragma unroll 2
-    // for (int i = group.thread_rank(); i < (this->storage_.size() / 4); i += group.size()) {
-    //   vec4[i] = empty;
-    // }
+    this->storage_.clear(group);
   }
 
   __device__ void add(T const& item) noexcept
@@ -70,71 +60,14 @@ class hyperloglog_ref {
     auto const reg               = h & register_mask;
     auto const zeroes            = cuda::std::countl_zero(h | register_mask) + 1;  // __clz
 
-    if constexpr (Scope == cuda::thread_scope_thread) {
-      this->storage_[reg] = max(this->storage_[reg], zeroes);
-    } else if constexpr (Scope == cuda::thread_scope_block) {
-      atomicMax_block(&(this->storage_[reg]), zeroes);
-    } else if constexpr (Scope == cuda::thread_scope_device) {
-      atomicMax(&(this->storage_[reg]), zeroes);
-    } else if constexpr (Scope == cuda::thread_scope_system) {
-      atomicMax_system(&(this->storage_[reg]), zeroes);
-    } else {
-      static_assert(cuco::dependent_false<decltype(Scope)>, "Unsupported thread scope");
-    }
+    this->storage_.update_max<thread_scope>(reg, zeroes);
   }
 
   template <class CG, cuda::thread_scope OtherScope>
   __device__ void merge(CG const& group,
                         hyperloglog_ref<T, Precision, OtherScope, Hash> const& other) noexcept
   {
-    for (int i = group.thread_rank(); i < this->storage_.size(); i += group.size()) {
-      if constexpr (Scope == cuda::thread_scope_thread) {
-        this->storage_[i] = max(this->storage_[i], other.storage_[i]);
-      } else if constexpr (Scope == cuda::thread_scope_block) {
-        atomicMax_block(this->storage_.data() + i, other.storage_[i]);
-      } else if constexpr (Scope == cuda::thread_scope_device) {
-        atomicMax(this->storage_.data() + i, other.storage_[i]);
-      } else if constexpr (Scope == cuda::thread_scope_system) {
-        atomicMax_system(this->storage_.data() + i, other.storage_[i]);
-      } else {
-        static_assert(cuco::dependent_false<decltype(Scope)>, "Unsupported thread scope");
-      }
-    }
-
-    // TODO remove test code
-    /*
-    auto vec4 = reinterpret_cast<int4 const*>(other.storage_.data());
-    // #pragma unroll 2
-    for (int i = group.thread_rank(); i < (this->storage_.size() / 4); i += group.size()) {
-      auto const items = vec4[i];
-      if constexpr (Scope == cuda::thread_scope_thread) {
-        auto max_vec4  = reinterpret_cast<int4*>(this->storage_.data());
-        auto max_items = max_vec4[i];
-        max_items.x    = max(max_items.x, items.x);
-        max_items.y    = max(max_items.y, items.y);
-        max_items.z    = max(max_items.z, items.z);
-        max_items.w    = max(max_items.w, items.w);
-        max_vec4[i]    = max_items;
-      } else if constexpr (Scope == cuda::thread_scope_block) {
-        atomicMax_block(this->storage_.data() + (i * 4 + 0), items.x);
-        atomicMax_block(this->storage_.data() + (i * 4 + 1), items.y);
-        atomicMax_block(this->storage_.data() + (i * 4 + 2), items.z);
-        atomicMax_block(this->storage_.data() + (i * 4 + 3), items.w);
-      } else if constexpr (Scope == cuda::thread_scope_device) {
-        atomicMax(this->storage_.data() + (i * 4 + 0), items.x);
-        atomicMax(this->storage_.data() + (i * 4 + 1), items.y);
-        atomicMax(this->storage_.data() + (i * 4 + 2), items.z);
-        atomicMax(this->storage_.data() + (i * 4 + 3), items.w);
-      } else if constexpr (Scope == cuda::thread_scope_system) {
-        atomicMax_system(this->storage_.data() + (i * 4 + 0), items.x);
-        atomicMax_system(this->storage_.data() + (i * 4 + 1), items.y);
-        atomicMax_system(this->storage_.data() + (i * 4 + 2), items.z);
-        atomicMax_system(this->storage_.data() + (i * 4 + 3), items.w);
-      } else {
-        static_assert(cuco::dependent_false<decltype(Scope)>, "Unsupported thread scope");
-      }
-    }
-    */
+    this->storage_.merge<thread_scope>(group, other.storage_);
   }
 
   [[nodiscard]] __device__ std::size_t estimate(
diff --git a/include/cuco/detail/hyperloglog/storage.cuh b/include/cuco/detail/hyperloglog/storage.cuh
index effdc076a..a1117fdfd 100644
--- a/include/cuco/detail/hyperloglog/storage.cuh
+++ b/include/cuco/detail/hyperloglog/storage.cuh
@@ -15,11 +15,109 @@
  */
 #pragma once
 
+#include <cuco/utility/cuda_thread_scope.cuh>
+#include <cuco/utility/traits.hpp>
+
+#include <cstddef>
 #include <cuda/std/array>
 
 namespace cuco::detail {
+
 template <int32_t Precision>
-struct alignas(sizeof(int) * 4) hyperloglog_storage
-  : public cuda::std::array<int, 1ull << Precision> {
+class hyperloglog_dense_registers {
+ public:
+  template <class CG>
+  __device__ void constexpr clear(CG const& group) noexcept
+  {
+    for (int i = group.thread_rank(); i < this->registers_.size(); i += group.size()) {
+      this->registers_[i] = 0;
+    }
+
+    // TODO remove test code
+    // int4 constexpr empty{0, 0, 0, 0};
+    // auto vec4 = reinterpret_cast<int4*>(this->storage_.data());
+    // // #pragma unroll 2
+    // for (int i = group.thread_rank(); i < (this->storage_.size() / 4); i += group.size()) {
+    //   vec4[i] = empty;
+    // }
+  }
+
+  __host__ __device__ constexpr int& operator[](std::size_t i) noexcept
+  {
+    return this->registers_[i];
+  }
+
+  __host__ __device__ constexpr int operator[](std::size_t i) const noexcept
+  {
+    return this->registers_[i];
+  }
+
+  __host__ __device__ constexpr std::size_t size() const noexcept
+  {
+    return this->registers_.size();
+  }
+
+  template <cuda::thread_scope Scope>
+  __device__ constexpr void update_max(std::size_t i, int value) noexcept
+  {
+    if constexpr (Scope == cuda::thread_scope_thread) {
+      this->registers_[i] = max(this->registers_[i], value);
+    } else if constexpr (Scope == cuda::thread_scope_block) {
+      atomicMax_block(&(this->registers_[i]), value);
+    } else if constexpr (Scope == cuda::thread_scope_device) {
+      atomicMax(&(this->registers_[i]), value);
+    } else if constexpr (Scope == cuda::thread_scope_system) {
+      atomicMax_system(&(this->registers_[i]), value);
+    } else {
+      static_assert(cuco::dependent_false<decltype(Scope)>, "Unsupported thread scope");
+    }
+  }
+
+  template <cuda::thread_scope Scope, class CG>
+  __device__ void constexpr merge(CG const& group,
+                                  hyperloglog_dense_registers const& other) noexcept
+  {
+    for (int i = group.thread_rank(); i < this->registers_.size(); i += group.size()) {
+      this->update_max<Scope>(i, other.registers_[i]);
+    }
+
+    // TODO remove test code
+    /*
+    auto vec4 = reinterpret_cast<int4 const*>(other.storage_.data());
+    // #pragma unroll 2
+    for (int i = group.thread_rank(); i < (this->storage_.size() / 4); i += group.size()) {
+      auto const items = vec4[i];
+      if constexpr (Scope == cuda::thread_scope_thread) {
+        auto max_vec4  = reinterpret_cast<int4*>(this->storage_.data());
+        auto max_items = max_vec4[i];
+        max_items.x    = max(max_items.x, items.x);
+        max_items.y    = max(max_items.y, items.y);
+        max_items.z    = max(max_items.z, items.z);
+        max_items.w    = max(max_items.w, items.w);
+        max_vec4[i]    = max_items;
+      } else if constexpr (Scope == cuda::thread_scope_block) {
+        atomicMax_block(this->storage_.data() + (i * 4 + 0), items.x);
+        atomicMax_block(this->storage_.data() + (i * 4 + 1), items.y);
+        atomicMax_block(this->storage_.data() + (i * 4 + 2), items.z);
+        atomicMax_block(this->storage_.data() + (i * 4 + 3), items.w);
+      } else if constexpr (Scope == cuda::thread_scope_device) {
+        atomicMax(this->storage_.data() + (i * 4 + 0), items.x);
+        atomicMax(this->storage_.data() + (i * 4 + 1), items.y);
+        atomicMax(this->storage_.data() + (i * 4 + 2), items.z);
+        atomicMax(this->storage_.data() + (i * 4 + 3), items.w);
+      } else if constexpr (Scope == cuda::thread_scope_system) {
+        atomicMax_system(this->storage_.data() + (i * 4 + 0), items.x);
+        atomicMax_system(this->storage_.data() + (i * 4 + 1), items.y);
+        atomicMax_system(this->storage_.data() + (i * 4 + 2), items.z);
+        atomicMax_system(this->storage_.data() + (i * 4 + 3), items.w);
+      } else {
+        static_assert(cuco::dependent_false<decltype(Scope)>, "Unsupported thread scope");
+      }
+    }
+    */
+  }
+
+ private:
+  alignas(sizeof(int) * 4) cuda::std::array<int, 1ull << Precision> registers_;
 };
 }  // namespace cuco::detail

From 0f0bd3fb6a8625a275268b2222ed20c65a6b264e Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Thu, 25 Jan 2024 17:01:04 +0000
Subject: [PATCH 07/78] Add inline docs for public APIs

---
 include/cuco/detail/hyperloglog/finalizer.cuh |  19 +-
 .../cuco/detail/hyperloglog/hyperloglog.cuh   | 179 +++++++++++++++---
 .../detail/hyperloglog/hyperloglog_ref.cuh    |  73 ++++++-
 include/cuco/detail/hyperloglog/kernels.cuh   |   2 +-
 include/cuco/detail/hyperloglog/storage.cuh   |  56 +++++-
 include/cuco/distinct_count_estimator.cuh     | 143 +++++++++++++-
 include/cuco/distinct_count_estimator_ref.cuh |  60 +++++-
 7 files changed, 484 insertions(+), 48 deletions(-)

diff --git a/include/cuco/detail/hyperloglog/finalizer.cuh b/include/cuco/detail/hyperloglog/finalizer.cuh
index 9f5c9a20d..3aca44fdf 100644
--- a/include/cuco/detail/hyperloglog/finalizer.cuh
+++ b/include/cuco/detail/hyperloglog/finalizer.cuh
@@ -20,12 +20,29 @@
 #include <cuda/std/cmath>
 
 namespace cuco::hyperloglog_ns::detail {
+
+/**
+ * @brief Estimate correction algorithm based on HyperLogLog++.
+ *
+ * @note Variable names correspond to the definitions given in the HLL++ paper:
+ * https://static.googleusercontent.com/media/research.google.com/de//pubs/archive/40671.pdf
+ *
+ * @tparam Precision Tuning parameter to trade accuracy for runtime/memory footprint
+ */
 template <int32_t Precision>
 class finalizer {
   // this minimum number of registers is required by HLL++
   static_assert(Precision >= 4, "Precision must be greater or equal to 4");
 
  public:
+  /**
+   * @brief Compute the bias-corrected cardinality estimate.
+   *
+   * @param z Geometric mean of registers
+   * @param v Number of 0 registers
+   *
+   * @return Bias-corrected cardinality estimate
+   */
   __host__ __device__ static double constexpr finalize(double z, int v) noexcept
   {
     auto e = alpha_mm() / z;
@@ -50,7 +67,7 @@ class finalizer {
   }
 
  private:
-  static auto constexpr m = (1 << Precision);
+  static auto constexpr m = (1 << Precision);  ///< Number of registers
 
   __host__ __device__ static double constexpr alpha_mm() noexcept
   {
diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh
index 1b9e0be15..3bb032105 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog.cuh
@@ -29,20 +29,47 @@
 #include <memory>
 
 namespace cuco::detail {
+/**
+ * @brief A GPU-accelerated utility for approximating the number of distinct items in a multiset.
+ *
+ * @note This class implements the HyperLogLog/HyperLogLog++ algorithm:
+ * https://static.googleusercontent.com/media/research.google.com/de//pubs/archive/40671.pdf.
+ * @note The `Precision` parameter can be used to trade runtime/memory footprint for better
+ * accuracy. A higher value corresponds to a more accurate result, however, setting the precision
+ * too high will result in deminishing results.
+ *
+ * @tparam T Type of items to count
+ * @tparam Precision Tuning parameter to trade runtime/memory footprint for better accuracy
+ * @tparam Scope The scope in which operations will be performed by individual threads
+ * @tparam Hash Hash function used to hash items
+ * @tparam Allocator Type of allocator used for device storage
+ */
 template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash, class Allocator>
 class hyperloglog {
  public:
-  static constexpr auto thread_scope = Scope;  ///< CUDA thread scope
-  static constexpr auto precision    = Precision;
+  static constexpr auto thread_scope = Scope;      ///< CUDA thread scope
+  static constexpr auto precision    = Precision;  ///< Precision
 
   template <cuda::thread_scope NewScope = thread_scope>
-  using ref_type = hyperloglog_ref<T, Precision, NewScope, Hash>;
-
-  using allocator_type = Allocator;  ///< Allocator type
-  using storage_type   = typename ref_type<>::storage_type;
-  using storage_allocator_type =
-    typename std::allocator_traits<Allocator>::template rebind_alloc<storage_type>;
-
+  using ref_type = hyperloglog_ref<T, Precision, NewScope, Hash>;  ///< Non-owning reference
+                                                                   ///< type
+
+  using allocator_type         = Allocator;                          ///< Allocator type
+  using storage_type           = typename ref_type<>::storage_type;  ///< Storage type
+  using storage_allocator_type = typename std::allocator_traits<Allocator>::template rebind_alloc<
+    storage_type>;  ///< Storage allocator type
+
+  /**
+   * @brief Constructs a `hyperloglog` host object.
+   *
+   * @note This function synchronizes the given stream.
+   *
+   * @param hash The hash function used to hash items
+   * @param alloc Allocator used for allocating device storage
+   * @param stream CUDA stream used to initialize the object
+   */
+  // Doxygen cannot document unnamed parameter for scope, see
+  // https://github.com/doxygen/doxygen/issues/6926
   constexpr hyperloglog(cuco::cuda_thread_scope<Scope>,
                         Hash const& hash,
                         Allocator const& alloc,
@@ -55,24 +82,56 @@ class hyperloglog {
     this->clear_async(stream);  // TODO async or sync?
   }
 
-  hyperloglog(hyperloglog const&)            = delete;
-  hyperloglog& operator=(hyperloglog const&) = delete;
-  hyperloglog(hyperloglog&&)                 = default;
-  hyperloglog& operator=(hyperloglog&&)      = default;
-  ~hyperloglog()                             = default;
+  ~hyperloglog() = default;
 
+  hyperloglog(hyperloglog const&) = delete;
+  hyperloglog& operator=(hyperloglog const&) = delete;
+  hyperloglog(hyperloglog&&)                 = default;  ///< Move constructor
+
+  // TODO this is somehow required to pass the Doxygen check.
+  /**
+   * @brief Copy-assignment operator.
+   *
+   * @return Copy of `*this`
+   */
+  hyperloglog& operator=(hyperloglog&&) = default;
+
+  /**
+   * @brief Asynchronously resets the estimator, i.e., clears the current count estimate.
+   *
+   * @param stream CUDA stream this operation is executed in
+   */
   void clear_async(cuco::cuda_stream_ref stream) noexcept
   {
     auto constexpr block_size = 1024;
     cuco::hyperloglog_ns::detail::clear<<<1, block_size, 0, stream>>>(this->ref());
   }
 
+  /**
+   * @brief Resets the estimator, i.e., clears the current count estimate.
+   *
+   * @note This function synchronizes the given stream. For asynchronous execution use
+   * `clear_async`.
+   *
+   * @param stream CUDA stream this operation is executed in
+   */
   void clear(cuco::cuda_stream_ref stream)
   {
     this->clear_async(stream);
     stream.synchronize();
   }
 
+  /**
+   * @brief Asynchronously adds to be counted items to the estimator.
+   *
+   * @tparam InputIt Device accessible random access input iterator where
+   * <tt>std::is_convertible<std::iterator_traits<InputIt>::value_type,
+   * T></tt> is `true`
+   *
+   * @param first Beginning of the sequence of items
+   * @param last End of the sequence of items
+   * @param stream CUDA stream this operation is executed in
+   */
   template <class InputIt>
   void add_async(InputIt first, InputIt last, cuco::cuda_stream_ref stream) noexcept
   {
@@ -83,7 +142,11 @@ class hyperloglog {
 
     int grid_size  = 0;
     int block_size = 0;
-    // TODO check cuda error?
+
+    // We make use of the occupancy calculator here to get the minimum number of blocks which still
+    // saturate the GPU. This reduces the atomic contention on the final register array during the
+    // merge phase.
+    // TODO check cuda error or will it sync the stream??
     cudaOccupancyMaxPotentialBlockSize(
       &grid_size, &block_size, &cuco::hyperloglog_ns::detail::add_shmem<InputIt, ref_type<>>);
 
@@ -91,6 +154,20 @@ class hyperloglog {
       first, num_items, this->ref());
   }
 
+  /**
+   * @brief Adds to be counted items to the estimator.
+   *
+   * @note This function synchronizes the given stream. For asynchronous execution use
+   * `add_async`.
+   *
+   * @tparam InputIt Device accessible random access input iterator where
+   * <tt>std::is_convertible<std::iterator_traits<InputIt>::value_type,
+   * T></tt> is `true`
+   *
+   * @param first Beginning of the sequence of items
+   * @param last End of the sequence of items
+   * @param stream CUDA stream this operation is executed in
+   */
   template <class InputIt>
   void add(InputIt first, InputIt last, cuco::cuda_stream_ref stream)
   {
@@ -98,35 +175,84 @@ class hyperloglog {
     stream.synchronize();
   }
 
+  /**
+   * @brief Asynchronously merges the result of `other` estimator into `*this` estimator.
+   *
+   * @tparam OtherScope Thread scope of `other` estimator
+   * @tparam OtherAllocator Allocator type of `other` estimator
+   *
+   * @param other Other estimator to be merged into `*this`
+   * @param stream CUDA stream this operation is executed in
+   */
   template <cuda::thread_scope OtherScope, class OtherAllocator>
   void merge_async(hyperloglog<T, Precision, OtherScope, Hash, OtherAllocator> const& other,
-                   cuco::cuda_stream_ref stream = {}) noexcept
+                   cuco::cuda_stream_ref stream) noexcept
   {
     this->merge_async(other.ref(), stream);
   }
 
+  /**
+   * @brief Merges the result of `other` estimator into `*this` estimator.
+   *
+   * @note This function synchronizes the given stream. For asynchronous execution use
+   * `merge_async`.
+   *
+   * @tparam OtherScope Thread scope of `other` estimator
+   * @tparam OtherAllocator Allocator type of `other` estimator
+   *
+   * @param other Other estimator to be merged into `*this`
+   * @param stream CUDA stream this operation is executed in
+   */
   template <cuda::thread_scope OtherScope, class OtherAllocator>
   void merge(hyperloglog<T, Precision, OtherScope, Hash, OtherAllocator> const& other,
-             cuco::cuda_stream_ref stream = {})
+             cuco::cuda_stream_ref stream)
   {
     this->merge_async(other, stream);
     stream.synchronize();
   }
 
+  /**
+   * @brief Asynchronously merges the result of `other` estimator reference into `*this` estimator.
+   *
+   * @tparam OtherScope Thread scope of `other` estimator
+   *
+   * @param other Other estimator reference to be merged into `*this`
+   * @param stream CUDA stream this operation is executed in
+   */
   template <cuda::thread_scope OtherScope>
-  void merge_async(ref_type<OtherScope> const& other, cuco::cuda_stream_ref stream = {}) noexcept
+  void merge_async(ref_type<OtherScope> const& other, cuco::cuda_stream_ref stream) noexcept
   {
     auto constexpr block_size = 1024;
     cuco::hyperloglog_ns::detail::merge<<<1, block_size, 0, stream>>>(other, this->ref());
   }
 
+  /**
+   * @brief Merges the result of `other` estimator reference into `*this` estimator.
+   *
+   * @note This function synchronizes the given stream. For asynchronous execution use
+   * `merge_async`.
+   *
+   * @tparam OtherScope Thread scope of `other` estimator
+   *
+   * @param other Other estimator reference to be merged into `*this`
+   * @param stream CUDA stream this operation is executed in
+   */
   template <cuda::thread_scope OtherScope>
-  void merge(ref_type<OtherScope> const& other, cuco::cuda_stream_ref stream = {})
+  void merge(ref_type<OtherScope> const& other, cuco::cuda_stream_ref stream)
   {
     this->merge_async(other, stream);
     stream.synchronize();
   }
 
+  /**
+   * @brief Compute the estimated distinct items count.
+   *
+   * @note This function synchronizes the given stream.
+   *
+   * @param stream CUDA stream this operation is executed in
+   *
+   * @return Approximate distinct items count
+   */
   [[nodiscard]] std::size_t estimate(cuco::cuda_stream_ref stream) const
   {
     // TODO remove test code
@@ -167,6 +293,11 @@ class hyperloglog {
     return cuco::hyperloglog_ns::detail::finalizer<Precision>::finalize(sum, zeroes);
   }
 
+  /**
+   * @brief Get device ref.
+   *
+   * @return Device ref object of the current `distinct_count_estimator` host object
+   */
   [[nodiscard]] ref_type<> ref() const noexcept
   {
     return ref_type<>{*(this->storage_.get()), {}, this->hash_};
@@ -185,11 +316,13 @@ class hyperloglog {
     storage_allocator_type& allocator;
   };
 
-  Hash hash_;
-  storage_allocator_type storage_allocator_;
-  storage_deleter storage_deleter_;
-  std::unique_ptr<storage_type, storage_deleter> storage_;
+  Hash hash_;                                               ///< Hash function used to hash items
+  storage_allocator_type storage_allocator_;                ///< Storage allocator
+  storage_deleter storage_deleter_;                         ///< Storage deleter
+  std::unique_ptr<storage_type, storage_deleter> storage_;  ///< Storage
 
+  // Needs to be friends with other instantiations of this class template to have access to their
+  // storage
   template <class T_, int32_t Precision_, cuda::thread_scope Scope_, class Hash_, class Allocator_>
   friend class hyperloglog;
 };
diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
index 5994748c5..c6073a265 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
@@ -28,30 +28,66 @@
 #include <cooperative_groups/reduce.h>
 
 namespace cuco::detail {
+/**
+ * @brief A GPU-accelerated utility for approximating the number of distinct items in a multiset.
+ *
+ * @note This class implements the HyperLogLog/HyperLogLog++ algorithm:
+ * https://static.googleusercontent.com/media/research.google.com/de//pubs/archive/40671.pdf.
+ * @note The `Precision` parameter can be used to trade runtime/memory footprint for better
+ * accuracy. A higher value corresponds to a more accurate result, however, setting the precision
+ * too high will result in deminishing results.
+ *
+ * @tparam T Type of items to count
+ * @tparam Precision Tuning parameter to trade runtime/memory footprint for better accuracy
+ * @tparam Scope The scope in which operations will be performed by individual threads
+ * @tparam Hash Hash function used to hash items
+ */
 template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash>
 class hyperloglog_ref {
  public:
-  using fp_type                      = float;
-  static constexpr auto thread_scope = Scope;  ///< CUDA thread scope
-  static constexpr auto precision    = Precision;
+  using fp_type                      = float;      ///< Floating point type used for reduction
+  static constexpr auto thread_scope = Scope;      ///< CUDA thread scope
+  static constexpr auto precision    = Precision;  ///< Precision
 
-  using storage_type = hyperloglog_dense_registers<Precision>;
-  template <cuda::thread_scope NewScope>
-  using with_scope = hyperloglog_ref<T, Precision, NewScope, Hash>;
+  using storage_type = hyperloglog_dense_registers<Precision>;  ///< Storage type
 
+  template <cuda::thread_scope NewScope>
+  using with_scope = hyperloglog_ref<T, Precision, NewScope, Hash>;  ///< Ref type with different
+                                                                     ///< thread scope
+
+  /**
+   * @brief Constructs a non-owning `hyperloglog_ref` object.
+   *
+   * @param storage Reference to storage object of type `storage_type`
+   * @param hash The hash function used to hash items
+   */
+  // Doxygen cannot document unnamed parameter for scope, see
+  // https://github.com/doxygen/doxygen/issues/6926
   __host__ __device__ constexpr hyperloglog_ref(storage_type& storage,
-                                                cuco::cuda_thread_scope<Scope> = {},
-                                                Hash const& hash               = {}) noexcept
+                                                cuco::cuda_thread_scope<Scope>,
+                                                Hash const& hash) noexcept
     : hash_{hash}, storage_{storage}
   {
   }
 
+  /**
+   * @brief Resets the estimator, i.e., clears the current count estimate.
+   *
+   * @tparam CG CUDA Cooperative Group type
+   *
+   * @param group CUDA Cooperative group this operation is executed in
+   */
   template <class CG>
   __device__ void clear(CG const& group) noexcept
   {
     this->storage_.clear(group);
   }
 
+  /**
+   * @brief Adds an item to the estimator.
+   *
+   * @param item The item to be counted
+   */
   __device__ void add(T const& item) noexcept
   {
     // static_assert NumBuckets is not too big
@@ -63,6 +99,15 @@ class hyperloglog_ref {
     this->storage_.update_max<thread_scope>(reg, zeroes);
   }
 
+  /**
+   * @brief Merges the result of `other` estimator reference into `*this` estimator reference.
+   *
+   * @tparam CG CUDA Cooperative Group type
+   * @tparam OtherScope Thread scope of `other` estimator
+   *
+   * @param group CUDA Cooperative group this operation is executed in
+   * @param other Other estimator reference to be merged into `*this`
+   */
   template <class CG, cuda::thread_scope OtherScope>
   __device__ void merge(CG const& group,
                         hyperloglog_ref<T, Precision, OtherScope, Hash> const& other) noexcept
@@ -70,6 +115,13 @@ class hyperloglog_ref {
     this->storage_.merge<thread_scope>(group, other.storage_);
   }
 
+  /**
+   * @brief Compute the estimated distinct items count.
+   *
+   * @param group CUDA thread block group this operation is executed in
+   *
+   * @return Approximate distinct items count
+   */
   [[nodiscard]] __device__ std::size_t estimate(
     cooperative_groups::thread_block const& group) const noexcept
   {
@@ -110,8 +162,9 @@ class hyperloglog_ref {
   }
 
  private:
-  Hash hash_;
-  storage_type& storage_;  // TODO is a reference the right choice here??
+  Hash hash_;  ///< Hash function used to hash items
+  // TODO is a reference the right choice here??
+  storage_type& storage_;  ///< Reference to storage object
 
   template <class T_, int32_t Precision_, cuda::thread_scope Scope_, class Hash_>
   friend class hyperloglog_ref;
diff --git a/include/cuco/detail/hyperloglog/kernels.cuh b/include/cuco/detail/hyperloglog/kernels.cuh
index e84f49e40..c7b9bc018 100644
--- a/include/cuco/detail/hyperloglog/kernels.cuh
+++ b/include/cuco/detail/hyperloglog/kernels.cuh
@@ -43,7 +43,7 @@ CUCO_KERNEL void add_shmem(InputIt first, cuco::detail::index_type n, RefType re
   auto idx               = cuco::detail::global_thread_id();
   auto const block       = cooperative_groups::this_thread_block();
 
-  local_ref_type local_ref(local_storage);
+  local_ref_type local_ref(local_storage, {}, {});
   local_ref.clear(block);
   block.sync();
 
diff --git a/include/cuco/detail/hyperloglog/storage.cuh b/include/cuco/detail/hyperloglog/storage.cuh
index a1117fdfd..fe0a4ff7a 100644
--- a/include/cuco/detail/hyperloglog/storage.cuh
+++ b/include/cuco/detail/hyperloglog/storage.cuh
@@ -23,9 +23,25 @@
 
 namespace cuco::detail {
 
+/**
+ * @brief Storage class for `hyperloglog` and `hyperloglog_ref`.
+ *
+ * @note This class implements the dense storage layout from the HyperLogLog++ paper, but uses
+ * 4bytes per register instead of only 6bits. This is required since we need to update registers
+ * atomically.
+ *
+ * @tparam Precision Tuning parameter to trade runtime/memory footprint for better accuracy
+ */
 template <int32_t Precision>
 class hyperloglog_dense_registers {
  public:
+  /**
+   * @brief Clears the storage.
+   *
+   * @tparam CG CUDA Cooperative Group type
+   *
+   * @param group CUDA Cooperative group this operation is executed in
+   */
   template <class CG>
   __device__ void constexpr clear(CG const& group) noexcept
   {
@@ -42,21 +58,49 @@ class hyperloglog_dense_registers {
     // }
   }
 
+  /**
+   * @brief Returns a reference to the element at specified location `i`. No bounds checking is
+   * performed.
+   *
+   * @param i Position of the element to return
+   *
+   * @return Reference to the requested element
+   */
   __host__ __device__ constexpr int& operator[](std::size_t i) noexcept
   {
     return this->registers_[i];
   }
 
+  /**
+   * @brief Returns the element at specified location `i`. No bounds checking is performed.
+   *
+   * @param i Position of the element to return
+   *
+   * @return Requested element
+   */
   __host__ __device__ constexpr int operator[](std::size_t i) const noexcept
   {
     return this->registers_[i];
   }
 
+  /**
+   * @brief Returns the number of elements in the container.
+   *
+   * @return The number of elements in the container
+   */
   __host__ __device__ constexpr std::size_t size() const noexcept
   {
     return this->registers_.size();
   }
 
+  /**
+   * @brief Atomically updates the register at position `i` with `max(reg[i], value)`.
+   *
+   * @tparam Scope CUDA thread scope
+   *
+   * @param i Register index
+   * @param value New value
+   */
   template <cuda::thread_scope Scope>
   __device__ constexpr void update_max(std::size_t i, int value) noexcept
   {
@@ -73,6 +117,15 @@ class hyperloglog_dense_registers {
     }
   }
 
+  /**
+   * @brief Combines the contents of `other` storage into `*this` storage.
+   *
+   * @tparam Scope CUDA thread scope
+   * @tparam CG CUDA Cooperative Group type
+   *
+   * @param group CUDA Cooperative group this operation is executed in
+   * @param other Other storage
+   */
   template <cuda::thread_scope Scope, class CG>
   __device__ void constexpr merge(CG const& group,
                                   hyperloglog_dense_registers const& other) noexcept
@@ -118,6 +171,7 @@ class hyperloglog_dense_registers {
   }
 
  private:
-  alignas(sizeof(int) * 4) cuda::std::array<int, 1ull << Precision> registers_;
+  alignas(sizeof(int) *
+          4) cuda::std::array<int, 1ull << Precision> registers_;  ///< Register array storage
 };
 }  // namespace cuco::detail
diff --git a/include/cuco/distinct_count_estimator.cuh b/include/cuco/distinct_count_estimator.cuh
index 1d5dde49d..5b18d7f40 100644
--- a/include/cuco/distinct_count_estimator.cuh
+++ b/include/cuco/distinct_count_estimator.cuh
@@ -27,6 +27,21 @@
 #include <memory>
 
 namespace cuco {
+/**
+ * @brief A GPU-accelerated utility for approximating the number of distinct items in a multiset.
+ *
+ * @note This implementation is based on the HyperLogLog++ algorithm:
+ * https://static.googleusercontent.com/media/research.google.com/de//pubs/archive/40671.pdf.
+ * @note The `Precision` parameter can be used to trade runtime/memory footprint for better
+ * accuracy. A higher value corresponds to a more accurate result, however, setting the precision
+ * too high will result in deminishing returns.
+ *
+ * @tparam T Type of items to count
+ * @tparam Precision Tuning parameter to trade runtime/memory footprint for better accuracy
+ * @tparam Scope The scope in which operations will be performed by individual threads
+ * @tparam Hash Hash function used to hash items
+ * @tparam Allocator Type of allocator used for device storage
+ */
 template <class T,
           int32_t Precision        = 11,
           cuda::thread_scope Scope = cuda::thread_scope_device,
@@ -37,57 +52,169 @@ class distinct_count_estimator {
 
  public:
   static constexpr auto thread_scope = impl_type::thread_scope;  ///< CUDA thread scope
-  static constexpr auto precision    = impl_type::precision;
-
-  using allocator_type = typename impl_type::allocator_type;  ///< Allocator type
-  using storage_type   = typename impl_type::storage_type;
+  static constexpr auto precision    = impl_type::precision;     ///< Precision
 
   template <cuda::thread_scope NewScope = thread_scope>
-  using ref_type = cuco::distinct_count_estimator_ref<T, Precision, NewScope, Hash>;
+  using ref_type =
+    cuco::distinct_count_estimator_ref<T, Precision, NewScope, Hash>;  ///< Non-owning reference
+                                                                       ///< type
+
+  using allocator_type = typename impl_type::allocator_type;  ///< Allocator type
+  using storage_type   = typename impl_type::storage_type;    ///< Storage type
 
   // TODO enable CTAD
+  /**
+   * @brief Constructs a `distinct_count_estimator` host object.
+   *
+   * @note This function synchronizes the given stream.
+   *
+   * @param scope The scope in which operations will be performed
+   * @param hash The hash function used to hash items
+   * @param alloc Allocator used for allocating device storage
+   * @param stream CUDA stream used to initialize the object
+   */
   constexpr distinct_count_estimator(cuco::cuda_thread_scope<Scope> scope = {},
                                      Hash const& hash                     = {},
                                      Allocator const& alloc               = {},
                                      cuco::cuda_stream_ref stream         = {});
 
+  ~distinct_count_estimator() = default;
+
   distinct_count_estimator(distinct_count_estimator const&) = delete;
   distinct_count_estimator& operator=(distinct_count_estimator const&) = delete;
-  distinct_count_estimator(distinct_count_estimator&&)                 = default;
+  distinct_count_estimator(distinct_count_estimator&&) = default;  ///< Move constructor
+
+  // TODO this is somehow required to pass the Doxygen check.
+  /**
+   * @brief Copy-assignment operator.
+   *
+   * @return Copy of `*this`
+   */
   distinct_count_estimator& operator=(distinct_count_estimator&&) = default;
-  ~distinct_count_estimator()                                     = default;
 
+  /**
+   * @brief Asynchronously resets the estimator, i.e., clears the current count estimate.
+   *
+   * @param stream CUDA stream this operation is executed in
+   */
   void clear_async(cuco::cuda_stream_ref stream = {}) noexcept;
 
+  /**
+   * @brief Resets the estimator, i.e., clears the current count estimate.
+   *
+   * @note This function synchronizes the given stream. For asynchronous execution use
+   * `clear_async`.
+   *
+   * @param stream CUDA stream this operation is executed in
+   */
   void clear(cuco::cuda_stream_ref stream = {});
 
+  /**
+   * @brief Asynchronously adds to be counted items to the estimator.
+   *
+   * @tparam InputIt Device accessible random access input iterator where
+   * <tt>std::is_convertible<std::iterator_traits<InputIt>::value_type,
+   * T></tt> is `true`
+   *
+   * @param first Beginning of the sequence of items
+   * @param last End of the sequence of items
+   * @param stream CUDA stream this operation is executed in
+   */
   template <class InputIt>
   void add_async(InputIt first, InputIt last, cuco::cuda_stream_ref stream = {}) noexcept;
 
+  /**
+   * @brief Adds to be counted items to the estimator.
+   *
+   * @note This function synchronizes the given stream. For asynchronous execution use
+   * `add_async`.
+   *
+   * @tparam InputIt Device accessible random access input iterator where
+   * <tt>std::is_convertible<std::iterator_traits<InputIt>::value_type,
+   * T></tt> is `true`
+   *
+   * @param first Beginning of the sequence of items
+   * @param last End of the sequence of items
+   * @param stream CUDA stream this operation is executed in
+   */
   template <class InputIt>
   void add(InputIt first, InputIt last, cuco::cuda_stream_ref stream = {});
 
+  /**
+   * @brief Asynchronously merges the result of `other` estimator into `*this` estimator.
+   *
+   * @tparam OtherScope Thread scope of `other` estimator
+   * @tparam OtherAllocator Allocator type of `other` estimator
+   *
+   * @param other Other estimator to be merged into `*this`
+   * @param stream CUDA stream this operation is executed in
+   */
   template <cuda::thread_scope OtherScope, class OtherAllocator>
   void merge_async(
     distinct_count_estimator<T, Precision, OtherScope, Hash, OtherAllocator> const& other,
     cuco::cuda_stream_ref stream = {}) noexcept;
 
+  /**
+   * @brief Merges the result of `other` estimator into `*this` estimator.
+   *
+   * @note This function synchronizes the given stream. For asynchronous execution use
+   * `merge_async`.
+   *
+   * @tparam OtherScope Thread scope of `other` estimator
+   * @tparam OtherAllocator Allocator type of `other` estimator
+   *
+   * @param other Other estimator to be merged into `*this`
+   * @param stream CUDA stream this operation is executed in
+   */
   template <cuda::thread_scope OtherScope, class OtherAllocator>
   void merge(distinct_count_estimator<T, Precision, OtherScope, Hash, OtherAllocator> const& other,
              cuco::cuda_stream_ref stream = {});
 
+  /**
+   * @brief Asynchronously merges the result of `other` estimator reference into `*this` estimator.
+   *
+   * @tparam OtherScope Thread scope of `other` estimator
+   *
+   * @param other Other estimator reference to be merged into `*this`
+   * @param stream CUDA stream this operation is executed in
+   */
   template <cuda::thread_scope OtherScope>
   void merge_async(ref_type<OtherScope> const& other, cuco::cuda_stream_ref stream = {}) noexcept;
 
+  /**
+   * @brief Merges the result of `other` estimator reference into `*this` estimator.
+   *
+   * @note This function synchronizes the given stream. For asynchronous execution use
+   * `merge_async`.
+   *
+   * @tparam OtherScope Thread scope of `other` estimator
+   *
+   * @param other Other estimator reference to be merged into `*this`
+   * @param stream CUDA stream this operation is executed in
+   */
   template <cuda::thread_scope OtherScope>
   void merge(ref_type<OtherScope> const& other, cuco::cuda_stream_ref stream = {});
 
+  /**
+   * @brief Compute the estimated distinct items count.
+   *
+   * @note This function synchronizes the given stream.
+   *
+   * @param stream CUDA stream this operation is executed in
+   *
+   * @return Approximate distinct items count
+   */
   [[nodiscard]] std::size_t estimate(cuco::cuda_stream_ref stream = {}) const;
 
+  /**
+   * @brief Get device ref.
+   *
+   * @return Device ref object of the current `distinct_count_estimator` host object
+   */
   [[nodiscard]] ref_type<> ref() const noexcept;
 
  private:
-  std::unique_ptr<impl_type> impl_;
+  std::unique_ptr<impl_type> impl_;  ///< Implementation object
 };
 }  // namespace cuco
 
diff --git a/include/cuco/distinct_count_estimator_ref.cuh b/include/cuco/distinct_count_estimator_ref.cuh
index 5787e3f47..a42671812 100644
--- a/include/cuco/distinct_count_estimator_ref.cuh
+++ b/include/cuco/distinct_count_estimator_ref.cuh
@@ -22,39 +22,91 @@
 #include <cooperative_groups.h>
 
 namespace cuco {
+/**
+ * @brief A GPU-accelerated utility for approximating the number of distinct items in a multiset.
+ *
+ * @note This implementation is based on the HyperLogLog++ algorithm:
+ * https://static.googleusercontent.com/media/research.google.com/de//pubs/archive/40671.pdf.
+ * @note The `Precision` parameter can be used to trade runtime/memory footprint for better
+ * accuracy. A higher value corresponds to a more accurate result, however, setting the precision
+ * too high will result in deminishing results.
+ *
+ * @tparam T Type of items to count
+ * @tparam Precision Tuning parameter to trade runtime/memory footprint for better accuracy
+ * @tparam Scope The scope in which operations will be performed by individual threads
+ * @tparam Hash Hash function used to hash items
+ */
 template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash>
 class distinct_count_estimator_ref {
   using impl_type = detail::hyperloglog_ref<T, Precision, Scope, Hash>;
 
  public:
   static constexpr auto thread_scope = impl_type::thread_scope;  ///< CUDA thread scope
-  static constexpr auto precision    = impl_type::precision;
+  static constexpr auto precision    = impl_type::precision;     ///< Precision
+
+  using storage_type = typename impl_type::storage_type;  ///< Storage type
 
-  using storage_type = typename impl_type::storage_type;
   template <cuda::thread_scope NewScope>
-  using with_scope = distinct_count_estimator_ref<T, Precision, NewScope, Hash>;
+  using with_scope =
+    distinct_count_estimator_ref<T, Precision, NewScope, Hash>;  ///< Ref type with different thread
+                                                                 ///< scope
 
   // TODO let storage_type be inferred?
+  /**
+   * @brief Constructs a non-owning `distinct_count_estimator_ref` object.
+   *
+   * @param storage Reference to storage object of type `storage_type`
+   * @param scope The scope in which operations will be performed
+   * @param hash The hash function used to hash items
+   */
   __host__ __device__ constexpr distinct_count_estimator_ref(
     storage_type& storage,
     cuco::cuda_thread_scope<Scope> scope = {},
     Hash const& hash                     = {}) noexcept;
 
+  /**
+   * @brief Resets the estimator, i.e., clears the current count estimate.
+   *
+   * @tparam CG CUDA Cooperative Group type
+   *
+   * @param group CUDA Cooperative group this operation is executed in
+   */
   template <class CG>
   __device__ void clear(CG const& group) noexcept;
 
+  /**
+   * @brief Adds an item to the estimator.
+   *
+   * @param item The item to be counted
+   */
   __device__ void add(T const& item) noexcept;
 
+  /**
+   * @brief Merges the result of `other` estimator reference into `*this` estimator reference.
+   *
+   * @tparam CG CUDA Cooperative Group type
+   * @tparam OtherScope Thread scope of `other` estimator
+   *
+   * @param group CUDA Cooperative group this operation is executed in
+   * @param other Other estimator reference to be merged into `*this`
+   */
   template <class CG, cuda::thread_scope OtherScope>
   __device__ void merge(
     CG const& group,
     distinct_count_estimator_ref<T, Precision, OtherScope, Hash> const& other) noexcept;
 
+  /**
+   * @brief Compute the estimated distinct items count.
+   *
+   * @param group CUDA thread block group this operation is executed in
+   *
+   * @return Approximate distinct items count
+   */
   [[nodiscard]] __device__ std::size_t estimate(
     cooperative_groups::thread_block const& group) const noexcept;
 
  private:
-  impl_type impl_;
+  impl_type impl_;  ///< Implementation object
 };
 }  // namespace cuco
 

From 1c780c25b63f1fec2a789982c55dc00f8e45c95c Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Thu, 25 Jan 2024 22:06:38 +0000
Subject: [PATCH 08/78] Add benchmark

---
 benchmarks/CMakeLists.txt                     |   5 +
 benchmarks/distinct_count_estimator_bench.cu  | 135 ++++++++++++++++++
 benchmarks/utils.hpp                          |   2 +
 .../cuco/detail/hyperloglog/hyperloglog.cuh   |   1 +
 .../detail/hyperloglog/hyperloglog_ref.cuh    |   1 +
 include/cuco/distinct_count_estimator.cuh     |   1 +
 include/cuco/distinct_count_estimator_ref.cuh |   1 +
 7 files changed, 146 insertions(+)
 create mode 100644 benchmarks/distinct_count_estimator_bench.cu

diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 6b03cb98c..da57a1055 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -84,3 +84,8 @@ ConfigureBench(DYNAMIC_MAP_BENCH
 # - hash function benchmarks ----------------------------------------------------------------------
 ConfigureBench(HASH_BENCH
   hash_bench.cu)
+
+###################################################################################################
+# - distinct_count_estimator benchmarks -----------------------------------------------------------
+ConfigureBench(DISTINCT_COUNT_ESTIMATOR_BENCH
+  distinct_count_estimator_bench.cu)
diff --git a/benchmarks/distinct_count_estimator_bench.cu b/benchmarks/distinct_count_estimator_bench.cu
new file mode 100644
index 000000000..c52025c6c
--- /dev/null
+++ b/benchmarks/distinct_count_estimator_bench.cu
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <defaults.hpp>
+#include <utils.hpp>
+
+#include <cuco/distinct_count_estimator.cuh>
+#include <cuco/static_set.cuh>
+#include <cuco/utility/key_generator.cuh>
+
+#include <nvbench/nvbench.cuh>
+
+#include <thrust/device_vector.h>
+
+#include <cstddef>
+
+using namespace cuco::benchmark;
+using namespace cuco::utility;
+
+template <typename T, typename InputIt>
+[[nodiscard]] std::size_t exact_distinct_count(InputIt first, InputIt last)
+{
+  // TODO don't use detail ns in user land
+  auto const num_items = cuco::detail::distance(first, last);
+  if (num_items == 0) { return 0; }
+
+  auto set = cuco::static_set{num_items, cuco::empty_key<T>{-1}};
+  set.insert(first, last);
+  return set.size();
+}
+
+/**
+ * @brief A benchmark evaluating `cuco::distinct_count_estimator` end-to-end performance
+ */
+template <typename Estimator, typename Dist>
+void distinct_count_estimator_e2e(nvbench::state& state, nvbench::type_list<Estimator, Dist>)
+{
+  using T = typename Estimator::value_type;
+
+  auto const num_items = state.get_int64_or_default("NumInputs", 1ull << 30);
+
+  thrust::device_vector<T> items(num_items);
+
+  key_generator gen;
+  gen.generate(dist_from_state<Dist>(state), items.begin(), items.end());
+
+  state.add_element_count(num_items);
+  state.add_global_memory_reads<T>(num_items, "InputSize");
+
+  Estimator estimator;
+  estimator.add(items.begin(), items.end());
+
+  double estimated_cardinality  = estimator.estimate();
+  double const true_cardinality = exact_distinct_count<T>(items.begin(), items.end());
+  auto const relative_error     = abs(true_cardinality - estimated_cardinality) / true_cardinality;
+
+  auto& summ = state.add_summary("RelativeError");
+  summ.set_string("hint", "RelErr");
+  summ.set_string("short_name", "RelativeError");
+  summ.set_string("description", "Relatve approximation error.");
+  summ.set_float64("value", relative_error);
+
+  estimator.clear();
+  state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer,
+             [&](nvbench::launch& launch, auto& timer) {
+               estimator.clear_async({launch.get_stream()});
+
+               timer.start();
+               estimator.add_async(items.begin(), items.end(), {launch.get_stream()});
+               estimated_cardinality = estimator.estimate({launch.get_stream()});
+               timer.stop();
+             });
+}
+
+/**
+ * @brief A benchmark evaluating `cuco::distinct_count_estimator::add` performance
+ */
+template <typename Estimator, typename Dist>
+void distinct_count_estimator_add(nvbench::state& state, nvbench::type_list<Estimator, Dist>)
+{
+  using T = typename Estimator::value_type;
+
+  auto const num_items = state.get_int64_or_default("NumInputs", 1ull << 30);
+
+  thrust::device_vector<T> items(num_items);
+
+  key_generator gen;
+  gen.generate(dist_from_state<Dist>(state), items.begin(), items.end());
+
+  state.add_element_count(num_items);
+  state.add_global_memory_reads<T>(num_items, "InputSize");
+
+  Estimator estimator;
+  state.exec(nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
+    estimator.clear_async({launch.get_stream()});
+
+    timer.start();
+    estimator.add_async(items.begin(), items.end(), {launch.get_stream()});
+    timer.stop();
+  });
+}
+
+using ESTIMATOR_RANGE = nvbench::type_list<cuco::distinct_count_estimator<nvbench::int32_t, 8>,
+                                           cuco::distinct_count_estimator<nvbench::int32_t, 9>,
+                                           cuco::distinct_count_estimator<nvbench::int32_t, 10>,
+                                           cuco::distinct_count_estimator<nvbench::int32_t, 11>,
+                                           cuco::distinct_count_estimator<nvbench::int32_t, 12>,
+                                           cuco::distinct_count_estimator<nvbench::int32_t, 13>,
+                                           cuco::distinct_count_estimator<nvbench::int64_t, 11>,
+                                           cuco::distinct_count_estimator<nvbench::int64_t, 12>>;
+
+NVBENCH_BENCH_TYPES(distinct_count_estimator_e2e,
+                    NVBENCH_TYPE_AXES(ESTIMATOR_RANGE, nvbench::type_list<distribution::unique>))
+  .set_name("distinct_count_estimator")
+  .set_type_axes_names({"Estimator", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE);
+
+NVBENCH_BENCH_TYPES(distinct_count_estimator_add,
+                    NVBENCH_TYPE_AXES(ESTIMATOR_RANGE, nvbench::type_list<distribution::unique>))
+  .set_name("distinct_count_estimator::add")
+  .set_type_axes_names({"Estimator", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE);
\ No newline at end of file
diff --git a/benchmarks/utils.hpp b/benchmarks/utils.hpp
index 392cafe06..97ca4988f 100644
--- a/benchmarks/utils.hpp
+++ b/benchmarks/utils.hpp
@@ -21,6 +21,8 @@
 
 #include <nvbench/nvbench.cuh>
 
+#include <cuda/std/atomic>  // thread_scope
+
 namespace cuco::benchmark {
 
 template <typename Dist>
diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh
index 3bb032105..c969be259 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog.cuh
@@ -55,6 +55,7 @@ class hyperloglog {
                                                                    ///< type
 
   using allocator_type         = Allocator;                          ///< Allocator type
+  using value_type             = typename ref_type<>::value_type;    ///< Type of items to count
   using storage_type           = typename ref_type<>::storage_type;  ///< Storage type
   using storage_allocator_type = typename std::allocator_traits<Allocator>::template rebind_alloc<
     storage_type>;  ///< Storage allocator type
diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
index c6073a265..2de123946 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
@@ -49,6 +49,7 @@ class hyperloglog_ref {
   static constexpr auto thread_scope = Scope;      ///< CUDA thread scope
   static constexpr auto precision    = Precision;  ///< Precision
 
+  using value_type   = T;                                       ///< Type of items to count
   using storage_type = hyperloglog_dense_registers<Precision>;  ///< Storage type
 
   template <cuda::thread_scope NewScope>
diff --git a/include/cuco/distinct_count_estimator.cuh b/include/cuco/distinct_count_estimator.cuh
index 5b18d7f40..16b943ac9 100644
--- a/include/cuco/distinct_count_estimator.cuh
+++ b/include/cuco/distinct_count_estimator.cuh
@@ -59,6 +59,7 @@ class distinct_count_estimator {
     cuco::distinct_count_estimator_ref<T, Precision, NewScope, Hash>;  ///< Non-owning reference
                                                                        ///< type
 
+  using value_type     = typename impl_type::value_type;      ///< Type of items to count
   using allocator_type = typename impl_type::allocator_type;  ///< Allocator type
   using storage_type   = typename impl_type::storage_type;    ///< Storage type
 
diff --git a/include/cuco/distinct_count_estimator_ref.cuh b/include/cuco/distinct_count_estimator_ref.cuh
index a42671812..d32b6c4e0 100644
--- a/include/cuco/distinct_count_estimator_ref.cuh
+++ b/include/cuco/distinct_count_estimator_ref.cuh
@@ -44,6 +44,7 @@ class distinct_count_estimator_ref {
   static constexpr auto thread_scope = impl_type::thread_scope;  ///< CUDA thread scope
   static constexpr auto precision    = impl_type::precision;     ///< Precision
 
+  using value_type   = typename impl_type::value_type;    ///< Type of items to count
   using storage_type = typename impl_type::storage_type;  ///< Storage type
 
   template <cuda::thread_scope NewScope>

From b478e010ebfc0d3a4875f003324b3a1d74036dc8 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Tue, 30 Jan 2024 12:08:52 +0000
Subject: [PATCH 09/78] Remove scope ctor parameter for now

---
 .../distinct_count_estimator.inl                      |  7 ++-----
 .../distinct_count_estimator_ref.inl                  |  6 ++----
 include/cuco/detail/hyperloglog/hyperloglog.cuh       | 11 +++--------
 include/cuco/detail/hyperloglog/hyperloglog_ref.cuh   |  6 +-----
 include/cuco/detail/hyperloglog/kernels.cuh           |  2 +-
 include/cuco/distinct_count_estimator.cuh             | 10 ++++------
 include/cuco/distinct_count_estimator_ref.cuh         |  7 ++-----
 7 files changed, 15 insertions(+), 34 deletions(-)

diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl
index 7013bc956..413d7ee7b 100644
--- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl
+++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl
@@ -18,11 +18,8 @@ namespace cuco {
 
 template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash, class Allocator>
 constexpr distinct_count_estimator<T, Precision, Scope, Hash, Allocator>::distinct_count_estimator(
-  cuco::cuda_thread_scope<Scope> scope,
-  Hash const& hash,
-  Allocator const& alloc,
-  cuco::cuda_stream_ref stream)
-  : impl_{std::make_unique<impl_type>(scope, hash, alloc, stream)}
+  Hash const& hash, Allocator const& alloc, cuco::cuda_stream_ref stream)
+  : impl_{std::make_unique<impl_type>(hash, alloc, stream)}
 {
 }
 
diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl
index 1359033d0..26fc9bd99 100644
--- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl
+++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl
@@ -18,10 +18,8 @@ namespace cuco {
 
 template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash>
 __host__ __device__ constexpr distinct_count_estimator_ref<T, Precision, Scope, Hash>::
-  distinct_count_estimator_ref(storage_type& storage,
-                               cuco::cuda_thread_scope<Scope> scope,
-                               Hash const& hash) noexcept
-  : impl_{storage, scope, hash}
+  distinct_count_estimator_ref(storage_type& storage, Hash const& hash) noexcept
+  : impl_{storage, hash}
 {
 }
 
diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh
index c969be259..af303a921 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog.cuh
@@ -69,12 +69,7 @@ class hyperloglog {
    * @param alloc Allocator used for allocating device storage
    * @param stream CUDA stream used to initialize the object
    */
-  // Doxygen cannot document unnamed parameter for scope, see
-  // https://github.com/doxygen/doxygen/issues/6926
-  constexpr hyperloglog(cuco::cuda_thread_scope<Scope>,
-                        Hash const& hash,
-                        Allocator const& alloc,
-                        cuco::cuda_stream_ref stream)
+  constexpr hyperloglog(Hash const& hash, Allocator const& alloc, cuco::cuda_stream_ref stream)
     : hash_{hash},
       storage_allocator_{alloc},
       storage_deleter_{storage_allocator_},
@@ -85,7 +80,7 @@ class hyperloglog {
 
   ~hyperloglog() = default;
 
-  hyperloglog(hyperloglog const&) = delete;
+  hyperloglog(hyperloglog const&)            = delete;
   hyperloglog& operator=(hyperloglog const&) = delete;
   hyperloglog(hyperloglog&&)                 = default;  ///< Move constructor
 
@@ -301,7 +296,7 @@ class hyperloglog {
    */
   [[nodiscard]] ref_type<> ref() const noexcept
   {
-    return ref_type<>{*(this->storage_.get()), {}, this->hash_};
+    return ref_type<>{*(this->storage_.get()), this->hash_};
   }
 
  private:
diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
index 2de123946..e41f47ef6 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
@@ -62,11 +62,7 @@ class hyperloglog_ref {
    * @param storage Reference to storage object of type `storage_type`
    * @param hash The hash function used to hash items
    */
-  // Doxygen cannot document unnamed parameter for scope, see
-  // https://github.com/doxygen/doxygen/issues/6926
-  __host__ __device__ constexpr hyperloglog_ref(storage_type& storage,
-                                                cuco::cuda_thread_scope<Scope>,
-                                                Hash const& hash) noexcept
+  __host__ __device__ constexpr hyperloglog_ref(storage_type& storage, Hash const& hash) noexcept
     : hash_{hash}, storage_{storage}
   {
   }
diff --git a/include/cuco/detail/hyperloglog/kernels.cuh b/include/cuco/detail/hyperloglog/kernels.cuh
index c7b9bc018..fd3a2a877 100644
--- a/include/cuco/detail/hyperloglog/kernels.cuh
+++ b/include/cuco/detail/hyperloglog/kernels.cuh
@@ -43,7 +43,7 @@ CUCO_KERNEL void add_shmem(InputIt first, cuco::detail::index_type n, RefType re
   auto idx               = cuco::detail::global_thread_id();
   auto const block       = cooperative_groups::this_thread_block();
 
-  local_ref_type local_ref(local_storage, {}, {});
+  local_ref_type local_ref(local_storage, {});
   local_ref.clear(block);
   block.sync();
 
diff --git a/include/cuco/distinct_count_estimator.cuh b/include/cuco/distinct_count_estimator.cuh
index 16b943ac9..5a9a16c85 100644
--- a/include/cuco/distinct_count_estimator.cuh
+++ b/include/cuco/distinct_count_estimator.cuh
@@ -69,19 +69,17 @@ class distinct_count_estimator {
    *
    * @note This function synchronizes the given stream.
    *
-   * @param scope The scope in which operations will be performed
    * @param hash The hash function used to hash items
    * @param alloc Allocator used for allocating device storage
    * @param stream CUDA stream used to initialize the object
    */
-  constexpr distinct_count_estimator(cuco::cuda_thread_scope<Scope> scope = {},
-                                     Hash const& hash                     = {},
-                                     Allocator const& alloc               = {},
-                                     cuco::cuda_stream_ref stream         = {});
+  constexpr distinct_count_estimator(Hash const& hash             = {},
+                                     Allocator const& alloc       = {},
+                                     cuco::cuda_stream_ref stream = {});
 
   ~distinct_count_estimator() = default;
 
-  distinct_count_estimator(distinct_count_estimator const&) = delete;
+  distinct_count_estimator(distinct_count_estimator const&)            = delete;
   distinct_count_estimator& operator=(distinct_count_estimator const&) = delete;
   distinct_count_estimator(distinct_count_estimator&&) = default;  ///< Move constructor
 
diff --git a/include/cuco/distinct_count_estimator_ref.cuh b/include/cuco/distinct_count_estimator_ref.cuh
index d32b6c4e0..256183082 100644
--- a/include/cuco/distinct_count_estimator_ref.cuh
+++ b/include/cuco/distinct_count_estimator_ref.cuh
@@ -57,13 +57,10 @@ class distinct_count_estimator_ref {
    * @brief Constructs a non-owning `distinct_count_estimator_ref` object.
    *
    * @param storage Reference to storage object of type `storage_type`
-   * @param scope The scope in which operations will be performed
    * @param hash The hash function used to hash items
    */
-  __host__ __device__ constexpr distinct_count_estimator_ref(
-    storage_type& storage,
-    cuco::cuda_thread_scope<Scope> scope = {},
-    Hash const& hash                     = {}) noexcept;
+  __host__ __device__ constexpr distinct_count_estimator_ref(storage_type& storage,
+                                                             Hash const& hash = {}) noexcept;
 
   /**
    * @brief Resets the estimator, i.e., clears the current count estimate.

From e3d401a7970dc6851ac45ce9017eca64e3cc586f Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Tue, 30 Jan 2024 19:24:46 +0000
Subject: [PATCH 10/78] Update benchmark

---
 benchmarks/distinct_count_estimator_bench.cu | 42 ++++++++++++--------
 1 file changed, 25 insertions(+), 17 deletions(-)

diff --git a/benchmarks/distinct_count_estimator_bench.cu b/benchmarks/distinct_count_estimator_bench.cu
index c52025c6c..7ceb305b4 100644
--- a/benchmarks/distinct_count_estimator_bench.cu
+++ b/benchmarks/distinct_count_estimator_bench.cu
@@ -24,21 +24,26 @@
 #include <nvbench/nvbench.cuh>
 
 #include <thrust/device_vector.h>
+#include <thrust/iterator/transform_iterator.h>
 
 #include <cstddef>
+#include <cuda/functional>
 
 using namespace cuco::benchmark;
 using namespace cuco::utility;
 
-template <typename T, typename InputIt>
-[[nodiscard]] std::size_t exact_distinct_count(InputIt first, InputIt last)
+template <typename InputIt>
+[[nodiscard]] std::size_t exact_distinct_count(InputIt first, std::size_t n)
 {
-  // TODO don't use detail ns in user land
-  auto const num_items = cuco::detail::distance(first, last);
-  if (num_items == 0) { return 0; }
+  // TODO static_set currently only supports types up-to 8-bytes in size.
+  // Casting is valid since the keys generated are representable in int64_t.
+  using T = std::int64_t;
 
-  auto set = cuco::static_set{num_items, cuco::empty_key<T>{-1}};
-  set.insert(first, last);
+  auto cast_iter = thrust::make_transform_iterator(
+    first, cuda::proclaim_return_type<T>([] __device__(auto i) { return static_cast<T>(i); }));
+
+  auto set = cuco::static_set{n, 0.8, cuco::empty_key<T>{-1}};
+  set.insert(cast_iter, cast_iter + n);
   return set.size();
 }
 
@@ -50,7 +55,7 @@ void distinct_count_estimator_e2e(nvbench::state& state, nvbench::type_list<Esti
 {
   using T = typename Estimator::value_type;
 
-  auto const num_items = state.get_int64_or_default("NumInputs", 1ull << 30);
+  auto const num_items = state.get_int64("NumInputs");
 
   thrust::device_vector<T> items(num_items);
 
@@ -64,7 +69,7 @@ void distinct_count_estimator_e2e(nvbench::state& state, nvbench::type_list<Esti
   estimator.add(items.begin(), items.end());
 
   double estimated_cardinality  = estimator.estimate();
-  double const true_cardinality = exact_distinct_count<T>(items.begin(), items.end());
+  double const true_cardinality = exact_distinct_count(items.begin(), num_items);
   auto const relative_error     = abs(true_cardinality - estimated_cardinality) / true_cardinality;
 
   auto& summ = state.add_summary("RelativeError");
@@ -93,7 +98,7 @@ void distinct_count_estimator_add(nvbench::state& state, nvbench::type_list<Esti
 {
   using T = typename Estimator::value_type;
 
-  auto const num_items = state.get_int64_or_default("NumInputs", 1ull << 30);
+  auto const num_items = state.get_int64("NumInputs");
 
   thrust::device_vector<T> items(num_items);
 
@@ -113,23 +118,26 @@ void distinct_count_estimator_add(nvbench::state& state, nvbench::type_list<Esti
   });
 }
 
-using ESTIMATOR_RANGE = nvbench::type_list<cuco::distinct_count_estimator<nvbench::int32_t, 8>,
-                                           cuco::distinct_count_estimator<nvbench::int32_t, 9>,
-                                           cuco::distinct_count_estimator<nvbench::int32_t, 10>,
+using ESTIMATOR_RANGE = nvbench::type_list<cuco::distinct_count_estimator<nvbench::int32_t, 10>,
                                            cuco::distinct_count_estimator<nvbench::int32_t, 11>,
                                            cuco::distinct_count_estimator<nvbench::int32_t, 12>,
-                                           cuco::distinct_count_estimator<nvbench::int32_t, 13>,
+                                           cuco::distinct_count_estimator<nvbench::int64_t, 10>,
                                            cuco::distinct_count_estimator<nvbench::int64_t, 11>,
-                                           cuco::distinct_count_estimator<nvbench::int64_t, 12>>;
+                                           cuco::distinct_count_estimator<nvbench::int64_t, 12>,
+                                           cuco::distinct_count_estimator<__int128_t, 10>,
+                                           cuco::distinct_count_estimator<__int128_t, 11>,
+                                           cuco::distinct_count_estimator<__int128_t, 12>>;
 
 NVBENCH_BENCH_TYPES(distinct_count_estimator_e2e,
                     NVBENCH_TYPE_AXES(ESTIMATOR_RANGE, nvbench::type_list<distribution::unique>))
-  .set_name("distinct_count_estimator")
+  .set_name("distinct_count_estimator_e2e")
   .set_type_axes_names({"Estimator", "Distribution"})
+  .add_int64_power_of_two_axis("NumInputs", {28, 29, 30})
   .set_max_noise(defaults::MAX_NOISE);
 
 NVBENCH_BENCH_TYPES(distinct_count_estimator_add,
                     NVBENCH_TYPE_AXES(ESTIMATOR_RANGE, nvbench::type_list<distribution::unique>))
-  .set_name("distinct_count_estimator::add")
+  .set_name("distinct_count_estimator::add_async")
   .set_type_axes_names({"Estimator", "Distribution"})
+  .add_int64_power_of_two_axis("NumInputs", {28, 29, 30})
   .set_max_noise(defaults::MAX_NOISE);
\ No newline at end of file

From 56520a604eca7dda1e64a4d107a62b0310ffc676 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Wed, 31 Jan 2024 18:08:12 +0000
Subject: [PATCH 11/78] Select cg reduce impl based on nvcc version

---
 include/cuco/detail/__config                   |  9 +++++++++
 .../detail/hyperloglog/hyperloglog_ref.cuh     | 18 ++++++++++++++++++
 2 files changed, 27 insertions(+)

diff --git a/include/cuco/detail/__config b/include/cuco/detail/__config
index c083fec86..fd3b6fce4 100644
--- a/include/cuco/detail/__config
+++ b/include/cuco/detail/__config
@@ -49,4 +49,13 @@
 
 #if defined(__SIZEOF_INT128__)
 #define CUCO_HAS_INT128
+#endif
+
+#if (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 8)
+#define CUCO_HAS_CG_EXPERIMENTAL_REDUCE_UPDATE_ASYNC
+#define _CG_ABI_EXPERIMENTAL
+#endif
+
+#if (__CUDACC_VER_MAJOR__ >= 12)
+#define CUCO_HAS_CG_REDUCE_UPDATE_ASYNC
 #endif
\ No newline at end of file
diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
index e41f47ef6..d6f362c5f 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
@@ -15,6 +15,7 @@
  */
 #pragma once
 
+#include <cuco/detail/__config>
 #include <cuco/detail/hyperloglog/finalizer.cuh>
 #include <cuco/detail/hyperloglog/storage.cuh>
 #include <cuco/hash_functions.cuh>
@@ -142,10 +143,27 @@ class hyperloglog_ref {
 
     // warp reduce Z and V
     auto const warp = cooperative_groups::tiled_partition<32>(group);
+#if defined(CUCO_HAS_CG_REDUCE_UPDATE_ASYNC)
     cooperative_groups::reduce_update_async(
       warp, block_sum, thread_sum, cooperative_groups::plus<fp_type>());
     cooperative_groups::reduce_update_async(
       warp, block_zeroes, thread_zeroes, cooperative_groups::plus<int>());
+#elif defined(CUCO_HAS_CG_EXPERIMENTAL_REDUCE_UPDATE_ASYNC)
+    cooperative_groups::experimental::reduce_update_async(
+      warp, block_sum, thread_sum, cooperative_groups::plus<fp_type>());
+    cooperative_groups::experimental::reduce_update_async(
+      warp, block_zeroes, thread_zeroes, cooperative_groups::plus<int>());
+#else
+    auto const warp_sum =
+      cooperative_groups::reduce(warp, thread_sum, cooperative_groups::plus<fp_type>());
+    auto const warp_zeroes =
+      cooperative_groups::reduce(warp, thread_zeroes, cooperative_groups::plus<int>());
+    // TODO warp sync needed?
+    if (warp.thread_rank() == 0) {
+      block_sum.fetch_add(warp_sum, cuda::std::memory_order_relaxed);
+      block_zeroes.fetch_add(warp_zeroes, cuda::std::memory_order_relaxed);
+    }
+#endif
     group.sync();
 
     if (group.thread_rank() == 0) {

From 367377228d5ce682e4c253ecbc6cd3bf44d1c9a4 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Wed, 31 Jan 2024 22:33:20 +0000
Subject: [PATCH 12/78] Re-format tuning header

---
 include/cuco/detail/hyperloglog/tuning.cuh | 2573 +-------------------
 1 file changed, 35 insertions(+), 2538 deletions(-)

diff --git a/include/cuco/detail/hyperloglog/tuning.cuh b/include/cuco/detail/hyperloglog/tuning.cuh
index f49e43e24..4d4a69067 100644
--- a/include/cuco/detail/hyperloglog/tuning.cuh
+++ b/include/cuco/detail/hyperloglog/tuning.cuh
@@ -26,2552 +26,49 @@ namespace cuco::hyperloglog_ns::detail {
 #define CUCO_HLL_TUNING_ARR_DECL __device__ static cuda::std::array constexpr
 #endif
 
-CUCO_HLL_TUNING_ARR_DECL thresholds{10.0,
-                                    20.0,
-                                    40.0,
-                                    80.0,
-                                    220.0,
-                                    400.0,
-                                    900.0,
-                                    1800.0,
-                                    3100.0,
-                                    6500.0,
-                                    15500.0,
-                                    20000.0,
-                                    50000.0,
-                                    120000.0,
-                                    350000.0};
+// clang-format off
+CUCO_HLL_TUNING_ARR_DECL thresholds{10.0, 20.0, 40.0, 80.0, 220.0, 400.0, 900.0, 1800.0, 3100.0, 6500.0, 15500.0, 20000.0, 50000.0, 120000.0, 350000.0};
 
 // HLL++ uses an interpolation method over the raw estimated cardinality to select the optimal bias.
 // Parameters/interpolation points taken from
 // https://docs.google.com/document/d/1gyjfMHy43U9OWBXxfaeG-3MjGzejW1dlpyMwEYAAWEI/mobilebasic
-CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p4{
-  11.0,    11.717,  12.207,  12.7896, 13.2882, 13.8204, 14.3772, 14.9342, 15.5202, 16.161,
-  16.7722, 17.4636, 18.0396, 18.6766, 19.3566, 20.0454, 20.7936, 21.4856, 22.2666, 22.9946,
-  23.766,  24.4692, 25.3638, 26.0764, 26.7864, 27.7602, 28.4814, 29.433,  30.2926, 31.0664,
-  31.9996, 32.7956, 33.5366, 34.5894, 35.5738, 36.2698, 37.3682, 38.0544, 39.2342, 40.0108,
-  40.7966, 41.9298, 42.8704, 43.6358, 44.5194, 45.773,  46.6772, 47.6174, 48.4888, 49.3304,
-  50.2506, 51.4996, 52.3824, 53.3078, 54.3984, 55.5838, 56.6618, 57.2174, 58.3514, 59.0802,
-  60.1482, 61.0376, 62.3598, 62.8078, 63.9744, 64.914,  65.781,  67.1806, 68.0594, 68.8446,
-  69.7928, 70.8248, 71.8324, 72.8598, 73.6246, 74.7014, 75.393,  76.6708, 77.2394};
-CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p5{
-  23.0,     23.1194,  23.8208,  24.2318,  24.77,    25.2436,  25.7774,  26.2848,  26.8224,
-  27.3742,  27.9336,  28.503,   29.0494,  29.6292,  30.2124,  30.798,   31.367,   31.9728,
-  32.5944,  33.217,   33.8438,  34.3696,  35.0956,  35.7044,  36.324,   37.0668,  37.6698,
-  38.3644,  39.049,   39.6918,  40.4146,  41.082,   41.687,   42.5398,  43.2462,  43.857,
-  44.6606,  45.4168,  46.1248,  46.9222,  47.6804,  48.447,   49.3454,  49.9594,  50.7636,
-  51.5776,  52.331,   53.19,    53.9676,  54.7564,  55.5314,  56.4442,  57.3708,  57.9774,
-  58.9624,  59.8796,  60.755,   61.472,   62.2076,  63.1024,  63.8908,  64.7338,  65.7728,
-  66.629,   67.413,   68.3266,  69.1524,  70.2642,  71.1806,  72.0566,  72.9192,  73.7598,
-  74.3516,  75.5802,  76.4386,  77.4916,  78.1524,  79.1892,  79.8414,  80.8798,  81.8376,
-  82.4698,  83.7656,  84.331,   85.5914,  86.6012,  87.7016,  88.5582,  89.3394,  90.3544,
-  91.4912,  92.308,   93.3552,  93.9746,  95.2052,  95.727,   97.1322,  98.3944,  98.7588,
-  100.242,  101.1914, 102.2538, 102.8776, 103.6292, 105.1932, 105.9152, 107.0868, 107.6728,
-  108.7144, 110.3114, 110.8716, 111.245,  112.7908, 113.7064, 114.636,  115.7464, 116.1788,
-  117.7464, 118.4896, 119.6166, 120.5082, 121.7798, 122.9028, 123.4426, 124.8854, 125.705,
-  126.4652, 128.3464, 128.3462, 130.0398, 131.0342, 131.0042, 132.4766, 133.511,  134.7252,
-  135.425,  136.5172, 138.0572, 138.6694, 139.3712, 140.8598, 141.4594, 142.554,  143.4006,
-  144.7374, 146.1634, 146.8994, 147.605,  147.9304, 149.1636, 150.2468, 151.5876, 152.2096,
-  153.7032, 154.7146, 155.807,  156.9228, 157.0372, 158.5852};
-CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p6{
-  46.0,     46.1902,  47.271,   47.8358,  48.8142,  49.2854,  50.317,   51.354,   51.8924,
-  52.9436,  53.4596,  54.5262,  55.6248,  56.1574,  57.2822,  57.837,   58.9636,  60.074,
-  60.7042,  61.7976,  62.4772,  63.6564,  64.7942,  65.5004,  66.686,   67.291,   68.5672,
-  69.8556,  70.4982,  71.8204,  72.4252,  73.7744,  75.0786,  75.8344,  77.0294,  77.8098,
-  79.0794,  80.5732,  81.1878,  82.5648,  83.2902,  84.6784,  85.3352,  86.8946,  88.3712,
-  89.0852,  90.499,   91.2686,  92.6844,  94.2234,  94.9732,  96.3356,  97.2286,  98.7262,
-  100.3284, 101.1048, 102.5962, 103.3562, 105.1272, 106.4184, 107.4974, 109.0822, 109.856,
-  111.48,   113.2834, 114.0208, 115.637,  116.5174, 118.0576, 119.7476, 120.427,  122.1326,
-  123.2372, 125.2788, 126.6776, 127.7926, 129.1952, 129.9564, 131.6454, 133.87,   134.5428,
-  136.2,    137.0294, 138.6278, 139.6782, 141.792,  143.3516, 144.2832, 146.0394, 147.0748,
-  148.4912, 150.849,  151.696,  153.5404, 154.073,  156.3714, 157.7216, 158.7328, 160.4208,
-  161.4184, 163.9424, 165.2772, 166.411,  168.1308, 168.769,  170.9258, 172.6828, 173.7502,
-  175.706,  176.3886, 179.0186, 180.4518, 181.927,  183.4172, 184.4114, 186.033,  188.5124,
-  189.5564, 191.6008, 192.4172, 193.8044, 194.997,  197.4548, 198.8948, 200.2346, 202.3086,
-  203.1548, 204.8842, 206.6508, 206.6772, 209.7254, 210.4752, 212.7228, 214.6614, 215.1676,
-  217.793,  218.0006, 219.9052, 221.66,   223.5588, 225.1636, 225.6882, 227.7126, 229.4502,
-  231.1978, 232.9756, 233.1654, 236.727,  238.1974, 237.7474, 241.1346, 242.3048, 244.1948,
-  245.3134, 246.879,  249.1204, 249.853,  252.6792, 253.857,  254.4486, 257.2362, 257.9534,
-  260.0286, 260.5632, 262.663,  264.723,  265.7566, 267.2566, 267.1624, 270.62,   272.8216,
-  273.2166, 275.2056, 276.2202, 278.3726, 280.3344, 281.9284, 283.9728, 284.1924, 286.4872,
-  287.587,  289.807,  291.1206, 292.769,  294.8708, 296.665,  297.1182, 299.4012, 300.6352,
-  302.1354, 304.1756, 306.1606, 307.3462, 308.5214, 309.4134, 310.8352, 313.9684, 315.837,
-  316.7796, 318.9858};
-CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p7{
-  92.0,     93.4934,  94.9758,  96.4574,  97.9718,  99.4954,  101.5302, 103.0756, 104.6374,
-  106.1782, 107.7888, 109.9522, 111.592,  113.2532, 114.9086, 116.5938, 118.9474, 120.6796,
-  122.4394, 124.2176, 125.9768, 128.4214, 130.2528, 132.0102, 133.8658, 135.7278, 138.3044,
-  140.1316, 142.093,  144.0032, 145.9092, 148.6306, 150.5294, 152.5756, 154.6508, 156.662,
-  159.552,  161.3724, 163.617,  165.5754, 167.7872, 169.8444, 172.7988, 174.8606, 177.2118,
-  179.3566, 181.4476, 184.5882, 186.6816, 189.0824, 191.0258, 193.6048, 196.4436, 198.7274,
-  200.957,  203.147,  205.4364, 208.7592, 211.3386, 213.781,  215.8028, 218.656,  221.6544,
-  223.996,  226.4718, 229.1544, 231.6098, 234.5956, 237.0616, 239.5758, 242.4878, 244.5244,
-  248.2146, 250.724,  252.8722, 255.5198, 258.0414, 261.941,  264.9048, 266.87,   269.4304,
-  272.028,  274.4708, 278.37,   281.0624, 283.4668, 286.5532, 289.4352, 293.2564, 295.2744,
-  298.2118, 300.7472, 304.1456, 307.2928, 309.7504, 312.5528, 315.979,  318.2102, 322.1834,
-  324.3494, 327.325,  330.6614, 332.903,  337.2544, 339.9042, 343.215,  345.2864, 348.0814,
-  352.6764, 355.301,  357.139,  360.658,  363.1732, 366.5902, 369.9538, 373.0828, 375.922,
-  378.9902, 382.7328, 386.4538, 388.1136, 391.2234, 394.0878, 396.708,  401.1556, 404.1852,
-  406.6372, 409.6822, 412.7796, 416.6078, 418.4916, 422.131,  424.5376, 428.1988, 432.211,
-  434.4502, 438.5282, 440.912,  444.0448, 447.7432, 450.8524, 453.7988, 456.7858, 458.8868,
-  463.9886, 466.5064, 468.9124, 472.6616, 475.4682, 478.582,  481.304,  485.2738, 488.6894,
-  490.329,  496.106,  497.6908, 501.1374, 504.5322, 506.8848, 510.3324, 513.4512, 516.179,
-  520.4412, 522.6066, 526.167,  528.7794, 533.379,  536.067,  538.46,   542.9116, 545.692,
-  547.9546, 552.493,  555.2722, 557.335,  562.449,  564.2014, 569.0738, 571.0974, 574.8564,
-  578.2996, 581.409,  583.9704, 585.8098, 589.6528, 594.5998, 595.958,  600.068,  603.3278,
-  608.2016, 609.9632, 612.864,  615.43,   620.7794, 621.272,  625.8644, 629.206,  633.219,
-  634.5154, 638.6102};
-CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p8{
-  184.2152,  187.2454,  190.2096,  193.6652,  196.6312,  199.6822,  203.249,   206.3296,  210.0038,
-  213.2074,  216.4612,  220.27,    223.5178,  227.4412,  230.8032,  234.1634,  238.1688,  241.6074,
-  245.6946,  249.2664,  252.8228,  257.0432,  260.6824,  264.9464,  268.6268,  272.2626,  276.8376,
-  280.4034,  284.8956,  288.8522,  292.7638,  297.3552,  301.3556,  305.7526,  309.9292,  313.8954,
-  318.8198,  322.7668,  327.298,   331.6688,  335.9466,  340.9746,  345.1672,  349.3474,  354.3028,
-  358.8912,  364.114,   368.4646,  372.9744,  378.4092,  382.6022,  387.843,   392.5684,  397.1652,
-  402.5426,  407.4152,  412.5388,  417.3592,  422.1366,  427.486,   432.3918,  437.5076,  442.509,
-  447.3834,  453.3498,  458.0668,  463.7346,  469.1228,  473.4528,  479.7,     484.644,   491.0518,
-  495.5774,  500.9068,  506.432,   512.1666,  517.434,   522.6644,  527.4894,  533.6312,  538.3804,
-  544.292,   550.5496,  556.0234,  562.8206,  566.6146,  572.4188,  579.117,   583.6762,  590.6576,
-  595.7864,  601.509,   607.5334,  612.9204,  619.772,   624.2924,  630.8654,  636.1836,  642.745,
-  649.1316,  655.0386,  660.0136,  666.6342,  671.6196,  678.1866,  684.4282,  689.3324,  695.4794,
-  702.5038,  708.129,   713.528,   720.3204,  726.463,   732.7928,  739.123,   744.7418,  751.2192,
-  756.5102,  762.6066,  769.0184,  775.2224,  781.4014,  787.7618,  794.1436,  798.6506,  805.6378,
-  811.766,   819.7514,  824.5776,  828.7322,  837.8048,  843.6302,  849.9336,  854.4798,  861.3388,
-  867.9894,  873.8196,  880.3136,  886.2308,  892.4588,  899.0816,  905.4076,  912.0064,  917.3878,
-  923.619,   929.998,   937.3482,  943.9506,  947.991,   955.1144,  962.203,   968.8222,  975.7324,
-  981.7826,  988.7666,  994.2648,  1000.3128, 1007.4082, 1013.7536, 1020.3376, 1026.7156, 1031.7478,
-  1037.4292, 1045.393,  1051.2278, 1058.3434, 1062.8726, 1071.884,  1076.806,  1082.9176, 1089.1678,
-  1095.5032, 1102.525,  1107.2264, 1115.315,  1120.93,   1127.252,  1134.1496, 1139.0408, 1147.5448,
-  1153.3296, 1158.1974, 1166.5262, 1174.3328, 1175.657,  1184.4222, 1190.9172, 1197.1292, 1204.4606,
-  1210.4578, 1218.8728, 1225.3336, 1226.6592, 1236.5768, 1241.363,  1249.4074, 1254.6566, 1260.8014,
-  1266.5454, 1274.5192};
-CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p9{
-  369.0,     374.8294,  381.2452,  387.6698,  394.1464,  400.2024,  406.8782,  413.6598,  420.462,
-  427.2826,  433.7102,  440.7416,  447.9366,  455.1046,  462.285,   469.0668,  476.306,   483.8448,
-  491.301,   498.9886,  506.2422,  513.8138,  521.7074,  529.7428,  537.8402,  545.1664,  553.3534,
-  561.594,   569.6886,  577.7876,  585.65,    594.228,   602.8036,  611.1666,  620.0818,  628.0824,
-  637.2574,  646.302,   655.1644,  664.0056,  672.3802,  681.7192,  690.5234,  700.2084,  708.831,
-  718.485,   728.1112,  737.4764,  746.76,    756.3368,  766.5538,  775.5058,  785.2646,  795.5902,
-  804.3818,  814.8998,  824.9532,  835.2062,  845.2798,  854.4728,  864.9582,  875.3292,  886.171,
-  896.781,   906.5716,  916.7048,  927.5322,  937.875,   949.3972,  958.3464,  969.7274,  980.2834,
-  992.1444,  1003.4264, 1013.0166, 1024.018,  1035.0438, 1046.34,   1057.6856, 1068.9836, 1079.0312,
-  1091.677,  1102.3188, 1113.4846, 1124.4424, 1135.739,  1147.1488, 1158.9202, 1169.406,  1181.5342,
-  1193.2834, 1203.8954, 1216.3286, 1226.2146, 1239.6684, 1251.9946, 1262.123,  1275.4338, 1285.7378,
-  1296.076,  1308.9692, 1320.4964, 1333.0998, 1343.9864, 1357.7754, 1368.3208, 1380.4838, 1392.7388,
-  1406.0758, 1416.9098, 1428.9728, 1440.9228, 1453.9292, 1462.617,  1476.05,   1490.2996, 1500.6128,
-  1513.7392, 1524.5174, 1536.6322, 1548.2584, 1562.3766, 1572.423,  1587.1232, 1596.5164, 1610.5938,
-  1622.5972, 1633.1222, 1647.7674, 1658.5044, 1671.57,   1683.7044, 1695.4142, 1708.7102, 1720.6094,
-  1732.6522, 1747.841,  1756.4072, 1769.9786, 1782.3276, 1797.5216, 1808.3186, 1819.0694, 1834.354,
-  1844.575,  1856.2808, 1871.1288, 1880.7852, 1893.9622, 1906.3418, 1920.6548, 1932.9302, 1945.8584,
-  1955.473,  1968.8248, 1980.6446, 1995.9598, 2008.349,  2019.8556, 2033.0334, 2044.0206, 2059.3956,
-  2069.9174, 2082.6084, 2093.7036, 2106.6108, 2118.9124, 2132.301,  2144.7628, 2159.8422, 2171.0212,
-  2183.101,  2193.5112, 2208.052,  2221.3194, 2233.3282, 2247.295,  2257.7222, 2273.342,  2286.5638,
-  2299.6786, 2310.8114, 2322.3312, 2335.516,  2349.874,  2363.5968, 2373.865,  2387.1918, 2401.8328,
-  2414.8496, 2424.544,  2436.7592, 2447.1682, 2464.1958, 2474.3438, 2489.0006, 2497.4526, 2513.6586,
-  2527.19,   2540.7028, 2553.768};
-CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p10{
-  738.1256,  750.4234,  763.1064,  775.4732,  788.4636,  801.0644,  814.488,   827.9654,  841.0832,
-  854.7864,  868.1992,  882.2176,  896.5228,  910.1716,  924.7752,  938.899,   953.6126,  968.6492,
-  982.9474,  998.5214,  1013.1064, 1028.6364, 1044.2468, 1059.4588, 1075.3832, 1091.0584, 1106.8606,
-  1123.3868, 1139.5062, 1156.1862, 1172.463,  1189.339,  1206.1936, 1223.1292, 1240.1854, 1257.2908,
-  1275.3324, 1292.8518, 1310.5204, 1328.4854, 1345.9318, 1364.552,  1381.4658, 1400.4256, 1419.849,
-  1438.152,  1456.8956, 1474.8792, 1494.118,  1513.62,   1532.5132, 1551.9322, 1570.7726, 1590.6086,
-  1610.5332, 1630.5918, 1650.4294, 1669.7662, 1690.4106, 1710.7338, 1730.9012, 1750.4486, 1770.1556,
-  1791.6338, 1812.7312, 1833.6264, 1853.9526, 1874.8742, 1896.8326, 1918.1966, 1939.5594, 1961.07,
-  1983.037,  2003.1804, 2026.071,  2047.4884, 2070.0848, 2091.2944, 2114.333,  2135.9626, 2158.2902,
-  2181.0814, 2202.0334, 2224.4832, 2246.39,   2269.7202, 2292.1714, 2314.2358, 2338.9346, 2360.891,
-  2384.0264, 2408.3834, 2430.1544, 2454.8684, 2476.9896, 2501.4368, 2522.8702, 2548.0408, 2570.6738,
-  2593.5208, 2617.0158, 2640.2302, 2664.0962, 2687.4986, 2714.2588, 2735.3914, 2759.6244, 2781.8378,
-  2808.0072, 2830.6516, 2856.2454, 2877.2136, 2903.4546, 2926.785,  2951.2294, 2976.468,  3000.867,
-  3023.6508, 3049.91,   3073.5984, 3098.162,  3121.5564, 3146.2328, 3170.9484, 3195.5902, 3221.3346,
-  3242.7032, 3271.6112, 3296.5546, 3317.7376, 3345.072,  3369.9518, 3394.326,  3418.1818, 3444.6926,
-  3469.086,  3494.2754, 3517.8698, 3544.248,  3565.3768, 3588.7234, 3616.979,  3643.7504, 3668.6812,
-  3695.72,   3719.7392, 3742.6224, 3770.4456, 3795.6602, 3819.9058, 3844.002,  3869.517,  3895.6824,
-  3920.8622, 3947.1364, 3973.985,  3995.4772, 4021.62,   4046.628,  4074.65,   4096.2256, 4121.831,
-  4146.6406, 4173.276,  4195.0744, 4223.9696, 4251.3708, 4272.9966, 4300.8046, 4326.302,  4353.1248,
-  4374.312,  4403.0322, 4426.819,  4450.0598, 4478.5206, 4504.8116, 4528.8928, 4553.9584, 4578.8712,
-  4603.8384, 4632.3872, 4655.5128, 4675.821,  4704.6222, 4731.9862, 4755.4174, 4781.2628, 4804.332,
-  4832.3048, 4862.8752, 4883.4148, 4906.9544, 4935.3516, 4954.3532, 4984.0248, 5011.217,  5035.3258,
-  5057.3672, 5084.1828};
-CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p11{
-  1477.0,    1501.6014, 1526.5802, 1551.7942, 1577.3042, 1603.2062,  1629.8402,  1656.2292,
-  1682.9462, 1709.9926, 1737.3026, 1765.4252, 1793.0578, 1821.6092,  1849.626,   1878.5568,
-  1908.527,  1937.5154, 1967.1874, 1997.3878, 2027.37,   2058.1972,  2089.5728,  2120.1012,
-  2151.9668, 2183.292,  2216.0772, 2247.8578, 2280.6562, 2313.041,   2345.714,   2380.3112,
-  2414.1806, 2447.9854, 2481.656,  2516.346,  2551.5154, 2586.8378,  2621.7448,  2656.6722,
-  2693.5722, 2729.1462, 2765.4124, 2802.8728, 2838.898,  2876.408,   2913.4926,  2951.4938,
-  2989.6776, 3026.282,  3065.7704, 3104.1012, 3143.7388, 3181.6876,  3221.1872,  3261.5048,
-  3300.0214, 3339.806,  3381.409,  3421.4144, 3461.4294, 3502.2286,  3544.651,   3586.6156,
-  3627.337,  3670.083,  3711.1538, 3753.5094, 3797.01,   3838.6686,  3882.1678,  3922.8116,
-  3967.9978, 4009.9204, 4054.3286, 4097.5706, 4140.6014, 4185.544,   4229.5976,  4274.583,
-  4316.9438, 4361.672,  4406.2786, 4451.8628, 4496.1834, 4543.505,   4589.1816,  4632.5188,
-  4678.2294, 4724.8908, 4769.0194, 4817.052,  4861.4588, 4910.1596,  4956.4344,  5002.5238,
-  5048.13,   5093.6374, 5142.8162, 5187.7894, 5237.3984, 5285.6078,  5331.0858,  5379.1036,
-  5428.6258, 5474.6018, 5522.7618, 5571.5822, 5618.59,   5667.9992,  5714.88,    5763.454,
-  5808.6982, 5860.3644, 5910.2914, 5953.571,  6005.9232, 6055.1914,  6104.5882,  6154.5702,
-  6199.7036, 6251.1764, 6298.7596, 6350.0302, 6398.061,  6448.4694,  6495.933,   6548.0474,
-  6597.7166, 6646.9416, 6695.9208, 6742.6328, 6793.5276, 6842.1934,  6894.2372,  6945.3864,
-  6996.9228, 7044.2372, 7094.1374, 7142.2272, 7192.2942, 7238.8338,  7288.9006,  7344.0908,
-  7394.8544, 7443.5176, 7490.4148, 7542.9314, 7595.6738, 7641.9878,  7694.3688,  7743.0448,
-  7797.522,  7845.53,   7899.594,  7950.3132, 7996.455,  8050.9442,  8092.9114,  8153.1374,
-  8197.4472, 8252.8278, 8301.8728, 8348.6776, 8401.4698, 8453.551,   8504.6598,  8553.8944,
-  8604.1276, 8657.6514, 8710.3062, 8758.908,  8807.8706, 8862.1702,  8910.4668,  8960.77,
-  9007.2766, 9063.164,  9121.0534, 9164.1354, 9218.1594, 9267.767,   9319.0594,  9372.155,
-  9419.7126, 9474.3722, 9520.1338, 9572.368,  9622.7702, 9675.8448,  9726.5396,  9778.7378,
-  9827.6554, 9878.1922, 9928.7782, 9978.3984, 10026.578, 10076.5626, 10137.1618, 10177.5244,
-  10229.9176};
-CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p12{
-  2954.0,     3003.4782,  3053.3568,  3104.3666,  3155.324,   3206.9598,  3259.648,   3312.539,
-  3366.1474,  3420.2576,  3474.8376,  3530.6076,  3586.451,   3643.38,    3700.4104,  3757.5638,
-  3815.9676,  3875.193,   3934.838,   3994.8548,  4055.018,   4117.1742,  4178.4482,  4241.1294,
-  4304.4776,  4367.4044,  4431.8724,  4496.3732,  4561.4304,  4627.5326,  4693.949,   4761.5532,
-  4828.7256,  4897.6182,  4965.5186,  5034.4528,  5104.865,   5174.7164,  5244.6828,  5316.6708,
-  5387.8312,  5459.9036,  5532.476,   5604.8652,  5679.6718,  5753.757,   5830.2072,  5905.2828,
-  5980.0434,  6056.6264,  6134.3192,  6211.5746,  6290.0816,  6367.1176,  6447.9796,  6526.5576,
-  6606.1858,  6686.9144,  6766.1142,  6847.0818,  6927.9664,  7010.9096,  7091.0816,  7175.3962,
-  7260.3454,  7344.018,   7426.4214,  7511.3106,  7596.0686,  7679.8094,  7765.818,   7852.4248,
-  7936.834,   8022.363,   8109.5066,  8200.4554,  8288.5832,  8373.366,   8463.4808,  8549.7682,
-  8642.0522,  8728.3288,  8820.9528,  8907.727,   9001.0794,  9091.2522,  9179.988,   9269.852,
-  9362.6394,  9453.642,   9546.9024,  9640.6616,  9732.6622,  9824.3254,  9917.7484,  10007.9392,
-  10106.7508, 10196.2152, 10289.8114, 10383.5494, 10482.3064, 10576.8734, 10668.7872, 10764.7156,
-  10862.0196, 10952.793,  11049.9748, 11146.0702, 11241.4492, 11339.2772, 11434.2336, 11530.741,
-  11627.6136, 11726.311,  11821.5964, 11918.837,  12015.3724, 12113.0162, 12213.0424, 12306.9804,
-  12408.4518, 12504.8968, 12604.586,  12700.9332, 12798.705,  12898.5142, 12997.0488, 13094.788,
-  13198.475,  13292.7764, 13392.9698, 13486.8574, 13590.1616, 13686.5838, 13783.6264, 13887.2638,
-  13992.0978, 14081.0844, 14189.9956, 14280.0912, 14382.4956, 14486.4384, 14588.1082, 14686.2392,
-  14782.276,  14888.0284, 14985.1864, 15088.8596, 15187.0998, 15285.027,  15383.6694, 15495.8266,
-  15591.3736, 15694.2008, 15790.3246, 15898.4116, 15997.4522, 16095.5014, 16198.8514, 16291.7492,
-  16402.6424, 16499.1266, 16606.2436, 16697.7186, 16796.3946, 16902.3376, 17005.7672, 17100.814,
-  17206.8282, 17305.8262, 17416.0744, 17508.4092, 17617.0178, 17715.4554, 17816.758,  17920.1748,
-  18012.9236, 18119.7984, 18223.2248, 18324.2482, 18426.6276, 18525.0932, 18629.8976, 18733.2588,
-  18831.0466, 18940.1366, 19032.2696, 19131.729,  19243.4864, 19349.6932, 19442.866,  19547.9448,
-  19653.2798, 19754.4034, 19854.0692, 19965.1224, 20065.1774, 20158.2212, 20253.353,  20366.3264,
-  20463.22};
-CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p13{
-  5908.5052,  6007.2672,  6107.347,   6208.5794,  6311.2622,  6414.5514,  6519.3376,  6625.6952,
-  6732.5988,  6841.3552,  6950.5972,  7061.3082,  7173.5646,  7287.109,   7401.8216,  7516.4344,
-  7633.3802,  7751.2962,  7870.3784,  7990.292,   8110.79,    8233.4574,  8356.6036,  8482.2712,
-  8607.7708,  8735.099,   8863.1858,  8993.4746,  9123.8496,  9255.6794,  9388.5448,  9522.7516,
-  9657.3106,  9792.6094,  9930.5642,  10068.794,  10206.7256, 10347.81,   10490.3196, 10632.0778,
-  10775.9916, 10920.4662, 11066.124,  11213.073,  11358.0362, 11508.1006, 11659.1716, 11808.7514,
-  11959.4884, 12112.1314, 12265.037,  12420.3756, 12578.933,  12734.311,  12890.0006, 13047.2144,
-  13207.3096, 13368.5144, 13528.024,  13689.847,  13852.7528, 14018.3168, 14180.5372, 14346.9668,
-  14513.5074, 14677.867,  14846.2186, 15017.4186, 15184.9716, 15356.339,  15529.2972, 15697.3578,
-  15871.8686, 16042.187,  16216.4094, 16389.4188, 16565.9126, 16742.3272, 16919.0042, 17094.7592,
-  17273.965,  17451.8342, 17634.4254, 17810.5984, 17988.9242, 18171.051,  18354.7938, 18539.466,
-  18721.0408, 18904.9972, 19081.867,  19271.9118, 19451.8694, 19637.9816, 19821.2922, 20013.1292,
-  20199.3858, 20387.8726, 20572.9514, 20770.7764, 20955.1714, 21144.751,  21329.9952, 21520.709,
-  21712.7016, 21906.3868, 22096.2626, 22286.0524, 22475.051,  22665.5098, 22862.8492, 23055.5294,
-  23249.6138, 23437.848,  23636.273,  23826.093,  24020.3296, 24213.3896, 24411.7392, 24602.9614,
-  24805.7952, 24998.1552, 25193.9588, 25389.0166, 25585.8392, 25780.6976, 25981.2728, 26175.977,
-  26376.5252, 26570.1964, 26773.387,  26962.9812, 27163.0586, 27368.164,  27565.0534, 27758.7428,
-  27961.1276, 28163.2324, 28362.3816, 28565.7668, 28758.644,  28956.9768, 29163.4722, 29354.7026,
-  29561.1186, 29767.9948, 29959.9986, 30164.0492, 30366.9818, 30562.5338, 30762.9928, 30976.1592,
-  31166.274,  31376.722,  31570.3734, 31770.809,  31974.8934, 32179.5286, 32387.5442, 32582.3504,
-  32794.076,  32989.9528, 33191.842,  33392.4684, 33595.659,  33801.8672, 34000.3414, 34200.0922,
-  34402.6792, 34610.0638, 34804.0084, 35011.13,   35218.669,  35418.6634, 35619.0792, 35830.6534,
-  36028.4966, 36229.7902, 36438.6422, 36630.7764, 36833.3102, 37048.6728, 37247.3916, 37453.5904,
-  37669.3614, 37854.5526, 38059.305,  38268.0936, 38470.2516, 38674.7064, 38876.167,  39068.3794,
-  39281.9144, 39492.8566, 39684.8628, 39898.4108, 40093.1836, 40297.6858, 40489.7086, 40717.2424};
-CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p14{
-  11817.475,  12015.0046, 12215.3792, 12417.7504, 12623.1814, 12830.0086, 13040.0072, 13252.503,
-  13466.178,  13683.2738, 13902.0344, 14123.9798, 14347.394,  14573.7784, 14802.6894, 15033.6824,
-  15266.9134, 15502.8624, 15741.4944, 15980.7956, 16223.8916, 16468.6316, 16715.733,  16965.5726,
-  17217.204,  17470.666,  17727.8516, 17986.7886, 18247.6902, 18510.9632, 18775.304,  19044.7486,
-  19314.4408, 19587.202,  19862.2576, 20135.924,  20417.0324, 20697.9788, 20979.6112, 21265.0274,
-  21550.723,  21841.6906, 22132.162,  22428.1406, 22722.127,  23020.5606, 23319.7394, 23620.4014,
-  23925.2728, 24226.9224, 24535.581,  24845.505,  25155.9618, 25470.3828, 25785.9702, 26103.7764,
-  26420.4132, 26742.0186, 27062.8852, 27388.415,  27714.6024, 28042.296,  28365.4494, 28701.1526,
-  29031.8008, 29364.2156, 29704.497,  30037.1458, 30380.111,  30723.8168, 31059.5114, 31404.9498,
-  31751.6752, 32095.2686, 32444.7792, 32794.767,  33145.204,  33498.4226, 33847.6502, 34209.006,
-  34560.849,  34919.4838, 35274.9778, 35635.1322, 35996.3266, 36359.1394, 36722.8266, 37082.8516,
-  37447.7354, 37815.9606, 38191.0692, 38559.4106, 38924.8112, 39294.6726, 39663.973,  40042.261,
-  40416.2036, 40779.2036, 41161.6436, 41540.9014, 41921.1998, 42294.7698, 42678.5264, 43061.3464,
-  43432.375,  43818.432,  44198.6598, 44583.0138, 44970.4794, 45353.924,  45729.858,  46118.2224,
-  46511.5724, 46900.7386, 47280.6964, 47668.1472, 48055.6796, 48446.9436, 48838.7146, 49217.7296,
-  49613.7796, 50010.7508, 50410.0208, 50793.7886, 51190.2456, 51583.1882, 51971.0796, 52376.5338,
-  52763.319,  53165.5534, 53556.5594, 53948.2702, 54346.352,  54748.7914, 55138.577,  55543.4824,
-  55941.1748, 56333.7746, 56745.1552, 57142.7944, 57545.2236, 57935.9956, 58348.5268, 58737.5474,
-  59158.5962, 59542.6896, 59958.8004, 60349.3788, 60755.0212, 61147.6144, 61548.194,  61946.0696,
-  62348.6042, 62763.603,  63162.781,  63560.635,  63974.3482, 64366.4908, 64771.5876, 65176.7346,
-  65597.3916, 65995.915,  66394.0384, 66822.9396, 67203.6336, 67612.2032, 68019.0078, 68420.0388,
-  68821.22,   69235.8388, 69640.0724, 70055.155,  70466.357,  70863.4266, 71276.2482, 71677.0306,
-  72080.2006, 72493.0214, 72893.5952, 73314.5856, 73714.9852, 74125.3022, 74521.2122, 74933.6814,
-  75341.5904, 75743.0244, 76166.0278, 76572.1322, 76973.1028, 77381.6284, 77800.6092, 78189.328,
-  78607.0962, 79012.2508, 79407.8358, 79825.725,  80238.701,  80646.891,  81035.6436, 81460.0448,
-  81876.3884};
-CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p15{
-  23635.0036,  24030.8034,  24431.4744,  24837.1524,  25246.7928,  25661.326,   26081.3532,
-  26505.2806,  26933.9892,  27367.7098,  27805.318,   28248.799,   28696.4382,  29148.8244,
-  29605.5138,  30066.8668,  30534.2344,  31006.32,    31480.778,   31962.2418,  32447.3324,
-  32938.0232,  33432.731,   33930.728,   34433.9896,  34944.1402,  35457.5588,  35974.5958,
-  36497.3296,  37021.9096,  37554.326,   38088.0826,  38628.8816,  39171.3192,  39723.2326,
-  40274.5554,  40832.3142,  41390.613,   41959.5908,  42532.5466,  43102.0344,  43683.5072,
-  44266.694,   44851.2822,  45440.7862,  46038.0586,  46640.3164,  47241.064,   47846.155,
-  48454.7396,  49076.9168,  49692.542,   50317.4778,  50939.65,    51572.5596,  52210.2906,
-  52843.7396,  53481.3996,  54127.236,   54770.406,   55422.6598,  56078.7958,  56736.7174,
-  57397.6784,  58064.5784,  58730.308,   59404.9784,  60077.0864,  60751.9158,  61444.1386,
-  62115.817,   62808.7742,  63501.4774,  64187.5454,  64883.6622,  65582.7468,  66274.5318,
-  66976.9276,  67688.7764,  68402.138,   69109.6274,  69822.9706,  70543.6108,  71265.5202,
-  71983.3848,  72708.4656,  73433.384,   74158.4664,  74896.4868,  75620.9564,  76362.1434,
-  77098.3204,  77835.7662,  78582.6114,  79323.9902,  80067.8658,  80814.9246,  81567.0136,
-  82310.8536,  83061.9952,  83821.4096,  84580.8608,  85335.547,   86092.5802,  86851.6506,
-  87612.311,   88381.2016,  89146.3296,  89907.8974,  90676.846,   91451.4152,  92224.5518,
-  92995.8686,  93763.5066,  94551.2796,  95315.1944,  96096.1806,  96881.0918,  97665.679,
-  98442.68,    99229.3002,  100011.0994, 100790.6386, 101580.1564, 102377.7484, 103152.1392,
-  103944.2712, 104730.216,  105528.6336, 106324.9398, 107117.6706, 107890.3988, 108695.2266,
-  109485.238,  110294.7876, 111075.0958, 111878.0496, 112695.2864, 113464.5486, 114270.0474,
-  115068.608,  115884.3626, 116673.2588, 117483.3716, 118275.097,  119085.4092, 119879.2808,
-  120687.5868, 121499.9944, 122284.916,  123095.9254, 123912.5038, 124709.0454, 125503.7182,
-  126323.259,  127138.9412, 127943.8294, 128755.646,  129556.5354, 130375.3298, 131161.4734,
-  131971.1962, 132787.5458, 133588.1056, 134431.351,  135220.2906, 136023.398,  136846.6558,
-  137667.0004, 138463.663,  139283.7154, 140074.6146, 140901.3072, 141721.8548, 142543.2322,
-  143356.1096, 144173.7412, 144973.0948, 145794.3162, 146609.5714, 147420.003,  148237.9784,
-  149050.5696, 149854.761,  150663.1966, 151494.0754, 152313.1416, 153112.6902, 153935.7206,
-  154746.9262, 155559.547,  156401.9746, 157228.7036, 158008.7254, 158820.75,   159646.9184,
-  160470.4458, 161279.5348, 162093.3114, 162918.542,  163729.2842};
-CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p16{
-  47271.0,     48062.3584,  48862.7074,  49673.152,   50492.8416,  51322.9514,  52161.03,
-  53009.407,   53867.6348,  54734.206,   55610.5144,  56496.2096,  57390.795,   58297.268,
-  59210.6448,  60134.665,   61068.0248,  62010.4472,  62962.5204,  63923.5742,  64895.0194,
-  65876.4182,  66862.6136,  67862.6968,  68868.8908,  69882.8544,  70911.271,   71944.0924,
-  72990.0326,  74040.692,   75100.6336,  76174.7826,  77252.5998,  78340.2974,  79438.2572,
-  80545.4976,  81657.2796,  82784.6336,  83915.515,   85059.7362,  86205.9368,  87364.4424,
-  88530.3358,  89707.3744,  90885.9638,  92080.197,   93275.5738,  94479.391,   95695.918,
-  96919.2236,  98148.4602,  99382.3474,  100625.6974, 101878.0284, 103141.6278, 104409.4588,
-  105686.2882, 106967.5402, 108261.6032, 109548.1578, 110852.0728, 112162.231,  113479.0072,
-  114806.2626, 116137.9072, 117469.5048, 118813.5186, 120165.4876, 121516.2556, 122875.766,
-  124250.5444, 125621.2222, 127003.2352, 128387.848,  129775.2644, 131181.7776, 132577.3086,
-  133979.9458, 135394.1132, 136800.9078, 138233.217,  139668.5308, 141085.212,  142535.2122,
-  143969.0684, 145420.2872, 146878.1542, 148332.7572, 149800.3202, 151269.66,   152743.6104,
-  154213.0948, 155690.288,  157169.4246, 158672.1756, 160160.059,  161650.6854, 163145.7772,
-  164645.6726, 166159.1952, 167682.1578, 169177.3328, 170700.0118, 172228.8964, 173732.6664,
-  175265.5556, 176787.799,  178317.111,  179856.6914, 181400.865,  182943.4612, 184486.742,
-  186033.4698, 187583.7886, 189148.1868, 190688.4526, 192250.1926, 193810.9042, 195354.2972,
-  196938.7682, 198493.5898, 200079.2824, 201618.912,  203205.5492, 204765.5798, 206356.1124,
-  207929.3064, 209498.7196, 211086.229,  212675.1324, 214256.7892, 215826.2392, 217412.8474,
-  218995.6724, 220618.6038, 222207.1166, 223781.0364, 225387.4332, 227005.7928, 228590.4336,
-  230217.8738, 231805.1054, 233408.9,    234995.3432, 236601.4956, 238190.7904, 239817.2548,
-  241411.2832, 243002.4066, 244640.1884, 246255.3128, 247849.3508, 249479.9734, 251106.8822,
-  252705.027,  254332.9242, 255935.129,  257526.9014, 259154.772,  260777.625,  262390.253,
-  264004.4906, 265643.59,   267255.4076, 268873.426,  270470.7252, 272106.4804, 273722.4456,
-  275337.794,  276945.7038, 278592.9154, 280204.3726, 281841.1606, 283489.171,  285130.1716,
-  286735.3362, 288364.7164, 289961.1814, 291595.5524, 293285.683,  294899.6668, 296499.3434,
-  298128.0462, 299761.8946, 301394.2424, 302997.6748, 304615.1478, 306269.7724, 307886.114,
-  309543.1028, 311153.2862, 312782.8546, 314421.2008, 316033.2438, 317692.9636, 319305.2648,
-  320948.7406, 322566.3364, 324228.4224, 325847.1542};
-CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p17{
-  94542.0,     96125.811,   97728.019,   99348.558,   100987.9705, 102646.7565, 104324.5125,
-  106021.7435, 107736.7865, 109469.272,  111223.9465, 112995.219,  114787.432,  116593.152,
-  118422.71,   120267.2345, 122134.6765, 124020.937,  125927.2705, 127851.255,  129788.9485,
-  131751.016,  133726.8225, 135722.592,  137736.789,  139770.568,  141821.518,  143891.343,
-  145982.1415, 148095.387,  150207.526,  152355.649,  154515.6415, 156696.05,   158887.7575,
-  161098.159,  163329.852,  165569.053,  167837.4005, 170121.6165, 172420.4595, 174732.6265,
-  177062.77,   179412.502,  181774.035,  184151.939,  186551.6895, 188965.691,  191402.8095,
-  193857.949,  196305.0775, 198774.6715, 201271.2585, 203764.78,   206299.3695, 208818.1365,
-  211373.115,  213946.7465, 216532.076,  219105.541,  221714.5375, 224337.5135, 226977.5125,
-  229613.0655, 232270.2685, 234952.2065, 237645.3555, 240331.1925, 243034.517,  245756.0725,
-  248517.6865, 251232.737,  254011.3955, 256785.995,  259556.44,   262368.335,  265156.911,
-  267965.266,  270785.583,  273616.0495, 276487.4835, 279346.639,  282202.509,  285074.3885,
-  287942.2855, 290856.018,  293774.0345, 296678.5145, 299603.6355, 302552.6575, 305492.9785,
-  308466.8605, 311392.581,  314347.538,  317319.4295, 320285.9785, 323301.7325, 326298.3235,
-  329301.3105, 332301.987,  335309.791,  338370.762,  341382.923,  344431.1265, 347464.1545,
-  350507.28,   353619.2345, 356631.2005, 359685.203,  362776.7845, 365886.488,  368958.2255,
-  372060.6825, 375165.4335, 378237.935,  381328.311,  384430.5225, 387576.425,  390683.242,
-  393839.648,  396977.8425, 400101.9805, 403271.296,  406409.8425, 409529.5485, 412678.7,
-  415847.423,  419020.8035, 422157.081,  425337.749,  428479.6165, 431700.902,  434893.1915,
-  438049.582,  441210.5415, 444379.2545, 447577.356,  450741.931,  453959.548,  457137.0935,
-  460329.846,  463537.4815, 466732.3345, 469960.5615, 473164.681,  476347.6345, 479496.173,
-  482813.1645, 486025.6995, 489249.4885, 492460.1945, 495675.8805, 498908.0075, 502131.802,
-  505374.3855, 508550.9915, 511806.7305, 515026.776,  518217.0005, 521523.9855, 524705.9855,
-  527950.997,  531210.0265, 534472.497,  537750.7315, 540926.922,  544207.094,  547429.4345,
-  550666.3745, 553975.3475, 557150.7185, 560399.6165, 563662.697,  566916.7395, 570146.1215,
-  573447.425,  576689.6245, 579874.5745, 583202.337,  586503.0255, 589715.635,  592910.161,
-  596214.3885, 599488.035,  602740.92,   605983.0685, 609248.67,   612491.3605, 615787.912,
-  619107.5245, 622307.9555, 625577.333,  628840.4385, 632085.2155, 635317.6135, 638691.7195,
-  641887.467,  645139.9405, 648441.546,  651666.252,  654941.845};
-CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p18{
-  189084.0,    192250.913,  195456.774,  198696.946,  201977.762,  205294.444,  208651.754,
-  212042.099,  215472.269,  218941.91,   222443.912,  225996.845,  229568.199,  233193.568,
-  236844.457,  240543.233,  244279.475,  248044.27,   251854.588,  255693.2,    259583.619,
-  263494.621,  267445.385,  271454.061,  275468.769,  279549.456,  283646.446,  287788.198,
-  291966.099,  296181.164,  300431.469,  304718.618,  309024.004,  313393.508,  317760.803,
-  322209.731,  326675.061,  331160.627,  335654.47,   340241.442,  344841.833,  349467.132,
-  354130.629,  358819.432,  363574.626,  368296.587,  373118.482,  377914.93,   382782.301,
-  387680.669,  392601.981,  397544.323,  402529.115,  407546.018,  412593.658,  417638.657,
-  422762.865,  427886.169,  433017.167,  438213.273,  443441.254,  448692.421,  453937.533,
-  459239.049,  464529.569,  469910.083,  475274.03,   480684.473,  486070.26,   491515.237,
-  496995.651,  502476.617,  507973.609,  513497.19,   519083.233,  524726.509,  530305.505,
-  535945.728,  541584.404,  547274.055,  552967.236,  558667.862,  564360.216,  570128.148,
-  575965.08,   581701.952,  587532.523,  593361.144,  599246.128,  605033.418,  610958.779,
-  616837.117,  622772.818,  628672.04,   634675.369,  640574.831,  646585.739,  652574.547,
-  658611.217,  664642.684,  670713.914,  676737.681,  682797.313,  688837.897,  694917.874,
-  701009.882,  707173.648,  713257.254,  719415.392,  725636.761,  731710.697,  737906.209,
-  744103.074,  750313.39,   756504.185,  762712.579,  768876.985,  775167.859,  781359.0,
-  787615.959,  793863.597,  800245.477,  806464.582,  812785.294,  819005.925,  825403.057,
-  831676.197,  837936.284,  844266.968,  850642.711,  856959.756,  863322.774,  869699.931,
-  876102.478,  882355.787,  888694.463,  895159.952,  901536.143,  907872.631,  914293.672,
-  920615.14,   927130.974,  933409.404,  939922.178,  946331.47,   952745.93,   959209.264,
-  965590.224,  972077.284,  978501.961,  984953.19,   991413.271,  997817.479,  1004222.658,
-  1010725.676, 1017177.138, 1023612.529, 1030098.236, 1036493.719, 1043112.207, 1049537.036,
-  1056008.096, 1062476.184, 1068942.337, 1075524.95,  1081932.864, 1088426.025, 1094776.005,
-  1101327.448, 1107901.673, 1114423.639, 1120884.602, 1127324.923, 1133794.24,  1140328.886,
-  1146849.376, 1153346.682, 1159836.502, 1166478.703, 1172953.304, 1179391.502, 1185950.982,
-  1192544.052, 1198913.41,  1205430.994, 1212015.525, 1218674.042, 1225121.683, 1231551.101,
-  1238126.379, 1244673.795, 1251260.649, 1257697.86,  1264320.983, 1270736.319, 1277274.694,
-  1283804.95,  1290211.514, 1296858.568, 1303455.691};
+CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p4{11.0, 11.717, 12.207, 12.7896, 13.2882, 13.8204, 14.3772, 14.9342, 15.5202, 16.161, 16.7722, 17.4636, 18.0396, 18.6766, 19.3566, 20.0454, 20.7936, 21.4856, 22.2666, 22.9946, 23.766, 24.4692, 25.3638, 26.0764, 26.7864, 27.7602, 28.4814, 29.433, 30.2926, 31.0664, 31.9996, 32.7956, 33.5366, 34.5894, 35.5738, 36.2698, 37.3682, 38.0544, 39.2342, 40.0108, 40.7966, 41.9298, 42.8704, 43.6358, 44.5194, 45.773, 46.6772, 47.6174, 48.4888, 49.3304, 50.2506, 51.4996, 52.3824, 53.3078, 54.3984, 55.5838, 56.6618, 57.2174, 58.3514, 59.0802, 60.1482, 61.0376, 62.3598, 62.8078, 63.9744, 64.914, 65.781, 67.1806, 68.0594, 68.8446, 69.7928, 70.8248, 71.8324, 72.8598, 73.6246, 74.7014, 75.393, 76.6708, 77.2394};
+CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p5{23.0, 23.1194, 23.8208, 24.2318, 24.77, 25.2436, 25.7774, 26.2848, 26.8224, 27.3742, 27.9336, 28.503, 29.0494, 29.6292, 30.2124, 30.798, 31.367, 31.9728, 32.5944, 33.217, 33.8438, 34.3696, 35.0956, 35.7044, 36.324, 37.0668, 37.6698, 38.3644, 39.049, 39.6918, 40.4146, 41.082, 41.687, 42.5398, 43.2462, 43.857, 44.6606, 45.4168, 46.1248, 46.9222, 47.6804, 48.447, 49.3454, 49.9594, 50.7636, 51.5776, 52.331, 53.19, 53.9676, 54.7564, 55.5314, 56.4442, 57.3708, 57.9774, 58.9624, 59.8796, 60.755, 61.472, 62.2076, 63.1024, 63.8908, 64.7338, 65.7728, 66.629, 67.413, 68.3266, 69.1524, 70.2642, 71.1806, 72.0566, 72.9192, 73.7598, 74.3516, 75.5802, 76.4386, 77.4916, 78.1524, 79.1892, 79.8414, 80.8798, 81.8376, 82.4698, 83.7656, 84.331, 85.5914, 86.6012, 87.7016, 88.5582, 89.3394, 90.3544, 91.4912, 92.308, 93.3552, 93.9746, 95.2052, 95.727, 97.1322, 98.3944, 98.7588, 100.242, 101.1914, 102.2538, 102.8776, 103.6292, 105.1932, 105.9152, 107.0868, 107.6728, 108.7144, 110.3114, 110.8716, 111.245, 112.7908, 113.7064, 114.636, 115.7464, 116.1788, 117.7464, 118.4896, 119.6166, 120.5082, 121.7798, 122.9028, 123.4426, 124.8854, 125.705, 126.4652, 128.3464, 128.3462, 130.0398, 131.0342, 131.0042, 132.4766, 133.511, 134.7252, 135.425, 136.5172, 138.0572, 138.6694, 139.3712, 140.8598, 141.4594, 142.554, 143.4006, 144.7374, 146.1634, 146.8994, 147.605, 147.9304, 149.1636, 150.2468, 151.5876, 152.2096, 153.7032, 154.7146, 155.807, 156.9228, 157.0372, 158.5852};
+CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p6{46.0, 46.1902, 47.271, 47.8358, 48.8142, 49.2854, 50.317, 51.354, 51.8924, 52.9436, 53.4596, 54.5262, 55.6248, 56.1574, 57.2822, 57.837, 58.9636, 60.074, 60.7042, 61.7976, 62.4772, 63.6564, 64.7942, 65.5004, 66.686, 67.291, 68.5672, 69.8556, 70.4982, 71.8204, 72.4252, 73.7744, 75.0786, 75.8344, 77.0294, 77.8098, 79.0794, 80.5732, 81.1878, 82.5648, 83.2902, 84.6784, 85.3352, 86.8946, 88.3712, 89.0852, 90.499, 91.2686, 92.6844, 94.2234, 94.9732, 96.3356, 97.2286, 98.7262, 100.3284, 101.1048, 102.5962, 103.3562, 105.1272, 106.4184, 107.4974, 109.0822, 109.856, 111.48, 113.2834, 114.0208, 115.637, 116.5174, 118.0576, 119.7476, 120.427, 122.1326, 123.2372, 125.2788, 126.6776, 127.7926, 129.1952, 129.9564, 131.6454, 133.87, 134.5428, 136.2, 137.0294, 138.6278, 139.6782, 141.792, 143.3516, 144.2832, 146.0394, 147.0748, 148.4912, 150.849, 151.696, 153.5404, 154.073, 156.3714, 157.7216, 158.7328, 160.4208, 161.4184, 163.9424, 165.2772, 166.411, 168.1308, 168.769, 170.9258, 172.6828, 173.7502, 175.706, 176.3886, 179.0186, 180.4518, 181.927, 183.4172, 184.4114, 186.033, 188.5124, 189.5564, 191.6008, 192.4172, 193.8044, 194.997, 197.4548, 198.8948, 200.2346, 202.3086, 203.1548, 204.8842, 206.6508, 206.6772, 209.7254, 210.4752, 212.7228, 214.6614, 215.1676, 217.793, 218.0006, 219.9052, 221.66, 223.5588, 225.1636, 225.6882, 227.7126, 229.4502, 231.1978, 232.9756, 233.1654, 236.727, 238.1974, 237.7474, 241.1346, 242.3048, 244.1948, 245.3134, 246.879, 249.1204, 249.853, 252.6792, 253.857, 254.4486, 257.2362, 257.9534, 260.0286, 260.5632, 262.663, 264.723, 265.7566, 267.2566, 267.1624, 270.62, 272.8216, 273.2166, 275.2056, 276.2202, 278.3726, 280.3344, 281.9284, 283.9728, 284.1924, 286.4872, 287.587, 289.807, 291.1206, 292.769, 294.8708, 296.665, 297.1182, 299.4012, 300.6352, 302.1354, 304.1756, 306.1606, 307.3462, 308.5214, 309.4134, 310.8352, 313.9684, 315.837, 316.7796, 318.9858};
+CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p7{92.0, 93.4934, 94.9758, 96.4574, 97.9718, 99.4954, 101.5302, 103.0756, 104.6374, 106.1782, 107.7888, 109.9522, 111.592, 113.2532, 114.9086, 116.5938, 118.9474, 120.6796, 122.4394, 124.2176, 125.9768, 128.4214, 130.2528, 132.0102, 133.8658, 135.7278, 138.3044, 140.1316, 142.093, 144.0032, 145.9092, 148.6306, 150.5294, 152.5756, 154.6508, 156.662, 159.552, 161.3724, 163.617, 165.5754, 167.7872, 169.8444, 172.7988, 174.8606, 177.2118, 179.3566, 181.4476, 184.5882, 186.6816, 189.0824, 191.0258, 193.6048, 196.4436, 198.7274, 200.957, 203.147, 205.4364, 208.7592, 211.3386, 213.781, 215.8028, 218.656, 221.6544, 223.996, 226.4718, 229.1544, 231.6098, 234.5956, 237.0616, 239.5758, 242.4878, 244.5244, 248.2146, 250.724, 252.8722, 255.5198, 258.0414, 261.941, 264.9048, 266.87, 269.4304, 272.028, 274.4708, 278.37, 281.0624, 283.4668, 286.5532, 289.4352, 293.2564, 295.2744, 298.2118, 300.7472, 304.1456, 307.2928, 309.7504, 312.5528, 315.979, 318.2102, 322.1834, 324.3494, 327.325, 330.6614, 332.903, 337.2544, 339.9042, 343.215, 345.2864, 348.0814, 352.6764, 355.301, 357.139, 360.658, 363.1732, 366.5902, 369.9538, 373.0828, 375.922, 378.9902, 382.7328, 386.4538, 388.1136, 391.2234, 394.0878, 396.708, 401.1556, 404.1852, 406.6372, 409.6822, 412.7796, 416.6078, 418.4916, 422.131, 424.5376, 428.1988, 432.211, 434.4502, 438.5282, 440.912, 444.0448, 447.7432, 450.8524, 453.7988, 456.7858, 458.8868, 463.9886, 466.5064, 468.9124, 472.6616, 475.4682, 478.582, 481.304, 485.2738, 488.6894, 490.329, 496.106, 497.6908, 501.1374, 504.5322, 506.8848, 510.3324, 513.4512, 516.179, 520.4412, 522.6066, 526.167, 528.7794, 533.379, 536.067, 538.46, 542.9116, 545.692, 547.9546, 552.493, 555.2722, 557.335, 562.449, 564.2014, 569.0738, 571.0974, 574.8564, 578.2996, 581.409, 583.9704, 585.8098, 589.6528, 594.5998, 595.958, 600.068, 603.3278, 608.2016, 609.9632, 612.864, 615.43, 620.7794, 621.272, 625.8644, 629.206, 633.219, 634.5154, 638.6102};
+CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p8{184.2152, 187.2454, 190.2096, 193.6652, 196.6312, 199.6822, 203.249, 206.3296, 210.0038, 213.2074, 216.4612, 220.27, 223.5178, 227.4412, 230.8032, 234.1634, 238.1688, 241.6074, 245.6946, 249.2664, 252.8228, 257.0432, 260.6824, 264.9464, 268.6268, 272.2626, 276.8376, 280.4034, 284.8956, 288.8522, 292.7638, 297.3552, 301.3556, 305.7526, 309.9292, 313.8954, 318.8198, 322.7668, 327.298, 331.6688, 335.9466, 340.9746, 345.1672, 349.3474, 354.3028, 358.8912, 364.114, 368.4646, 372.9744, 378.4092, 382.6022, 387.843, 392.5684, 397.1652, 402.5426, 407.4152, 412.5388, 417.3592, 422.1366, 427.486, 432.3918, 437.5076, 442.509, 447.3834, 453.3498, 458.0668, 463.7346, 469.1228, 473.4528, 479.7, 484.644, 491.0518, 495.5774, 500.9068, 506.432, 512.1666, 517.434, 522.6644, 527.4894, 533.6312, 538.3804, 544.292, 550.5496, 556.0234, 562.8206, 566.6146, 572.4188, 579.117, 583.6762, 590.6576, 595.7864, 601.509, 607.5334, 612.9204, 619.772, 624.2924, 630.8654, 636.1836, 642.745, 649.1316, 655.0386, 660.0136, 666.6342, 671.6196, 678.1866, 684.4282, 689.3324, 695.4794, 702.5038, 708.129, 713.528, 720.3204, 726.463, 732.7928, 739.123, 744.7418, 751.2192, 756.5102, 762.6066, 769.0184, 775.2224, 781.4014, 787.7618, 794.1436, 798.6506, 805.6378, 811.766, 819.7514, 824.5776, 828.7322, 837.8048, 843.6302, 849.9336, 854.4798, 861.3388, 867.9894, 873.8196, 880.3136, 886.2308, 892.4588, 899.0816, 905.4076, 912.0064, 917.3878, 923.619, 929.998, 937.3482, 943.9506, 947.991, 955.1144, 962.203, 968.8222, 975.7324, 981.7826, 988.7666, 994.2648, 1000.3128, 1007.4082, 1013.7536, 1020.3376, 1026.7156, 1031.7478, 1037.4292, 1045.393, 1051.2278, 1058.3434, 1062.8726, 1071.884, 1076.806, 1082.9176, 1089.1678, 1095.5032, 1102.525, 1107.2264, 1115.315, 1120.93, 1127.252, 1134.1496, 1139.0408, 1147.5448, 1153.3296, 1158.1974, 1166.5262, 1174.3328, 1175.657, 1184.4222, 1190.9172, 1197.1292, 1204.4606, 1210.4578, 1218.8728, 1225.3336, 1226.6592, 1236.5768, 1241.363, 1249.4074, 1254.6566, 1260.8014, 1266.5454, 1274.5192};
+CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p9{369.0, 374.8294, 381.2452, 387.6698, 394.1464, 400.2024, 406.8782, 413.6598, 420.462, 427.2826, 433.7102, 440.7416, 447.9366, 455.1046, 462.285, 469.0668, 476.306, 483.8448, 491.301, 498.9886, 506.2422, 513.8138, 521.7074, 529.7428, 537.8402, 545.1664, 553.3534, 561.594, 569.6886, 577.7876, 585.65, 594.228, 602.8036, 611.1666, 620.0818, 628.0824, 637.2574, 646.302, 655.1644, 664.0056, 672.3802, 681.7192, 690.5234, 700.2084, 708.831, 718.485, 728.1112, 737.4764, 746.76, 756.3368, 766.5538, 775.5058, 785.2646, 795.5902, 804.3818, 814.8998, 824.9532, 835.2062, 845.2798, 854.4728, 864.9582, 875.3292, 886.171, 896.781, 906.5716, 916.7048, 927.5322, 937.875, 949.3972, 958.3464, 969.7274, 980.2834, 992.1444, 1003.4264, 1013.0166, 1024.018, 1035.0438, 1046.34, 1057.6856, 1068.9836, 1079.0312, 1091.677, 1102.3188, 1113.4846, 1124.4424, 1135.739, 1147.1488, 1158.9202, 1169.406, 1181.5342, 1193.2834, 1203.8954, 1216.3286, 1226.2146, 1239.6684, 1251.9946, 1262.123, 1275.4338, 1285.7378, 1296.076, 1308.9692, 1320.4964, 1333.0998, 1343.9864, 1357.7754, 1368.3208, 1380.4838, 1392.7388, 1406.0758, 1416.9098, 1428.9728, 1440.9228, 1453.9292, 1462.617, 1476.05, 1490.2996, 1500.6128, 1513.7392, 1524.5174, 1536.6322, 1548.2584, 1562.3766, 1572.423, 1587.1232, 1596.5164, 1610.5938, 1622.5972, 1633.1222, 1647.7674, 1658.5044, 1671.57, 1683.7044, 1695.4142, 1708.7102, 1720.6094, 1732.6522, 1747.841, 1756.4072, 1769.9786, 1782.3276, 1797.5216, 1808.3186, 1819.0694, 1834.354, 1844.575, 1856.2808, 1871.1288, 1880.7852, 1893.9622, 1906.3418, 1920.6548, 1932.9302, 1945.8584, 1955.473, 1968.8248, 1980.6446, 1995.9598, 2008.349, 2019.8556, 2033.0334, 2044.0206, 2059.3956, 2069.9174, 2082.6084, 2093.7036, 2106.6108, 2118.9124, 2132.301, 2144.7628, 2159.8422, 2171.0212, 2183.101, 2193.5112, 2208.052, 2221.3194, 2233.3282, 2247.295, 2257.7222, 2273.342, 2286.5638, 2299.6786, 2310.8114, 2322.3312, 2335.516, 2349.874, 2363.5968, 2373.865, 2387.1918, 2401.8328, 2414.8496, 2424.544, 2436.7592, 2447.1682, 2464.1958, 2474.3438, 2489.0006, 2497.4526, 2513.6586, 2527.19, 2540.7028, 2553.768};
+CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p10{738.1256, 750.4234, 763.1064, 775.4732, 788.4636, 801.0644, 814.488, 827.9654, 841.0832, 854.7864, 868.1992, 882.2176, 896.5228, 910.1716, 924.7752, 938.899, 953.6126, 968.6492, 982.9474, 998.5214, 1013.1064, 1028.6364, 1044.2468, 1059.4588, 1075.3832, 1091.0584, 1106.8606, 1123.3868, 1139.5062, 1156.1862, 1172.463, 1189.339, 1206.1936, 1223.1292, 1240.1854, 1257.2908, 1275.3324, 1292.8518, 1310.5204, 1328.4854, 1345.9318, 1364.552, 1381.4658, 1400.4256, 1419.849, 1438.152, 1456.8956, 1474.8792, 1494.118, 1513.62, 1532.5132, 1551.9322, 1570.7726, 1590.6086, 1610.5332, 1630.5918, 1650.4294, 1669.7662, 1690.4106, 1710.7338, 1730.9012, 1750.4486, 1770.1556, 1791.6338, 1812.7312, 1833.6264, 1853.9526, 1874.8742, 1896.8326, 1918.1966, 1939.5594, 1961.07, 1983.037, 2003.1804, 2026.071, 2047.4884, 2070.0848, 2091.2944, 2114.333, 2135.9626, 2158.2902, 2181.0814, 2202.0334, 2224.4832, 2246.39, 2269.7202, 2292.1714, 2314.2358, 2338.9346, 2360.891, 2384.0264, 2408.3834, 2430.1544, 2454.8684, 2476.9896, 2501.4368, 2522.8702, 2548.0408, 2570.6738, 2593.5208, 2617.0158, 2640.2302, 2664.0962, 2687.4986, 2714.2588, 2735.3914, 2759.6244, 2781.8378, 2808.0072, 2830.6516, 2856.2454, 2877.2136, 2903.4546, 2926.785, 2951.2294, 2976.468, 3000.867, 3023.6508, 3049.91, 3073.5984, 3098.162, 3121.5564, 3146.2328, 3170.9484, 3195.5902, 3221.3346, 3242.7032, 3271.6112, 3296.5546, 3317.7376, 3345.072, 3369.9518, 3394.326, 3418.1818, 3444.6926, 3469.086, 3494.2754, 3517.8698, 3544.248, 3565.3768, 3588.7234, 3616.979, 3643.7504, 3668.6812, 3695.72, 3719.7392, 3742.6224, 3770.4456, 3795.6602, 3819.9058, 3844.002, 3869.517, 3895.6824, 3920.8622, 3947.1364, 3973.985, 3995.4772, 4021.62, 4046.628, 4074.65, 4096.2256, 4121.831, 4146.6406, 4173.276, 4195.0744, 4223.9696, 4251.3708, 4272.9966, 4300.8046, 4326.302, 4353.1248, 4374.312, 4403.0322, 4426.819, 4450.0598, 4478.5206, 4504.8116, 4528.8928, 4553.9584, 4578.8712, 4603.8384, 4632.3872, 4655.5128, 4675.821, 4704.6222, 4731.9862, 4755.4174, 4781.2628, 4804.332, 4832.3048, 4862.8752, 4883.4148, 4906.9544, 4935.3516, 4954.3532, 4984.0248, 5011.217, 5035.3258, 5057.3672, 5084.1828};
+CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p11{1477.0, 1501.6014, 1526.5802, 1551.7942, 1577.3042, 1603.2062, 1629.8402, 1656.2292, 1682.9462, 1709.9926, 1737.3026, 1765.4252, 1793.0578, 1821.6092, 1849.626, 1878.5568, 1908.527, 1937.5154, 1967.1874, 1997.3878, 2027.37, 2058.1972, 2089.5728, 2120.1012, 2151.9668, 2183.292, 2216.0772, 2247.8578, 2280.6562, 2313.041, 2345.714, 2380.3112, 2414.1806, 2447.9854, 2481.656, 2516.346, 2551.5154, 2586.8378, 2621.7448, 2656.6722, 2693.5722, 2729.1462, 2765.4124, 2802.8728, 2838.898, 2876.408, 2913.4926, 2951.4938, 2989.6776, 3026.282, 3065.7704, 3104.1012, 3143.7388, 3181.6876, 3221.1872, 3261.5048, 3300.0214, 3339.806, 3381.409, 3421.4144, 3461.4294, 3502.2286, 3544.651, 3586.6156, 3627.337, 3670.083, 3711.1538, 3753.5094, 3797.01, 3838.6686, 3882.1678, 3922.8116, 3967.9978, 4009.9204, 4054.3286, 4097.5706, 4140.6014, 4185.544, 4229.5976, 4274.583, 4316.9438, 4361.672, 4406.2786, 4451.8628, 4496.1834, 4543.505, 4589.1816, 4632.5188, 4678.2294, 4724.8908, 4769.0194, 4817.052, 4861.4588, 4910.1596, 4956.4344, 5002.5238, 5048.13, 5093.6374, 5142.8162, 5187.7894, 5237.3984, 5285.6078, 5331.0858, 5379.1036, 5428.6258, 5474.6018, 5522.7618, 5571.5822, 5618.59, 5667.9992, 5714.88, 5763.454, 5808.6982, 5860.3644, 5910.2914, 5953.571, 6005.9232, 6055.1914, 6104.5882, 6154.5702, 6199.7036, 6251.1764, 6298.7596, 6350.0302, 6398.061, 6448.4694, 6495.933, 6548.0474, 6597.7166, 6646.9416, 6695.9208, 6742.6328, 6793.5276, 6842.1934, 6894.2372, 6945.3864, 6996.9228, 7044.2372, 7094.1374, 7142.2272, 7192.2942, 7238.8338, 7288.9006, 7344.0908, 7394.8544, 7443.5176, 7490.4148, 7542.9314, 7595.6738, 7641.9878, 7694.3688, 7743.0448, 7797.522, 7845.53, 7899.594, 7950.3132, 7996.455, 8050.9442, 8092.9114, 8153.1374, 8197.4472, 8252.8278, 8301.8728, 8348.6776, 8401.4698, 8453.551, 8504.6598, 8553.8944, 8604.1276, 8657.6514, 8710.3062, 8758.908, 8807.8706, 8862.1702, 8910.4668, 8960.77, 9007.2766, 9063.164, 9121.0534, 9164.1354, 9218.1594, 9267.767, 9319.0594, 9372.155, 9419.7126, 9474.3722, 9520.1338, 9572.368, 9622.7702, 9675.8448, 9726.5396, 9778.7378, 9827.6554, 9878.1922, 9928.7782, 9978.3984, 10026.578, 10076.5626, 10137.1618, 10177.5244, 10229.9176};
+CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p12{2954.0, 3003.4782, 3053.3568, 3104.3666, 3155.324, 3206.9598, 3259.648, 3312.539, 3366.1474, 3420.2576, 3474.8376, 3530.6076, 3586.451, 3643.38, 3700.4104, 3757.5638, 3815.9676, 3875.193, 3934.838, 3994.8548, 4055.018, 4117.1742, 4178.4482, 4241.1294, 4304.4776, 4367.4044, 4431.8724, 4496.3732, 4561.4304, 4627.5326, 4693.949, 4761.5532, 4828.7256, 4897.6182, 4965.5186, 5034.4528, 5104.865, 5174.7164, 5244.6828, 5316.6708, 5387.8312, 5459.9036, 5532.476, 5604.8652, 5679.6718, 5753.757, 5830.2072, 5905.2828, 5980.0434, 6056.6264, 6134.3192, 6211.5746, 6290.0816, 6367.1176, 6447.9796, 6526.5576, 6606.1858, 6686.9144, 6766.1142, 6847.0818, 6927.9664, 7010.9096, 7091.0816, 7175.3962, 7260.3454, 7344.018, 7426.4214, 7511.3106, 7596.0686, 7679.8094, 7765.818, 7852.4248, 7936.834, 8022.363, 8109.5066, 8200.4554, 8288.5832, 8373.366, 8463.4808, 8549.7682, 8642.0522, 8728.3288, 8820.9528, 8907.727, 9001.0794, 9091.2522, 9179.988, 9269.852, 9362.6394, 9453.642, 9546.9024, 9640.6616, 9732.6622, 9824.3254, 9917.7484, 10007.9392, 10106.7508, 10196.2152, 10289.8114, 10383.5494, 10482.3064, 10576.8734, 10668.7872, 10764.7156, 10862.0196, 10952.793, 11049.9748, 11146.0702, 11241.4492, 11339.2772, 11434.2336, 11530.741, 11627.6136, 11726.311, 11821.5964, 11918.837, 12015.3724, 12113.0162, 12213.0424, 12306.9804, 12408.4518, 12504.8968, 12604.586, 12700.9332, 12798.705, 12898.5142, 12997.0488, 13094.788, 13198.475, 13292.7764, 13392.9698, 13486.8574, 13590.1616, 13686.5838, 13783.6264, 13887.2638, 13992.0978, 14081.0844, 14189.9956, 14280.0912, 14382.4956, 14486.4384, 14588.1082, 14686.2392, 14782.276, 14888.0284, 14985.1864, 15088.8596, 15187.0998, 15285.027, 15383.6694, 15495.8266, 15591.3736, 15694.2008, 15790.3246, 15898.4116, 15997.4522, 16095.5014, 16198.8514, 16291.7492, 16402.6424, 16499.1266, 16606.2436, 16697.7186, 16796.3946, 16902.3376, 17005.7672, 17100.814, 17206.8282, 17305.8262, 17416.0744, 17508.4092, 17617.0178, 17715.4554, 17816.758, 17920.1748, 18012.9236, 18119.7984, 18223.2248, 18324.2482, 18426.6276, 18525.0932, 18629.8976, 18733.2588, 18831.0466, 18940.1366, 19032.2696, 19131.729, 19243.4864, 19349.6932, 19442.866, 19547.9448, 19653.2798, 19754.4034, 19854.0692, 19965.1224, 20065.1774, 20158.2212, 20253.353, 20366.3264, 20463.22};
+CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p13{5908.5052, 6007.2672, 6107.347, 6208.5794, 6311.2622, 6414.5514, 6519.3376, 6625.6952, 6732.5988, 6841.3552, 6950.5972, 7061.3082, 7173.5646, 7287.109, 7401.8216, 7516.4344, 7633.3802, 7751.2962, 7870.3784, 7990.292, 8110.79, 8233.4574, 8356.6036, 8482.2712, 8607.7708, 8735.099, 8863.1858, 8993.4746, 9123.8496, 9255.6794, 9388.5448, 9522.7516, 9657.3106, 9792.6094, 9930.5642, 10068.794, 10206.7256, 10347.81, 10490.3196, 10632.0778, 10775.9916, 10920.4662, 11066.124, 11213.073, 11358.0362, 11508.1006, 11659.1716, 11808.7514, 11959.4884, 12112.1314, 12265.037, 12420.3756, 12578.933, 12734.311, 12890.0006, 13047.2144, 13207.3096, 13368.5144, 13528.024, 13689.847, 13852.7528, 14018.3168, 14180.5372, 14346.9668, 14513.5074, 14677.867, 14846.2186, 15017.4186, 15184.9716, 15356.339, 15529.2972, 15697.3578, 15871.8686, 16042.187, 16216.4094, 16389.4188, 16565.9126, 16742.3272, 16919.0042, 17094.7592, 17273.965, 17451.8342, 17634.4254, 17810.5984, 17988.9242, 18171.051, 18354.7938, 18539.466, 18721.0408, 18904.9972, 19081.867, 19271.9118, 19451.8694, 19637.9816, 19821.2922, 20013.1292, 20199.3858, 20387.8726, 20572.9514, 20770.7764, 20955.1714, 21144.751, 21329.9952, 21520.709, 21712.7016, 21906.3868, 22096.2626, 22286.0524, 22475.051, 22665.5098, 22862.8492, 23055.5294, 23249.6138, 23437.848, 23636.273, 23826.093, 24020.3296, 24213.3896, 24411.7392, 24602.9614, 24805.7952, 24998.1552, 25193.9588, 25389.0166, 25585.8392, 25780.6976, 25981.2728, 26175.977, 26376.5252, 26570.1964, 26773.387, 26962.9812, 27163.0586, 27368.164, 27565.0534, 27758.7428, 27961.1276, 28163.2324, 28362.3816, 28565.7668, 28758.644, 28956.9768, 29163.4722, 29354.7026, 29561.1186, 29767.9948, 29959.9986, 30164.0492, 30366.9818, 30562.5338, 30762.9928, 30976.1592, 31166.274, 31376.722, 31570.3734, 31770.809, 31974.8934, 32179.5286, 32387.5442, 32582.3504, 32794.076, 32989.9528, 33191.842, 33392.4684, 33595.659, 33801.8672, 34000.3414, 34200.0922, 34402.6792, 34610.0638, 34804.0084, 35011.13, 35218.669, 35418.6634, 35619.0792, 35830.6534, 36028.4966, 36229.7902, 36438.6422, 36630.7764, 36833.3102, 37048.6728, 37247.3916, 37453.5904, 37669.3614, 37854.5526, 38059.305, 38268.0936, 38470.2516, 38674.7064, 38876.167, 39068.3794, 39281.9144, 39492.8566, 39684.8628, 39898.4108, 40093.1836, 40297.6858, 40489.7086, 40717.2424};
+CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p14{11817.475, 12015.0046, 12215.3792, 12417.7504, 12623.1814, 12830.0086, 13040.0072, 13252.503, 13466.178, 13683.2738, 13902.0344, 14123.9798, 14347.394, 14573.7784, 14802.6894, 15033.6824, 15266.9134, 15502.8624, 15741.4944, 15980.7956, 16223.8916, 16468.6316, 16715.733, 16965.5726, 17217.204, 17470.666, 17727.8516, 17986.7886, 18247.6902, 18510.9632, 18775.304, 19044.7486, 19314.4408, 19587.202, 19862.2576, 20135.924, 20417.0324, 20697.9788, 20979.6112, 21265.0274, 21550.723, 21841.6906, 22132.162, 22428.1406, 22722.127, 23020.5606, 23319.7394, 23620.4014, 23925.2728, 24226.9224, 24535.581, 24845.505, 25155.9618, 25470.3828, 25785.9702, 26103.7764, 26420.4132, 26742.0186, 27062.8852, 27388.415, 27714.6024, 28042.296, 28365.4494, 28701.1526, 29031.8008, 29364.2156, 29704.497, 30037.1458, 30380.111, 30723.8168, 31059.5114, 31404.9498, 31751.6752, 32095.2686, 32444.7792, 32794.767, 33145.204, 33498.4226, 33847.6502, 34209.006, 34560.849, 34919.4838, 35274.9778, 35635.1322, 35996.3266, 36359.1394, 36722.8266, 37082.8516, 37447.7354, 37815.9606, 38191.0692, 38559.4106, 38924.8112, 39294.6726, 39663.973, 40042.261, 40416.2036, 40779.2036, 41161.6436, 41540.9014, 41921.1998, 42294.7698, 42678.5264, 43061.3464, 43432.375, 43818.432, 44198.6598, 44583.0138, 44970.4794, 45353.924, 45729.858, 46118.2224, 46511.5724, 46900.7386, 47280.6964, 47668.1472, 48055.6796, 48446.9436, 48838.7146, 49217.7296, 49613.7796, 50010.7508, 50410.0208, 50793.7886, 51190.2456, 51583.1882, 51971.0796, 52376.5338, 52763.319, 53165.5534, 53556.5594, 53948.2702, 54346.352, 54748.7914, 55138.577, 55543.4824, 55941.1748, 56333.7746, 56745.1552, 57142.7944, 57545.2236, 57935.9956, 58348.5268, 58737.5474, 59158.5962, 59542.6896, 59958.8004, 60349.3788, 60755.0212, 61147.6144, 61548.194, 61946.0696, 62348.6042, 62763.603, 63162.781, 63560.635, 63974.3482, 64366.4908, 64771.5876, 65176.7346, 65597.3916, 65995.915, 66394.0384, 66822.9396, 67203.6336, 67612.2032, 68019.0078, 68420.0388, 68821.22, 69235.8388, 69640.0724, 70055.155, 70466.357, 70863.4266, 71276.2482, 71677.0306, 72080.2006, 72493.0214, 72893.5952, 73314.5856, 73714.9852, 74125.3022, 74521.2122, 74933.6814, 75341.5904, 75743.0244, 76166.0278, 76572.1322, 76973.1028, 77381.6284, 77800.6092, 78189.328, 78607.0962, 79012.2508, 79407.8358, 79825.725, 80238.701, 80646.891, 81035.6436, 81460.0448, 81876.3884};
+CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p15{23635.0036, 24030.8034, 24431.4744, 24837.1524, 25246.7928, 25661.326, 26081.3532, 26505.2806, 26933.9892, 27367.7098, 27805.318, 28248.799, 28696.4382, 29148.8244, 29605.5138, 30066.8668, 30534.2344, 31006.32, 31480.778, 31962.2418, 32447.3324, 32938.0232, 33432.731, 33930.728, 34433.9896, 34944.1402, 35457.5588, 35974.5958, 36497.3296, 37021.9096, 37554.326, 38088.0826, 38628.8816, 39171.3192, 39723.2326, 40274.5554, 40832.3142, 41390.613, 41959.5908, 42532.5466, 43102.0344, 43683.5072, 44266.694, 44851.2822, 45440.7862, 46038.0586, 46640.3164, 47241.064, 47846.155, 48454.7396, 49076.9168, 49692.542, 50317.4778, 50939.65, 51572.5596, 52210.2906, 52843.7396, 53481.3996, 54127.236, 54770.406, 55422.6598, 56078.7958, 56736.7174, 57397.6784, 58064.5784, 58730.308, 59404.9784, 60077.0864, 60751.9158, 61444.1386, 62115.817, 62808.7742, 63501.4774, 64187.5454, 64883.6622, 65582.7468, 66274.5318, 66976.9276, 67688.7764, 68402.138, 69109.6274, 69822.9706, 70543.6108, 71265.5202, 71983.3848, 72708.4656, 73433.384, 74158.4664, 74896.4868, 75620.9564, 76362.1434, 77098.3204, 77835.7662, 78582.6114, 79323.9902, 80067.8658, 80814.9246, 81567.0136, 82310.8536, 83061.9952, 83821.4096, 84580.8608, 85335.547, 86092.5802, 86851.6506, 87612.311, 88381.2016, 89146.3296, 89907.8974, 90676.846, 91451.4152, 92224.5518, 92995.8686, 93763.5066, 94551.2796, 95315.1944, 96096.1806, 96881.0918, 97665.679, 98442.68, 99229.3002, 100011.0994, 100790.6386, 101580.1564, 102377.7484, 103152.1392, 103944.2712, 104730.216, 105528.6336, 106324.9398, 107117.6706, 107890.3988, 108695.2266, 109485.238, 110294.7876, 111075.0958, 111878.0496, 112695.2864, 113464.5486, 114270.0474, 115068.608, 115884.3626, 116673.2588, 117483.3716, 118275.097, 119085.4092, 119879.2808, 120687.5868, 121499.9944, 122284.916, 123095.9254, 123912.5038, 124709.0454, 125503.7182, 126323.259, 127138.9412, 127943.8294, 128755.646, 129556.5354, 130375.3298, 131161.4734, 131971.1962, 132787.5458, 133588.1056, 134431.351, 135220.2906, 136023.398, 136846.6558, 137667.0004, 138463.663, 139283.7154, 140074.6146, 140901.3072, 141721.8548, 142543.2322, 143356.1096, 144173.7412, 144973.0948, 145794.3162, 146609.5714, 147420.003, 148237.9784, 149050.5696, 149854.761, 150663.1966, 151494.0754, 152313.1416, 153112.6902, 153935.7206, 154746.9262, 155559.547, 156401.9746, 157228.7036, 158008.7254, 158820.75, 159646.9184, 160470.4458, 161279.5348, 162093.3114, 162918.542, 163729.2842};
+CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p16{47271.0, 48062.3584, 48862.7074, 49673.152, 50492.8416, 51322.9514, 52161.03, 53009.407, 53867.6348, 54734.206, 55610.5144, 56496.2096, 57390.795, 58297.268, 59210.6448, 60134.665, 61068.0248, 62010.4472, 62962.5204, 63923.5742, 64895.0194, 65876.4182, 66862.6136, 67862.6968, 68868.8908, 69882.8544, 70911.271, 71944.0924, 72990.0326, 74040.692, 75100.6336, 76174.7826, 77252.5998, 78340.2974, 79438.2572, 80545.4976, 81657.2796, 82784.6336, 83915.515, 85059.7362, 86205.9368, 87364.4424, 88530.3358, 89707.3744, 90885.9638, 92080.197, 93275.5738, 94479.391, 95695.918, 96919.2236, 98148.4602, 99382.3474, 100625.6974, 101878.0284, 103141.6278, 104409.4588, 105686.2882, 106967.5402, 108261.6032, 109548.1578, 110852.0728, 112162.231, 113479.0072, 114806.2626, 116137.9072, 117469.5048, 118813.5186, 120165.4876, 121516.2556, 122875.766, 124250.5444, 125621.2222, 127003.2352, 128387.848, 129775.2644, 131181.7776, 132577.3086, 133979.9458, 135394.1132, 136800.9078, 138233.217, 139668.5308, 141085.212, 142535.2122, 143969.0684, 145420.2872, 146878.1542, 148332.7572, 149800.3202, 151269.66, 152743.6104, 154213.0948, 155690.288, 157169.4246, 158672.1756, 160160.059, 161650.6854, 163145.7772, 164645.6726, 166159.1952, 167682.1578, 169177.3328, 170700.0118, 172228.8964, 173732.6664, 175265.5556, 176787.799, 178317.111, 179856.6914, 181400.865, 182943.4612, 184486.742, 186033.4698, 187583.7886, 189148.1868, 190688.4526, 192250.1926, 193810.9042, 195354.2972, 196938.7682, 198493.5898, 200079.2824, 201618.912, 203205.5492, 204765.5798, 206356.1124, 207929.3064, 209498.7196, 211086.229, 212675.1324, 214256.7892, 215826.2392, 217412.8474, 218995.6724, 220618.6038, 222207.1166, 223781.0364, 225387.4332, 227005.7928, 228590.4336, 230217.8738, 231805.1054, 233408.9, 234995.3432, 236601.4956, 238190.7904, 239817.2548, 241411.2832, 243002.4066, 244640.1884, 246255.3128, 247849.3508, 249479.9734, 251106.8822, 252705.027, 254332.9242, 255935.129, 257526.9014, 259154.772, 260777.625, 262390.253, 264004.4906, 265643.59, 267255.4076, 268873.426, 270470.7252, 272106.4804, 273722.4456, 275337.794, 276945.7038, 278592.9154, 280204.3726, 281841.1606, 283489.171, 285130.1716, 286735.3362, 288364.7164, 289961.1814, 291595.5524, 293285.683, 294899.6668, 296499.3434, 298128.0462, 299761.8946, 301394.2424, 302997.6748, 304615.1478, 306269.7724, 307886.114, 309543.1028, 311153.2862, 312782.8546, 314421.2008, 316033.2438, 317692.9636, 319305.2648, 320948.7406, 322566.3364, 324228.4224, 325847.1542};
+CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p17{94542.0, 96125.811, 97728.019, 99348.558, 100987.9705, 102646.7565, 104324.5125, 106021.7435, 107736.7865, 109469.272, 111223.9465, 112995.219, 114787.432, 116593.152, 118422.71, 120267.2345, 122134.6765, 124020.937, 125927.2705, 127851.255, 129788.9485, 131751.016, 133726.8225, 135722.592, 137736.789, 139770.568, 141821.518, 143891.343, 145982.1415, 148095.387, 150207.526, 152355.649, 154515.6415, 156696.05, 158887.7575, 161098.159, 163329.852, 165569.053, 167837.4005, 170121.6165, 172420.4595, 174732.6265, 177062.77, 179412.502, 181774.035, 184151.939, 186551.6895, 188965.691, 191402.8095, 193857.949, 196305.0775, 198774.6715, 201271.2585, 203764.78, 206299.3695, 208818.1365, 211373.115, 213946.7465, 216532.076, 219105.541, 221714.5375, 224337.5135, 226977.5125, 229613.0655, 232270.2685, 234952.2065, 237645.3555, 240331.1925, 243034.517, 245756.0725, 248517.6865, 251232.737, 254011.3955, 256785.995, 259556.44, 262368.335, 265156.911, 267965.266, 270785.583, 273616.0495, 276487.4835, 279346.639, 282202.509, 285074.3885, 287942.2855, 290856.018, 293774.0345, 296678.5145, 299603.6355, 302552.6575, 305492.9785, 308466.8605, 311392.581, 314347.538, 317319.4295, 320285.9785, 323301.7325, 326298.3235, 329301.3105, 332301.987, 335309.791, 338370.762, 341382.923, 344431.1265, 347464.1545, 350507.28, 353619.2345, 356631.2005, 359685.203, 362776.7845, 365886.488, 368958.2255, 372060.6825, 375165.4335, 378237.935, 381328.311, 384430.5225, 387576.425, 390683.242, 393839.648, 396977.8425, 400101.9805, 403271.296, 406409.8425, 409529.5485, 412678.7, 415847.423, 419020.8035, 422157.081, 425337.749, 428479.6165, 431700.902, 434893.1915, 438049.582, 441210.5415, 444379.2545, 447577.356, 450741.931, 453959.548, 457137.0935, 460329.846, 463537.4815, 466732.3345, 469960.5615, 473164.681, 476347.6345, 479496.173, 482813.1645, 486025.6995, 489249.4885, 492460.1945, 495675.8805, 498908.0075, 502131.802, 505374.3855, 508550.9915, 511806.7305, 515026.776, 518217.0005, 521523.9855, 524705.9855, 527950.997, 531210.0265, 534472.497, 537750.7315, 540926.922, 544207.094, 547429.4345, 550666.3745, 553975.3475, 557150.7185, 560399.6165, 563662.697, 566916.7395, 570146.1215, 573447.425, 576689.6245, 579874.5745, 583202.337, 586503.0255, 589715.635, 592910.161, 596214.3885, 599488.035, 602740.92, 605983.0685, 609248.67, 612491.3605, 615787.912, 619107.5245, 622307.9555, 625577.333, 628840.4385, 632085.2155, 635317.6135, 638691.7195, 641887.467, 645139.9405, 648441.546, 651666.252, 654941.845};
+CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p18{189084.0, 192250.913, 195456.774, 198696.946, 201977.762, 205294.444, 208651.754, 212042.099, 215472.269, 218941.91, 222443.912, 225996.845, 229568.199, 233193.568, 236844.457, 240543.233, 244279.475, 248044.27, 251854.588, 255693.2, 259583.619, 263494.621, 267445.385, 271454.061, 275468.769, 279549.456, 283646.446, 287788.198, 291966.099, 296181.164, 300431.469, 304718.618, 309024.004, 313393.508, 317760.803, 322209.731, 326675.061, 331160.627, 335654.47, 340241.442, 344841.833, 349467.132, 354130.629, 358819.432, 363574.626, 368296.587, 373118.482, 377914.93, 382782.301, 387680.669, 392601.981, 397544.323, 402529.115, 407546.018, 412593.658, 417638.657, 422762.865, 427886.169, 433017.167, 438213.273, 443441.254, 448692.421, 453937.533, 459239.049, 464529.569, 469910.083, 475274.03, 480684.473, 486070.26, 491515.237, 496995.651, 502476.617, 507973.609, 513497.19, 519083.233, 524726.509, 530305.505, 535945.728, 541584.404, 547274.055, 552967.236, 558667.862, 564360.216, 570128.148, 575965.08, 581701.952, 587532.523, 593361.144, 599246.128, 605033.418, 610958.779, 616837.117, 622772.818, 628672.04, 634675.369, 640574.831, 646585.739, 652574.547, 658611.217, 664642.684, 670713.914, 676737.681, 682797.313, 688837.897, 694917.874, 701009.882, 707173.648, 713257.254, 719415.392, 725636.761, 731710.697, 737906.209, 744103.074, 750313.39, 756504.185, 762712.579, 768876.985, 775167.859, 781359.0, 787615.959, 793863.597, 800245.477, 806464.582, 812785.294, 819005.925, 825403.057, 831676.197, 837936.284, 844266.968, 850642.711, 856959.756, 863322.774, 869699.931, 876102.478, 882355.787, 888694.463, 895159.952, 901536.143, 907872.631, 914293.672, 920615.14, 927130.974, 933409.404, 939922.178, 946331.47, 952745.93, 959209.264, 965590.224, 972077.284, 978501.961, 984953.19, 991413.271, 997817.479, 1004222.658, 1010725.676, 1017177.138, 1023612.529, 1030098.236, 1036493.719, 1043112.207, 1049537.036, 1056008.096, 1062476.184, 1068942.337, 1075524.95, 1081932.864, 1088426.025, 1094776.005, 1101327.448, 1107901.673, 1114423.639, 1120884.602, 1127324.923, 1133794.24, 1140328.886, 1146849.376, 1153346.682, 1159836.502, 1166478.703, 1172953.304, 1179391.502, 1185950.982, 1192544.052, 1198913.41, 1205430.994, 1212015.525, 1218674.042, 1225121.683, 1231551.101, 1238126.379, 1244673.795, 1251260.649, 1257697.86, 1264320.983, 1270736.319, 1277274.694, 1283804.95, 1290211.514, 1296858.568, 1303455.691};
 
 // Meta array storing interpolation points for estimates for Precision=4..18
-__device__ static cuda::std::array constexpr raw_estimate_data{raw_estimate_data_p4.data(),
-                                                               raw_estimate_data_p5.data(),
-                                                               raw_estimate_data_p6.data(),
-                                                               raw_estimate_data_p7.data(),
-                                                               raw_estimate_data_p8.data(),
-                                                               raw_estimate_data_p9.data(),
-                                                               raw_estimate_data_p10.data(),
-                                                               raw_estimate_data_p11.data(),
-                                                               raw_estimate_data_p12.data(),
-                                                               raw_estimate_data_p13.data(),
-                                                               raw_estimate_data_p14.data(),
-                                                               raw_estimate_data_p15.data(),
-                                                               raw_estimate_data_p16.data(),
-                                                               raw_estimate_data_p17.data(),
-                                                               raw_estimate_data_p18.data()};
+CUCO_HLL_TUNING_ARR_DECL raw_estimate_data{raw_estimate_data_p4.data(), raw_estimate_data_p5.data(), raw_estimate_data_p6.data(), raw_estimate_data_p7.data(), raw_estimate_data_p8.data(), raw_estimate_data_p9.data(), raw_estimate_data_p10.data(), raw_estimate_data_p11.data(), raw_estimate_data_p12.data(), raw_estimate_data_p13.data(), raw_estimate_data_p14.data(), raw_estimate_data_p15.data(), raw_estimate_data_p16.data(), raw_estimate_data_p17.data(), raw_estimate_data_p18.data()};
 
-CUCO_HLL_TUNING_ARR_DECL bias_data_p4{10.0,
-                                      9.717,
-                                      9.207,
-                                      8.7896,
-                                      8.2882,
-                                      7.8204,
-                                      7.3772,
-                                      6.9342,
-                                      6.5202,
-                                      6.161,
-                                      5.7722,
-                                      5.4636,
-                                      5.0396,
-                                      4.6766,
-                                      4.3566,
-                                      4.0454,
-                                      3.7936,
-                                      3.4856,
-                                      3.2666,
-                                      2.9946,
-                                      2.766,
-                                      2.4692,
-                                      2.3638,
-                                      2.0764,
-                                      1.7864,
-                                      1.7602,
-                                      1.4814,
-                                      1.433,
-                                      1.2926,
-                                      1.0664,
-                                      0.999600000000001,
-                                      0.7956,
-                                      0.5366,
-                                      0.589399999999998,
-                                      0.573799999999999,
-                                      0.269799999999996,
-                                      0.368200000000002,
-                                      0.0544000000000011,
-                                      0.234200000000001,
-                                      0.0108000000000033,
-                                      -0.203400000000002,
-                                      -0.0701999999999998,
-                                      -0.129600000000003,
-                                      -0.364199999999997,
-                                      -0.480600000000003,
-                                      -0.226999999999997,
-                                      -0.322800000000001,
-                                      -0.382599999999996,
-                                      -0.511200000000002,
-                                      -0.669600000000003,
-                                      -0.749400000000001,
-                                      -0.500399999999999,
-                                      -0.617600000000003,
-                                      -0.6922,
-                                      -0.601599999999998,
-                                      -0.416200000000003,
-                                      -0.338200000000001,
-                                      -0.782600000000002,
-                                      -0.648600000000002,
-                                      -0.919800000000002,
-                                      -0.851799999999997,
-                                      -0.962400000000002,
-                                      -0.6402,
-                                      -1.1922,
-                                      -1.0256,
-                                      -1.086,
-                                      -1.21899999999999,
-                                      -0.819400000000002,
-                                      -0.940600000000003,
-                                      -1.1554,
-                                      -1.2072,
-                                      -1.1752,
-                                      -1.16759999999999,
-                                      -1.14019999999999,
-                                      -1.3754,
-                                      -1.29859999999999,
-                                      -1.607,
-                                      -1.3292,
-                                      -1.7606};
-CUCO_HLL_TUNING_ARR_DECL bias_data_p5{22.0,
-                                      21.1194,
-                                      20.8208,
-                                      20.2318,
-                                      19.77,
-                                      19.2436,
-                                      18.7774,
-                                      18.2848,
-                                      17.8224,
-                                      17.3742,
-                                      16.9336,
-                                      16.503,
-                                      16.0494,
-                                      15.6292,
-                                      15.2124,
-                                      14.798,
-                                      14.367,
-                                      13.9728,
-                                      13.5944,
-                                      13.217,
-                                      12.8438,
-                                      12.3696,
-                                      12.0956,
-                                      11.7044,
-                                      11.324,
-                                      11.0668,
-                                      10.6698,
-                                      10.3644,
-                                      10.049,
-                                      9.6918,
-                                      9.4146,
-                                      9.082,
-                                      8.687,
-                                      8.5398,
-                                      8.2462,
-                                      7.857,
-                                      7.6606,
-                                      7.4168,
-                                      7.1248,
-                                      6.9222,
-                                      6.6804,
-                                      6.447,
-                                      6.3454,
-                                      5.9594,
-                                      5.7636,
-                                      5.5776,
-                                      5.331,
-                                      5.19,
-                                      4.9676,
-                                      4.7564,
-                                      4.5314,
-                                      4.4442,
-                                      4.3708,
-                                      3.9774,
-                                      3.9624,
-                                      3.8796,
-                                      3.755,
-                                      3.472,
-                                      3.2076,
-                                      3.1024,
-                                      2.8908,
-                                      2.7338,
-                                      2.7728,
-                                      2.629,
-                                      2.413,
-                                      2.3266,
-                                      2.1524,
-                                      2.2642,
-                                      2.1806,
-                                      2.0566,
-                                      1.9192,
-                                      1.7598,
-                                      1.3516,
-                                      1.5802,
-                                      1.43859999999999,
-                                      1.49160000000001,
-                                      1.1524,
-                                      1.1892,
-                                      0.841399999999993,
-                                      0.879800000000003,
-                                      0.837599999999995,
-                                      0.469800000000006,
-                                      0.765600000000006,
-                                      0.331000000000003,
-                                      0.591399999999993,
-                                      0.601200000000006,
-                                      0.701599999999999,
-                                      0.558199999999999,
-                                      0.339399999999998,
-                                      0.354399999999998,
-                                      0.491200000000006,
-                                      0.308000000000007,
-                                      0.355199999999996,
-                                      -0.0254000000000048,
-                                      0.205200000000005,
-                                      -0.272999999999996,
-                                      0.132199999999997,
-                                      0.394400000000005,
-                                      -0.241200000000006,
-                                      0.242000000000004,
-                                      0.191400000000002,
-                                      0.253799999999998,
-                                      -0.122399999999999,
-                                      -0.370800000000003,
-                                      0.193200000000004,
-                                      -0.0848000000000013,
-                                      0.0867999999999967,
-                                      -0.327200000000005,
-                                      -0.285600000000002,
-                                      0.311400000000006,
-                                      -0.128399999999999,
-                                      -0.754999999999995,
-                                      -0.209199999999996,
-                                      -0.293599999999998,
-                                      -0.364000000000004,
-                                      -0.253600000000006,
-                                      -0.821200000000005,
-                                      -0.253600000000006,
-                                      -0.510400000000004,
-                                      -0.383399999999995,
-                                      -0.491799999999998,
-                                      -0.220200000000006,
-                                      -0.0972000000000008,
-                                      -0.557400000000001,
-                                      -0.114599999999996,
-                                      -0.295000000000002,
-                                      -0.534800000000004,
-                                      0.346399999999988,
-                                      -0.65379999999999,
-                                      0.0398000000000138,
-                                      0.0341999999999985,
-                                      -0.995800000000003,
-                                      -0.523400000000009,
-                                      -0.489000000000004,
-                                      -0.274799999999999,
-                                      -0.574999999999989,
-                                      -0.482799999999997,
-                                      0.0571999999999946,
-                                      -0.330600000000004,
-                                      -0.628800000000012,
-                                      -0.140199999999993,
-                                      -0.540600000000012,
-                                      -0.445999999999998,
-                                      -0.599400000000003,
-                                      -0.262599999999992,
-                                      0.163399999999996,
-                                      -0.100599999999986,
-                                      -0.39500000000001,
-                                      -1.06960000000001,
-                                      -0.836399999999998,
-                                      -0.753199999999993,
-                                      -0.412399999999991,
-                                      -0.790400000000005,
-                                      -0.29679999999999,
-                                      -0.28540000000001,
-                                      -0.193000000000012,
-                                      -0.0772000000000048,
-                                      -0.962799999999987,
-                                      -0.414800000000014};
-CUCO_HLL_TUNING_ARR_DECL bias_data_p6{45.0,
-                                      44.1902,
-                                      43.271,
-                                      42.8358,
-                                      41.8142,
-                                      41.2854,
-                                      40.317,
-                                      39.354,
-                                      38.8924,
-                                      37.9436,
-                                      37.4596,
-                                      36.5262,
-                                      35.6248,
-                                      35.1574,
-                                      34.2822,
-                                      33.837,
-                                      32.9636,
-                                      32.074,
-                                      31.7042,
-                                      30.7976,
-                                      30.4772,
-                                      29.6564,
-                                      28.7942,
-                                      28.5004,
-                                      27.686,
-                                      27.291,
-                                      26.5672,
-                                      25.8556,
-                                      25.4982,
-                                      24.8204,
-                                      24.4252,
-                                      23.7744,
-                                      23.0786,
-                                      22.8344,
-                                      22.0294,
-                                      21.8098,
-                                      21.0794,
-                                      20.5732,
-                                      20.1878,
-                                      19.5648,
-                                      19.2902,
-                                      18.6784,
-                                      18.3352,
-                                      17.8946,
-                                      17.3712,
-                                      17.0852,
-                                      16.499,
-                                      16.2686,
-                                      15.6844,
-                                      15.2234,
-                                      14.9732,
-                                      14.3356,
-                                      14.2286,
-                                      13.7262,
-                                      13.3284,
-                                      13.1048,
-                                      12.5962,
-                                      12.3562,
-                                      12.1272,
-                                      11.4184,
-                                      11.4974,
-                                      11.0822,
-                                      10.856,
-                                      10.48,
-                                      10.2834,
-                                      10.0208,
-                                      9.637,
-                                      9.51739999999999,
-                                      9.05759999999999,
-                                      8.74760000000001,
-                                      8.42700000000001,
-                                      8.1326,
-                                      8.2372,
-                                      8.2788,
-                                      7.6776,
-                                      7.79259999999999,
-                                      7.1952,
-                                      6.9564,
-                                      6.6454,
-                                      6.87,
-                                      6.5428,
-                                      6.19999999999999,
-                                      6.02940000000001,
-                                      5.62780000000001,
-                                      5.6782,
-                                      5.792,
-                                      5.35159999999999,
-                                      5.28319999999999,
-                                      5.0394,
-                                      5.07480000000001,
-                                      4.49119999999999,
-                                      4.84899999999999,
-                                      4.696,
-                                      4.54040000000001,
-                                      4.07300000000001,
-                                      4.37139999999999,
-                                      3.7216,
-                                      3.7328,
-                                      3.42080000000001,
-                                      3.41839999999999,
-                                      3.94239999999999,
-                                      3.27719999999999,
-                                      3.411,
-                                      3.13079999999999,
-                                      2.76900000000001,
-                                      2.92580000000001,
-                                      2.68279999999999,
-                                      2.75020000000001,
-                                      2.70599999999999,
-                                      2.3886,
-                                      3.01859999999999,
-                                      2.45179999999999,
-                                      2.92699999999999,
-                                      2.41720000000001,
-                                      2.41139999999999,
-                                      2.03299999999999,
-                                      2.51240000000001,
-                                      2.5564,
-                                      2.60079999999999,
-                                      2.41720000000001,
-                                      1.80439999999999,
-                                      1.99700000000001,
-                                      2.45480000000001,
-                                      1.8948,
-                                      2.2346,
-                                      2.30860000000001,
-                                      2.15479999999999,
-                                      1.88419999999999,
-                                      1.6508,
-                                      0.677199999999999,
-                                      1.72540000000001,
-                                      1.4752,
-                                      1.72280000000001,
-                                      1.66139999999999,
-                                      1.16759999999999,
-                                      1.79300000000001,
-                                      1.00059999999999,
-                                      0.905200000000008,
-                                      0.659999999999997,
-                                      1.55879999999999,
-                                      1.1636,
-                                      0.688199999999995,
-                                      0.712600000000009,
-                                      0.450199999999995,
-                                      1.1978,
-                                      0.975599999999986,
-                                      0.165400000000005,
-                                      1.727,
-                                      1.19739999999999,
-                                      -0.252600000000001,
-                                      1.13460000000001,
-                                      1.3048,
-                                      1.19479999999999,
-                                      0.313400000000001,
-                                      0.878999999999991,
-                                      1.12039999999999,
-                                      0.853000000000009,
-                                      1.67920000000001,
-                                      0.856999999999999,
-                                      0.448599999999999,
-                                      1.2362,
-                                      0.953399999999988,
-                                      1.02859999999998,
-                                      0.563199999999995,
-                                      0.663000000000011,
-                                      0.723000000000013,
-                                      0.756599999999992,
-                                      0.256599999999992,
-                                      -0.837600000000009,
-                                      0.620000000000005,
-                                      0.821599999999989,
-                                      0.216600000000028,
-                                      0.205600000000004,
-                                      0.220199999999977,
-                                      0.372599999999977,
-                                      0.334400000000016,
-                                      0.928400000000011,
-                                      0.972800000000007,
-                                      0.192400000000021,
-                                      0.487199999999973,
-                                      -0.413000000000011,
-                                      0.807000000000016,
-                                      0.120600000000024,
-                                      0.769000000000005,
-                                      0.870799999999974,
-                                      0.66500000000002,
-                                      0.118200000000002,
-                                      0.401200000000017,
-                                      0.635199999999998,
-                                      0.135400000000004,
-                                      0.175599999999974,
-                                      1.16059999999999,
-                                      0.34620000000001,
-                                      0.521400000000028,
-                                      -0.586599999999976,
-                                      -1.16480000000001,
-                                      0.968399999999974,
-                                      0.836999999999989,
-                                      0.779600000000016,
-                                      0.985799999999983};
-CUCO_HLL_TUNING_ARR_DECL bias_data_p7{91.0,
-                                      89.4934,
-                                      87.9758,
-                                      86.4574,
-                                      84.9718,
-                                      83.4954,
-                                      81.5302,
-                                      80.0756,
-                                      78.6374,
-                                      77.1782,
-                                      75.7888,
-                                      73.9522,
-                                      72.592,
-                                      71.2532,
-                                      69.9086,
-                                      68.5938,
-                                      66.9474,
-                                      65.6796,
-                                      64.4394,
-                                      63.2176,
-                                      61.9768,
-                                      60.4214,
-                                      59.2528,
-                                      58.0102,
-                                      56.8658,
-                                      55.7278,
-                                      54.3044,
-                                      53.1316,
-                                      52.093,
-                                      51.0032,
-                                      49.9092,
-                                      48.6306,
-                                      47.5294,
-                                      46.5756,
-                                      45.6508,
-                                      44.662,
-                                      43.552,
-                                      42.3724,
-                                      41.617,
-                                      40.5754,
-                                      39.7872,
-                                      38.8444,
-                                      37.7988,
-                                      36.8606,
-                                      36.2118,
-                                      35.3566,
-                                      34.4476,
-                                      33.5882,
-                                      32.6816,
-                                      32.0824,
-                                      31.0258,
-                                      30.6048,
-                                      29.4436,
-                                      28.7274,
-                                      27.957,
-                                      27.147,
-                                      26.4364,
-                                      25.7592,
-                                      25.3386,
-                                      24.781,
-                                      23.8028,
-                                      23.656,
-                                      22.6544,
-                                      21.996,
-                                      21.4718,
-                                      21.1544,
-                                      20.6098,
-                                      19.5956,
-                                      19.0616,
-                                      18.5758,
-                                      18.4878,
-                                      17.5244,
-                                      17.2146,
-                                      16.724,
-                                      15.8722,
-                                      15.5198,
-                                      15.0414,
-                                      14.941,
-                                      14.9048,
-                                      13.87,
-                                      13.4304,
-                                      13.028,
-                                      12.4708,
-                                      12.37,
-                                      12.0624,
-                                      11.4668,
-                                      11.5532,
-                                      11.4352,
-                                      11.2564,
-                                      10.2744,
-                                      10.2118,
-                                      9.74720000000002,
-                                      10.1456,
-                                      9.2928,
-                                      8.75040000000001,
-                                      8.55279999999999,
-                                      8.97899999999998,
-                                      8.21019999999999,
-                                      8.18340000000001,
-                                      7.3494,
-                                      7.32499999999999,
-                                      7.66140000000001,
-                                      6.90300000000002,
-                                      7.25439999999998,
-                                      6.9042,
-                                      7.21499999999997,
-                                      6.28640000000001,
-                                      6.08139999999997,
-                                      6.6764,
-                                      6.30099999999999,
-                                      5.13900000000001,
-                                      5.65800000000002,
-                                      5.17320000000001,
-                                      4.59019999999998,
-                                      4.9538,
-                                      5.08280000000002,
-                                      4.92200000000003,
-                                      4.99020000000002,
-                                      4.7328,
-                                      5.4538,
-                                      4.11360000000002,
-                                      4.22340000000003,
-                                      4.08780000000002,
-                                      3.70800000000003,
-                                      4.15559999999999,
-                                      4.18520000000001,
-                                      3.63720000000001,
-                                      3.68220000000002,
-                                      3.77960000000002,
-                                      3.6078,
-                                      2.49160000000001,
-                                      3.13099999999997,
-                                      2.5376,
-                                      3.19880000000001,
-                                      3.21100000000001,
-                                      2.4502,
-                                      3.52820000000003,
-                                      2.91199999999998,
-                                      3.04480000000001,
-                                      2.7432,
-                                      2.85239999999999,
-                                      2.79880000000003,
-                                      2.78579999999999,
-                                      1.88679999999999,
-                                      2.98860000000002,
-                                      2.50639999999999,
-                                      1.91239999999999,
-                                      2.66160000000002,
-                                      2.46820000000002,
-                                      1.58199999999999,
-                                      1.30399999999997,
-                                      2.27379999999999,
-                                      2.68939999999998,
-                                      1.32900000000001,
-                                      3.10599999999999,
-                                      1.69080000000002,
-                                      2.13740000000001,
-                                      2.53219999999999,
-                                      1.88479999999998,
-                                      1.33240000000001,
-                                      1.45119999999997,
-                                      1.17899999999997,
-                                      2.44119999999998,
-                                      1.60659999999996,
-                                      2.16700000000003,
-                                      0.77940000000001,
-                                      2.37900000000002,
-                                      2.06700000000001,
-                                      1.46000000000004,
-                                      2.91160000000002,
-                                      1.69200000000001,
-                                      0.954600000000028,
-                                      2.49300000000005,
-                                      2.2722,
-                                      1.33500000000004,
-                                      2.44899999999996,
-                                      1.20140000000004,
-                                      3.07380000000001,
-                                      2.09739999999999,
-                                      2.85640000000001,
-                                      2.29960000000005,
-                                      2.40899999999999,
-                                      1.97040000000004,
-                                      0.809799999999996,
-                                      1.65279999999996,
-                                      2.59979999999996,
-                                      0.95799999999997,
-                                      2.06799999999998,
-                                      2.32780000000002,
-                                      4.20159999999998,
-                                      1.96320000000003,
-                                      1.86400000000003,
-                                      1.42999999999995,
-                                      3.77940000000001,
-                                      1.27200000000005,
-                                      1.86440000000005,
-                                      2.20600000000002,
-                                      3.21900000000005,
-                                      1.5154,
-                                      2.61019999999996};
-CUCO_HLL_TUNING_ARR_DECL bias_data_p8{183.2152,
-                                      180.2454,
-                                      177.2096,
-                                      173.6652,
-                                      170.6312,
-                                      167.6822,
-                                      164.249,
-                                      161.3296,
-                                      158.0038,
-                                      155.2074,
-                                      152.4612,
-                                      149.27,
-                                      146.5178,
-                                      143.4412,
-                                      140.8032,
-                                      138.1634,
-                                      135.1688,
-                                      132.6074,
-                                      129.6946,
-                                      127.2664,
-                                      124.8228,
-                                      122.0432,
-                                      119.6824,
-                                      116.9464,
-                                      114.6268,
-                                      112.2626,
-                                      109.8376,
-                                      107.4034,
-                                      104.8956,
-                                      102.8522,
-                                      100.7638,
-                                      98.3552,
-                                      96.3556,
-                                      93.7526,
-                                      91.9292,
-                                      89.8954,
-                                      87.8198,
-                                      85.7668,
-                                      83.298,
-                                      81.6688,
-                                      79.9466,
-                                      77.9746,
-                                      76.1672,
-                                      74.3474,
-                                      72.3028,
-                                      70.8912,
-                                      69.114,
-                                      67.4646,
-                                      65.9744,
-                                      64.4092,
-                                      62.6022,
-                                      60.843,
-                                      59.5684,
-                                      58.1652,
-                                      56.5426,
-                                      55.4152,
-                                      53.5388,
-                                      52.3592,
-                                      51.1366,
-                                      49.486,
-                                      48.3918,
-                                      46.5076,
-                                      45.509,
-                                      44.3834,
-                                      43.3498,
-                                      42.0668,
-                                      40.7346,
-                                      40.1228,
-                                      38.4528,
-                                      37.7,
-                                      36.644,
-                                      36.0518,
-                                      34.5774,
-                                      33.9068,
-                                      32.432,
-                                      32.1666,
-                                      30.434,
-                                      29.6644,
-                                      28.4894,
-                                      27.6312,
-                                      26.3804,
-                                      26.292,
-                                      25.5496000000001,
-                                      25.0234,
-                                      24.8206,
-                                      22.6146,
-                                      22.4188,
-                                      22.117,
-                                      20.6762,
-                                      20.6576,
-                                      19.7864,
-                                      19.509,
-                                      18.5334,
-                                      17.9204,
-                                      17.772,
-                                      16.2924,
-                                      16.8654,
-                                      15.1836,
-                                      15.745,
-                                      15.1316,
-                                      15.0386,
-                                      14.0136,
-                                      13.6342,
-                                      12.6196,
-                                      12.1866,
-                                      12.4281999999999,
-                                      11.3324,
-                                      10.4794000000001,
-                                      11.5038,
-                                      10.129,
-                                      9.52800000000002,
-                                      10.3203999999999,
-                                      9.46299999999997,
-                                      9.79280000000006,
-                                      9.12300000000005,
-                                      8.74180000000001,
-                                      9.2192,
-                                      7.51020000000005,
-                                      7.60659999999996,
-                                      7.01840000000004,
-                                      7.22239999999999,
-                                      7.40139999999997,
-                                      6.76179999999999,
-                                      7.14359999999999,
-                                      5.65060000000005,
-                                      5.63779999999997,
-                                      5.76599999999996,
-                                      6.75139999999999,
-                                      5.57759999999996,
-                                      3.73220000000003,
-                                      5.8048,
-                                      5.63019999999995,
-                                      4.93359999999996,
-                                      3.47979999999995,
-                                      4.33879999999999,
-                                      3.98940000000005,
-                                      3.81960000000004,
-                                      3.31359999999995,
-                                      3.23080000000004,
-                                      3.4588,
-                                      3.08159999999998,
-                                      3.4076,
-                                      3.00639999999999,
-                                      2.38779999999997,
-                                      2.61900000000003,
-                                      1.99800000000005,
-                                      3.34820000000002,
-                                      2.95060000000001,
-                                      0.990999999999985,
-                                      2.11440000000005,
-                                      2.20299999999997,
-                                      2.82219999999995,
-                                      2.73239999999998,
-                                      2.7826,
-                                      3.76660000000004,
-                                      2.26480000000004,
-                                      2.31280000000004,
-                                      2.40819999999997,
-                                      2.75360000000001,
-                                      3.33759999999995,
-                                      2.71559999999999,
-                                      1.7478000000001,
-                                      1.42920000000004,
-                                      2.39300000000003,
-                                      2.22779999999989,
-                                      2.34339999999997,
-                                      0.87259999999992,
-                                      3.88400000000001,
-                                      1.80600000000004,
-                                      1.91759999999999,
-                                      1.16779999999994,
-                                      1.50320000000011,
-                                      2.52500000000009,
-                                      0.226400000000012,
-                                      2.31500000000005,
-                                      0.930000000000064,
-                                      1.25199999999995,
-                                      2.14959999999996,
-                                      0.0407999999999902,
-                                      2.5447999999999,
-                                      1.32960000000003,
-                                      0.197400000000016,
-                                      2.52620000000002,
-                                      3.33279999999991,
-                                      -1.34300000000007,
-                                      0.422199999999975,
-                                      0.917200000000093,
-                                      1.12920000000008,
-                                      1.46060000000011,
-                                      1.45779999999991,
-                                      2.8728000000001,
-                                      3.33359999999993,
-                                      -1.34079999999994,
-                                      1.57680000000005,
-                                      0.363000000000056,
-                                      1.40740000000005,
-                                      0.656600000000026,
-                                      0.801400000000058,
-                                      -0.454600000000028,
-                                      1.51919999999996};
-CUCO_HLL_TUNING_ARR_DECL bias_data_p9{368.0,
-                                      361.8294,
-                                      355.2452,
-                                      348.6698,
-                                      342.1464,
-                                      336.2024,
-                                      329.8782,
-                                      323.6598,
-                                      317.462,
-                                      311.2826,
-                                      305.7102,
-                                      299.7416,
-                                      293.9366,
-                                      288.1046,
-                                      282.285,
-                                      277.0668,
-                                      271.306,
-                                      265.8448,
-                                      260.301,
-                                      254.9886,
-                                      250.2422,
-                                      244.8138,
-                                      239.7074,
-                                      234.7428,
-                                      229.8402,
-                                      225.1664,
-                                      220.3534,
-                                      215.594,
-                                      210.6886,
-                                      205.7876,
-                                      201.65,
-                                      197.228,
-                                      192.8036,
-                                      188.1666,
-                                      184.0818,
-                                      180.0824,
-                                      176.2574,
-                                      172.302,
-                                      168.1644,
-                                      164.0056,
-                                      160.3802,
-                                      156.7192,
-                                      152.5234,
-                                      149.2084,
-                                      145.831,
-                                      142.485,
-                                      139.1112,
-                                      135.4764,
-                                      131.76,
-                                      129.3368,
-                                      126.5538,
-                                      122.5058,
-                                      119.2646,
-                                      116.5902,
-                                      113.3818,
-                                      110.8998,
-                                      107.9532,
-                                      105.2062,
-                                      102.2798,
-                                      99.4728,
-                                      96.9582,
-                                      94.3292,
-                                      92.171,
-                                      89.7809999999999,
-                                      87.5716,
-                                      84.7048,
-                                      82.5322,
-                                      79.875,
-                                      78.3972,
-                                      75.3464,
-                                      73.7274,
-                                      71.2834,
-                                      70.1444,
-                                      68.4263999999999,
-                                      66.0166,
-                                      64.018,
-                                      62.0437999999999,
-                                      60.3399999999999,
-                                      58.6856,
-                                      57.9836,
-                                      55.0311999999999,
-                                      54.6769999999999,
-                                      52.3188,
-                                      51.4846,
-                                      49.4423999999999,
-                                      47.739,
-                                      46.1487999999999,
-                                      44.9202,
-                                      43.4059999999999,
-                                      42.5342000000001,
-                                      41.2834,
-                                      38.8954000000001,
-                                      38.3286000000001,
-                                      36.2146,
-                                      36.6684,
-                                      35.9946,
-                                      33.123,
-                                      33.4338,
-                                      31.7378000000001,
-                                      29.076,
-                                      28.9692,
-                                      27.4964,
-                                      27.0998,
-                                      25.9864,
-                                      26.7754,
-                                      24.3208,
-                                      23.4838,
-                                      22.7388000000001,
-                                      24.0758000000001,
-                                      21.9097999999999,
-                                      20.9728,
-                                      19.9228000000001,
-                                      19.9292,
-                                      16.617,
-                                      17.05,
-                                      18.2996000000001,
-                                      15.6128000000001,
-                                      15.7392,
-                                      14.5174,
-                                      13.6322,
-                                      12.2583999999999,
-                                      13.3766000000001,
-                                      11.423,
-                                      13.1232,
-                                      9.51639999999998,
-                                      10.5938000000001,
-                                      9.59719999999993,
-                                      8.12220000000002,
-                                      9.76739999999995,
-                                      7.50440000000003,
-                                      7.56999999999994,
-                                      6.70440000000008,
-                                      6.41419999999994,
-                                      6.71019999999999,
-                                      5.60940000000005,
-                                      4.65219999999999,
-                                      6.84099999999989,
-                                      3.4072000000001,
-                                      3.97859999999991,
-                                      3.32760000000007,
-                                      5.52160000000003,
-                                      3.31860000000006,
-                                      2.06940000000009,
-                                      4.35400000000004,
-                                      1.57500000000005,
-                                      0.280799999999999,
-                                      2.12879999999996,
-                                      -0.214799999999968,
-                                      -0.0378000000000611,
-                                      -0.658200000000079,
-                                      0.654800000000023,
-                                      -0.0697999999999865,
-                                      0.858400000000074,
-                                      -2.52700000000004,
-                                      -2.1751999999999,
-                                      -3.35539999999992,
-                                      -1.04019999999991,
-                                      -0.651000000000067,
-                                      -2.14439999999991,
-                                      -1.96659999999997,
-                                      -3.97939999999994,
-                                      -0.604400000000169,
-                                      -3.08260000000018,
-                                      -3.39159999999993,
-                                      -5.29640000000018,
-                                      -5.38920000000007,
-                                      -5.08759999999984,
-                                      -4.69900000000007,
-                                      -5.23720000000003,
-                                      -3.15779999999995,
-                                      -4.97879999999986,
-                                      -4.89899999999989,
-                                      -7.48880000000008,
-                                      -5.94799999999987,
-                                      -5.68060000000014,
-                                      -6.67180000000008,
-                                      -4.70499999999993,
-                                      -7.27779999999984,
-                                      -4.6579999999999,
-                                      -4.4362000000001,
-                                      -4.32139999999981,
-                                      -5.18859999999995,
-                                      -6.66879999999992,
-                                      -6.48399999999992,
-                                      -5.1260000000002,
-                                      -4.4032000000002,
-                                      -6.13500000000022,
-                                      -5.80819999999994,
-                                      -4.16719999999987,
-                                      -4.15039999999999,
-                                      -7.45600000000013,
-                                      -7.24080000000004,
-                                      -9.83179999999993,
-                                      -5.80420000000004,
-                                      -8.6561999999999,
-                                      -6.99940000000015,
-                                      -10.5473999999999,
-                                      -7.34139999999979,
-                                      -6.80999999999995,
-                                      -6.29719999999998,
-                                      -6.23199999999997};
-CUCO_HLL_TUNING_ARR_DECL bias_data_p10{737.1256,
-                                       724.4234,
-                                       711.1064,
-                                       698.4732,
-                                       685.4636,
-                                       673.0644,
-                                       660.488,
-                                       647.9654,
-                                       636.0832,
-                                       623.7864,
-                                       612.1992,
-                                       600.2176,
-                                       588.5228,
-                                       577.1716,
-                                       565.7752,
-                                       554.899,
-                                       543.6126,
-                                       532.6492,
-                                       521.9474,
-                                       511.5214,
-                                       501.1064,
-                                       490.6364,
-                                       480.2468,
-                                       470.4588,
-                                       460.3832,
-                                       451.0584,
-                                       440.8606,
-                                       431.3868,
-                                       422.5062,
-                                       413.1862,
-                                       404.463,
-                                       395.339,
-                                       386.1936,
-                                       378.1292,
-                                       369.1854,
-                                       361.2908,
-                                       353.3324,
-                                       344.8518,
-                                       337.5204,
-                                       329.4854,
-                                       321.9318,
-                                       314.552,
-                                       306.4658,
-                                       299.4256,
-                                       292.849,
-                                       286.152,
-                                       278.8956,
-                                       271.8792,
-                                       265.118,
-                                       258.62,
-                                       252.5132,
-                                       245.9322,
-                                       239.7726,
-                                       233.6086,
-                                       227.5332,
-                                       222.5918,
-                                       216.4294,
-                                       210.7662,
-                                       205.4106,
-                                       199.7338,
-                                       194.9012,
-                                       188.4486,
-                                       183.1556,
-                                       178.6338,
-                                       173.7312,
-                                       169.6264,
-                                       163.9526,
-                                       159.8742,
-                                       155.8326,
-                                       151.1966,
-                                       147.5594,
-                                       143.07,
-                                       140.037,
-                                       134.1804,
-                                       131.071,
-                                       127.4884,
-                                       124.0848,
-                                       120.2944,
-                                       117.333,
-                                       112.9626,
-                                       110.2902,
-                                       107.0814,
-                                       103.0334,
-                                       99.4832000000001,
-                                       96.3899999999999,
-                                       93.7202000000002,
-                                       90.1714000000002,
-                                       87.2357999999999,
-                                       85.9346,
-                                       82.8910000000001,
-                                       80.0264000000002,
-                                       78.3834000000002,
-                                       75.1543999999999,
-                                       73.8683999999998,
-                                       70.9895999999999,
-                                       69.4367999999999,
-                                       64.8701999999998,
-                                       65.0408000000002,
-                                       61.6738,
-                                       59.5207999999998,
-                                       57.0158000000001,
-                                       54.2302,
-                                       53.0962,
-                                       50.4985999999999,
-                                       52.2588000000001,
-                                       47.3914,
-                                       45.6244000000002,
-                                       42.8377999999998,
-                                       43.0072,
-                                       40.6516000000001,
-                                       40.2453999999998,
-                                       35.2136,
-                                       36.4546,
-                                       33.7849999999999,
-                                       33.2294000000002,
-                                       32.4679999999998,
-                                       30.8670000000002,
-                                       28.6507999999999,
-                                       28.9099999999999,
-                                       27.5983999999999,
-                                       26.1619999999998,
-                                       24.5563999999999,
-                                       23.2328000000002,
-                                       21.9484000000002,
-                                       21.5902000000001,
-                                       21.3346000000001,
-                                       17.7031999999999,
-                                       20.6111999999998,
-                                       19.5545999999999,
-                                       15.7375999999999,
-                                       17.0720000000001,
-                                       16.9517999999998,
-                                       15.326,
-                                       13.1817999999998,
-                                       14.6925999999999,
-                                       13.0859999999998,
-                                       13.2754,
-                                       10.8697999999999,
-                                       11.248,
-                                       7.3768,
-                                       4.72339999999986,
-                                       7.97899999999981,
-                                       8.7503999999999,
-                                       7.68119999999999,
-                                       9.7199999999998,
-                                       7.73919999999998,
-                                       5.6224000000002,
-                                       7.44560000000001,
-                                       6.6601999999998,
-                                       5.9058,
-                                       4.00199999999995,
-                                       4.51699999999983,
-                                       4.68240000000014,
-                                       3.86220000000003,
-                                       5.13639999999987,
-                                       5.98500000000013,
-                                       2.47719999999981,
-                                       2.61999999999989,
-                                       1.62800000000016,
-                                       4.65000000000009,
-                                       0.225599999999758,
-                                       0.831000000000131,
-                                       -0.359400000000278,
-                                       1.27599999999984,
-                                       -2.92559999999958,
-                                       -0.0303999999996449,
-                                       2.37079999999969,
-                                       -2.0033999999996,
-                                       0.804600000000391,
-                                       0.30199999999968,
-                                       1.1247999999996,
-                                       -2.6880000000001,
-                                       0.0321999999996478,
-                                       -1.18099999999959,
-                                       -3.9402,
-                                       -1.47940000000017,
-                                       -0.188400000000001,
-                                       -2.10720000000038,
-                                       -2.04159999999956,
-                                       -3.12880000000041,
-                                       -4.16160000000036,
-                                       -0.612799999999879,
-                                       -3.48719999999958,
-                                       -8.17900000000009,
-                                       -5.37780000000021,
-                                       -4.01379999999972,
-                                       -5.58259999999973,
-                                       -5.73719999999958,
-                                       -7.66799999999967,
-                                       -5.69520000000011,
-                                       -1.1247999999996,
-                                       -5.58520000000044,
-                                       -8.04560000000038,
-                                       -4.64840000000004,
-                                       -11.6468000000004,
-                                       -7.97519999999986,
-                                       -5.78300000000036,
-                                       -7.67420000000038,
-                                       -10.6328000000003,
-                                       -9.81720000000041};
-CUCO_HLL_TUNING_ARR_DECL bias_data_p11{1476.0,
-                                       1449.6014,
-                                       1423.5802,
-                                       1397.7942,
-                                       1372.3042,
-                                       1347.2062,
-                                       1321.8402,
-                                       1297.2292,
-                                       1272.9462,
-                                       1248.9926,
-                                       1225.3026,
-                                       1201.4252,
-                                       1178.0578,
-                                       1155.6092,
-                                       1132.626,
-                                       1110.5568,
-                                       1088.527,
-                                       1066.5154,
-                                       1045.1874,
-                                       1024.3878,
-                                       1003.37,
-                                       982.1972,
-                                       962.5728,
-                                       942.1012,
-                                       922.9668,
-                                       903.292,
-                                       884.0772,
-                                       864.8578,
-                                       846.6562,
-                                       828.041,
-                                       809.714,
-                                       792.3112,
-                                       775.1806,
-                                       757.9854,
-                                       740.656,
-                                       724.346,
-                                       707.5154,
-                                       691.8378,
-                                       675.7448,
-                                       659.6722,
-                                       645.5722,
-                                       630.1462,
-                                       614.4124,
-                                       600.8728,
-                                       585.898,
-                                       572.408,
-                                       558.4926,
-                                       544.4938,
-                                       531.6776,
-                                       517.282,
-                                       505.7704,
-                                       493.1012,
-                                       480.7388,
-                                       467.6876,
-                                       456.1872,
-                                       445.5048,
-                                       433.0214,
-                                       420.806,
-                                       411.409,
-                                       400.4144,
-                                       389.4294,
-                                       379.2286,
-                                       369.651,
-                                       360.6156,
-                                       350.337,
-                                       342.083,
-                                       332.1538,
-                                       322.5094,
-                                       315.01,
-                                       305.6686,
-                                       298.1678,
-                                       287.8116,
-                                       280.9978,
-                                       271.9204,
-                                       265.3286,
-                                       257.5706,
-                                       249.6014,
-                                       242.544,
-                                       235.5976,
-                                       229.583,
-                                       220.9438,
-                                       214.672,
-                                       208.2786,
-                                       201.8628,
-                                       195.1834,
-                                       191.505,
-                                       186.1816,
-                                       178.5188,
-                                       172.2294,
-                                       167.8908,
-                                       161.0194,
-                                       158.052,
-                                       151.4588,
-                                       148.1596,
-                                       143.4344,
-                                       138.5238,
-                                       133.13,
-                                       127.6374,
-                                       124.8162,
-                                       118.7894,
-                                       117.3984,
-                                       114.6078,
-                                       109.0858,
-                                       105.1036,
-                                       103.6258,
-                                       98.6018000000004,
-                                       95.7618000000002,
-                                       93.5821999999998,
-                                       88.5900000000001,
-                                       86.9992000000002,
-                                       82.8800000000001,
-                                       80.4539999999997,
-                                       74.6981999999998,
-                                       74.3644000000004,
-                                       73.2914000000001,
-                                       65.5709999999999,
-                                       66.9232000000002,
-                                       65.1913999999997,
-                                       62.5882000000001,
-                                       61.5702000000001,
-                                       55.7035999999998,
-                                       56.1764000000003,
-                                       52.7596000000003,
-                                       53.0302000000001,
-                                       49.0609999999997,
-                                       48.4694,
-                                       44.933,
-                                       46.0474000000004,
-                                       44.7165999999997,
-                                       41.9416000000001,
-                                       39.9207999999999,
-                                       35.6328000000003,
-                                       35.5276000000003,
-                                       33.1934000000001,
-                                       33.2371999999996,
-                                       33.3864000000003,
-                                       33.9228000000003,
-                                       30.2371999999996,
-                                       29.1373999999996,
-                                       25.2272000000003,
-                                       24.2942000000003,
-                                       19.8338000000003,
-                                       18.9005999999999,
-                                       23.0907999999999,
-                                       21.8544000000002,
-                                       19.5176000000001,
-                                       15.4147999999996,
-                                       16.9314000000004,
-                                       18.6737999999996,
-                                       12.9877999999999,
-                                       14.3688000000002,
-                                       12.0447999999997,
-                                       15.5219999999999,
-                                       12.5299999999997,
-                                       14.5940000000001,
-                                       14.3131999999996,
-                                       9.45499999999993,
-                                       12.9441999999999,
-                                       3.91139999999996,
-                                       13.1373999999996,
-                                       5.44720000000052,
-                                       9.82779999999912,
-                                       7.87279999999919,
-                                       3.67760000000089,
-                                       5.46980000000076,
-                                       5.55099999999948,
-                                       5.65979999999945,
-                                       3.89439999999922,
-                                       3.1275999999998,
-                                       5.65140000000065,
-                                       6.3062000000009,
-                                       3.90799999999945,
-                                       1.87060000000019,
-                                       5.17020000000048,
-                                       2.46680000000015,
-                                       0.770000000000437,
-                                       -3.72340000000077,
-                                       1.16400000000067,
-                                       8.05340000000069,
-                                       0.135399999999208,
-                                       2.15940000000046,
-                                       0.766999999999825,
-                                       1.0594000000001,
-                                       3.15500000000065,
-                                       -0.287399999999252,
-                                       2.37219999999979,
-                                       -2.86620000000039,
-                                       -1.63199999999961,
-                                       -2.22979999999916,
-                                       -0.15519999999924,
-                                       -1.46039999999994,
-                                       -0.262199999999211,
-                                       -2.34460000000036,
-                                       -2.8078000000005,
-                                       -3.22179999999935,
-                                       -5.60159999999996,
-                                       -8.42200000000048,
-                                       -9.43740000000071,
-                                       0.161799999999857,
-                                       -10.4755999999998,
-                                       -10.0823999999993};
-CUCO_HLL_TUNING_ARR_DECL bias_data_p12{2953.0,
-                                       2900.4782,
-                                       2848.3568,
-                                       2796.3666,
-                                       2745.324,
-                                       2694.9598,
-                                       2644.648,
-                                       2595.539,
-                                       2546.1474,
-                                       2498.2576,
-                                       2450.8376,
-                                       2403.6076,
-                                       2357.451,
-                                       2311.38,
-                                       2266.4104,
-                                       2221.5638,
-                                       2176.9676,
-                                       2134.193,
-                                       2090.838,
-                                       2048.8548,
-                                       2007.018,
-                                       1966.1742,
-                                       1925.4482,
-                                       1885.1294,
-                                       1846.4776,
-                                       1807.4044,
-                                       1768.8724,
-                                       1731.3732,
-                                       1693.4304,
-                                       1657.5326,
-                                       1621.949,
-                                       1586.5532,
-                                       1551.7256,
-                                       1517.6182,
-                                       1483.5186,
-                                       1450.4528,
-                                       1417.865,
-                                       1385.7164,
-                                       1352.6828,
-                                       1322.6708,
-                                       1291.8312,
-                                       1260.9036,
-                                       1231.476,
-                                       1201.8652,
-                                       1173.6718,
-                                       1145.757,
-                                       1119.2072,
-                                       1092.2828,
-                                       1065.0434,
-                                       1038.6264,
-                                       1014.3192,
-                                       988.5746,
-                                       965.0816,
-                                       940.1176,
-                                       917.9796,
-                                       894.5576,
-                                       871.1858,
-                                       849.9144,
-                                       827.1142,
-                                       805.0818,
-                                       783.9664,
-                                       763.9096,
-                                       742.0816,
-                                       724.3962,
-                                       706.3454,
-                                       688.018,
-                                       667.4214,
-                                       650.3106,
-                                       633.0686,
-                                       613.8094,
-                                       597.818,
-                                       581.4248,
-                                       563.834,
-                                       547.363,
-                                       531.5066,
-                                       520.455400000001,
-                                       505.583199999999,
-                                       488.366,
-                                       476.480799999999,
-                                       459.7682,
-                                       450.0522,
-                                       434.328799999999,
-                                       423.952799999999,
-                                       408.727000000001,
-                                       399.079400000001,
-                                       387.252200000001,
-                                       373.987999999999,
-                                       360.852000000001,
-                                       351.6394,
-                                       339.642,
-                                       330.902400000001,
-                                       322.661599999999,
-                                       311.662200000001,
-                                       301.3254,
-                                       291.7484,
-                                       279.939200000001,
-                                       276.7508,
-                                       263.215200000001,
-                                       254.811400000001,
-                                       245.5494,
-                                       242.306399999999,
-                                       234.8734,
-                                       223.787200000001,
-                                       217.7156,
-                                       212.0196,
-                                       200.793,
-                                       195.9748,
-                                       189.0702,
-                                       182.449199999999,
-                                       177.2772,
-                                       170.2336,
-                                       164.741,
-                                       158.613600000001,
-                                       155.311,
-                                       147.5964,
-                                       142.837,
-                                       137.3724,
-                                       132.0162,
-                                       130.0424,
-                                       121.9804,
-                                       120.451800000001,
-                                       114.8968,
-                                       111.585999999999,
-                                       105.933199999999,
-                                       101.705,
-                                       98.5141999999996,
-                                       95.0488000000005,
-                                       89.7880000000005,
-                                       91.4750000000004,
-                                       83.7764000000006,
-                                       80.9698000000008,
-                                       72.8574000000008,
-                                       73.1615999999995,
-                                       67.5838000000003,
-                                       62.6263999999992,
-                                       63.2638000000006,
-                                       66.0977999999996,
-                                       52.0843999999997,
-                                       58.9956000000002,
-                                       47.0912000000008,
-                                       46.4956000000002,
-                                       48.4383999999991,
-                                       47.1082000000006,
-                                       43.2392,
-                                       37.2759999999998,
-                                       40.0283999999992,
-                                       35.1864000000005,
-                                       35.8595999999998,
-                                       32.0998,
-                                       28.027,
-                                       23.6694000000007,
-                                       33.8266000000003,
-                                       26.3736000000008,
-                                       27.2008000000005,
-                                       21.3245999999999,
-                                       26.4115999999995,
-                                       23.4521999999997,
-                                       19.5013999999992,
-                                       19.8513999999996,
-                                       10.7492000000002,
-                                       18.6424000000006,
-                                       13.1265999999996,
-                                       18.2436000000016,
-                                       6.71860000000015,
-                                       3.39459999999963,
-                                       6.33759999999893,
-                                       7.76719999999841,
-                                       0.813999999998487,
-                                       3.82819999999992,
-                                       0.826199999999517,
-                                       8.07440000000133,
-                                       -1.59080000000176,
-                                       5.01780000000144,
-                                       0.455399999998917,
-                                       -0.24199999999837,
-                                       0.174800000000687,
-                                       -9.07640000000174,
-                                       -4.20160000000033,
-                                       -3.77520000000004,
-                                       -4.75179999999818,
-                                       -5.3724000000002,
-                                       -8.90680000000066,
-                                       -6.10239999999976,
-                                       -5.74120000000039,
-                                       -9.95339999999851,
-                                       -3.86339999999836,
-                                       -13.7304000000004,
-                                       -16.2710000000006,
-                                       -7.51359999999841,
-                                       -3.30679999999847,
-                                       -13.1339999999982,
-                                       -10.0551999999989,
-                                       -6.72019999999975,
-                                       -8.59660000000076,
-                                       -10.9307999999983,
-                                       -1.8775999999998,
-                                       -4.82259999999951,
-                                       -13.7788,
-                                       -21.6470000000008,
-                                       -10.6735999999983,
-                                       -15.7799999999988};
-CUCO_HLL_TUNING_ARR_DECL bias_data_p13{5907.5052,         5802.2672,
-                                       5697.347,          5593.5794,
-                                       5491.2622,         5390.5514,
-                                       5290.3376,         5191.6952,
-                                       5093.5988,         4997.3552,
-                                       4902.5972,         4808.3082,
-                                       4715.5646,         4624.109,
-                                       4533.8216,         4444.4344,
-                                       4356.3802,         4269.2962,
-                                       4183.3784,         4098.292,
-                                       4014.79,           3932.4574,
-                                       3850.6036,         3771.2712,
-                                       3691.7708,         3615.099,
-                                       3538.1858,         3463.4746,
-                                       3388.8496,         3315.6794,
-                                       3244.5448,         3173.7516,
-                                       3103.3106,         3033.6094,
-                                       2966.5642,         2900.794,
-                                       2833.7256,         2769.81,
-                                       2707.3196,         2644.0778,
-                                       2583.9916,         2523.4662,
-                                       2464.124,          2406.073,
-                                       2347.0362,         2292.1006,
-                                       2238.1716,         2182.7514,
-                                       2128.4884,         2077.1314,
-                                       2025.037,          1975.3756,
-                                       1928.933,          1879.311,
-                                       1831.0006,         1783.2144,
-                                       1738.3096,         1694.5144,
-                                       1649.024,          1606.847,
-                                       1564.7528,         1525.3168,
-                                       1482.5372,         1443.9668,
-                                       1406.5074,         1365.867,
-                                       1329.2186,         1295.4186,
-                                       1257.9716,         1225.339,
-                                       1193.2972,         1156.3578,
-                                       1125.8686,         1091.187,
-                                       1061.4094,         1029.4188,
-                                       1000.9126,         972.3272,
-                                       944.004199999999,  915.7592,
-                                       889.965,           862.834200000001,
-                                       840.4254,          812.598399999999,
-                                       785.924200000001,  763.050999999999,
-                                       741.793799999999,  721.466,
-                                       699.040799999999,  677.997200000002,
-                                       649.866999999998,  634.911800000002,
-                                       609.8694,          591.981599999999,
-                                       570.2922,          557.129199999999,
-                                       538.3858,          521.872599999999,
-                                       502.951400000002,  495.776399999999,
-                                       475.171399999999,  459.751,
-                                       439.995200000001,  426.708999999999,
-                                       413.7016,          402.3868,
-                                       387.262599999998,  372.0524,
-                                       357.050999999999,  342.5098,
-                                       334.849200000001,  322.529399999999,
-                                       311.613799999999,  295.848000000002,
-                                       289.273000000001,  274.093000000001,
-                                       263.329600000001,  251.389599999999,
-                                       245.7392,          231.9614,
-                                       229.7952,          217.155200000001,
-                                       208.9588,          199.016599999999,
-                                       190.839199999999,  180.6976,
-                                       176.272799999999,  166.976999999999,
-                                       162.5252,          151.196400000001,
-                                       149.386999999999,  133.981199999998,
-                                       130.0586,          130.164000000001,
-                                       122.053400000001,  110.7428,
-                                       108.1276,          106.232400000001,
-                                       100.381600000001,  98.7668000000012,
-                                       86.6440000000002,  79.9768000000004,
-                                       82.4722000000002,  68.7026000000005,
-                                       70.1186000000016,  71.9948000000004,
-                                       58.998599999999,   59.0492000000013,
-                                       56.9818000000014,  47.5338000000011,
-                                       42.9928,           51.1591999999982,
-                                       37.2740000000013,  42.7220000000016,
-                                       31.3734000000004,  26.8090000000011,
-                                       25.8934000000008,  26.5286000000015,
-                                       29.5442000000003,  19.3503999999994,
-                                       26.0760000000009,  17.9527999999991,
-                                       14.8419999999969,  10.4683999999979,
-                                       8.65899999999965,  9.86720000000059,
-                                       4.34139999999752,  -0.907800000000861,
-                                       -3.32080000000133, -0.936199999996461,
-                                       -11.9916000000012, -8.87000000000262,
-                                       -6.33099999999831, -11.3366000000024,
-                                       -15.9207999999999, -9.34659999999712,
-                                       -15.5034000000014, -19.2097999999969,
-                                       -15.357799999998,  -28.2235999999975,
-                                       -30.6898000000001, -19.3271999999997,
-                                       -25.6083999999973, -24.409599999999,
-                                       -13.6385999999984, -33.4473999999973,
-                                       -32.6949999999997, -28.9063999999998,
-                                       -31.7483999999968, -32.2935999999972,
-                                       -35.8329999999987, -47.620600000002,
-                                       -39.0855999999985, -33.1434000000008,
-                                       -46.1371999999974, -37.5892000000022,
-                                       -46.8164000000033, -47.3142000000007,
-                                       -60.2914000000019, -37.7575999999972};
-CUCO_HLL_TUNING_ARR_DECL bias_data_p14{
-  11816.475,         11605.0046,        11395.3792,        11188.7504,        10984.1814,
-  10782.0086,        10582.0072,        10384.503,         10189.178,         9996.2738,
-  9806.0344,         9617.9798,         9431.394,          9248.7784,         9067.6894,
-  8889.6824,         8712.9134,         8538.8624,         8368.4944,         8197.7956,
-  8031.8916,         7866.6316,         7703.733,          7544.5726,         7386.204,
-  7230.666,          7077.8516,         6926.7886,         6778.6902,         6631.9632,
-  6487.304,          6346.7486,         6206.4408,         6070.202,          5935.2576,
-  5799.924,          5671.0324,         5541.9788,         5414.6112,         5290.0274,
-  5166.723,          5047.6906,         4929.162,          4815.1406,         4699.127,
-  4588.5606,         4477.7394,         4369.4014,         4264.2728,         4155.9224,
-  4055.581,          3955.505,          3856.9618,         3761.3828,         3666.9702,
-  3575.7764,         3482.4132,         3395.0186,         3305.8852,         3221.415,
-  3138.6024,         3056.296,          2970.4494,         2896.1526,         2816.8008,
-  2740.2156,         2670.497,          2594.1458,         2527.111,          2460.8168,
-  2387.5114,         2322.9498,         2260.6752,         2194.2686,         2133.7792,
-  2074.767,          2015.204,          1959.4226,         1898.6502,         1850.006,
-  1792.849,          1741.4838,         1687.9778,         1638.1322,         1589.3266,
-  1543.1394,         1496.8266,         1447.8516,         1402.7354,         1361.9606,
-  1327.0692,         1285.4106,         1241.8112,         1201.6726,         1161.973,
-  1130.261,          1094.2036,         1048.2036,         1020.6436,         990.901400000002,
-  961.199800000002,  924.769800000002,  899.526400000002,  872.346400000002,  834.375,
-  810.432000000001,  780.659800000001,  756.013800000001,  733.479399999997,  707.923999999999,
-  673.858,           652.222399999999,  636.572399999997,  615.738599999997,  586.696400000001,
-  564.147199999999,  541.679600000003,  523.943599999999,  505.714599999999,  475.729599999999,
-  461.779600000002,  449.750800000002,  439.020799999998,  412.7886,          400.245600000002,
-  383.188199999997,  362.079599999997,  357.533799999997,  334.319000000003,  327.553399999997,
-  308.559399999998,  291.270199999999,  279.351999999999,  271.791400000002,  252.576999999997,
-  247.482400000001,  236.174800000001,  218.774599999997,  220.155200000001,  208.794399999999,
-  201.223599999998,  182.995600000002,  185.5268,          164.547400000003,  176.5962,
-  150.689599999998,  157.8004,          138.378799999999,  134.021200000003,  117.614399999999,
-  108.194000000003,  97.0696000000025,  89.6042000000016,  95.6030000000028,  84.7810000000027,
-  72.635000000002,   77.3482000000004,  59.4907999999996,  55.5875999999989,  50.7346000000034,
-  61.3916000000027,  50.9149999999936,  39.0384000000049,  58.9395999999979,  29.633600000001,
-  28.2032000000036,  26.0078000000067,  17.0387999999948,  9.22000000000116,  13.8387999999977,
-  8.07240000000456,  14.1549999999988,  15.3570000000036,  3.42660000000615,  6.24820000000182,
-  -2.96940000000177, -8.79940000000352, -5.97860000000219, -14.4048000000039, -3.4143999999942,
-  -13.0148000000045, -11.6977999999945, -25.7878000000055, -22.3185999999987, -24.409599999999,
-  -31.9756000000052, -18.9722000000038, -22.8678000000073, -30.8972000000067, -32.3715999999986,
-  -22.3907999999938, -43.6720000000059, -35.9038,          -39.7492000000057, -54.1641999999993,
-  -45.2749999999942, -42.2989999999991, -44.1089999999967, -64.3564000000042, -49.9551999999967,
-  -42.6116000000038};
-CUCO_HLL_TUNING_ARR_DECL bias_data_p15{
-  23634.0036,         23210.8034,        22792.4744,        22379.1524,
-  21969.7928,         21565.326,         21165.3532,        20770.2806,
-  20379.9892,         19994.7098,        19613.318,         19236.799,
-  18865.4382,         18498.8244,        18136.5138,        17778.8668,
-  17426.2344,         17079.32,          16734.778,         16397.2418,
-  16063.3324,         15734.0232,        15409.731,         15088.728,
-  14772.9896,         14464.1402,        14157.5588,        13855.5958,
-  13559.3296,         13264.9096,        12978.326,         12692.0826,
-  12413.8816,         12137.3192,        11870.2326,        11602.5554,
-  11340.3142,         11079.613,         10829.5908,        10583.5466,
-  10334.0344,         10095.5072,        9859.694,          9625.2822,
-  9395.7862,          9174.0586,         8957.3164,         8738.064,
-  8524.155,           8313.7396,         8116.9168,         7913.542,
-  7718.4778,          7521.65,           7335.5596,         7154.2906,
-  6968.7396,          6786.3996,         6613.236,          6437.406,
-  6270.6598,          6107.7958,         5945.7174,         5787.6784,
-  5635.5784,          5482.308,          5337.9784,         5190.0864,
-  5045.9158,          4919.1386,         4771.817,          4645.7742,
-  4518.4774,          4385.5454,         4262.6622,         4142.74679999999,
-  4015.5318,          3897.9276,         3790.7764,         3685.13800000001,
-  3573.6274,          3467.9706,         3368.61079999999,  3271.5202,
-  3170.3848,          3076.4656,         2982.38400000001,  2888.4664,
-  2806.4868,          2711.9564,         2634.1434,         2551.3204,
-  2469.7662,          2396.61139999999,  2318.9902,         2243.8658,
-  2171.9246,          2105.01360000001,  2028.8536,         1960.9952,
-  1901.4096,          1841.86079999999,  1777.54700000001,  1714.5802,
-  1654.65059999999,   1596.311,          1546.2016,         1492.3296,
-  1433.8974,          1383.84600000001,  1339.4152,         1293.5518,
-  1245.8686,          1193.50659999999,  1162.27959999999,  1107.19439999999,
-  1069.18060000001,   1035.09179999999,  999.679000000004,  957.679999999993,
-  925.300199999998,   888.099400000006,  848.638600000006,  818.156400000007,
-  796.748399999997,   752.139200000005,  725.271200000003,  692.216,
-  671.633600000001,   647.939799999993,  621.670599999998,  575.398799999995,
-  561.226599999995,   532.237999999998,  521.787599999996,  483.095799999996,
-  467.049599999998,   465.286399999997,  415.548599999995,  401.047399999996,
-  380.607999999993,   377.362599999993,  347.258799999996,  338.371599999999,
-  310.096999999994,   301.409199999995,  276.280799999993,  265.586800000005,
-  258.994399999996,   223.915999999997,  215.925399999993,  213.503800000006,
-  191.045400000003,   166.718200000003,  166.259000000005,  162.941200000001,
-  148.829400000002,   141.645999999993,  123.535399999993,  122.329800000007,
-  89.473399999988,    80.1962000000058,  77.5457999999926,  59.1056000000099,
-  83.3509999999951,   52.2906000000075,  36.3979999999865,  40.6558000000077,
-  42.0003999999899,   19.6630000000005,  19.7153999999864,  -8.38539999999921,
-  -0.692799999989802, 0.854800000000978, 3.23219999999856,  -3.89040000000386,
-  -5.25880000001052,  -24.9052000000083, -22.6837999999989, -26.4286000000138,
-  -34.997000000003,   -37.0216000000073, -43.430400000012,  -58.2390000000014,
-  -68.8034000000043,  -56.9245999999985, -57.8583999999973, -77.3097999999882,
-  -73.2793999999994,  -81.0738000000129, -87.4530000000086, -65.0254000000132,
-  -57.296399999992,   -96.2746000000043, -103.25,           -96.081600000005,
-  -91.5542000000132,  -102.465200000006, -107.688599999994, -101.458000000013,
-  -109.715800000005};
-CUCO_HLL_TUNING_ARR_DECL bias_data_p16{
-  47270.0,           46423.3584,        45585.7074,        44757.152,         43938.8416,
-  43130.9514,        42330.03,          41540.407,         40759.6348,        39988.206,
-  39226.5144,        38473.2096,        37729.795,         36997.268,         36272.6448,
-  35558.665,         34853.0248,        34157.4472,        33470.5204,        32793.5742,
-  32127.0194,        31469.4182,        30817.6136,        30178.6968,        29546.8908,
-  28922.8544,        28312.271,         27707.0924,        27114.0326,        26526.692,
-  25948.6336,        25383.7826,        24823.5998,        24272.2974,        23732.2572,
-  23201.4976,        22674.2796,        22163.6336,        21656.515,         21161.7362,
-  20669.9368,        20189.4424,        19717.3358,        19256.3744,        18795.9638,
-  18352.197,         17908.5738,        17474.391,         17052.918,         16637.2236,
-  16228.4602,        15823.3474,        15428.6974,        15043.0284,        14667.6278,
-  14297.4588,        13935.2882,        13578.5402,        13234.6032,        12882.1578,
-  12548.0728,        12219.231,         11898.0072,        11587.2626,        11279.9072,
-  10973.5048,        10678.5186,        10392.4876,        10105.2556,        9825.766,
-  9562.5444,         9294.2222,         9038.2352,         8784.848,          8533.2644,
-  8301.7776,         8058.30859999999,  7822.94579999999,  7599.11319999999,  7366.90779999999,
-  7161.217,          6957.53080000001,  6736.212,          6548.21220000001,  6343.06839999999,
-  6156.28719999999,  5975.15419999999,  5791.75719999999,  5621.32019999999,  5451.66,
-  5287.61040000001,  5118.09479999999,  4957.288,          4798.4246,         4662.17559999999,
-  4512.05900000001,  4364.68539999999,  4220.77720000001,  4082.67259999999,  3957.19519999999,
-  3842.15779999999,  3699.3328,         3583.01180000001,  3473.8964,         3338.66639999999,
-  3233.55559999999,  3117.799,          3008.111,          2909.69140000001,  2814.86499999999,
-  2719.46119999999,  2624.742,          2532.46979999999,  2444.7886,         2370.1868,
-  2272.45259999999,  2196.19260000001,  2117.90419999999,  2023.2972,         1969.76819999999,
-  1885.58979999999,  1833.2824,         1733.91200000001,  1682.54920000001,  1604.57980000001,
-  1556.11240000001,  1491.3064,         1421.71960000001,  1371.22899999999,  1322.1324,
-  1264.7892,         1196.23920000001,  1143.8474,         1088.67240000001,  1073.60380000001,
-  1023.11660000001,  959.036400000012,  927.433199999999,  906.792799999996,  853.433599999989,
-  841.873800000001,  791.1054,          756.899999999994,  704.343200000003,  672.495599999995,
-  622.790399999998,  611.254799999995,  567.283200000005,  519.406599999988,  519.188400000014,
-  495.312800000014,  451.350799999986,  443.973399999988,  431.882199999993,  392.027000000002,
-  380.924200000009,  345.128999999986,  298.901400000002,  287.771999999997,  272.625,
-  247.253000000026,  222.490600000019,  223.590000000026,  196.407599999977,  176.425999999978,
-  134.725199999986,  132.4804,          110.445599999977,  86.7939999999944,  56.7038000000175,
-  64.915399999998,   38.3726000000024,  37.1606000000029,  46.170999999973,   49.1716000000015,
-  15.3362000000197,  6.71639999997569,  -34.8185999999987, -39.4476000000141, 12.6830000000191,
-  -12.3331999999937, -50.6565999999875, -59.9538000000175, -65.1054000000004, -70.7576000000117,
-  -106.325200000021, -126.852200000023, -110.227599999984, -132.885999999999, -113.897200000007,
-  -142.713800000027, -151.145399999979, -150.799200000009, -177.756200000003, -156.036399999983,
-  -182.735199999996, -177.259399999981, -198.663600000029, -174.577600000019, -193.84580000001};
-CUCO_HLL_TUNING_ARR_DECL bias_data_p17{
-  94541.0,           92848.811,         91174.019,         89517.558,         87879.9705,
-  86262.7565,        84663.5125,        83083.7435,        81521.7865,        79977.272,
-  78455.9465,        76950.219,         75465.432,         73994.152,         72546.71,
-  71115.2345,        69705.6765,        68314.937,         66944.2705,        65591.255,
-  64252.9485,        62938.016,         61636.8225,        60355.592,         59092.789,
-  57850.568,         56624.518,         55417.343,         54231.1415,        53067.387,
-  51903.526,         50774.649,         49657.6415,        48561.05,          47475.7575,
-  46410.159,         45364.852,         44327.053,         43318.4005,        42325.6165,
-  41348.4595,        40383.6265,        39436.77,          38509.502,         37594.035,
-  36695.939,         35818.6895,        34955.691,         34115.8095,        33293.949,
-  32465.0775,        31657.6715,        30877.2585,        30093.78,          29351.3695,
-  28594.1365,        27872.115,         27168.7465,        26477.076,         25774.541,
-  25106.5375,        24452.5135,        23815.5125,        23174.0655,        22555.2685,
-  21960.2065,        21376.3555,        20785.1925,        20211.517,         19657.0725,
-  19141.6865,        18579.737,         18081.3955,        17578.995,         17073.44,
-  16608.335,         16119.911,         15651.266,         15194.583,         14749.0495,
-  14343.4835,        13925.639,         13504.509,         13099.3885,        12691.2855,
-  12328.018,         11969.0345,        11596.5145,        11245.6355,        10917.6575,
-  10580.9785,        10277.8605,        9926.58100000001,  9605.538,          9300.42950000003,
-  8989.97850000003,  8728.73249999998,  8448.3235,         8175.31050000002,  7898.98700000002,
-  7629.79100000003,  7413.76199999999,  7149.92300000001,  6921.12650000001,  6677.1545,
-  6443.28000000003,  6278.23450000002,  6014.20049999998,  5791.20299999998,  5605.78450000001,
-  5438.48800000001,  5234.2255,         5059.6825,         4887.43349999998,  4682.935,
-  4496.31099999999,  4322.52250000002,  4191.42499999999,  4021.24200000003,  3900.64799999999,
-  3762.84250000003,  3609.98050000001,  3502.29599999997,  3363.84250000003,  3206.54849999998,
-  3079.70000000001,  2971.42300000001,  2867.80349999998,  2727.08100000001,  2630.74900000001,
-  2496.6165,         2440.902,          2356.19150000002,  2235.58199999999,  2120.54149999999,
-  2012.25449999998,  1933.35600000003,  1820.93099999998,  1761.54800000001,  1663.09350000002,
-  1578.84600000002,  1509.48149999999,  1427.3345,         1379.56150000001,  1306.68099999998,
-  1212.63449999999,  1084.17300000001,  1124.16450000001,  1060.69949999999,  1007.48849999998,
-  941.194499999983,  879.880500000028,  836.007500000007,  782.802000000025,  748.385499999975,
-  647.991500000004,  626.730500000005,  570.776000000013,  484.000500000024,  513.98550000001,
-  418.985499999952,  386.996999999974,  370.026500000036,  355.496999999974,  356.731499999994,
-  255.92200000002,   259.094000000041,  205.434499999974,  165.374500000034,  197.347500000033,
-  95.718499999959,   67.6165000000037,  54.6970000000438,  31.7395000000251,  -15.8784999999916,
-  8.42500000004657,  -26.3754999999655, -118.425500000012, -66.6629999999423, -42.9745000000112,
-  -107.364999999991, -189.839000000036, -162.611499999999, -164.964999999967, -189.079999999958,
-  -223.931499999948, -235.329999999958, -269.639500000048, -249.087999999989, -206.475499999942,
-  -283.04449999996,  -290.667000000016, -304.561499999953, -336.784499999951, -380.386500000022,
-  -283.280499999993, -364.533000000054, -389.059499999974, -364.454000000027, -415.748000000021,
-  -417.155000000028};
-CUCO_HLL_TUNING_ARR_DECL bias_data_p18{189083.0,
-                                       185696.913,
-                                       182348.774,
-                                       179035.946,
-                                       175762.762,
-                                       172526.444,
-                                       169329.754,
-                                       166166.099,
-                                       163043.269,
-                                       159958.91,
-                                       156907.912,
-                                       153906.845,
-                                       150924.199,
-                                       147996.568,
-                                       145093.457,
-                                       142239.233,
-                                       139421.475,
-                                       136632.27,
-                                       133889.588,
-                                       131174.2,
-                                       128511.619,
-                                       125868.621,
-                                       123265.385,
-                                       120721.061,
-                                       118181.769,
-                                       115709.456,
-                                       113252.446,
-                                       110840.198,
-                                       108465.099,
-                                       106126.164,
-                                       103823.469,
-                                       101556.618,
-                                       99308.004,
-                                       97124.508,
-                                       94937.803,
-                                       92833.731,
-                                       90745.061,
-                                       88677.627,
-                                       86617.47,
-                                       84650.442,
-                                       82697.833,
-                                       80769.132,
-                                       78879.629,
-                                       77014.432,
-                                       75215.626,
-                                       73384.587,
-                                       71652.482,
-                                       69895.93,
-                                       68209.301,
-                                       66553.669,
-                                       64921.981,
-                                       63310.323,
-                                       61742.115,
-                                       60205.018,
-                                       58698.658,
-                                       57190.657,
-                                       55760.865,
-                                       54331.169,
-                                       52908.167,
-                                       51550.273,
-                                       50225.254,
-                                       48922.421,
-                                       47614.533,
-                                       46362.049,
-                                       45098.569,
-                                       43926.083,
-                                       42736.03,
-                                       41593.473,
-                                       40425.26,
-                                       39316.237,
-                                       38243.651,
-                                       37170.617,
-                                       36114.609,
-                                       35084.19,
-                                       34117.233,
-                                       33206.509,
-                                       32231.505,
-                                       31318.728,
-                                       30403.404,
-                                       29540.0550000001,
-                                       28679.236,
-                                       27825.862,
-                                       26965.216,
-                                       26179.148,
-                                       25462.08,
-                                       24645.952,
-                                       23922.523,
-                                       23198.144,
-                                       22529.128,
-                                       21762.4179999999,
-                                       21134.779,
-                                       20459.117,
-                                       19840.818,
-                                       19187.04,
-                                       18636.3689999999,
-                                       17982.831,
-                                       17439.7389999999,
-                                       16874.547,
-                                       16358.2169999999,
-                                       15835.684,
-                                       15352.914,
-                                       14823.681,
-                                       14329.313,
-                                       13816.897,
-                                       13342.874,
-                                       12880.882,
-                                       12491.648,
-                                       12021.254,
-                                       11625.392,
-                                       11293.7610000001,
-                                       10813.697,
-                                       10456.209,
-                                       10099.074,
-                                       9755.39000000001,
-                                       9393.18500000006,
-                                       9047.57900000003,
-                                       8657.98499999999,
-                                       8395.85900000005,
-                                       8033.0,
-                                       7736.95900000003,
-                                       7430.59699999995,
-                                       7258.47699999996,
-                                       6924.58200000005,
-                                       6691.29399999999,
-                                       6357.92500000005,
-                                       6202.05700000003,
-                                       5921.19700000004,
-                                       5628.28399999999,
-                                       5404.96799999999,
-                                       5226.71100000001,
-                                       4990.75600000005,
-                                       4799.77399999998,
-                                       4622.93099999998,
-                                       4472.478,
-                                       4171.78700000001,
-                                       3957.46299999999,
-                                       3868.95200000005,
-                                       3691.14300000004,
-                                       3474.63100000005,
-                                       3341.67200000002,
-                                       3109.14000000001,
-                                       3071.97400000005,
-                                       2796.40399999998,
-                                       2756.17799999996,
-                                       2611.46999999997,
-                                       2471.93000000005,
-                                       2382.26399999997,
-                                       2209.22400000005,
-                                       2142.28399999999,
-                                       2013.96100000001,
-                                       1911.18999999994,
-                                       1818.27099999995,
-                                       1668.47900000005,
-                                       1519.65800000005,
-                                       1469.67599999998,
-                                       1367.13800000004,
-                                       1248.52899999998,
-                                       1181.23600000003,
-                                       1022.71900000004,
-                                       1088.20700000005,
-                                       959.03600000008,
-                                       876.095999999903,
-                                       791.183999999892,
-                                       703.337000000058,
-                                       731.949999999953,
-                                       586.86400000006,
-                                       526.024999999907,
-                                       323.004999999888,
-                                       320.448000000091,
-                                       340.672999999952,
-                                       309.638999999966,
-                                       216.601999999955,
-                                       102.922999999952,
-                                       19.2399999999907,
-                                       -0.114000000059605,
-                                       -32.6240000000689,
-                                       -89.3179999999702,
-                                       -153.497999999905,
-                                       -64.2970000000205,
-                                       -143.695999999996,
-                                       -259.497999999905,
-                                       -253.017999999924,
-                                       -213.948000000091,
-                                       -397.590000000084,
-                                       -434.006000000052,
-                                       -403.475000000093,
-                                       -297.958000000101,
-                                       -404.317000000039,
-                                       -528.898999999976,
-                                       -506.621000000043,
-                                       -513.205000000075,
-                                       -479.351000000024,
-                                       -596.139999999898,
-                                       -527.016999999993,
-                                       -664.681000000099,
-                                       -680.306000000099,
-                                       -704.050000000047,
-                                       -850.486000000034,
-                                       -757.43200000003,
-                                       -713.308999999892};
+CUCO_HLL_TUNING_ARR_DECL bias_data_p4{10.0, 9.717, 9.207, 8.7896, 8.2882, 7.8204, 7.3772, 6.9342, 6.5202, 6.161, 5.7722, 5.4636, 5.0396, 4.6766, 4.3566, 4.0454, 3.7936, 3.4856, 3.2666, 2.9946, 2.766, 2.4692, 2.3638, 2.0764, 1.7864, 1.7602, 1.4814, 1.433, 1.2926, 1.0664, 0.999600000000001, 0.7956, 0.5366, 0.589399999999998, 0.573799999999999, 0.269799999999996, 0.368200000000002, 0.0544000000000011, 0.234200000000001, 0.0108000000000033, -0.203400000000002, -0.0701999999999998, -0.129600000000003, -0.364199999999997, -0.480600000000003, -0.226999999999997, -0.322800000000001, -0.382599999999996, -0.511200000000002, -0.669600000000003, -0.749400000000001, -0.500399999999999, -0.617600000000003, -0.6922, -0.601599999999998, -0.416200000000003, -0.338200000000001, -0.782600000000002, -0.648600000000002, -0.919800000000002, -0.851799999999997, -0.962400000000002, -0.6402, -1.1922, -1.0256, -1.086, -1.21899999999999, -0.819400000000002, -0.940600000000003, -1.1554, -1.2072, -1.1752, -1.16759999999999, -1.14019999999999, -1.3754, -1.29859999999999, -1.607, -1.3292, -1.7606};
+CUCO_HLL_TUNING_ARR_DECL bias_data_p5{22.0, 21.1194, 20.8208, 20.2318, 19.77, 19.2436, 18.7774, 18.2848, 17.8224, 17.3742, 16.9336, 16.503, 16.0494, 15.6292, 15.2124, 14.798, 14.367, 13.9728, 13.5944, 13.217, 12.8438, 12.3696, 12.0956, 11.7044, 11.324, 11.0668, 10.6698, 10.3644, 10.049, 9.6918, 9.4146, 9.082, 8.687, 8.5398, 8.2462, 7.857, 7.6606, 7.4168, 7.1248, 6.9222, 6.6804, 6.447, 6.3454, 5.9594, 5.7636, 5.5776, 5.331, 5.19, 4.9676, 4.7564, 4.5314, 4.4442, 4.3708, 3.9774, 3.9624, 3.8796, 3.755, 3.472, 3.2076, 3.1024, 2.8908, 2.7338, 2.7728, 2.629, 2.413, 2.3266, 2.1524, 2.2642, 2.1806, 2.0566, 1.9192, 1.7598, 1.3516, 1.5802, 1.43859999999999, 1.49160000000001, 1.1524, 1.1892, 0.841399999999993, 0.879800000000003, 0.837599999999995, 0.469800000000006, 0.765600000000006, 0.331000000000003, 0.591399999999993, 0.601200000000006, 0.701599999999999, 0.558199999999999, 0.339399999999998, 0.354399999999998, 0.491200000000006, 0.308000000000007, 0.355199999999996, -0.0254000000000048, 0.205200000000005, -0.272999999999996, 0.132199999999997, 0.394400000000005, -0.241200000000006, 0.242000000000004, 0.191400000000002, 0.253799999999998, -0.122399999999999, -0.370800000000003, 0.193200000000004, -0.0848000000000013, 0.0867999999999967, -0.327200000000005, -0.285600000000002, 0.311400000000006, -0.128399999999999, -0.754999999999995, -0.209199999999996, -0.293599999999998, -0.364000000000004, -0.253600000000006, -0.821200000000005, -0.253600000000006, -0.510400000000004, -0.383399999999995, -0.491799999999998, -0.220200000000006, -0.0972000000000008, -0.557400000000001, -0.114599999999996, -0.295000000000002, -0.534800000000004, 0.346399999999988, -0.65379999999999, 0.0398000000000138, 0.0341999999999985, -0.995800000000003, -0.523400000000009, -0.489000000000004, -0.274799999999999, -0.574999999999989, -0.482799999999997, 0.0571999999999946, -0.330600000000004, -0.628800000000012, -0.140199999999993, -0.540600000000012, -0.445999999999998, -0.599400000000003, -0.262599999999992, 0.163399999999996, -0.100599999999986, -0.39500000000001, -1.06960000000001, -0.836399999999998, -0.753199999999993, -0.412399999999991, -0.790400000000005, -0.29679999999999, -0.28540000000001, -0.193000000000012, -0.0772000000000048, -0.962799999999987, -0.414800000000014};
+CUCO_HLL_TUNING_ARR_DECL bias_data_p6{45.0, 44.1902, 43.271, 42.8358, 41.8142, 41.2854, 40.317, 39.354, 38.8924, 37.9436, 37.4596, 36.5262, 35.6248, 35.1574, 34.2822, 33.837, 32.9636, 32.074, 31.7042, 30.7976, 30.4772, 29.6564, 28.7942, 28.5004, 27.686, 27.291, 26.5672, 25.8556, 25.4982, 24.8204, 24.4252, 23.7744, 23.0786, 22.8344, 22.0294, 21.8098, 21.0794, 20.5732, 20.1878, 19.5648, 19.2902, 18.6784, 18.3352, 17.8946, 17.3712, 17.0852, 16.499, 16.2686, 15.6844, 15.2234, 14.9732, 14.3356, 14.2286, 13.7262, 13.3284, 13.1048, 12.5962, 12.3562, 12.1272, 11.4184, 11.4974, 11.0822, 10.856, 10.48, 10.2834, 10.0208, 9.637, 9.51739999999999, 9.05759999999999, 8.74760000000001, 8.42700000000001, 8.1326, 8.2372, 8.2788, 7.6776, 7.79259999999999, 7.1952, 6.9564, 6.6454, 6.87, 6.5428, 6.19999999999999, 6.02940000000001, 5.62780000000001, 5.6782, 5.792, 5.35159999999999, 5.28319999999999, 5.0394, 5.07480000000001, 4.49119999999999, 4.84899999999999, 4.696, 4.54040000000001, 4.07300000000001, 4.37139999999999, 3.7216, 3.7328, 3.42080000000001, 3.41839999999999, 3.94239999999999, 3.27719999999999, 3.411, 3.13079999999999, 2.76900000000001, 2.92580000000001, 2.68279999999999, 2.75020000000001, 2.70599999999999, 2.3886, 3.01859999999999, 2.45179999999999, 2.92699999999999, 2.41720000000001, 2.41139999999999, 2.03299999999999, 2.51240000000001, 2.5564, 2.60079999999999, 2.41720000000001, 1.80439999999999, 1.99700000000001, 2.45480000000001, 1.8948, 2.2346, 2.30860000000001, 2.15479999999999, 1.88419999999999, 1.6508, 0.677199999999999, 1.72540000000001, 1.4752, 1.72280000000001, 1.66139999999999, 1.16759999999999, 1.79300000000001, 1.00059999999999, 0.905200000000008, 0.659999999999997, 1.55879999999999, 1.1636, 0.688199999999995, 0.712600000000009, 0.450199999999995, 1.1978, 0.975599999999986, 0.165400000000005, 1.727, 1.19739999999999, -0.252600000000001, 1.13460000000001, 1.3048, 1.19479999999999, 0.313400000000001, 0.878999999999991, 1.12039999999999, 0.853000000000009, 1.67920000000001, 0.856999999999999, 0.448599999999999, 1.2362, 0.953399999999988, 1.02859999999998, 0.563199999999995, 0.663000000000011, 0.723000000000013, 0.756599999999992, 0.256599999999992, -0.837600000000009, 0.620000000000005, 0.821599999999989, 0.216600000000028, 0.205600000000004, 0.220199999999977, 0.372599999999977, 0.334400000000016, 0.928400000000011, 0.972800000000007, 0.192400000000021, 0.487199999999973, -0.413000000000011, 0.807000000000016, 0.120600000000024, 0.769000000000005, 0.870799999999974, 0.66500000000002, 0.118200000000002, 0.401200000000017, 0.635199999999998, 0.135400000000004, 0.175599999999974, 1.16059999999999, 0.34620000000001, 0.521400000000028, -0.586599999999976, -1.16480000000001, 0.968399999999974, 0.836999999999989, 0.779600000000016, 0.985799999999983};
+CUCO_HLL_TUNING_ARR_DECL bias_data_p7{91.0, 89.4934, 87.9758, 86.4574, 84.9718, 83.4954, 81.5302, 80.0756, 78.6374, 77.1782, 75.7888, 73.9522, 72.592, 71.2532, 69.9086, 68.5938, 66.9474, 65.6796, 64.4394, 63.2176, 61.9768, 60.4214, 59.2528, 58.0102, 56.8658, 55.7278, 54.3044, 53.1316, 52.093, 51.0032, 49.9092, 48.6306, 47.5294, 46.5756, 45.6508, 44.662, 43.552, 42.3724, 41.617, 40.5754, 39.7872, 38.8444, 37.7988, 36.8606, 36.2118, 35.3566, 34.4476, 33.5882, 32.6816, 32.0824, 31.0258, 30.6048, 29.4436, 28.7274, 27.957, 27.147, 26.4364, 25.7592, 25.3386, 24.781, 23.8028, 23.656, 22.6544, 21.996, 21.4718, 21.1544, 20.6098, 19.5956, 19.0616, 18.5758, 18.4878, 17.5244, 17.2146, 16.724, 15.8722, 15.5198, 15.0414, 14.941, 14.9048, 13.87, 13.4304, 13.028, 12.4708, 12.37, 12.0624, 11.4668, 11.5532, 11.4352, 11.2564, 10.2744, 10.2118, 9.74720000000002, 10.1456, 9.2928, 8.75040000000001, 8.55279999999999, 8.97899999999998, 8.21019999999999, 8.18340000000001, 7.3494, 7.32499999999999, 7.66140000000001, 6.90300000000002, 7.25439999999998, 6.9042, 7.21499999999997, 6.28640000000001, 6.08139999999997, 6.6764, 6.30099999999999, 5.13900000000001, 5.65800000000002, 5.17320000000001, 4.59019999999998, 4.9538, 5.08280000000002, 4.92200000000003, 4.99020000000002, 4.7328, 5.4538, 4.11360000000002, 4.22340000000003, 4.08780000000002, 3.70800000000003, 4.15559999999999, 4.18520000000001, 3.63720000000001, 3.68220000000002, 3.77960000000002, 3.6078, 2.49160000000001, 3.13099999999997, 2.5376, 3.19880000000001, 3.21100000000001, 2.4502, 3.52820000000003, 2.91199999999998, 3.04480000000001, 2.7432, 2.85239999999999, 2.79880000000003, 2.78579999999999, 1.88679999999999, 2.98860000000002, 2.50639999999999, 1.91239999999999, 2.66160000000002, 2.46820000000002, 1.58199999999999, 1.30399999999997, 2.27379999999999, 2.68939999999998, 1.32900000000001, 3.10599999999999, 1.69080000000002, 2.13740000000001, 2.53219999999999, 1.88479999999998, 1.33240000000001, 1.45119999999997, 1.17899999999997, 2.44119999999998, 1.60659999999996, 2.16700000000003, 0.77940000000001, 2.37900000000002, 2.06700000000001, 1.46000000000004, 2.91160000000002, 1.69200000000001, 0.954600000000028, 2.49300000000005, 2.2722, 1.33500000000004, 2.44899999999996, 1.20140000000004, 3.07380000000001, 2.09739999999999, 2.85640000000001, 2.29960000000005, 2.40899999999999, 1.97040000000004, 0.809799999999996, 1.65279999999996, 2.59979999999996, 0.95799999999997, 2.06799999999998, 2.32780000000002, 4.20159999999998, 1.96320000000003, 1.86400000000003, 1.42999999999995, 3.77940000000001, 1.27200000000005, 1.86440000000005, 2.20600000000002, 3.21900000000005, 1.5154, 2.61019999999996};
+CUCO_HLL_TUNING_ARR_DECL bias_data_p8{183.2152, 180.2454, 177.2096, 173.6652, 170.6312, 167.6822, 164.249, 161.3296, 158.0038, 155.2074, 152.4612, 149.27, 146.5178, 143.4412, 140.8032, 138.1634, 135.1688, 132.6074, 129.6946, 127.2664, 124.8228, 122.0432, 119.6824, 116.9464, 114.6268, 112.2626, 109.8376, 107.4034, 104.8956, 102.8522, 100.7638, 98.3552, 96.3556, 93.7526, 91.9292, 89.8954, 87.8198, 85.7668, 83.298, 81.6688, 79.9466, 77.9746, 76.1672, 74.3474, 72.3028, 70.8912, 69.114, 67.4646, 65.9744, 64.4092, 62.6022, 60.843, 59.5684, 58.1652, 56.5426, 55.4152, 53.5388, 52.3592, 51.1366, 49.486, 48.3918, 46.5076, 45.509, 44.3834, 43.3498, 42.0668, 40.7346, 40.1228, 38.4528, 37.7, 36.644, 36.0518, 34.5774, 33.9068, 32.432, 32.1666, 30.434, 29.6644, 28.4894, 27.6312, 26.3804, 26.292, 25.5496000000001, 25.0234, 24.8206, 22.6146, 22.4188, 22.117, 20.6762, 20.6576, 19.7864, 19.509, 18.5334, 17.9204, 17.772, 16.2924, 16.8654, 15.1836, 15.745, 15.1316, 15.0386, 14.0136, 13.6342, 12.6196, 12.1866, 12.4281999999999, 11.3324, 10.4794000000001, 11.5038, 10.129, 9.52800000000002, 10.3203999999999, 9.46299999999997, 9.79280000000006, 9.12300000000005, 8.74180000000001, 9.2192, 7.51020000000005, 7.60659999999996, 7.01840000000004, 7.22239999999999, 7.40139999999997, 6.76179999999999, 7.14359999999999, 5.65060000000005, 5.63779999999997, 5.76599999999996, 6.75139999999999, 5.57759999999996, 3.73220000000003, 5.8048, 5.63019999999995, 4.93359999999996, 3.47979999999995, 4.33879999999999, 3.98940000000005, 3.81960000000004, 3.31359999999995, 3.23080000000004, 3.4588, 3.08159999999998, 3.4076, 3.00639999999999, 2.38779999999997, 2.61900000000003, 1.99800000000005, 3.34820000000002, 2.95060000000001, 0.990999999999985, 2.11440000000005, 2.20299999999997, 2.82219999999995, 2.73239999999998, 2.7826, 3.76660000000004, 2.26480000000004, 2.31280000000004, 2.40819999999997, 2.75360000000001, 3.33759999999995, 2.71559999999999, 1.7478000000001, 1.42920000000004, 2.39300000000003, 2.22779999999989, 2.34339999999997, 0.87259999999992, 3.88400000000001, 1.80600000000004, 1.91759999999999, 1.16779999999994, 1.50320000000011, 2.52500000000009, 0.226400000000012, 2.31500000000005, 0.930000000000064, 1.25199999999995, 2.14959999999996, 0.0407999999999902, 2.5447999999999, 1.32960000000003, 0.197400000000016, 2.52620000000002, 3.33279999999991, -1.34300000000007, 0.422199999999975, 0.917200000000093, 1.12920000000008, 1.46060000000011, 1.45779999999991, 2.8728000000001, 3.33359999999993, -1.34079999999994, 1.57680000000005, 0.363000000000056, 1.40740000000005, 0.656600000000026, 0.801400000000058, -0.454600000000028, 1.51919999999996};
+CUCO_HLL_TUNING_ARR_DECL bias_data_p9{368.0, 361.8294, 355.2452, 348.6698, 342.1464, 336.2024, 329.8782, 323.6598, 317.462, 311.2826, 305.7102, 299.7416, 293.9366, 288.1046, 282.285, 277.0668, 271.306, 265.8448, 260.301, 254.9886, 250.2422, 244.8138, 239.7074, 234.7428, 229.8402, 225.1664, 220.3534, 215.594, 210.6886, 205.7876, 201.65, 197.228, 192.8036, 188.1666, 184.0818, 180.0824, 176.2574, 172.302, 168.1644, 164.0056, 160.3802, 156.7192, 152.5234, 149.2084, 145.831, 142.485, 139.1112, 135.4764, 131.76, 129.3368, 126.5538, 122.5058, 119.2646, 116.5902, 113.3818, 110.8998, 107.9532, 105.2062, 102.2798, 99.4728, 96.9582, 94.3292, 92.171, 89.7809999999999, 87.5716, 84.7048, 82.5322, 79.875, 78.3972, 75.3464, 73.7274, 71.2834, 70.1444, 68.4263999999999, 66.0166, 64.018, 62.0437999999999, 60.3399999999999, 58.6856, 57.9836, 55.0311999999999, 54.6769999999999, 52.3188, 51.4846, 49.4423999999999, 47.739, 46.1487999999999, 44.9202, 43.4059999999999, 42.5342000000001, 41.2834, 38.8954000000001, 38.3286000000001, 36.2146, 36.6684, 35.9946, 33.123, 33.4338, 31.7378000000001, 29.076, 28.9692, 27.4964, 27.0998, 25.9864, 26.7754, 24.3208, 23.4838, 22.7388000000001, 24.0758000000001, 21.9097999999999, 20.9728, 19.9228000000001, 19.9292, 16.617, 17.05, 18.2996000000001, 15.6128000000001, 15.7392, 14.5174, 13.6322, 12.2583999999999, 13.3766000000001, 11.423, 13.1232, 9.51639999999998, 10.5938000000001, 9.59719999999993, 8.12220000000002, 9.76739999999995, 7.50440000000003, 7.56999999999994, 6.70440000000008, 6.41419999999994, 6.71019999999999, 5.60940000000005, 4.65219999999999, 6.84099999999989, 3.4072000000001, 3.97859999999991, 3.32760000000007, 5.52160000000003, 3.31860000000006, 2.06940000000009, 4.35400000000004, 1.57500000000005, 0.280799999999999, 2.12879999999996, -0.214799999999968, -0.0378000000000611, -0.658200000000079, 0.654800000000023, -0.0697999999999865, 0.858400000000074, -2.52700000000004, -2.1751999999999, -3.35539999999992, -1.04019999999991, -0.651000000000067, -2.14439999999991, -1.96659999999997, -3.97939999999994, -0.604400000000169, -3.08260000000018, -3.39159999999993, -5.29640000000018, -5.38920000000007, -5.08759999999984, -4.69900000000007, -5.23720000000003, -3.15779999999995, -4.97879999999986, -4.89899999999989, -7.48880000000008, -5.94799999999987, -5.68060000000014, -6.67180000000008, -4.70499999999993, -7.27779999999984, -4.6579999999999, -4.4362000000001, -4.32139999999981, -5.18859999999995, -6.66879999999992, -6.48399999999992, -5.1260000000002, -4.4032000000002, -6.13500000000022, -5.80819999999994, -4.16719999999987, -4.15039999999999, -7.45600000000013, -7.24080000000004, -9.83179999999993, -5.80420000000004, -8.6561999999999, -6.99940000000015, -10.5473999999999, -7.34139999999979, -6.80999999999995, -6.29719999999998, -6.23199999999997};
+CUCO_HLL_TUNING_ARR_DECL bias_data_p10{737.1256, 724.4234, 711.1064, 698.4732, 685.4636, 673.0644, 660.488, 647.9654, 636.0832, 623.7864, 612.1992, 600.2176, 588.5228, 577.1716, 565.7752, 554.899, 543.6126, 532.6492, 521.9474, 511.5214, 501.1064, 490.6364, 480.2468, 470.4588, 460.3832, 451.0584, 440.8606, 431.3868, 422.5062, 413.1862, 404.463, 395.339, 386.1936, 378.1292, 369.1854, 361.2908, 353.3324, 344.8518, 337.5204, 329.4854, 321.9318, 314.552, 306.4658, 299.4256, 292.849, 286.152, 278.8956, 271.8792, 265.118, 258.62, 252.5132, 245.9322, 239.7726, 233.6086, 227.5332, 222.5918, 216.4294, 210.7662, 205.4106, 199.7338, 194.9012, 188.4486, 183.1556, 178.6338, 173.7312, 169.6264, 163.9526, 159.8742, 155.8326, 151.1966, 147.5594, 143.07, 140.037, 134.1804, 131.071, 127.4884, 124.0848, 120.2944, 117.333, 112.9626, 110.2902, 107.0814, 103.0334, 99.4832000000001, 96.3899999999999, 93.7202000000002, 90.1714000000002, 87.2357999999999, 85.9346, 82.8910000000001, 80.0264000000002, 78.3834000000002, 75.1543999999999, 73.8683999999998, 70.9895999999999, 69.4367999999999, 64.8701999999998, 65.0408000000002, 61.6738, 59.5207999999998, 57.0158000000001, 54.2302, 53.0962, 50.4985999999999, 52.2588000000001, 47.3914, 45.6244000000002, 42.8377999999998, 43.0072, 40.6516000000001, 40.2453999999998, 35.2136, 36.4546, 33.7849999999999, 33.2294000000002, 32.4679999999998, 30.8670000000002, 28.6507999999999, 28.9099999999999, 27.5983999999999, 26.1619999999998, 24.5563999999999, 23.2328000000002, 21.9484000000002, 21.5902000000001, 21.3346000000001, 17.7031999999999, 20.6111999999998, 19.5545999999999, 15.7375999999999, 17.0720000000001, 16.9517999999998, 15.326, 13.1817999999998, 14.6925999999999, 13.0859999999998, 13.2754, 10.8697999999999, 11.248, 7.3768, 4.72339999999986, 7.97899999999981, 8.7503999999999, 7.68119999999999, 9.7199999999998, 7.73919999999998, 5.6224000000002, 7.44560000000001, 6.6601999999998, 5.9058, 4.00199999999995, 4.51699999999983, 4.68240000000014, 3.86220000000003, 5.13639999999987, 5.98500000000013, 2.47719999999981, 2.61999999999989, 1.62800000000016, 4.65000000000009, 0.225599999999758, 0.831000000000131, -0.359400000000278, 1.27599999999984, -2.92559999999958, -0.0303999999996449, 2.37079999999969, -2.0033999999996, 0.804600000000391, 0.30199999999968, 1.1247999999996, -2.6880000000001, 0.0321999999996478, -1.18099999999959, -3.9402, -1.47940000000017, -0.188400000000001, -2.10720000000038, -2.04159999999956, -3.12880000000041, -4.16160000000036, -0.612799999999879, -3.48719999999958, -8.17900000000009, -5.37780000000021, -4.01379999999972, -5.58259999999973, -5.73719999999958, -7.66799999999967, -5.69520000000011, -1.1247999999996, -5.58520000000044, -8.04560000000038, -4.64840000000004, -11.6468000000004, -7.97519999999986, -5.78300000000036, -7.67420000000038, -10.6328000000003, -9.81720000000041};
+CUCO_HLL_TUNING_ARR_DECL bias_data_p11{1476.0, 1449.6014, 1423.5802, 1397.7942, 1372.3042, 1347.2062, 1321.8402, 1297.2292, 1272.9462, 1248.9926, 1225.3026, 1201.4252, 1178.0578, 1155.6092, 1132.626, 1110.5568, 1088.527, 1066.5154, 1045.1874, 1024.3878, 1003.37, 982.1972, 962.5728, 942.1012, 922.9668, 903.292, 884.0772, 864.8578, 846.6562, 828.041, 809.714, 792.3112, 775.1806, 757.9854, 740.656, 724.346, 707.5154, 691.8378, 675.7448, 659.6722, 645.5722, 630.1462, 614.4124, 600.8728, 585.898, 572.408, 558.4926, 544.4938, 531.6776, 517.282, 505.7704, 493.1012, 480.7388, 467.6876, 456.1872, 445.5048, 433.0214, 420.806, 411.409, 400.4144, 389.4294, 379.2286, 369.651, 360.6156, 350.337, 342.083, 332.1538, 322.5094, 315.01, 305.6686, 298.1678, 287.8116, 280.9978, 271.9204, 265.3286, 257.5706, 249.6014, 242.544, 235.5976, 229.583, 220.9438, 214.672, 208.2786, 201.8628, 195.1834, 191.505, 186.1816, 178.5188, 172.2294, 167.8908, 161.0194, 158.052, 151.4588, 148.1596, 143.4344, 138.5238, 133.13, 127.6374, 124.8162, 118.7894, 117.3984, 114.6078, 109.0858, 105.1036, 103.6258, 98.6018000000004, 95.7618000000002, 93.5821999999998, 88.5900000000001, 86.9992000000002, 82.8800000000001, 80.4539999999997, 74.6981999999998, 74.3644000000004, 73.2914000000001, 65.5709999999999, 66.9232000000002, 65.1913999999997, 62.5882000000001, 61.5702000000001, 55.7035999999998, 56.1764000000003, 52.7596000000003, 53.0302000000001, 49.0609999999997, 48.4694, 44.933, 46.0474000000004, 44.7165999999997, 41.9416000000001, 39.9207999999999, 35.6328000000003, 35.5276000000003, 33.1934000000001, 33.2371999999996, 33.3864000000003, 33.9228000000003, 30.2371999999996, 29.1373999999996, 25.2272000000003, 24.2942000000003, 19.8338000000003, 18.9005999999999, 23.0907999999999, 21.8544000000002, 19.5176000000001, 15.4147999999996, 16.9314000000004, 18.6737999999996, 12.9877999999999, 14.3688000000002, 12.0447999999997, 15.5219999999999, 12.5299999999997, 14.5940000000001, 14.3131999999996, 9.45499999999993, 12.9441999999999, 3.91139999999996, 13.1373999999996, 5.44720000000052, 9.82779999999912, 7.87279999999919, 3.67760000000089, 5.46980000000076, 5.55099999999948, 5.65979999999945, 3.89439999999922, 3.1275999999998, 5.65140000000065, 6.3062000000009, 3.90799999999945, 1.87060000000019, 5.17020000000048, 2.46680000000015, 0.770000000000437, -3.72340000000077, 1.16400000000067, 8.05340000000069, 0.135399999999208, 2.15940000000046, 0.766999999999825, 1.0594000000001, 3.15500000000065, -0.287399999999252, 2.37219999999979, -2.86620000000039, -1.63199999999961, -2.22979999999916, -0.15519999999924, -1.46039999999994, -0.262199999999211, -2.34460000000036, -2.8078000000005, -3.22179999999935, -5.60159999999996, -8.42200000000048, -9.43740000000071, 0.161799999999857, -10.4755999999998, -10.0823999999993};
+CUCO_HLL_TUNING_ARR_DECL bias_data_p12{2953.0, 2900.4782, 2848.3568, 2796.3666, 2745.324, 2694.9598, 2644.648, 2595.539, 2546.1474, 2498.2576, 2450.8376, 2403.6076, 2357.451, 2311.38, 2266.4104, 2221.5638, 2176.9676, 2134.193, 2090.838, 2048.8548, 2007.018, 1966.1742, 1925.4482, 1885.1294, 1846.4776, 1807.4044, 1768.8724, 1731.3732, 1693.4304, 1657.5326, 1621.949, 1586.5532, 1551.7256, 1517.6182, 1483.5186, 1450.4528, 1417.865, 1385.7164, 1352.6828, 1322.6708, 1291.8312, 1260.9036, 1231.476, 1201.8652, 1173.6718, 1145.757, 1119.2072, 1092.2828, 1065.0434, 1038.6264, 1014.3192, 988.5746, 965.0816, 940.1176, 917.9796, 894.5576, 871.1858, 849.9144, 827.1142, 805.0818, 783.9664, 763.9096, 742.0816, 724.3962, 706.3454, 688.018, 667.4214, 650.3106, 633.0686, 613.8094, 597.818, 581.4248, 563.834, 547.363, 531.5066, 520.455400000001, 505.583199999999, 488.366, 476.480799999999, 459.7682, 450.0522, 434.328799999999, 423.952799999999, 408.727000000001, 399.079400000001, 387.252200000001, 373.987999999999, 360.852000000001, 351.6394, 339.642, 330.902400000001, 322.661599999999, 311.662200000001, 301.3254, 291.7484, 279.939200000001, 276.7508, 263.215200000001, 254.811400000001, 245.5494, 242.306399999999, 234.8734, 223.787200000001, 217.7156, 212.0196, 200.793, 195.9748, 189.0702, 182.449199999999, 177.2772, 170.2336, 164.741, 158.613600000001, 155.311, 147.5964, 142.837, 137.3724, 132.0162, 130.0424, 121.9804, 120.451800000001, 114.8968, 111.585999999999, 105.933199999999, 101.705, 98.5141999999996, 95.0488000000005, 89.7880000000005, 91.4750000000004, 83.7764000000006, 80.9698000000008, 72.8574000000008, 73.1615999999995, 67.5838000000003, 62.6263999999992, 63.2638000000006, 66.0977999999996, 52.0843999999997, 58.9956000000002, 47.0912000000008, 46.4956000000002, 48.4383999999991, 47.1082000000006, 43.2392, 37.2759999999998, 40.0283999999992, 35.1864000000005, 35.8595999999998, 32.0998, 28.027, 23.6694000000007, 33.8266000000003, 26.3736000000008, 27.2008000000005, 21.3245999999999, 26.4115999999995, 23.4521999999997, 19.5013999999992, 19.8513999999996, 10.7492000000002, 18.6424000000006, 13.1265999999996, 18.2436000000016, 6.71860000000015, 3.39459999999963, 6.33759999999893, 7.76719999999841, 0.813999999998487, 3.82819999999992, 0.826199999999517, 8.07440000000133, -1.59080000000176, 5.01780000000144, 0.455399999998917, -0.24199999999837, 0.174800000000687, -9.07640000000174, -4.20160000000033, -3.77520000000004, -4.75179999999818, -5.3724000000002, -8.90680000000066, -6.10239999999976, -5.74120000000039, -9.95339999999851, -3.86339999999836, -13.7304000000004, -16.2710000000006, -7.51359999999841, -3.30679999999847, -13.1339999999982, -10.0551999999989, -6.72019999999975, -8.59660000000076, -10.9307999999983, -1.8775999999998, -4.82259999999951, -13.7788, -21.6470000000008, -10.6735999999983, -15.7799999999988};
+CUCO_HLL_TUNING_ARR_DECL bias_data_p13{5907.5052, 5802.2672, 5697.347, 5593.5794, 5491.2622, 5390.5514, 5290.3376, 5191.6952, 5093.5988, 4997.3552, 4902.5972, 4808.3082, 4715.5646, 4624.109, 4533.8216, 4444.4344, 4356.3802, 4269.2962, 4183.3784, 4098.292, 4014.79, 3932.4574, 3850.6036, 3771.2712, 3691.7708, 3615.099, 3538.1858, 3463.4746, 3388.8496, 3315.6794, 3244.5448, 3173.7516, 3103.3106, 3033.6094, 2966.5642, 2900.794, 2833.7256, 2769.81, 2707.3196, 2644.0778, 2583.9916, 2523.4662, 2464.124, 2406.073, 2347.0362, 2292.1006, 2238.1716, 2182.7514, 2128.4884, 2077.1314, 2025.037, 1975.3756, 1928.933, 1879.311, 1831.0006, 1783.2144, 1738.3096, 1694.5144, 1649.024, 1606.847, 1564.7528, 1525.3168, 1482.5372, 1443.9668, 1406.5074, 1365.867, 1329.2186, 1295.4186, 1257.9716, 1225.339, 1193.2972, 1156.3578, 1125.8686, 1091.187, 1061.4094, 1029.4188, 1000.9126, 972.3272, 944.004199999999, 915.7592, 889.965, 862.834200000001, 840.4254, 812.598399999999, 785.924200000001, 763.050999999999, 741.793799999999, 721.466, 699.040799999999, 677.997200000002, 649.866999999998, 634.911800000002, 609.8694, 591.981599999999, 570.2922, 557.129199999999, 538.3858, 521.872599999999, 502.951400000002, 495.776399999999, 475.171399999999, 459.751, 439.995200000001, 426.708999999999, 413.7016, 402.3868, 387.262599999998, 372.0524, 357.050999999999, 342.5098, 334.849200000001, 322.529399999999, 311.613799999999, 295.848000000002, 289.273000000001, 274.093000000001, 263.329600000001, 251.389599999999, 245.7392, 231.9614, 229.7952, 217.155200000001, 208.9588, 199.016599999999, 190.839199999999, 180.6976, 176.272799999999, 166.976999999999, 162.5252, 151.196400000001, 149.386999999999, 133.981199999998, 130.0586, 130.164000000001, 122.053400000001, 110.7428, 108.1276, 106.232400000001, 100.381600000001, 98.7668000000012, 86.6440000000002, 79.9768000000004, 82.4722000000002, 68.7026000000005, 70.1186000000016, 71.9948000000004, 58.998599999999, 59.0492000000013, 56.9818000000014, 47.5338000000011, 42.9928, 51.1591999999982, 37.2740000000013, 42.7220000000016, 31.3734000000004, 26.8090000000011, 25.8934000000008, 26.5286000000015, 29.5442000000003, 19.3503999999994, 26.0760000000009, 17.9527999999991, 14.8419999999969, 10.4683999999979, 8.65899999999965, 9.86720000000059, 4.34139999999752, -0.907800000000861, -3.32080000000133, -0.936199999996461, -11.9916000000012, -8.87000000000262, -6.33099999999831, -11.3366000000024, -15.9207999999999, -9.34659999999712, -15.5034000000014, -19.2097999999969, -15.357799999998, -28.2235999999975, -30.6898000000001, -19.3271999999997, -25.6083999999973, -24.409599999999, -13.6385999999984, -33.4473999999973, -32.6949999999997, -28.9063999999998, -31.7483999999968, -32.2935999999972, -35.8329999999987, -47.620600000002, -39.0855999999985, -33.1434000000008, -46.1371999999974, -37.5892000000022, -46.8164000000033, -47.3142000000007, -60.2914000000019, -37.7575999999972};
+CUCO_HLL_TUNING_ARR_DECL bias_data_p14{11816.475, 11605.0046, 11395.3792, 11188.7504, 10984.1814, 10782.0086, 10582.0072, 10384.503, 10189.178, 9996.2738, 9806.0344, 9617.9798, 9431.394, 9248.7784, 9067.6894, 8889.6824, 8712.9134, 8538.8624, 8368.4944, 8197.7956, 8031.8916, 7866.6316, 7703.733, 7544.5726, 7386.204, 7230.666, 7077.8516, 6926.7886, 6778.6902, 6631.9632, 6487.304, 6346.7486, 6206.4408, 6070.202, 5935.2576, 5799.924, 5671.0324, 5541.9788, 5414.6112, 5290.0274, 5166.723, 5047.6906, 4929.162, 4815.1406, 4699.127, 4588.5606, 4477.7394, 4369.4014, 4264.2728, 4155.9224, 4055.581, 3955.505, 3856.9618, 3761.3828, 3666.9702, 3575.7764, 3482.4132, 3395.0186, 3305.8852, 3221.415, 3138.6024, 3056.296, 2970.4494, 2896.1526, 2816.8008, 2740.2156, 2670.497, 2594.1458, 2527.111, 2460.8168, 2387.5114, 2322.9498, 2260.6752, 2194.2686, 2133.7792, 2074.767, 2015.204, 1959.4226, 1898.6502, 1850.006, 1792.849, 1741.4838, 1687.9778, 1638.1322, 1589.3266, 1543.1394, 1496.8266, 1447.8516, 1402.7354, 1361.9606, 1327.0692, 1285.4106, 1241.8112, 1201.6726, 1161.973, 1130.261, 1094.2036, 1048.2036, 1020.6436, 990.901400000002, 961.199800000002, 924.769800000002, 899.526400000002, 872.346400000002, 834.375, 810.432000000001, 780.659800000001, 756.013800000001, 733.479399999997, 707.923999999999, 673.858, 652.222399999999, 636.572399999997, 615.738599999997, 586.696400000001, 564.147199999999, 541.679600000003, 523.943599999999, 505.714599999999, 475.729599999999, 461.779600000002, 449.750800000002, 439.020799999998, 412.7886, 400.245600000002, 383.188199999997, 362.079599999997, 357.533799999997, 334.319000000003, 327.553399999997, 308.559399999998, 291.270199999999, 279.351999999999, 271.791400000002, 252.576999999997, 247.482400000001, 236.174800000001, 218.774599999997, 220.155200000001, 208.794399999999, 201.223599999998, 182.995600000002, 185.5268, 164.547400000003, 176.5962, 150.689599999998, 157.8004, 138.378799999999, 134.021200000003, 117.614399999999, 108.194000000003, 97.0696000000025, 89.6042000000016, 95.6030000000028, 84.7810000000027, 72.635000000002, 77.3482000000004, 59.4907999999996, 55.5875999999989, 50.7346000000034, 61.3916000000027, 50.9149999999936, 39.0384000000049, 58.9395999999979, 29.633600000001, 28.2032000000036, 26.0078000000067, 17.0387999999948, 9.22000000000116, 13.8387999999977, 8.07240000000456, 14.1549999999988, 15.3570000000036, 3.42660000000615, 6.24820000000182, -2.96940000000177, -8.79940000000352, -5.97860000000219, -14.4048000000039, -3.4143999999942, -13.0148000000045, -11.6977999999945, -25.7878000000055, -22.3185999999987, -24.409599999999, -31.9756000000052, -18.9722000000038, -22.8678000000073, -30.8972000000067, -32.3715999999986, -22.3907999999938, -43.6720000000059, -35.9038, -39.7492000000057, -54.1641999999993, -45.2749999999942, -42.2989999999991, -44.1089999999967, -64.3564000000042, -49.9551999999967, -42.6116000000038};
+CUCO_HLL_TUNING_ARR_DECL bias_data_p15{23634.0036, 23210.8034, 22792.4744, 22379.1524, 21969.7928, 21565.326, 21165.3532, 20770.2806, 20379.9892, 19994.7098, 19613.318, 19236.799, 18865.4382, 18498.8244, 18136.5138, 17778.8668, 17426.2344, 17079.32, 16734.778, 16397.2418, 16063.3324, 15734.0232, 15409.731, 15088.728, 14772.9896, 14464.1402, 14157.5588, 13855.5958, 13559.3296, 13264.9096, 12978.326, 12692.0826, 12413.8816, 12137.3192, 11870.2326, 11602.5554, 11340.3142, 11079.613, 10829.5908, 10583.5466, 10334.0344, 10095.5072, 9859.694, 9625.2822, 9395.7862, 9174.0586, 8957.3164, 8738.064, 8524.155, 8313.7396, 8116.9168, 7913.542, 7718.4778, 7521.65, 7335.5596, 7154.2906, 6968.7396, 6786.3996, 6613.236, 6437.406, 6270.6598, 6107.7958, 5945.7174, 5787.6784, 5635.5784, 5482.308, 5337.9784, 5190.0864, 5045.9158, 4919.1386, 4771.817, 4645.7742, 4518.4774, 4385.5454, 4262.6622, 4142.74679999999, 4015.5318, 3897.9276, 3790.7764, 3685.13800000001, 3573.6274, 3467.9706, 3368.61079999999, 3271.5202, 3170.3848, 3076.4656, 2982.38400000001, 2888.4664, 2806.4868, 2711.9564, 2634.1434, 2551.3204, 2469.7662, 2396.61139999999, 2318.9902, 2243.8658, 2171.9246, 2105.01360000001, 2028.8536, 1960.9952, 1901.4096, 1841.86079999999, 1777.54700000001, 1714.5802, 1654.65059999999, 1596.311, 1546.2016, 1492.3296, 1433.8974, 1383.84600000001, 1339.4152, 1293.5518, 1245.8686, 1193.50659999999, 1162.27959999999, 1107.19439999999, 1069.18060000001, 1035.09179999999, 999.679000000004, 957.679999999993, 925.300199999998, 888.099400000006, 848.638600000006, 818.156400000007, 796.748399999997, 752.139200000005, 725.271200000003, 692.216, 671.633600000001, 647.939799999993, 621.670599999998, 575.398799999995, 561.226599999995, 532.237999999998, 521.787599999996, 483.095799999996, 467.049599999998, 465.286399999997, 415.548599999995, 401.047399999996, 380.607999999993, 377.362599999993, 347.258799999996, 338.371599999999, 310.096999999994, 301.409199999995, 276.280799999993, 265.586800000005, 258.994399999996, 223.915999999997, 215.925399999993, 213.503800000006, 191.045400000003, 166.718200000003, 166.259000000005, 162.941200000001, 148.829400000002, 141.645999999993, 123.535399999993, 122.329800000007, 89.473399999988, 80.1962000000058, 77.5457999999926, 59.1056000000099, 83.3509999999951, 52.2906000000075, 36.3979999999865, 40.6558000000077, 42.0003999999899, 19.6630000000005, 19.7153999999864, -8.38539999999921, -0.692799999989802, 0.854800000000978, 3.23219999999856, -3.89040000000386, -5.25880000001052, -24.9052000000083, -22.6837999999989, -26.4286000000138, -34.997000000003, -37.0216000000073, -43.430400000012, -58.2390000000014, -68.8034000000043, -56.9245999999985, -57.8583999999973, -77.3097999999882, -73.2793999999994, -81.0738000000129, -87.4530000000086, -65.0254000000132, -57.296399999992, -96.2746000000043, -103.25, -96.081600000005, -91.5542000000132, -102.465200000006, -107.688599999994, -101.458000000013, -109.715800000005};
+CUCO_HLL_TUNING_ARR_DECL bias_data_p16{47270.0, 46423.3584, 45585.7074, 44757.152, 43938.8416, 43130.9514, 42330.03, 41540.407, 40759.6348, 39988.206, 39226.5144, 38473.2096, 37729.795, 36997.268, 36272.6448, 35558.665, 34853.0248, 34157.4472, 33470.5204, 32793.5742, 32127.0194, 31469.4182, 30817.6136, 30178.6968, 29546.8908, 28922.8544, 28312.271, 27707.0924, 27114.0326, 26526.692, 25948.6336, 25383.7826, 24823.5998, 24272.2974, 23732.2572, 23201.4976, 22674.2796, 22163.6336, 21656.515, 21161.7362, 20669.9368, 20189.4424, 19717.3358, 19256.3744, 18795.9638, 18352.197, 17908.5738, 17474.391, 17052.918, 16637.2236, 16228.4602, 15823.3474, 15428.6974, 15043.0284, 14667.6278, 14297.4588, 13935.2882, 13578.5402, 13234.6032, 12882.1578, 12548.0728, 12219.231, 11898.0072, 11587.2626, 11279.9072, 10973.5048, 10678.5186, 10392.4876, 10105.2556, 9825.766, 9562.5444, 9294.2222, 9038.2352, 8784.848, 8533.2644, 8301.7776, 8058.30859999999, 7822.94579999999, 7599.11319999999, 7366.90779999999, 7161.217, 6957.53080000001, 6736.212, 6548.21220000001, 6343.06839999999, 6156.28719999999, 5975.15419999999, 5791.75719999999, 5621.32019999999, 5451.66, 5287.61040000001, 5118.09479999999, 4957.288, 4798.4246, 4662.17559999999, 4512.05900000001, 4364.68539999999, 4220.77720000001, 4082.67259999999, 3957.19519999999, 3842.15779999999, 3699.3328, 3583.01180000001, 3473.8964, 3338.66639999999, 3233.55559999999, 3117.799, 3008.111, 2909.69140000001, 2814.86499999999, 2719.46119999999, 2624.742, 2532.46979999999, 2444.7886, 2370.1868, 2272.45259999999, 2196.19260000001, 2117.90419999999, 2023.2972, 1969.76819999999, 1885.58979999999, 1833.2824, 1733.91200000001, 1682.54920000001, 1604.57980000001, 1556.11240000001, 1491.3064, 1421.71960000001, 1371.22899999999, 1322.1324, 1264.7892, 1196.23920000001, 1143.8474, 1088.67240000001, 1073.60380000001, 1023.11660000001, 959.036400000012, 927.433199999999, 906.792799999996, 853.433599999989, 841.873800000001, 791.1054, 756.899999999994, 704.343200000003, 672.495599999995, 622.790399999998, 611.254799999995, 567.283200000005, 519.406599999988, 519.188400000014, 495.312800000014, 451.350799999986, 443.973399999988, 431.882199999993, 392.027000000002, 380.924200000009, 345.128999999986, 298.901400000002, 287.771999999997, 272.625, 247.253000000026, 222.490600000019, 223.590000000026, 196.407599999977, 176.425999999978, 134.725199999986, 132.4804, 110.445599999977, 86.7939999999944, 56.7038000000175, 64.915399999998, 38.3726000000024, 37.1606000000029, 46.170999999973, 49.1716000000015, 15.3362000000197, 6.71639999997569, -34.8185999999987, -39.4476000000141, 12.6830000000191, -12.3331999999937, -50.6565999999875, -59.9538000000175, -65.1054000000004, -70.7576000000117, -106.325200000021, -126.852200000023, -110.227599999984, -132.885999999999, -113.897200000007, -142.713800000027, -151.145399999979, -150.799200000009, -177.756200000003, -156.036399999983, -182.735199999996, -177.259399999981, -198.663600000029, -174.577600000019, -193.84580000001};
+CUCO_HLL_TUNING_ARR_DECL bias_data_p17{94541.0, 92848.811, 91174.019, 89517.558, 87879.9705, 86262.7565, 84663.5125, 83083.7435, 81521.7865, 79977.272, 78455.9465, 76950.219, 75465.432, 73994.152, 72546.71, 71115.2345, 69705.6765, 68314.937, 66944.2705, 65591.255, 64252.9485, 62938.016, 61636.8225, 60355.592, 59092.789, 57850.568, 56624.518, 55417.343, 54231.1415, 53067.387, 51903.526, 50774.649, 49657.6415, 48561.05, 47475.7575, 46410.159, 45364.852, 44327.053, 43318.4005, 42325.6165, 41348.4595, 40383.6265, 39436.77, 38509.502, 37594.035, 36695.939, 35818.6895, 34955.691, 34115.8095, 33293.949, 32465.0775, 31657.6715, 30877.2585, 30093.78, 29351.3695, 28594.1365, 27872.115, 27168.7465, 26477.076, 25774.541, 25106.5375, 24452.5135, 23815.5125, 23174.0655, 22555.2685, 21960.2065, 21376.3555, 20785.1925, 20211.517, 19657.0725, 19141.6865, 18579.737, 18081.3955, 17578.995, 17073.44, 16608.335, 16119.911, 15651.266, 15194.583, 14749.0495, 14343.4835, 13925.639, 13504.509, 13099.3885, 12691.2855, 12328.018, 11969.0345, 11596.5145, 11245.6355, 10917.6575, 10580.9785, 10277.8605, 9926.58100000001, 9605.538, 9300.42950000003, 8989.97850000003, 8728.73249999998, 8448.3235, 8175.31050000002, 7898.98700000002, 7629.79100000003, 7413.76199999999, 7149.92300000001, 6921.12650000001, 6677.1545, 6443.28000000003, 6278.23450000002, 6014.20049999998, 5791.20299999998, 5605.78450000001, 5438.48800000001, 5234.2255, 5059.6825, 4887.43349999998, 4682.935, 4496.31099999999, 4322.52250000002, 4191.42499999999, 4021.24200000003, 3900.64799999999, 3762.84250000003, 3609.98050000001, 3502.29599999997, 3363.84250000003, 3206.54849999998, 3079.70000000001, 2971.42300000001, 2867.80349999998, 2727.08100000001, 2630.74900000001, 2496.6165, 2440.902, 2356.19150000002, 2235.58199999999, 2120.54149999999, 2012.25449999998, 1933.35600000003, 1820.93099999998, 1761.54800000001, 1663.09350000002, 1578.84600000002, 1509.48149999999, 1427.3345, 1379.56150000001, 1306.68099999998, 1212.63449999999, 1084.17300000001, 1124.16450000001, 1060.69949999999, 1007.48849999998, 941.194499999983, 879.880500000028, 836.007500000007, 782.802000000025, 748.385499999975, 647.991500000004, 626.730500000005, 570.776000000013, 484.000500000024, 513.98550000001, 418.985499999952, 386.996999999974, 370.026500000036, 355.496999999974, 356.731499999994, 255.92200000002, 259.094000000041, 205.434499999974, 165.374500000034, 197.347500000033, 95.718499999959, 67.6165000000037, 54.6970000000438, 31.7395000000251, -15.8784999999916, 8.42500000004657, -26.3754999999655, -118.425500000012, -66.6629999999423, -42.9745000000112, -107.364999999991, -189.839000000036, -162.611499999999, -164.964999999967, -189.079999999958, -223.931499999948, -235.329999999958, -269.639500000048, -249.087999999989, -206.475499999942, -283.04449999996, -290.667000000016, -304.561499999953, -336.784499999951, -380.386500000022, -283.280499999993, -364.533000000054, -389.059499999974, -364.454000000027, -415.748000000021, -417.155000000028};
+CUCO_HLL_TUNING_ARR_DECL bias_data_p18{189083.0, 185696.913, 182348.774, 179035.946, 175762.762, 172526.444, 169329.754, 166166.099, 163043.269, 159958.91, 156907.912, 153906.845, 150924.199, 147996.568, 145093.457, 142239.233, 139421.475, 136632.27, 133889.588, 131174.2, 128511.619, 125868.621, 123265.385, 120721.061, 118181.769, 115709.456, 113252.446, 110840.198, 108465.099, 106126.164, 103823.469, 101556.618, 99308.004, 97124.508, 94937.803, 92833.731, 90745.061, 88677.627, 86617.47, 84650.442, 82697.833, 80769.132, 78879.629, 77014.432, 75215.626, 73384.587, 71652.482, 69895.93, 68209.301, 66553.669, 64921.981, 63310.323, 61742.115, 60205.018, 58698.658, 57190.657, 55760.865, 54331.169, 52908.167, 51550.273, 50225.254, 48922.421, 47614.533, 46362.049, 45098.569, 43926.083, 42736.03, 41593.473, 40425.26, 39316.237, 38243.651, 37170.617, 36114.609, 35084.19, 34117.233, 33206.509, 32231.505, 31318.728, 30403.404, 29540.0550000001, 28679.236, 27825.862, 26965.216, 26179.148, 25462.08, 24645.952, 23922.523, 23198.144, 22529.128, 21762.4179999999, 21134.779, 20459.117, 19840.818, 19187.04, 18636.3689999999, 17982.831, 17439.7389999999, 16874.547, 16358.2169999999, 15835.684, 15352.914, 14823.681, 14329.313, 13816.897, 13342.874, 12880.882, 12491.648, 12021.254, 11625.392, 11293.7610000001, 10813.697, 10456.209, 10099.074, 9755.39000000001, 9393.18500000006, 9047.57900000003, 8657.98499999999, 8395.85900000005, 8033.0, 7736.95900000003, 7430.59699999995, 7258.47699999996, 6924.58200000005, 6691.29399999999, 6357.92500000005, 6202.05700000003, 5921.19700000004, 5628.28399999999, 5404.96799999999, 5226.71100000001, 4990.75600000005, 4799.77399999998, 4622.93099999998, 4472.478, 4171.78700000001, 3957.46299999999, 3868.95200000005, 3691.14300000004, 3474.63100000005, 3341.67200000002, 3109.14000000001, 3071.97400000005, 2796.40399999998, 2756.17799999996, 2611.46999999997, 2471.93000000005, 2382.26399999997, 2209.22400000005, 2142.28399999999, 2013.96100000001, 1911.18999999994, 1818.27099999995, 1668.47900000005, 1519.65800000005, 1469.67599999998, 1367.13800000004, 1248.52899999998, 1181.23600000003, 1022.71900000004, 1088.20700000005, 959.03600000008, 876.095999999903, 791.183999999892, 703.337000000058, 731.949999999953, 586.86400000006, 526.024999999907, 323.004999999888, 320.448000000091, 340.672999999952, 309.638999999966, 216.601999999955, 102.922999999952, 19.2399999999907, -0.114000000059605, -32.6240000000689, -89.3179999999702, -153.497999999905, -64.2970000000205, -143.695999999996, -259.497999999905, -253.017999999924, -213.948000000091, -397.590000000084, -434.006000000052, -403.475000000093, -297.958000000101, -404.317000000039, -528.898999999976, -506.621000000043, -513.205000000075, -479.351000000024, -596.139999999898, -527.016999999993, -664.681000000099, -680.306000000099, -704.050000000047, -850.486000000034, -757.43200000003, -713.308999999892};
 
 // Meta array storing interpolation points for biases for Precision=4..18
-__device__ static cuda::std::array constexpr bias_data{bias_data_p4.data(),
-                                                       bias_data_p5.data(),
-                                                       bias_data_p6.data(),
-                                                       bias_data_p7.data(),
-                                                       bias_data_p8.data(),
-                                                       bias_data_p9.data(),
-                                                       bias_data_p10.data(),
-                                                       bias_data_p11.data(),
-                                                       bias_data_p12.data(),
-                                                       bias_data_p13.data(),
-                                                       bias_data_p14.data(),
-                                                       bias_data_p15.data(),
-                                                       bias_data_p16.data(),
-                                                       bias_data_p17.data(),
-                                                       bias_data_p18.data()};
+CUCO_HLL_TUNING_ARR_DECL bias_data{bias_data_p4.data(), bias_data_p5.data(), bias_data_p6.data(), bias_data_p7.data(), bias_data_p8.data(), bias_data_p9.data(), bias_data_p10.data(), bias_data_p11.data(), bias_data_p12.data(), bias_data_p13.data(), bias_data_p14.data(), bias_data_p15.data(), bias_data_p16.data(), bias_data_p17.data(), bias_data_p18.data()};
+// clang-format on
 
 }  // namespace cuco::hyperloglog_ns::detail
\ No newline at end of file

From 799284e6940024295a4d04a37dae0889919d9afe Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Thu, 1 Feb 2024 00:49:41 +0000
Subject: [PATCH 13/78] Implement HLL++ bias correction step

---
 include/cuco/detail/hyperloglog/finalizer.cuh |  77 ++++++++++++-
 include/cuco/detail/hyperloglog/tuning.cuh    | 102 ++++++++++++++++--
 2 files changed, 168 insertions(+), 11 deletions(-)

diff --git a/include/cuco/detail/hyperloglog/finalizer.cuh b/include/cuco/detail/hyperloglog/finalizer.cuh
index 3aca44fdf..cd69ffa0a 100644
--- a/include/cuco/detail/hyperloglog/finalizer.cuh
+++ b/include/cuco/detail/hyperloglog/finalizer.cuh
@@ -17,7 +17,9 @@
 
 #include <cuco/detail/hyperloglog/tuning.cuh>
 
+#include <cstddef>
 #include <cuda/std/cmath>
+#include <cuda/std/limits>
 
 namespace cuco::hyperloglog_ns::detail {
 
@@ -31,6 +33,9 @@ namespace cuco::hyperloglog_ns::detail {
  */
 template <int32_t Precision>
 class finalizer {
+  // Note: Most of the types in this implementation are explicit instead of relying on `auto` to
+  // avoid confusion with the reference implementation.
+
   // this minimum number of registers is required by HLL++
   static_assert(Precision >= 4, "Precision must be greater or equal to 4");
 
@@ -43,11 +48,9 @@ class finalizer {
    *
    * @return Bias-corrected cardinality estimate
    */
-  __host__ __device__ static double constexpr finalize(double z, int v) noexcept
+  __host__ __device__ static std::size_t constexpr finalize(double z, int v) noexcept
   {
     auto e = alpha_mm() / z;
-    // TODO remove test code
-    // printf("raw e: %lf\n", e);
 
     if (v > 0) {
       // Use linear counting for small cardinality estimates.
@@ -68,6 +71,7 @@ class finalizer {
 
  private:
   static auto constexpr m = (1 << Precision);  ///< Number of registers
+  static auto constexpr k = 6;                 ///< Number of interpolation points to consider
 
   __host__ __device__ static double constexpr alpha_mm() noexcept
   {
@@ -90,7 +94,70 @@ class finalizer {
     return e;
   }
 
-  // TODO implement HLL++ bias correction
-  __host__ __device__ static double constexpr bias(double e) noexcept { return e * 0; }
+  __host__ __device__ static double constexpr bias(double e) noexcept
+  {
+    auto const anchor_index = interpolation_anchor_index(e);
+    int const n             = raw_estimate_data<Precision>().size();
+
+    auto low  = cuda::std::max(anchor_index - k + 1, 0);
+    auto high = cuda::std::min(low + k, n);
+    // Keep moving bounds as long as the (exclusive) high bound is closer to the estimate than
+    // the lower (inclusive) bound.
+    while (high < n and distance(e, high) < distance(e, low)) {
+      low += 1;
+      high += 1;
+    }
+
+    auto const& biases = bias_data<Precision>();
+    double bias_sum    = 0.0;
+    for (int i = low; i < high; ++i) {
+      bias_sum += biases[i];
+    }
+
+    return bias_sum / (high - low);
+  }
+
+  __host__ __device__ static double distance(double e, int i) noexcept
+  {
+    auto const diff = e - raw_estimate_data<Precision>()[i];
+    return diff * diff;
+  }
+
+  __host__ __device__ static int interpolation_anchor_index(double e) noexcept
+  {
+    auto const& estimates = raw_estimate_data<Precision>();
+    int left              = 0;
+    int right             = static_cast<int>(estimates.size()) - 1;
+    int mid;
+    int candidate_index = 0;  // Index of the closest element found
+
+    while (left <= right) {
+      mid = left + (right - left) / 2;
+
+      if (estimates[mid] < e) {
+        left = mid + 1;
+      } else if (estimates[mid] > e) {
+        right = mid - 1;
+      } else {
+        // Exact match found, no need to look further
+        return mid;
+      }
+    }
+
+    // At this point, 'left' is the insertion point. We need to compare the elements at 'left' and
+    // 'left - 1' to find the closest one, taking care of boundary conditions.
+
+    // Distance from 'e' to the element at 'left', if within bounds
+    double const dist_lhs = left < static_cast<int>(estimates.size())
+                              ? cuda::std::abs(estimates[left] - e)
+                              : cuda::std::numeric_limits<double>::max();
+    // Distance from 'e' to the element at 'left - 1', if within bounds
+    double const dist_rhs = left - 1 >= 0 ? cuda::std::abs(estimates[left - 1] - e)
+                                          : cuda::std::numeric_limits<double>::max();
+
+    candidate_index = (dist_lhs < dist_rhs) ? left : left - 1;
+
+    return candidate_index;
+  }
 };
 }  // namespace cuco::hyperloglog_ns::detail
\ No newline at end of file
diff --git a/include/cuco/detail/hyperloglog/tuning.cuh b/include/cuco/detail/hyperloglog/tuning.cuh
index 4d4a69067..c10ef6950 100644
--- a/include/cuco/detail/hyperloglog/tuning.cuh
+++ b/include/cuco/detail/hyperloglog/tuning.cuh
@@ -20,10 +20,9 @@
 namespace cuco::hyperloglog_ns::detail {
 
 // TODO this will spawn one copy of each array in every TU :(
-// TODO use float instead of double?
 // TODO use __constant__?
 #ifndef CUCO_HLL_TUNING_ARR_DECL
-#define CUCO_HLL_TUNING_ARR_DECL __device__ static cuda::std::array constexpr
+#define CUCO_HLL_TUNING_ARR_DECL __device__ static constexpr cuda::std::array
 #endif
 
 // clang-format off
@@ -48,8 +47,54 @@ CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p16{47271.0, 48062.3584, 48862.7074,
 CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p17{94542.0, 96125.811, 97728.019, 99348.558, 100987.9705, 102646.7565, 104324.5125, 106021.7435, 107736.7865, 109469.272, 111223.9465, 112995.219, 114787.432, 116593.152, 118422.71, 120267.2345, 122134.6765, 124020.937, 125927.2705, 127851.255, 129788.9485, 131751.016, 133726.8225, 135722.592, 137736.789, 139770.568, 141821.518, 143891.343, 145982.1415, 148095.387, 150207.526, 152355.649, 154515.6415, 156696.05, 158887.7575, 161098.159, 163329.852, 165569.053, 167837.4005, 170121.6165, 172420.4595, 174732.6265, 177062.77, 179412.502, 181774.035, 184151.939, 186551.6895, 188965.691, 191402.8095, 193857.949, 196305.0775, 198774.6715, 201271.2585, 203764.78, 206299.3695, 208818.1365, 211373.115, 213946.7465, 216532.076, 219105.541, 221714.5375, 224337.5135, 226977.5125, 229613.0655, 232270.2685, 234952.2065, 237645.3555, 240331.1925, 243034.517, 245756.0725, 248517.6865, 251232.737, 254011.3955, 256785.995, 259556.44, 262368.335, 265156.911, 267965.266, 270785.583, 273616.0495, 276487.4835, 279346.639, 282202.509, 285074.3885, 287942.2855, 290856.018, 293774.0345, 296678.5145, 299603.6355, 302552.6575, 305492.9785, 308466.8605, 311392.581, 314347.538, 317319.4295, 320285.9785, 323301.7325, 326298.3235, 329301.3105, 332301.987, 335309.791, 338370.762, 341382.923, 344431.1265, 347464.1545, 350507.28, 353619.2345, 356631.2005, 359685.203, 362776.7845, 365886.488, 368958.2255, 372060.6825, 375165.4335, 378237.935, 381328.311, 384430.5225, 387576.425, 390683.242, 393839.648, 396977.8425, 400101.9805, 403271.296, 406409.8425, 409529.5485, 412678.7, 415847.423, 419020.8035, 422157.081, 425337.749, 428479.6165, 431700.902, 434893.1915, 438049.582, 441210.5415, 444379.2545, 447577.356, 450741.931, 453959.548, 457137.0935, 460329.846, 463537.4815, 466732.3345, 469960.5615, 473164.681, 476347.6345, 479496.173, 482813.1645, 486025.6995, 489249.4885, 492460.1945, 495675.8805, 498908.0075, 502131.802, 505374.3855, 508550.9915, 511806.7305, 515026.776, 518217.0005, 521523.9855, 524705.9855, 527950.997, 531210.0265, 534472.497, 537750.7315, 540926.922, 544207.094, 547429.4345, 550666.3745, 553975.3475, 557150.7185, 560399.6165, 563662.697, 566916.7395, 570146.1215, 573447.425, 576689.6245, 579874.5745, 583202.337, 586503.0255, 589715.635, 592910.161, 596214.3885, 599488.035, 602740.92, 605983.0685, 609248.67, 612491.3605, 615787.912, 619107.5245, 622307.9555, 625577.333, 628840.4385, 632085.2155, 635317.6135, 638691.7195, 641887.467, 645139.9405, 648441.546, 651666.252, 654941.845};
 CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p18{189084.0, 192250.913, 195456.774, 198696.946, 201977.762, 205294.444, 208651.754, 212042.099, 215472.269, 218941.91, 222443.912, 225996.845, 229568.199, 233193.568, 236844.457, 240543.233, 244279.475, 248044.27, 251854.588, 255693.2, 259583.619, 263494.621, 267445.385, 271454.061, 275468.769, 279549.456, 283646.446, 287788.198, 291966.099, 296181.164, 300431.469, 304718.618, 309024.004, 313393.508, 317760.803, 322209.731, 326675.061, 331160.627, 335654.47, 340241.442, 344841.833, 349467.132, 354130.629, 358819.432, 363574.626, 368296.587, 373118.482, 377914.93, 382782.301, 387680.669, 392601.981, 397544.323, 402529.115, 407546.018, 412593.658, 417638.657, 422762.865, 427886.169, 433017.167, 438213.273, 443441.254, 448692.421, 453937.533, 459239.049, 464529.569, 469910.083, 475274.03, 480684.473, 486070.26, 491515.237, 496995.651, 502476.617, 507973.609, 513497.19, 519083.233, 524726.509, 530305.505, 535945.728, 541584.404, 547274.055, 552967.236, 558667.862, 564360.216, 570128.148, 575965.08, 581701.952, 587532.523, 593361.144, 599246.128, 605033.418, 610958.779, 616837.117, 622772.818, 628672.04, 634675.369, 640574.831, 646585.739, 652574.547, 658611.217, 664642.684, 670713.914, 676737.681, 682797.313, 688837.897, 694917.874, 701009.882, 707173.648, 713257.254, 719415.392, 725636.761, 731710.697, 737906.209, 744103.074, 750313.39, 756504.185, 762712.579, 768876.985, 775167.859, 781359.0, 787615.959, 793863.597, 800245.477, 806464.582, 812785.294, 819005.925, 825403.057, 831676.197, 837936.284, 844266.968, 850642.711, 856959.756, 863322.774, 869699.931, 876102.478, 882355.787, 888694.463, 895159.952, 901536.143, 907872.631, 914293.672, 920615.14, 927130.974, 933409.404, 939922.178, 946331.47, 952745.93, 959209.264, 965590.224, 972077.284, 978501.961, 984953.19, 991413.271, 997817.479, 1004222.658, 1010725.676, 1017177.138, 1023612.529, 1030098.236, 1036493.719, 1043112.207, 1049537.036, 1056008.096, 1062476.184, 1068942.337, 1075524.95, 1081932.864, 1088426.025, 1094776.005, 1101327.448, 1107901.673, 1114423.639, 1120884.602, 1127324.923, 1133794.24, 1140328.886, 1146849.376, 1153346.682, 1159836.502, 1166478.703, 1172953.304, 1179391.502, 1185950.982, 1192544.052, 1198913.41, 1205430.994, 1212015.525, 1218674.042, 1225121.683, 1231551.101, 1238126.379, 1244673.795, 1251260.649, 1257697.86, 1264320.983, 1270736.319, 1277274.694, 1283804.95, 1290211.514, 1296858.568, 1303455.691};
 
-// Meta array storing interpolation points for estimates for Precision=4..18
-CUCO_HLL_TUNING_ARR_DECL raw_estimate_data{raw_estimate_data_p4.data(), raw_estimate_data_p5.data(), raw_estimate_data_p6.data(), raw_estimate_data_p7.data(), raw_estimate_data_p8.data(), raw_estimate_data_p9.data(), raw_estimate_data_p10.data(), raw_estimate_data_p11.data(), raw_estimate_data_p12.data(), raw_estimate_data_p13.data(), raw_estimate_data_p14.data(), raw_estimate_data_p15.data(), raw_estimate_data_p16.data(), raw_estimate_data_p17.data(), raw_estimate_data_p18.data()};
+// helpers for selecting the corresponding arrays for a given precision
+template <int32_t P>
+__host__ __device__ auto const& raw_estimate_data() noexcept;
+
+template <>
+__host__ __device__ auto const& raw_estimate_data<4>() noexcept { return raw_estimate_data_p4; };
+
+template <>
+__host__ __device__ auto const& raw_estimate_data<5>() noexcept { return raw_estimate_data_p5; };
+
+template <>
+__host__ __device__ auto const& raw_estimate_data<6>() noexcept { return raw_estimate_data_p6; };
+
+template <>
+__host__ __device__ auto const& raw_estimate_data<7>() noexcept { return raw_estimate_data_p7; };
+
+template <>
+__host__ __device__ auto const& raw_estimate_data<8>() noexcept { return raw_estimate_data_p8; };
+
+template <>
+__host__ __device__ auto const& raw_estimate_data<9>() noexcept { return raw_estimate_data_p9; };
+
+template <>
+__host__ __device__ auto const& raw_estimate_data<10>() noexcept { return raw_estimate_data_p10; };
+
+template <>
+__host__ __device__ auto const& raw_estimate_data<11>() noexcept { return raw_estimate_data_p11; };
+
+template <>
+__host__ __device__ auto const& raw_estimate_data<12>() noexcept { return raw_estimate_data_p12; };
+
+template <>
+__host__ __device__ auto const& raw_estimate_data<13>() noexcept { return raw_estimate_data_p13; };
+
+template <>
+__host__ __device__ auto const& raw_estimate_data<14>() noexcept { return raw_estimate_data_p14; };
+
+template <>
+__host__ __device__ auto const& raw_estimate_data<15>() noexcept { return raw_estimate_data_p15; };
+
+template <>
+__host__ __device__ auto const& raw_estimate_data<16>() noexcept { return raw_estimate_data_p16; };
+
+template <>
+__host__ __device__ auto const& raw_estimate_data<17>() noexcept { return raw_estimate_data_p17; };
+
+template <>
+__host__ __device__ auto const& raw_estimate_data<18>() noexcept { return raw_estimate_data_p18; };
 
 CUCO_HLL_TUNING_ARR_DECL bias_data_p4{10.0, 9.717, 9.207, 8.7896, 8.2882, 7.8204, 7.3772, 6.9342, 6.5202, 6.161, 5.7722, 5.4636, 5.0396, 4.6766, 4.3566, 4.0454, 3.7936, 3.4856, 3.2666, 2.9946, 2.766, 2.4692, 2.3638, 2.0764, 1.7864, 1.7602, 1.4814, 1.433, 1.2926, 1.0664, 0.999600000000001, 0.7956, 0.5366, 0.589399999999998, 0.573799999999999, 0.269799999999996, 0.368200000000002, 0.0544000000000011, 0.234200000000001, 0.0108000000000033, -0.203400000000002, -0.0701999999999998, -0.129600000000003, -0.364199999999997, -0.480600000000003, -0.226999999999997, -0.322800000000001, -0.382599999999996, -0.511200000000002, -0.669600000000003, -0.749400000000001, -0.500399999999999, -0.617600000000003, -0.6922, -0.601599999999998, -0.416200000000003, -0.338200000000001, -0.782600000000002, -0.648600000000002, -0.919800000000002, -0.851799999999997, -0.962400000000002, -0.6402, -1.1922, -1.0256, -1.086, -1.21899999999999, -0.819400000000002, -0.940600000000003, -1.1554, -1.2072, -1.1752, -1.16759999999999, -1.14019999999999, -1.3754, -1.29859999999999, -1.607, -1.3292, -1.7606};
 CUCO_HLL_TUNING_ARR_DECL bias_data_p5{22.0, 21.1194, 20.8208, 20.2318, 19.77, 19.2436, 18.7774, 18.2848, 17.8224, 17.3742, 16.9336, 16.503, 16.0494, 15.6292, 15.2124, 14.798, 14.367, 13.9728, 13.5944, 13.217, 12.8438, 12.3696, 12.0956, 11.7044, 11.324, 11.0668, 10.6698, 10.3644, 10.049, 9.6918, 9.4146, 9.082, 8.687, 8.5398, 8.2462, 7.857, 7.6606, 7.4168, 7.1248, 6.9222, 6.6804, 6.447, 6.3454, 5.9594, 5.7636, 5.5776, 5.331, 5.19, 4.9676, 4.7564, 4.5314, 4.4442, 4.3708, 3.9774, 3.9624, 3.8796, 3.755, 3.472, 3.2076, 3.1024, 2.8908, 2.7338, 2.7728, 2.629, 2.413, 2.3266, 2.1524, 2.2642, 2.1806, 2.0566, 1.9192, 1.7598, 1.3516, 1.5802, 1.43859999999999, 1.49160000000001, 1.1524, 1.1892, 0.841399999999993, 0.879800000000003, 0.837599999999995, 0.469800000000006, 0.765600000000006, 0.331000000000003, 0.591399999999993, 0.601200000000006, 0.701599999999999, 0.558199999999999, 0.339399999999998, 0.354399999999998, 0.491200000000006, 0.308000000000007, 0.355199999999996, -0.0254000000000048, 0.205200000000005, -0.272999999999996, 0.132199999999997, 0.394400000000005, -0.241200000000006, 0.242000000000004, 0.191400000000002, 0.253799999999998, -0.122399999999999, -0.370800000000003, 0.193200000000004, -0.0848000000000013, 0.0867999999999967, -0.327200000000005, -0.285600000000002, 0.311400000000006, -0.128399999999999, -0.754999999999995, -0.209199999999996, -0.293599999999998, -0.364000000000004, -0.253600000000006, -0.821200000000005, -0.253600000000006, -0.510400000000004, -0.383399999999995, -0.491799999999998, -0.220200000000006, -0.0972000000000008, -0.557400000000001, -0.114599999999996, -0.295000000000002, -0.534800000000004, 0.346399999999988, -0.65379999999999, 0.0398000000000138, 0.0341999999999985, -0.995800000000003, -0.523400000000009, -0.489000000000004, -0.274799999999999, -0.574999999999989, -0.482799999999997, 0.0571999999999946, -0.330600000000004, -0.628800000000012, -0.140199999999993, -0.540600000000012, -0.445999999999998, -0.599400000000003, -0.262599999999992, 0.163399999999996, -0.100599999999986, -0.39500000000001, -1.06960000000001, -0.836399999999998, -0.753199999999993, -0.412399999999991, -0.790400000000005, -0.29679999999999, -0.28540000000001, -0.193000000000012, -0.0772000000000048, -0.962799999999987, -0.414800000000014};
@@ -67,8 +112,53 @@ CUCO_HLL_TUNING_ARR_DECL bias_data_p16{47270.0, 46423.3584, 45585.7074, 44757.15
 CUCO_HLL_TUNING_ARR_DECL bias_data_p17{94541.0, 92848.811, 91174.019, 89517.558, 87879.9705, 86262.7565, 84663.5125, 83083.7435, 81521.7865, 79977.272, 78455.9465, 76950.219, 75465.432, 73994.152, 72546.71, 71115.2345, 69705.6765, 68314.937, 66944.2705, 65591.255, 64252.9485, 62938.016, 61636.8225, 60355.592, 59092.789, 57850.568, 56624.518, 55417.343, 54231.1415, 53067.387, 51903.526, 50774.649, 49657.6415, 48561.05, 47475.7575, 46410.159, 45364.852, 44327.053, 43318.4005, 42325.6165, 41348.4595, 40383.6265, 39436.77, 38509.502, 37594.035, 36695.939, 35818.6895, 34955.691, 34115.8095, 33293.949, 32465.0775, 31657.6715, 30877.2585, 30093.78, 29351.3695, 28594.1365, 27872.115, 27168.7465, 26477.076, 25774.541, 25106.5375, 24452.5135, 23815.5125, 23174.0655, 22555.2685, 21960.2065, 21376.3555, 20785.1925, 20211.517, 19657.0725, 19141.6865, 18579.737, 18081.3955, 17578.995, 17073.44, 16608.335, 16119.911, 15651.266, 15194.583, 14749.0495, 14343.4835, 13925.639, 13504.509, 13099.3885, 12691.2855, 12328.018, 11969.0345, 11596.5145, 11245.6355, 10917.6575, 10580.9785, 10277.8605, 9926.58100000001, 9605.538, 9300.42950000003, 8989.97850000003, 8728.73249999998, 8448.3235, 8175.31050000002, 7898.98700000002, 7629.79100000003, 7413.76199999999, 7149.92300000001, 6921.12650000001, 6677.1545, 6443.28000000003, 6278.23450000002, 6014.20049999998, 5791.20299999998, 5605.78450000001, 5438.48800000001, 5234.2255, 5059.6825, 4887.43349999998, 4682.935, 4496.31099999999, 4322.52250000002, 4191.42499999999, 4021.24200000003, 3900.64799999999, 3762.84250000003, 3609.98050000001, 3502.29599999997, 3363.84250000003, 3206.54849999998, 3079.70000000001, 2971.42300000001, 2867.80349999998, 2727.08100000001, 2630.74900000001, 2496.6165, 2440.902, 2356.19150000002, 2235.58199999999, 2120.54149999999, 2012.25449999998, 1933.35600000003, 1820.93099999998, 1761.54800000001, 1663.09350000002, 1578.84600000002, 1509.48149999999, 1427.3345, 1379.56150000001, 1306.68099999998, 1212.63449999999, 1084.17300000001, 1124.16450000001, 1060.69949999999, 1007.48849999998, 941.194499999983, 879.880500000028, 836.007500000007, 782.802000000025, 748.385499999975, 647.991500000004, 626.730500000005, 570.776000000013, 484.000500000024, 513.98550000001, 418.985499999952, 386.996999999974, 370.026500000036, 355.496999999974, 356.731499999994, 255.92200000002, 259.094000000041, 205.434499999974, 165.374500000034, 197.347500000033, 95.718499999959, 67.6165000000037, 54.6970000000438, 31.7395000000251, -15.8784999999916, 8.42500000004657, -26.3754999999655, -118.425500000012, -66.6629999999423, -42.9745000000112, -107.364999999991, -189.839000000036, -162.611499999999, -164.964999999967, -189.079999999958, -223.931499999948, -235.329999999958, -269.639500000048, -249.087999999989, -206.475499999942, -283.04449999996, -290.667000000016, -304.561499999953, -336.784499999951, -380.386500000022, -283.280499999993, -364.533000000054, -389.059499999974, -364.454000000027, -415.748000000021, -417.155000000028};
 CUCO_HLL_TUNING_ARR_DECL bias_data_p18{189083.0, 185696.913, 182348.774, 179035.946, 175762.762, 172526.444, 169329.754, 166166.099, 163043.269, 159958.91, 156907.912, 153906.845, 150924.199, 147996.568, 145093.457, 142239.233, 139421.475, 136632.27, 133889.588, 131174.2, 128511.619, 125868.621, 123265.385, 120721.061, 118181.769, 115709.456, 113252.446, 110840.198, 108465.099, 106126.164, 103823.469, 101556.618, 99308.004, 97124.508, 94937.803, 92833.731, 90745.061, 88677.627, 86617.47, 84650.442, 82697.833, 80769.132, 78879.629, 77014.432, 75215.626, 73384.587, 71652.482, 69895.93, 68209.301, 66553.669, 64921.981, 63310.323, 61742.115, 60205.018, 58698.658, 57190.657, 55760.865, 54331.169, 52908.167, 51550.273, 50225.254, 48922.421, 47614.533, 46362.049, 45098.569, 43926.083, 42736.03, 41593.473, 40425.26, 39316.237, 38243.651, 37170.617, 36114.609, 35084.19, 34117.233, 33206.509, 32231.505, 31318.728, 30403.404, 29540.0550000001, 28679.236, 27825.862, 26965.216, 26179.148, 25462.08, 24645.952, 23922.523, 23198.144, 22529.128, 21762.4179999999, 21134.779, 20459.117, 19840.818, 19187.04, 18636.3689999999, 17982.831, 17439.7389999999, 16874.547, 16358.2169999999, 15835.684, 15352.914, 14823.681, 14329.313, 13816.897, 13342.874, 12880.882, 12491.648, 12021.254, 11625.392, 11293.7610000001, 10813.697, 10456.209, 10099.074, 9755.39000000001, 9393.18500000006, 9047.57900000003, 8657.98499999999, 8395.85900000005, 8033.0, 7736.95900000003, 7430.59699999995, 7258.47699999996, 6924.58200000005, 6691.29399999999, 6357.92500000005, 6202.05700000003, 5921.19700000004, 5628.28399999999, 5404.96799999999, 5226.71100000001, 4990.75600000005, 4799.77399999998, 4622.93099999998, 4472.478, 4171.78700000001, 3957.46299999999, 3868.95200000005, 3691.14300000004, 3474.63100000005, 3341.67200000002, 3109.14000000001, 3071.97400000005, 2796.40399999998, 2756.17799999996, 2611.46999999997, 2471.93000000005, 2382.26399999997, 2209.22400000005, 2142.28399999999, 2013.96100000001, 1911.18999999994, 1818.27099999995, 1668.47900000005, 1519.65800000005, 1469.67599999998, 1367.13800000004, 1248.52899999998, 1181.23600000003, 1022.71900000004, 1088.20700000005, 959.03600000008, 876.095999999903, 791.183999999892, 703.337000000058, 731.949999999953, 586.86400000006, 526.024999999907, 323.004999999888, 320.448000000091, 340.672999999952, 309.638999999966, 216.601999999955, 102.922999999952, 19.2399999999907, -0.114000000059605, -32.6240000000689, -89.3179999999702, -153.497999999905, -64.2970000000205, -143.695999999996, -259.497999999905, -253.017999999924, -213.948000000091, -397.590000000084, -434.006000000052, -403.475000000093, -297.958000000101, -404.317000000039, -528.898999999976, -506.621000000043, -513.205000000075, -479.351000000024, -596.139999999898, -527.016999999993, -664.681000000099, -680.306000000099, -704.050000000047, -850.486000000034, -757.43200000003, -713.308999999892};
 
-// Meta array storing interpolation points for biases for Precision=4..18
-CUCO_HLL_TUNING_ARR_DECL bias_data{bias_data_p4.data(), bias_data_p5.data(), bias_data_p6.data(), bias_data_p7.data(), bias_data_p8.data(), bias_data_p9.data(), bias_data_p10.data(), bias_data_p11.data(), bias_data_p12.data(), bias_data_p13.data(), bias_data_p14.data(), bias_data_p15.data(), bias_data_p16.data(), bias_data_p17.data(), bias_data_p18.data()};
+template <int32_t P>
+__host__ __device__ auto const& bias_data() noexcept;
+
+template <>
+__host__ __device__ auto const& bias_data<4>() noexcept { return bias_data_p4; };
+
+template <>
+__host__ __device__ auto const& bias_data<5>() noexcept { return bias_data_p5; };
+
+template <>
+__host__ __device__ auto const& bias_data<6>() noexcept { return bias_data_p6; };
+
+template <>
+__host__ __device__ auto const& bias_data<7>() noexcept { return bias_data_p7; };
+
+template <>
+__host__ __device__ auto const& bias_data<8>() noexcept { return bias_data_p8; };
+
+template <>
+__host__ __device__ auto const& bias_data<9>() noexcept { return bias_data_p9; };
+
+template <>
+__host__ __device__ auto const& bias_data<10>() noexcept { return bias_data_p10; };
+
+template <>
+__host__ __device__ auto const& bias_data<11>() noexcept { return bias_data_p11; };
+
+template <>
+__host__ __device__ auto const& bias_data<12>() noexcept { return bias_data_p12; };
+
+template <>
+__host__ __device__ auto const& bias_data<13>() noexcept { return bias_data_p13; };
+
+template <>
+__host__ __device__ auto const& bias_data<14>() noexcept { return bias_data_p14; };
+
+template <>
+__host__ __device__ auto const& bias_data<15>() noexcept { return bias_data_p15; };
+
+template <>
+__host__ __device__ auto const& bias_data<16>() noexcept { return bias_data_p16; };
+
+template <>
+__host__ __device__ auto const& bias_data<17>() noexcept { return bias_data_p17; };
+
+template <>
+__host__ __device__ auto const& bias_data<18>() noexcept { return bias_data_p18; };
 // clang-format on
 
 }  // namespace cuco::hyperloglog_ns::detail
\ No newline at end of file

From abbeffa0076cfa48d77b61b625fe58ee19cabcd5 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Thu, 1 Feb 2024 02:00:30 +0000
Subject: [PATCH 14/78] Extend examples and fix some bugs along the way

---
 examples/CMakeLists.txt                       |   1 +
 .../device_ref_example.cu                     | 116 ++++++++++++++++++
 .../host_bulk_example.cu                      |  45 ++++---
 .../distinct_count_estimator.inl              |   2 +-
 .../distinct_count_estimator_ref.inl          |   4 +-
 .../cuco/detail/hyperloglog/hyperloglog.cuh   |  14 +++
 include/cuco/distinct_count_estimator_ref.cuh |   3 +
 7 files changed, 158 insertions(+), 27 deletions(-)
 create mode 100644 examples/distinct_count_estimator/device_ref_example.cu

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index f6e753cf2..9ee062690 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -42,3 +42,4 @@ ConfigureExample(STATIC_MAP_CUSTOM_TYPE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/sta
 ConfigureExample(STATIC_MAP_COUNT_BY_KEY_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/count_by_key_example.cu")
 ConfigureExample(STATIC_MULTIMAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_multimap/host_bulk_example.cu")
 ConfigureExample(DISTINCT_COUNT_ESTIMATOR_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/distinct_count_estimator/host_bulk_example.cu")
+ConfigureExample(DISTINCT_COUNT_ESTIMATOR_DEVICE_REF_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/distinct_count_estimator/device_ref_example.cu")
diff --git a/examples/distinct_count_estimator/device_ref_example.cu b/examples/distinct_count_estimator/device_ref_example.cu
new file mode 100644
index 000000000..82e34b5c9
--- /dev/null
+++ b/examples/distinct_count_estimator/device_ref_example.cu
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cuco/distinct_count_estimator.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/sequence.h>
+
+#include <cstddef>
+#include <iostream>
+
+/**
+ * @file device_reference_example.cu
+ * @brief Demonstrates usage of `cuco::distinct_count_estimator` device-side APIs.
+ *
+ * This example demonstrates how the non-owning reference type `cuco::distinct_count_estimator_ref`
+ * can be used to implement a custom kernel that fuses the cardinality estimation step with any
+ * other workload that traverses the input data.
+ */
+
+template <class RefType, class InputIt>
+__global__ void piggyback_kernel(RefType ref, InputIt first, std::size_t n)
+{
+  // Transform the reference type (with device scope) to a reference type with block scope
+  using local_ref_type = typename RefType::with_scope<cuda::thread_scope_block>;
+
+  // Shared memory storage for the block-local estimator
+  __shared__ typename local_ref_type::storage_type local_storage;
+
+  auto const loop_stride = gridDim.x * blockDim.x;
+  auto idx               = blockDim.x * blockIdx.x + threadIdx.x;
+  auto const block       = cooperative_groups::this_thread_block();
+
+  // Create the local estimator with the shared memory storage
+  local_ref_type local_ref(local_storage, {});
+
+  // Initialize the local estimator
+  local_ref.clear(block);
+  block.sync();
+
+  while (idx < n) {
+    auto const& item = *(first + idx);
+
+    // Add each item to the local estimator
+    local_ref.add(item);
+
+    /*
+    Here we can add some custom workload that takes the input `item`.
+
+    The idea is that cardinality estimation can be fused/piggy-backed with any other workload that
+    traverses the data. Since `local_ref.add` can run close to the SOL of the DRAM bandwidth, we get
+    the estimate "for free" while performing other computations over the data.
+    */
+
+    idx += loop_stride;
+  }
+  block.sync();
+
+  // We can also compute the local estimate on the device
+  auto const local_estimate = local_ref.estimate(block);
+  if (block.thread_rank() == 0) {
+    // The local estimate should approximately be `num_items`/`gridDim.x`
+    printf("Estimate for block %d = %llu\n", blockIdx.x, local_estimate);
+  }
+
+  // In the end, we merge the shared memory estimator into the global estimator which gives us the
+  // final result
+  ref.merge(block, local_ref);
+}
+
+int main(void)
+{
+  using T                         = int;
+  constexpr std::size_t num_items = 1ull << 28;  // 1GB
+
+  thrust::device_vector<T> items(num_items);
+
+  // Generate `num_items` distinct items
+  thrust::sequence(items.begin(), items.end(), 0);
+
+  // Initialize the estimator
+  cuco::distinct_count_estimator<T> estimator;
+
+  // Add all items to the estimator
+  estimator.add(items.begin(), items.end());
+
+  // Calculate the cardinality estimate from the bulk operation
+  std::size_t const estimated_cardinality_bulk = estimator.estimate();
+
+  // Clear the estimator so it can be reused
+  estimator.clear();
+
+  // Call the custom kernel and pass a non-owning reference to the estimator to the GPU
+  piggyback_kernel<<<10, 512>>>(estimator.ref(), items.begin(), num_items);
+
+  // Calculate the cardinality estimate from the custom kernel
+  std::size_t const estimated_cardinality_custom = estimator.estimate();
+
+  if (estimated_cardinality_bulk == estimated_cardinality_custom) {
+    std::cout << "Success! Cardinality estimates are identical" << std::endl;
+  }
+
+  return 0;
+}
\ No newline at end of file
diff --git a/examples/distinct_count_estimator/host_bulk_example.cu b/examples/distinct_count_estimator/host_bulk_example.cu
index 18085e72f..9e60ae47b 100644
--- a/examples/distinct_count_estimator/host_bulk_example.cu
+++ b/examples/distinct_count_estimator/host_bulk_example.cu
@@ -21,41 +21,38 @@
 #include <cstddef>
 #include <iostream>
 
-int main()
+/**
+ * @file host_bulk_example.cu
+ * @brief Demonstrates usage of `cuco::distinct_count_estimator` "bulk" host APIs.
+ */
+
+int main(void)
 {
   using T                         = int;
-  std::size_t constexpr num_items = 1ull << 30;  // 4GB
+  constexpr std::size_t num_items = 1ull << 28;  // 1GB
 
   thrust::device_vector<T> items(num_items);
-  // create a vector of distinct items
-  thrust::sequence(items.begin(), items.end(), 0);
 
-  cudaEvent_t start, stop;
-  cudaEventCreate(&start);
-  cudaEventCreate(&stop);
+  // Generate `num_items` distinct items
+  thrust::sequence(items.begin(), items.end(), 0);
 
+  // Initialize the estimator
   cuco::distinct_count_estimator<T> estimator;
-  cudaEventRecord(start);
-  // add all items to the estimator
+
+  // Add all items to the estimator
   estimator.add(items.begin(), items.end());
-  // after the estimator has seen all items, we can calculate the cardinality
-  std::size_t const estimated_cardinality = estimator.estimate();
-  cudaEventRecord(stop);
-  cudaEventSynchronize(stop);
 
-  float milliseconds = 0;
-  cudaEventElapsedTime(&milliseconds, start, stop);
-  float input_size_gb = num_items * sizeof(T) / 1073741824.0f;
-  float throughput    = input_size_gb / (milliseconds / 1000.0f);
+  // Adding the same items again will not affect the result
+  estimator.add(items.begin(), items.begin() + num_items / 2);
+
+  // Calculate the cardinality estimate
+  std::size_t const estimated_cardinality = estimator.estimate();
 
-  std::cout << "True cardinality:\t" << num_items << "\nEstimated cardinality:\t"
-            << estimated_cardinality << "\nRelative error:\t"
+  std::cout << "True cardinality: " << num_items
+            << "\nEstimated cardinality: " << estimated_cardinality << "\nRelative error: "
             << abs(static_cast<double>(num_items) - static_cast<double>(estimated_cardinality)) /
                  num_items
-            << "\nData size:\t" << input_size_gb << "GB"
-            << "\nElapsed time:\t" << milliseconds << "ms"
-            << "\nMemory throughput\t" << throughput << "GB/s" << std::endl;
+            << std::endl;
 
-  cudaEventDestroy(start);
-  cudaEventDestroy(stop);
+  return 0;
 }
\ No newline at end of file
diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl
index 413d7ee7b..79488e0e1 100644
--- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl
+++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl
@@ -98,6 +98,6 @@ template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash, clas
 typename distinct_count_estimator<T, Precision, Scope, Hash, Allocator>::ref_type<>
 distinct_count_estimator<T, Precision, Scope, Hash, Allocator>::ref() const noexcept
 {
-  return this->impl_->ref();
+  return ref_type<>{this->impl_->storage_ref(), this->impl_->hash()};
 }
 }  // namespace cuco
\ No newline at end of file
diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl
index 26fc9bd99..3b940edfd 100644
--- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl
+++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl
@@ -43,13 +43,13 @@ __device__ void distinct_count_estimator_ref<T, Precision, Scope, Hash>::merge(
   CG const& group,
   distinct_count_estimator_ref<T, Precision, OtherScope, Hash> const& other) noexcept
 {
-  this->impl_.merge(group, other);
+  this->impl_.merge(group, other.impl_);
 }
 
 template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash>
 __device__ std::size_t distinct_count_estimator_ref<T, Precision, Scope, Hash>::estimate(
   cooperative_groups::thread_block const& group) const noexcept
 {
-  this->impl_.estimate(group);
+  return this->impl_.estimate(group);
 }
 }  // namespace cuco
\ No newline at end of file
diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh
index af303a921..a08cfa942 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog.cuh
@@ -299,6 +299,20 @@ class hyperloglog {
     return ref_type<>{*(this->storage_.get()), this->hash_};
   }
 
+  /**
+   * @brief Get storage ref.
+   *
+   * @return Reference to storage
+   */
+  [[nodiscard]] storage_type& storage_ref() const noexcept { return *(this->storage_.get()); }
+
+  /**
+   * @brief Get hash function.
+   *
+   * @return The hash function
+   */
+  [[nodiscard]] auto hash() const noexcept { return this->hash_; }
+
  private:
   struct storage_deleter {
     using pointer = typename storage_allocator_type::value_type*;
diff --git a/include/cuco/distinct_count_estimator_ref.cuh b/include/cuco/distinct_count_estimator_ref.cuh
index 256183082..d656d6e17 100644
--- a/include/cuco/distinct_count_estimator_ref.cuh
+++ b/include/cuco/distinct_count_estimator_ref.cuh
@@ -105,6 +105,9 @@ class distinct_count_estimator_ref {
 
  private:
   impl_type impl_;  ///< Implementation object
+
+  template <class T_, int32_t Precision_, cuda::thread_scope Scope_, class Hash_>
+  friend class distinct_count_estimator_ref;
 };
 }  // namespace cuco
 

From 86d461801a5aad230e6143b2cca9cdbc89bf0bb8 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Thu, 1 Feb 2024 17:01:02 +0000
Subject: [PATCH 15/78] Refactor thresholds

---
 include/cuco/detail/hyperloglog/finalizer.cuh |  2 +-
 include/cuco/detail/hyperloglog/tuning.cuh    | 52 +++++++++++++++++--
 2 files changed, 50 insertions(+), 4 deletions(-)

diff --git a/include/cuco/detail/hyperloglog/finalizer.cuh b/include/cuco/detail/hyperloglog/finalizer.cuh
index cd69ffa0a..18af4ca1b 100644
--- a/include/cuco/detail/hyperloglog/finalizer.cuh
+++ b/include/cuco/detail/hyperloglog/finalizer.cuh
@@ -57,7 +57,7 @@ class finalizer {
       double const h = m * log(static_cast<double>(m) / v);
       // HLL++ is defined only when p < 19, otherwise we need to fallback to HLL.
       // The threshold `2.5 * m` is from the original HLL algorithm.
-      if ((Precision < 19 and h <= thresholds[Precision - 4]) or e <= 2.5 * m) {
+      if ((Precision < 19 and h <= threshold<Precision>()) or e <= 2.5 * m) {
         e = h;
       } else {
         e = bias_corrected_estimate(e);
diff --git a/include/cuco/detail/hyperloglog/tuning.cuh b/include/cuco/detail/hyperloglog/tuning.cuh
index c10ef6950..05cacb067 100644
--- a/include/cuco/detail/hyperloglog/tuning.cuh
+++ b/include/cuco/detail/hyperloglog/tuning.cuh
@@ -26,7 +26,53 @@ namespace cuco::hyperloglog_ns::detail {
 #endif
 
 // clang-format off
-CUCO_HLL_TUNING_ARR_DECL thresholds{10.0, 20.0, 40.0, 80.0, 220.0, 400.0, 900.0, 1800.0, 3100.0, 6500.0, 15500.0, 20000.0, 50000.0, 120000.0, 350000.0};
+template <int32_t Precision>
+__host__ __device__ constexpr auto threshold() noexcept;
+
+template <>
+__host__ __device__ constexpr auto threshold<4>() noexcept { return 10.0; };
+
+template <>
+__host__ __device__ constexpr auto threshold<5>() noexcept { return 20.0; };
+
+template <>
+__host__ __device__ constexpr auto threshold<6>() noexcept { return 40.0; };
+
+template <>
+__host__ __device__ constexpr auto threshold<7>() noexcept { return 80.0; };
+
+template <>
+__host__ __device__ constexpr auto threshold<8>() noexcept { return 220.0; };
+
+template <>
+__host__ __device__ constexpr auto threshold<9>() noexcept { return 400.0; };
+
+template <>
+__host__ __device__ constexpr auto threshold<10>() noexcept { return 900.0; };
+
+template <>
+__host__ __device__ constexpr auto threshold<11>() noexcept { return 1800.0; };
+
+template <>
+__host__ __device__ constexpr auto threshold<12>() noexcept { return 3100.0; };
+
+template <>
+__host__ __device__ constexpr auto threshold<13>() noexcept { return 6500.0; };
+
+template <>
+__host__ __device__ constexpr auto threshold<14>() noexcept { return 15500.0; };
+
+template <>
+__host__ __device__ constexpr auto threshold<15>() noexcept { return 20000.0; };
+
+template <>
+__host__ __device__ constexpr auto threshold<16>() noexcept { return 50000.0; };
+
+template <>
+__host__ __device__ constexpr auto threshold<17>() noexcept { return 120000.0; };
+
+template <>
+__host__ __device__ constexpr auto threshold<18>() noexcept { return 350000.0; };
 
 // HLL++ uses an interpolation method over the raw estimated cardinality to select the optimal bias.
 // Parameters/interpolation points taken from
@@ -48,7 +94,7 @@ CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p17{94542.0, 96125.811, 97728.019, 99
 CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p18{189084.0, 192250.913, 195456.774, 198696.946, 201977.762, 205294.444, 208651.754, 212042.099, 215472.269, 218941.91, 222443.912, 225996.845, 229568.199, 233193.568, 236844.457, 240543.233, 244279.475, 248044.27, 251854.588, 255693.2, 259583.619, 263494.621, 267445.385, 271454.061, 275468.769, 279549.456, 283646.446, 287788.198, 291966.099, 296181.164, 300431.469, 304718.618, 309024.004, 313393.508, 317760.803, 322209.731, 326675.061, 331160.627, 335654.47, 340241.442, 344841.833, 349467.132, 354130.629, 358819.432, 363574.626, 368296.587, 373118.482, 377914.93, 382782.301, 387680.669, 392601.981, 397544.323, 402529.115, 407546.018, 412593.658, 417638.657, 422762.865, 427886.169, 433017.167, 438213.273, 443441.254, 448692.421, 453937.533, 459239.049, 464529.569, 469910.083, 475274.03, 480684.473, 486070.26, 491515.237, 496995.651, 502476.617, 507973.609, 513497.19, 519083.233, 524726.509, 530305.505, 535945.728, 541584.404, 547274.055, 552967.236, 558667.862, 564360.216, 570128.148, 575965.08, 581701.952, 587532.523, 593361.144, 599246.128, 605033.418, 610958.779, 616837.117, 622772.818, 628672.04, 634675.369, 640574.831, 646585.739, 652574.547, 658611.217, 664642.684, 670713.914, 676737.681, 682797.313, 688837.897, 694917.874, 701009.882, 707173.648, 713257.254, 719415.392, 725636.761, 731710.697, 737906.209, 744103.074, 750313.39, 756504.185, 762712.579, 768876.985, 775167.859, 781359.0, 787615.959, 793863.597, 800245.477, 806464.582, 812785.294, 819005.925, 825403.057, 831676.197, 837936.284, 844266.968, 850642.711, 856959.756, 863322.774, 869699.931, 876102.478, 882355.787, 888694.463, 895159.952, 901536.143, 907872.631, 914293.672, 920615.14, 927130.974, 933409.404, 939922.178, 946331.47, 952745.93, 959209.264, 965590.224, 972077.284, 978501.961, 984953.19, 991413.271, 997817.479, 1004222.658, 1010725.676, 1017177.138, 1023612.529, 1030098.236, 1036493.719, 1043112.207, 1049537.036, 1056008.096, 1062476.184, 1068942.337, 1075524.95, 1081932.864, 1088426.025, 1094776.005, 1101327.448, 1107901.673, 1114423.639, 1120884.602, 1127324.923, 1133794.24, 1140328.886, 1146849.376, 1153346.682, 1159836.502, 1166478.703, 1172953.304, 1179391.502, 1185950.982, 1192544.052, 1198913.41, 1205430.994, 1212015.525, 1218674.042, 1225121.683, 1231551.101, 1238126.379, 1244673.795, 1251260.649, 1257697.86, 1264320.983, 1270736.319, 1277274.694, 1283804.95, 1290211.514, 1296858.568, 1303455.691};
 
 // helpers for selecting the corresponding arrays for a given precision
-template <int32_t P>
+template <int32_t Precision>
 __host__ __device__ auto const& raw_estimate_data() noexcept;
 
 template <>
@@ -112,7 +158,7 @@ CUCO_HLL_TUNING_ARR_DECL bias_data_p16{47270.0, 46423.3584, 45585.7074, 44757.15
 CUCO_HLL_TUNING_ARR_DECL bias_data_p17{94541.0, 92848.811, 91174.019, 89517.558, 87879.9705, 86262.7565, 84663.5125, 83083.7435, 81521.7865, 79977.272, 78455.9465, 76950.219, 75465.432, 73994.152, 72546.71, 71115.2345, 69705.6765, 68314.937, 66944.2705, 65591.255, 64252.9485, 62938.016, 61636.8225, 60355.592, 59092.789, 57850.568, 56624.518, 55417.343, 54231.1415, 53067.387, 51903.526, 50774.649, 49657.6415, 48561.05, 47475.7575, 46410.159, 45364.852, 44327.053, 43318.4005, 42325.6165, 41348.4595, 40383.6265, 39436.77, 38509.502, 37594.035, 36695.939, 35818.6895, 34955.691, 34115.8095, 33293.949, 32465.0775, 31657.6715, 30877.2585, 30093.78, 29351.3695, 28594.1365, 27872.115, 27168.7465, 26477.076, 25774.541, 25106.5375, 24452.5135, 23815.5125, 23174.0655, 22555.2685, 21960.2065, 21376.3555, 20785.1925, 20211.517, 19657.0725, 19141.6865, 18579.737, 18081.3955, 17578.995, 17073.44, 16608.335, 16119.911, 15651.266, 15194.583, 14749.0495, 14343.4835, 13925.639, 13504.509, 13099.3885, 12691.2855, 12328.018, 11969.0345, 11596.5145, 11245.6355, 10917.6575, 10580.9785, 10277.8605, 9926.58100000001, 9605.538, 9300.42950000003, 8989.97850000003, 8728.73249999998, 8448.3235, 8175.31050000002, 7898.98700000002, 7629.79100000003, 7413.76199999999, 7149.92300000001, 6921.12650000001, 6677.1545, 6443.28000000003, 6278.23450000002, 6014.20049999998, 5791.20299999998, 5605.78450000001, 5438.48800000001, 5234.2255, 5059.6825, 4887.43349999998, 4682.935, 4496.31099999999, 4322.52250000002, 4191.42499999999, 4021.24200000003, 3900.64799999999, 3762.84250000003, 3609.98050000001, 3502.29599999997, 3363.84250000003, 3206.54849999998, 3079.70000000001, 2971.42300000001, 2867.80349999998, 2727.08100000001, 2630.74900000001, 2496.6165, 2440.902, 2356.19150000002, 2235.58199999999, 2120.54149999999, 2012.25449999998, 1933.35600000003, 1820.93099999998, 1761.54800000001, 1663.09350000002, 1578.84600000002, 1509.48149999999, 1427.3345, 1379.56150000001, 1306.68099999998, 1212.63449999999, 1084.17300000001, 1124.16450000001, 1060.69949999999, 1007.48849999998, 941.194499999983, 879.880500000028, 836.007500000007, 782.802000000025, 748.385499999975, 647.991500000004, 626.730500000005, 570.776000000013, 484.000500000024, 513.98550000001, 418.985499999952, 386.996999999974, 370.026500000036, 355.496999999974, 356.731499999994, 255.92200000002, 259.094000000041, 205.434499999974, 165.374500000034, 197.347500000033, 95.718499999959, 67.6165000000037, 54.6970000000438, 31.7395000000251, -15.8784999999916, 8.42500000004657, -26.3754999999655, -118.425500000012, -66.6629999999423, -42.9745000000112, -107.364999999991, -189.839000000036, -162.611499999999, -164.964999999967, -189.079999999958, -223.931499999948, -235.329999999958, -269.639500000048, -249.087999999989, -206.475499999942, -283.04449999996, -290.667000000016, -304.561499999953, -336.784499999951, -380.386500000022, -283.280499999993, -364.533000000054, -389.059499999974, -364.454000000027, -415.748000000021, -417.155000000028};
 CUCO_HLL_TUNING_ARR_DECL bias_data_p18{189083.0, 185696.913, 182348.774, 179035.946, 175762.762, 172526.444, 169329.754, 166166.099, 163043.269, 159958.91, 156907.912, 153906.845, 150924.199, 147996.568, 145093.457, 142239.233, 139421.475, 136632.27, 133889.588, 131174.2, 128511.619, 125868.621, 123265.385, 120721.061, 118181.769, 115709.456, 113252.446, 110840.198, 108465.099, 106126.164, 103823.469, 101556.618, 99308.004, 97124.508, 94937.803, 92833.731, 90745.061, 88677.627, 86617.47, 84650.442, 82697.833, 80769.132, 78879.629, 77014.432, 75215.626, 73384.587, 71652.482, 69895.93, 68209.301, 66553.669, 64921.981, 63310.323, 61742.115, 60205.018, 58698.658, 57190.657, 55760.865, 54331.169, 52908.167, 51550.273, 50225.254, 48922.421, 47614.533, 46362.049, 45098.569, 43926.083, 42736.03, 41593.473, 40425.26, 39316.237, 38243.651, 37170.617, 36114.609, 35084.19, 34117.233, 33206.509, 32231.505, 31318.728, 30403.404, 29540.0550000001, 28679.236, 27825.862, 26965.216, 26179.148, 25462.08, 24645.952, 23922.523, 23198.144, 22529.128, 21762.4179999999, 21134.779, 20459.117, 19840.818, 19187.04, 18636.3689999999, 17982.831, 17439.7389999999, 16874.547, 16358.2169999999, 15835.684, 15352.914, 14823.681, 14329.313, 13816.897, 13342.874, 12880.882, 12491.648, 12021.254, 11625.392, 11293.7610000001, 10813.697, 10456.209, 10099.074, 9755.39000000001, 9393.18500000006, 9047.57900000003, 8657.98499999999, 8395.85900000005, 8033.0, 7736.95900000003, 7430.59699999995, 7258.47699999996, 6924.58200000005, 6691.29399999999, 6357.92500000005, 6202.05700000003, 5921.19700000004, 5628.28399999999, 5404.96799999999, 5226.71100000001, 4990.75600000005, 4799.77399999998, 4622.93099999998, 4472.478, 4171.78700000001, 3957.46299999999, 3868.95200000005, 3691.14300000004, 3474.63100000005, 3341.67200000002, 3109.14000000001, 3071.97400000005, 2796.40399999998, 2756.17799999996, 2611.46999999997, 2471.93000000005, 2382.26399999997, 2209.22400000005, 2142.28399999999, 2013.96100000001, 1911.18999999994, 1818.27099999995, 1668.47900000005, 1519.65800000005, 1469.67599999998, 1367.13800000004, 1248.52899999998, 1181.23600000003, 1022.71900000004, 1088.20700000005, 959.03600000008, 876.095999999903, 791.183999999892, 703.337000000058, 731.949999999953, 586.86400000006, 526.024999999907, 323.004999999888, 320.448000000091, 340.672999999952, 309.638999999966, 216.601999999955, 102.922999999952, 19.2399999999907, -0.114000000059605, -32.6240000000689, -89.3179999999702, -153.497999999905, -64.2970000000205, -143.695999999996, -259.497999999905, -253.017999999924, -213.948000000091, -397.590000000084, -434.006000000052, -403.475000000093, -297.958000000101, -404.317000000039, -528.898999999976, -506.621000000043, -513.205000000075, -479.351000000024, -596.139999999898, -527.016999999993, -664.681000000099, -680.306000000099, -704.050000000047, -850.486000000034, -757.43200000003, -713.308999999892};
 
-template <int32_t P>
+template <int32_t Precision>
 __host__ __device__ auto const& bias_data() noexcept;
 
 template <>

From d6a9a4e9efdffff72cc9b516a2457bfe7e8d229f Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Thu, 1 Feb 2024 17:08:24 +0000
Subject: [PATCH 16/78] Initialize shmem storage using placement new

---
 examples/distinct_count_estimator/device_ref_example.cu | 4 ++++
 include/cuco/detail/hyperloglog/kernels.cuh             | 3 +++
 2 files changed, 7 insertions(+)

diff --git a/examples/distinct_count_estimator/device_ref_example.cu b/examples/distinct_count_estimator/device_ref_example.cu
index 82e34b5c9..8634e6b12 100644
--- a/examples/distinct_count_estimator/device_ref_example.cu
+++ b/examples/distinct_count_estimator/device_ref_example.cu
@@ -43,6 +43,10 @@ __global__ void piggyback_kernel(RefType ref, InputIt first, std::size_t n)
   auto idx               = blockDim.x * blockIdx.x + threadIdx.x;
   auto const block       = cooperative_groups::this_thread_block();
 
+  // Initialize the local storage object
+  if (block.thread_rank() == 0) { new (&local_storage) typename local_ref_type::storage_type{}; }
+  block.sync();
+
   // Create the local estimator with the shared memory storage
   local_ref_type local_ref(local_storage, {});
 
diff --git a/include/cuco/detail/hyperloglog/kernels.cuh b/include/cuco/detail/hyperloglog/kernels.cuh
index fd3a2a877..07f16b097 100644
--- a/include/cuco/detail/hyperloglog/kernels.cuh
+++ b/include/cuco/detail/hyperloglog/kernels.cuh
@@ -43,6 +43,9 @@ CUCO_KERNEL void add_shmem(InputIt first, cuco::detail::index_type n, RefType re
   auto idx               = cuco::detail::global_thread_id();
   auto const block       = cooperative_groups::this_thread_block();
 
+  if (block.thread_rank() == 0) { new (&local_storage) typename local_ref_type::storage_type{}; }
+  block.sync();
+
   local_ref_type local_ref(local_storage, {});
   local_ref.clear(block);
   block.sync();

From 891d6068b5deb342e726c0e5ab6483ae80be2c8a Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Thu, 1 Feb 2024 17:52:52 +0000
Subject: [PATCH 17/78] Add unit test

---
 tests/CMakeLists.txt                          |   5 +
 .../unique_sequence_test.cu                   | 105 ++++++++++++++++++
 2 files changed, 110 insertions(+)
 create mode 100644 tests/distinct_count_estimator/unique_sequence_test.cu

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index e09efddb3..531556247 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -109,3 +109,8 @@ ConfigureTest(DYNAMIC_BITSET_TEST
     dynamic_bitset/rank_test.cu
     dynamic_bitset/select_test.cu
     dynamic_bitset/size_test.cu)
+
+###################################################################################################
+# - distinct_count_estimator ----------------------------------------------------------------------
+ConfigureTest(DISTINCT_COUNT_ESTIMATOR_TEST
+    distinct_count_estimator/unique_sequence_test.cu)
diff --git a/tests/distinct_count_estimator/unique_sequence_test.cu b/tests/distinct_count_estimator/unique_sequence_test.cu
new file mode 100644
index 000000000..a4d07ba4d
--- /dev/null
+++ b/tests/distinct_count_estimator/unique_sequence_test.cu
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utils.hpp>
+
+#include <cuco/distinct_count_estimator.cuh>
+#include <cuco/hash_functions.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/sequence.h>
+
+#include <catch2/catch_template_test_macros.hpp>
+#include <catch2/generators/catch_generators.hpp>
+
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+
+TEMPLATE_TEST_CASE_SIG("distinct_count_estimator: unique sequence",
+                       "",
+                       ((typename T, int32_t Precision, typename Hash), T, Precision, Hash),
+                       (int32_t, 9, cuco::xxhash_32<int32_t>),
+                       (int32_t, 10, cuco::xxhash_32<int32_t>),
+                       (int32_t, 11, cuco::xxhash_32<int32_t>),
+                       (int32_t, 12, cuco::xxhash_32<int32_t>),
+                       (int32_t, 13, cuco::xxhash_32<int32_t>),
+                       (int32_t, 9, cuco::xxhash_64<int32_t>),
+                       (int32_t, 10, cuco::xxhash_64<int32_t>),
+                       (int32_t, 11, cuco::xxhash_64<int32_t>),
+                       (int32_t, 12, cuco::xxhash_64<int32_t>),
+                       (int32_t, 13, cuco::xxhash_64<int32_t>),
+                       (int64_t, 9, cuco::xxhash_32<int64_t>),
+                       (int64_t, 10, cuco::xxhash_32<int64_t>),
+                       (int64_t, 11, cuco::xxhash_32<int64_t>),
+                       (int64_t, 12, cuco::xxhash_32<int64_t>),
+                       (int64_t, 13, cuco::xxhash_32<int64_t>),
+                       (int64_t, 9, cuco::xxhash_64<int64_t>),
+                       (int64_t, 10, cuco::xxhash_64<int64_t>),
+                       (int64_t, 11, cuco::xxhash_64<int64_t>),
+                       (int64_t, 12, cuco::xxhash_64<int64_t>),
+                       (int64_t, 13, cuco::xxhash_64<int64_t>),
+                       (__int128_t, 9, cuco::xxhash_32<__int128_t>),
+                       (__int128_t, 10, cuco::xxhash_32<__int128_t>),
+                       (__int128_t, 11, cuco::xxhash_32<__int128_t>),
+                       (__int128_t, 12, cuco::xxhash_32<__int128_t>),
+                       (__int128_t, 13, cuco::xxhash_32<__int128_t>),
+                       (__int128_t, 9, cuco::xxhash_64<__int128_t>),
+                       (__int128_t, 10, cuco::xxhash_64<__int128_t>),
+                       (__int128_t, 11, cuco::xxhash_64<__int128_t>),
+                       (__int128_t, 12, cuco::xxhash_64<__int128_t>),
+                       (__int128_t, 13, cuco::xxhash_64<__int128_t>))
+{
+  // This factor determines the error threshold for passing the test
+  // TODO might be too high
+  double constexpr tolerance_factor = 3.0;
+  // RSD for a given precision is given by the following formula
+  double const relative_standard_deviation =
+    1.04 / std::sqrt(static_cast<double>(1ull << Precision));
+
+  auto num_items_pow2 = GENERATE(25, 26, 28);
+  INFO("num_items=2^" << num_items_pow2);
+  auto num_items = 1ull << num_items_pow2;
+
+  thrust::device_vector<T> items(num_items);
+
+  // Generate `num_items` distinct items
+  thrust::sequence(items.begin(), items.end(), 0);
+
+  // Initialize the estimator
+  cuco::distinct_count_estimator<T> estimator;
+
+  REQUIRE(estimator.estimate() == 0);
+
+  // Add all items to the estimator
+  estimator.add(items.begin(), items.end());
+
+  auto const estimate = estimator.estimate();
+
+  // Adding the same items again should not affect the result
+  estimator.add(items.begin(), items.begin() + num_items / 2);
+  REQUIRE(estimator.estimate() == estimate);
+
+  // Clearing the estimator shoult reset the estimate
+  estimator.clear();
+  REQUIRE(estimator.estimate() == 0);
+
+  double const relative_error =
+    std::abs(static_cast<double>(num_items) - static_cast<double>(estimate)) / num_items;
+
+  // Check if the error is acceptable
+  REQUIRE(relative_error < tolerance_factor * relative_standard_deviation);
+}

From 35441950b86b49e572c8dd1fe3932cc9f7c043d2 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Thu, 1 Feb 2024 23:17:35 +0000
Subject: [PATCH 18/78] Remove experimental cg async reduce since it is buggy

---
 include/cuco/detail/__config                        | 5 -----
 include/cuco/detail/hyperloglog/hyperloglog_ref.cuh | 5 -----
 2 files changed, 10 deletions(-)

diff --git a/include/cuco/detail/__config b/include/cuco/detail/__config
index fd3b6fce4..812a4e631 100644
--- a/include/cuco/detail/__config
+++ b/include/cuco/detail/__config
@@ -51,11 +51,6 @@
 #define CUCO_HAS_INT128
 #endif
 
-#if (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 8)
-#define CUCO_HAS_CG_EXPERIMENTAL_REDUCE_UPDATE_ASYNC
-#define _CG_ABI_EXPERIMENTAL
-#endif
-
 #if (__CUDACC_VER_MAJOR__ >= 12)
 #define CUCO_HAS_CG_REDUCE_UPDATE_ASYNC
 #endif
\ No newline at end of file
diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
index d6f362c5f..c25d68c8e 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
@@ -148,11 +148,6 @@ class hyperloglog_ref {
       warp, block_sum, thread_sum, cooperative_groups::plus<fp_type>());
     cooperative_groups::reduce_update_async(
       warp, block_zeroes, thread_zeroes, cooperative_groups::plus<int>());
-#elif defined(CUCO_HAS_CG_EXPERIMENTAL_REDUCE_UPDATE_ASYNC)
-    cooperative_groups::experimental::reduce_update_async(
-      warp, block_sum, thread_sum, cooperative_groups::plus<fp_type>());
-    cooperative_groups::experimental::reduce_update_async(
-      warp, block_zeroes, thread_zeroes, cooperative_groups::plus<int>());
 #else
     auto const warp_sum =
       cooperative_groups::reduce(warp, thread_sum, cooperative_groups::plus<fp_type>());

From 3506ecbf2673d3d012e8c0872efd8110faa37486 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Fri, 2 Feb 2024 00:15:40 +0000
Subject: [PATCH 19/78] Fix bit-shifting bug that lead to high error rates

---
 include/cuco/detail/hyperloglog/hyperloglog.cuh     | 3 ++-
 include/cuco/detail/hyperloglog/hyperloglog_ref.cuh | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh
index a08cfa942..a9288bb80 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog.cuh
@@ -278,10 +278,11 @@ class hyperloglog {
     using fp_type = typename ref_type<>::fp_type;
     fp_type sum   = 0;
     int zeroes    = 0;
+
     // geometric mean computation + count registers with 0s
     for (std::size_t i = 0; i < registers.size(); ++i) {
       auto const reg = registers[i];
-      sum += fp_type{1} / static_cast<fp_type>(1 << reg);
+      sum += fp_type{1} / static_cast<fp_type>(1ull << reg);
       zeroes += reg == 0;
     }
 
diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
index c25d68c8e..5fe8d5c3e 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
@@ -89,7 +89,7 @@ class hyperloglog_ref {
   __device__ void add(T const& item) noexcept
   {
     // static_assert NumBuckets is not too big
-    auto constexpr register_mask = (1 << Precision) - 1;
+    auto constexpr register_mask = (1ull << Precision) - 1;
     auto const h                 = this->hash_(item);
     auto const reg               = h & register_mask;
     auto const zeroes            = cuda::std::countl_zero(h | register_mask) + 1;  // __clz

From 919d0abe8f5b48408c19d34a4d442899fd2aa9c0 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Fri, 2 Feb 2024 00:26:14 +0000
Subject: [PATCH 20/78] Storage cleanups

---
 .../cuco/detail/hyperloglog/hyperloglog.cuh   |  2 +-
 include/cuco/detail/hyperloglog/storage.cuh   | 65 ++-----------------
 2 files changed, 8 insertions(+), 59 deletions(-)

diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh
index a9288bb80..f95f7859f 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog.cuh
@@ -280,7 +280,7 @@ class hyperloglog {
     int zeroes    = 0;
 
     // geometric mean computation + count registers with 0s
-    for (std::size_t i = 0; i < registers.size(); ++i) {
+    for (int i = 0; i < registers.size(); ++i) {
       auto const reg = registers[i];
       sum += fp_type{1} / static_cast<fp_type>(1ull << reg);
       zeroes += reg == 0;
diff --git a/include/cuco/detail/hyperloglog/storage.cuh b/include/cuco/detail/hyperloglog/storage.cuh
index fe0a4ff7a..5ca525c69 100644
--- a/include/cuco/detail/hyperloglog/storage.cuh
+++ b/include/cuco/detail/hyperloglog/storage.cuh
@@ -35,6 +35,8 @@ namespace cuco::detail {
 template <int32_t Precision>
 class hyperloglog_dense_registers {
  public:
+  // We use `int` here since this is the smallest type that supports native `atomicMax` on GPUs
+  using register_type = int;  ///< Register array storage
   /**
    * @brief Clears the storage.
    *
@@ -48,14 +50,6 @@ class hyperloglog_dense_registers {
     for (int i = group.thread_rank(); i < this->registers_.size(); i += group.size()) {
       this->registers_[i] = 0;
     }
-
-    // TODO remove test code
-    // int4 constexpr empty{0, 0, 0, 0};
-    // auto vec4 = reinterpret_cast<int4*>(this->storage_.data());
-    // // #pragma unroll 2
-    // for (int i = group.thread_rank(); i < (this->storage_.size() / 4); i += group.size()) {
-    //   vec4[i] = empty;
-    // }
   }
 
   /**
@@ -66,10 +60,7 @@ class hyperloglog_dense_registers {
    *
    * @return Reference to the requested element
    */
-  __host__ __device__ constexpr int& operator[](std::size_t i) noexcept
-  {
-    return this->registers_[i];
-  }
+  __host__ __device__ constexpr int& operator[](int i) noexcept { return this->registers_[i]; }
 
   /**
    * @brief Returns the element at specified location `i`. No bounds checking is performed.
@@ -78,20 +69,14 @@ class hyperloglog_dense_registers {
    *
    * @return Requested element
    */
-  __host__ __device__ constexpr int operator[](std::size_t i) const noexcept
-  {
-    return this->registers_[i];
-  }
+  __host__ __device__ constexpr int operator[](int i) const noexcept { return this->registers_[i]; }
 
   /**
    * @brief Returns the number of elements in the container.
    *
    * @return The number of elements in the container
    */
-  __host__ __device__ constexpr std::size_t size() const noexcept
-  {
-    return this->registers_.size();
-  }
+  __host__ __device__ constexpr int size() const noexcept { return this->registers_.size(); }
 
   /**
    * @brief Atomically updates the register at position `i` with `max(reg[i], value)`.
@@ -102,7 +87,7 @@ class hyperloglog_dense_registers {
    * @param value New value
    */
   template <cuda::thread_scope Scope>
-  __device__ constexpr void update_max(std::size_t i, int value) noexcept
+  __device__ constexpr void update_max(int i, register_type value) noexcept
   {
     if constexpr (Scope == cuda::thread_scope_thread) {
       this->registers_[i] = max(this->registers_[i], value);
@@ -133,45 +118,9 @@ class hyperloglog_dense_registers {
     for (int i = group.thread_rank(); i < this->registers_.size(); i += group.size()) {
       this->update_max<Scope>(i, other.registers_[i]);
     }
-
-    // TODO remove test code
-    /*
-    auto vec4 = reinterpret_cast<int4 const*>(other.storage_.data());
-    // #pragma unroll 2
-    for (int i = group.thread_rank(); i < (this->storage_.size() / 4); i += group.size()) {
-      auto const items = vec4[i];
-      if constexpr (Scope == cuda::thread_scope_thread) {
-        auto max_vec4  = reinterpret_cast<int4*>(this->storage_.data());
-        auto max_items = max_vec4[i];
-        max_items.x    = max(max_items.x, items.x);
-        max_items.y    = max(max_items.y, items.y);
-        max_items.z    = max(max_items.z, items.z);
-        max_items.w    = max(max_items.w, items.w);
-        max_vec4[i]    = max_items;
-      } else if constexpr (Scope == cuda::thread_scope_block) {
-        atomicMax_block(this->storage_.data() + (i * 4 + 0), items.x);
-        atomicMax_block(this->storage_.data() + (i * 4 + 1), items.y);
-        atomicMax_block(this->storage_.data() + (i * 4 + 2), items.z);
-        atomicMax_block(this->storage_.data() + (i * 4 + 3), items.w);
-      } else if constexpr (Scope == cuda::thread_scope_device) {
-        atomicMax(this->storage_.data() + (i * 4 + 0), items.x);
-        atomicMax(this->storage_.data() + (i * 4 + 1), items.y);
-        atomicMax(this->storage_.data() + (i * 4 + 2), items.z);
-        atomicMax(this->storage_.data() + (i * 4 + 3), items.w);
-      } else if constexpr (Scope == cuda::thread_scope_system) {
-        atomicMax_system(this->storage_.data() + (i * 4 + 0), items.x);
-        atomicMax_system(this->storage_.data() + (i * 4 + 1), items.y);
-        atomicMax_system(this->storage_.data() + (i * 4 + 2), items.z);
-        atomicMax_system(this->storage_.data() + (i * 4 + 3), items.w);
-      } else {
-        static_assert(cuco::dependent_false<decltype(Scope)>, "Unsupported thread scope");
-      }
-    }
-    */
   }
 
  private:
-  alignas(sizeof(int) *
-          4) cuda::std::array<int, 1ull << Precision> registers_;  ///< Register array storage
+  cuda::std::array<register_type, 1ull << Precision> registers_;  ///< Register array storage
 };
 }  // namespace cuco::detail

From 52f6e09e10d4e92c9fd028ab5c741da37ad30d6b Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Fri, 2 Feb 2024 00:44:46 +0000
Subject: [PATCH 21/78] Update readme

---
 README.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/README.md b/README.md
index 30937c0b6..069a70897 100644
--- a/README.md
+++ b/README.md
@@ -232,4 +232,12 @@ We plan to add many GPU-accelerated, concurrent data structures to `cuCollection
 #### Examples:
 - [Host-bulk APIs (TODO)]()
 
+### `distinct_count_estimator`
+
+`cuco::distinct_count_estimator` implements the famous [HyperLogLog++ algorithm](https://static.googleusercontent.com/media/research.google.com/de//pubs/archive/40671.pdf) for approximating the count of distinct items in a multiset/stream.
+
+#### Examples:
+- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/distinct_count_estimator/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/z/EG7cMssxo))
+- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/distinct_count_estimator/device_ref_example.cu) (see [live example in godbolt](https://godbolt.org/z/va8eE9dqb))
+
 

From 0a0119d9cc6e42ee0885b8683285ad3d4621c11a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20J=C3=BCnger?=
 <2955913+sleeepyjack@users.noreply.github.com>
Date: Fri, 2 Feb 2024 01:45:52 +0100
Subject: [PATCH 22/78] Fix typo

Co-authored-by: Yunsong Wang <yunsongw@nvidia.com>
---
 tests/distinct_count_estimator/unique_sequence_test.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/distinct_count_estimator/unique_sequence_test.cu b/tests/distinct_count_estimator/unique_sequence_test.cu
index a4d07ba4d..23c86321d 100644
--- a/tests/distinct_count_estimator/unique_sequence_test.cu
+++ b/tests/distinct_count_estimator/unique_sequence_test.cu
@@ -93,7 +93,7 @@ TEMPLATE_TEST_CASE_SIG("distinct_count_estimator: unique sequence",
   estimator.add(items.begin(), items.begin() + num_items / 2);
   REQUIRE(estimator.estimate() == estimate);
 
-  // Clearing the estimator shoult reset the estimate
+  // Clearing the estimator should reset the estimate
   estimator.clear();
   REQUIRE(estimator.estimate() == 0);
 

From ab50bed27ac1331caccc913bba897ca7294b768f Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Fri, 2 Feb 2024 00:48:02 +0000
Subject: [PATCH 23/78] Fix typo

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 069a70897..5283fcf3e 100644
--- a/README.md
+++ b/README.md
@@ -238,6 +238,6 @@ We plan to add many GPU-accelerated, concurrent data structures to `cuCollection
 
 #### Examples:
 - [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/distinct_count_estimator/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/z/EG7cMssxo))
-- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/distinct_count_estimator/device_ref_example.cu) (see [live example in godbolt](https://godbolt.org/z/va8eE9dqb))
+- [Device-ref APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/distinct_count_estimator/device_ref_example.cu) (see [live example in godbolt](https://godbolt.org/z/va8eE9dqb))
 
 

From 68d2df07658c41a872e8a5ad928966c46c73532d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20J=C3=BCnger?=
 <2955913+sleeepyjack@users.noreply.github.com>
Date: Sat, 3 Feb 2024 02:24:58 +0100
Subject: [PATCH 24/78] Apply suggestions from code review

Co-authored-by: Yunsong Wang <yunsongw@nvidia.com>
---
 README.md                                               | 2 +-
 benchmarks/distinct_count_estimator_bench.cu            | 3 ++-
 examples/distinct_count_estimator/device_ref_example.cu | 2 +-
 include/cuco/detail/hyperloglog/finalizer.cuh           | 3 ++-
 include/cuco/detail/hyperloglog/hyperloglog.cuh         | 3 +--
 include/cuco/detail/hyperloglog/hyperloglog_ref.cuh     | 3 ++-
 6 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 5283fcf3e..48b598da9 100644
--- a/README.md
+++ b/README.md
@@ -234,7 +234,7 @@ We plan to add many GPU-accelerated, concurrent data structures to `cuCollection
 
 ### `distinct_count_estimator`
 
-`cuco::distinct_count_estimator` implements the famous [HyperLogLog++ algorithm](https://static.googleusercontent.com/media/research.google.com/de//pubs/archive/40671.pdf) for approximating the count of distinct items in a multiset/stream.
+`cuco::distinct_count_estimator` implements the well-established [HyperLogLog++ algorithm](https://static.googleusercontent.com/media/research.google.com/de//pubs/archive/40671.pdf) for approximating the count of distinct items in a multiset/stream.
 
 #### Examples:
 - [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/distinct_count_estimator/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/z/EG7cMssxo))
diff --git a/benchmarks/distinct_count_estimator_bench.cu b/benchmarks/distinct_count_estimator_bench.cu
index 7ceb305b4..12504f120 100644
--- a/benchmarks/distinct_count_estimator_bench.cu
+++ b/benchmarks/distinct_count_estimator_bench.cu
@@ -26,9 +26,10 @@
 #include <thrust/device_vector.h>
 #include <thrust/iterator/transform_iterator.h>
 
-#include <cstddef>
 #include <cuda/functional>
 
+#include <cstddef>
+
 using namespace cuco::benchmark;
 using namespace cuco::utility;
 
diff --git a/examples/distinct_count_estimator/device_ref_example.cu b/examples/distinct_count_estimator/device_ref_example.cu
index 8634e6b12..2701e34b7 100644
--- a/examples/distinct_count_estimator/device_ref_example.cu
+++ b/examples/distinct_count_estimator/device_ref_example.cu
@@ -22,7 +22,7 @@
 #include <iostream>
 
 /**
- * @file device_reference_example.cu
+ * @file device_ref_example.cu
  * @brief Demonstrates usage of `cuco::distinct_count_estimator` device-side APIs.
  *
  * This example demonstrates how the non-owning reference type `cuco::distinct_count_estimator_ref`
diff --git a/include/cuco/detail/hyperloglog/finalizer.cuh b/include/cuco/detail/hyperloglog/finalizer.cuh
index 18af4ca1b..2129783fd 100644
--- a/include/cuco/detail/hyperloglog/finalizer.cuh
+++ b/include/cuco/detail/hyperloglog/finalizer.cuh
@@ -17,10 +17,11 @@
 
 #include <cuco/detail/hyperloglog/tuning.cuh>
 
-#include <cstddef>
 #include <cuda/std/cmath>
 #include <cuda/std/limits>
 
+#include <cstddef>
+
 namespace cuco::hyperloglog_ns::detail {
 
 /**
diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh
index f95f7859f..6d4f9ef58 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog.cuh
@@ -75,7 +75,7 @@ class hyperloglog {
       storage_deleter_{storage_allocator_},
       storage_{storage_allocator_.allocate(1ull), storage_deleter_}
   {
-    this->clear_async(stream);  // TODO async or sync?
+    this->clear_async(stream);
   }
 
   ~hyperloglog() = default;
@@ -84,7 +84,6 @@ class hyperloglog {
   hyperloglog& operator=(hyperloglog const&) = delete;
   hyperloglog(hyperloglog&&)                 = default;  ///< Move constructor
 
-  // TODO this is somehow required to pass the Doxygen check.
   /**
    * @brief Copy-assignment operator.
    *
diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
index 5fe8d5c3e..a045ca20c 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
@@ -22,12 +22,13 @@
 #include <cuco/utility/cuda_thread_scope.cuh>
 #include <cuco/utility/traits.hpp>
 
-#include <cstddef>
 #include <cuda/std/bit>
 
 #include <cooperative_groups.h>
 #include <cooperative_groups/reduce.h>
 
+#include <cstddef>
+
 namespace cuco::detail {
 /**
  * @brief A GPU-accelerated utility for approximating the number of distinct items in a multiset.

From 93f68a2515214d9b59e58cb773fa3ae3af2c8bf7 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Tue, 6 Feb 2024 15:26:14 +0000
Subject: [PATCH 25/78] Use CUDART_VERSION instead of (__CUDACC_VER_MAJOR__

---
 include/cuco/detail/__config | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/cuco/detail/__config b/include/cuco/detail/__config
index 812a4e631..ba300d4dc 100644
--- a/include/cuco/detail/__config
+++ b/include/cuco/detail/__config
@@ -51,6 +51,6 @@
 #define CUCO_HAS_INT128
 #endif
 
-#if (__CUDACC_VER_MAJOR__ >= 12)
+#if defined(CUDART_VERSION) && (CUDART_VERSION >= 12000)
 #define CUCO_HAS_CG_REDUCE_UPDATE_ASYNC
 #endif
\ No newline at end of file

From 03a85728c49d0df7f76366fa6a556264cd6f16b3 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Tue, 6 Feb 2024 16:38:44 +0000
Subject: [PATCH 26/78] Apply suggestions from code review

---
 .../cuco/detail/hyperloglog/hyperloglog.cuh   | 49 ++++++++-----------
 .../detail/hyperloglog/hyperloglog_ref.cuh    |  4 +-
 2 files changed, 22 insertions(+), 31 deletions(-)

diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh
index 6d4f9ef58..94850c0c7 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog.cuh
@@ -16,10 +16,13 @@
 #pragma once
 
 #include <cuco/cuda_stream_ref.hpp>
+#include <cuco/detail/error.hpp>
 #include <cuco/detail/hyperloglog/finalizer.cuh>
 #include <cuco/detail/hyperloglog/hyperloglog_ref.cuh>
 #include <cuco/detail/hyperloglog/kernels.cuh>
 #include <cuco/detail/hyperloglog/storage.cuh>
+#include <cuco/detail/storage/storage_base.cuh>
+#include <cuco/detail/utils.hpp>
 #include <cuco/hash_functions.cuh>
 #include <cuco/utility/allocator.hpp>
 #include <cuco/utility/cuda_thread_scope.cuh>
@@ -54,11 +57,12 @@ class hyperloglog {
   using ref_type = hyperloglog_ref<T, Precision, NewScope, Hash>;  ///< Non-owning reference
                                                                    ///< type
 
-  using allocator_type         = Allocator;                          ///< Allocator type
-  using value_type             = typename ref_type<>::value_type;    ///< Type of items to count
-  using storage_type           = typename ref_type<>::storage_type;  ///< Storage type
-  using storage_allocator_type = typename std::allocator_traits<Allocator>::template rebind_alloc<
-    storage_type>;  ///< Storage allocator type
+  using value_type   = typename ref_type<>::value_type;    ///< Type of items to count
+  using storage_type = typename ref_type<>::storage_type;  ///< Storage type
+  using hash_type    = typename ref_type<>::hash_type;     ///< Hash function type
+  using allocator_type =
+    typename std::allocator_traits<Allocator>::template rebind_alloc<storage_type>;  ///< Allocator
+                                                                                     ///< type
 
   /**
    * @brief Constructs a `hyperloglog` host object.
@@ -71,9 +75,9 @@ class hyperloglog {
    */
   constexpr hyperloglog(Hash const& hash, Allocator const& alloc, cuco::cuda_stream_ref stream)
     : hash_{hash},
-      storage_allocator_{alloc},
-      storage_deleter_{storage_allocator_},
-      storage_{storage_allocator_.allocate(1ull), storage_deleter_}
+      allocator_{alloc},
+      deleter_{1ull, allocator_},
+      storage_{allocator_.allocate(1ull), deleter_}
   {
     this->clear_async(stream);
   }
@@ -128,9 +132,9 @@ class hyperloglog {
    * @param stream CUDA stream this operation is executed in
    */
   template <class InputIt>
-  void add_async(InputIt first, InputIt last, cuco::cuda_stream_ref stream) noexcept
+  void add_async(InputIt first, InputIt last, cuco::cuda_stream_ref stream)
   {
-    auto const num_items = cuco::detail::distance(first, last);  // TODO include
+    auto const num_items = cuco::detail::distance(first, last);
     if (num_items == 0) { return; }
 
     // TODO fallback to local memory registers in case they don't fit in shmem
@@ -141,9 +145,8 @@ class hyperloglog {
     // We make use of the occupancy calculator here to get the minimum number of blocks which still
     // saturate the GPU. This reduces the atomic contention on the final register array during the
     // merge phase.
-    // TODO check cuda error or will it sync the stream??
-    cudaOccupancyMaxPotentialBlockSize(
-      &grid_size, &block_size, &cuco::hyperloglog_ns::detail::add_shmem<InputIt, ref_type<>>);
+    CUCO_CUDA_TRY(cudaOccupancyMaxPotentialBlockSize(
+      &grid_size, &block_size, &cuco::hyperloglog_ns::detail::add_shmem<InputIt, ref_type<>>));
 
     cuco::hyperloglog_ns::detail::add_shmem<<<grid_size, block_size, 0, stream>>>(
       first, num_items, this->ref());
@@ -314,22 +317,10 @@ class hyperloglog {
   [[nodiscard]] auto hash() const noexcept { return this->hash_; }
 
  private:
-  struct storage_deleter {
-    using pointer = typename storage_allocator_type::value_type*;
-
-    storage_deleter(storage_allocator_type& a) : allocator{a} {}
-
-    storage_deleter(storage_deleter const&) = default;
-
-    void operator()(pointer ptr) { allocator.deallocate(ptr, 1); }
-
-    storage_allocator_type& allocator;
-  };
-
-  Hash hash_;                                               ///< Hash function used to hash items
-  storage_allocator_type storage_allocator_;                ///< Storage allocator
-  storage_deleter storage_deleter_;                         ///< Storage deleter
-  std::unique_ptr<storage_type, storage_deleter> storage_;  ///< Storage
+  hash_type hash_;                                       ///< Hash function used to hash items
+  allocator_type allocator_;                             ///< Storage allocator
+  custom_deleter<std::size_t, allocator_type> deleter_;  ///< Storage deleter
+  std::unique_ptr<storage_type, custom_deleter<std::size_t, allocator_type>> storage_;  ///< Storage
 
   // Needs to be friends with other instantiations of this class template to have access to their
   // storage
diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
index a045ca20c..c9ace51c3 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
@@ -53,6 +53,7 @@ class hyperloglog_ref {
 
   using value_type   = T;                                       ///< Type of items to count
   using storage_type = hyperloglog_dense_registers<Precision>;  ///< Storage type
+  using hash_type    = Hash;                                    ///< Hash function type
 
   template <cuda::thread_scope NewScope>
   using with_scope = hyperloglog_ref<T, Precision, NewScope, Hash>;  ///< Ref type with different
@@ -173,8 +174,7 @@ class hyperloglog_ref {
   }
 
  private:
-  Hash hash_;  ///< Hash function used to hash items
-  // TODO is a reference the right choice here??
+  hash_type hash_;         ///< Hash function used to hash items
   storage_type& storage_;  ///< Reference to storage object
 
   template <class T_, int32_t Precision_, cuda::thread_scope Scope_, class Hash_>

From 33f7bafbdfc130b8d459b85325f818ab417b64e8 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Fri, 16 Feb 2024 01:02:54 +0000
Subject: [PATCH 27/78] Enable Precision>18; fix some bugs, extend tests.

---
 include/cuco/detail/hyperloglog/finalizer.cuh | 17 ++++-----
 .../cuco/detail/hyperloglog/hyperloglog.cuh   | 28 ++++++++++-----
 .../detail/hyperloglog/hyperloglog_ref.cuh    | 11 +++---
 include/cuco/detail/hyperloglog/kernels.cuh   | 22 +++++++++---
 .../unique_sequence_test.cu                   | 36 +++++++------------
 5 files changed, 63 insertions(+), 51 deletions(-)

diff --git a/include/cuco/detail/hyperloglog/finalizer.cuh b/include/cuco/detail/hyperloglog/finalizer.cuh
index 2129783fd..8b221f6ba 100644
--- a/include/cuco/detail/hyperloglog/finalizer.cuh
+++ b/include/cuco/detail/hyperloglog/finalizer.cuh
@@ -56,15 +56,15 @@ class finalizer {
     if (v > 0) {
       // Use linear counting for small cardinality estimates.
       double const h = m * log(static_cast<double>(m) / v);
-      // HLL++ is defined only when p < 19, otherwise we need to fallback to HLL.
       // The threshold `2.5 * m` is from the original HLL algorithm.
-      if ((Precision < 19 and h <= threshold<Precision>()) or e <= 2.5 * m) {
-        e = h;
-      } else {
-        e = bias_corrected_estimate(e);
+      if (e <= 2.5 * m) { return cuda::std::round(h); }
+
+      if constexpr (Precision < 19) {
+        e = (h <= threshold<Precision>()) ? h : bias_corrected_estimate(e);
       }
     } else {
-      e = bias_corrected_estimate(e);
+      // HLL++ is defined only when p < 19, otherwise we need to fallback to HLL.
+      if constexpr (Precision < 19) { e = bias_corrected_estimate(e); }
     }
 
     return cuda::std::round(e);
@@ -89,10 +89,7 @@ class finalizer {
 
   __host__ __device__ static double constexpr bias_corrected_estimate(double e) noexcept
   {
-    if constexpr (Precision < 19) {
-      if (e < 5.0 * m) { return e - bias(e); }
-    }
-    return e;
+    return (e < 5.0 * m) ? e - bias(e) : e;
   }
 
   __host__ __device__ static double constexpr bias(double e) noexcept
diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh
index 94850c0c7..986166836 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog.cuh
@@ -137,19 +137,31 @@ class hyperloglog {
     auto const num_items = cuco::detail::distance(first, last);
     if (num_items == 0) { return; }
 
-    // TODO fallback to local memory registers in case they don't fit in shmem
-
-    int grid_size  = 0;
-    int block_size = 0;
+    int grid_size         = 0;
+    int block_size        = 0;
+    int const shmem_bytes = sizeof(storage_type);
 
     // We make use of the occupancy calculator here to get the minimum number of blocks which still
     // saturate the GPU. This reduces the atomic contention on the final register array during the
     // merge phase.
     CUCO_CUDA_TRY(cudaOccupancyMaxPotentialBlockSize(
-      &grid_size, &block_size, &cuco::hyperloglog_ns::detail::add_shmem<InputIt, ref_type<>>));
-
-    cuco::hyperloglog_ns::detail::add_shmem<<<grid_size, block_size, 0, stream>>>(
-      first, num_items, this->ref());
+      &grid_size,
+      &block_size,
+      &cuco::hyperloglog_ns::detail::add_shmem<InputIt, ref_type<>>,
+      shmem_bytes));
+
+    if (grid_size != 0) {  // use shmem codepath
+      cuco::hyperloglog_ns::detail::add_shmem<<<grid_size, block_size, shmem_bytes, stream>>>(
+        first, num_items, this->ref());
+    } else {  // use gmem codepath since there is not enough shmem available
+      block_size = 0;
+      CUCO_CUDA_TRY(cudaOccupancyMaxPotentialBlockSize(
+        &grid_size, &block_size, &cuco::hyperloglog_ns::detail::add_gmem<InputIt, ref_type<>>));
+      CUCO_EXPECTS(grid_size != 0, "Invalid kernel launch configuration");
+
+      cuco::hyperloglog_ns::detail::add_gmem<<<grid_size, block_size, 0, stream>>>(
+        first, num_items, this->ref());
+    }
   }
 
   /**
diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
index c9ace51c3..e2d57c65d 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
@@ -23,6 +23,7 @@
 #include <cuco/utility/traits.hpp>
 
 #include <cuda/std/bit>
+#include <cuda/std/utility>
 
 #include <cooperative_groups.h>
 #include <cooperative_groups/reduce.h>
@@ -90,11 +91,11 @@ class hyperloglog_ref {
    */
   __device__ void add(T const& item) noexcept
   {
-    // static_assert NumBuckets is not too big
-    auto constexpr register_mask = (1ull << Precision) - 1;
-    auto const h                 = this->hash_(item);
-    auto const reg               = h & register_mask;
-    auto const zeroes            = cuda::std::countl_zero(h | register_mask) + 1;  // __clz
+    using hash_value_type = decltype(cuda::std::declval<hash_type>()(cuda::std::declval<T>()));
+    hash_value_type constexpr register_mask = (1ull << Precision) - 1;
+    auto const h                            = this->hash_(item);
+    auto const reg                          = h & register_mask;
+    auto const zeroes = cuda::std::countl_zero(h | register_mask) + 1;  // __clz
 
     this->storage_.update_max<thread_scope>(reg, zeroes);
   }
diff --git a/include/cuco/detail/hyperloglog/kernels.cuh b/include/cuco/detail/hyperloglog/kernels.cuh
index 07f16b097..653caac95 100644
--- a/include/cuco/detail/hyperloglog/kernels.cuh
+++ b/include/cuco/detail/hyperloglog/kernels.cuh
@@ -35,18 +35,20 @@ CUCO_KERNEL void clear(RefType ref)
 template <class InputIt, class RefType>
 CUCO_KERNEL void add_shmem(InputIt first, cuco::detail::index_type n, RefType ref)
 {
-  using local_ref_type = typename RefType::with_scope<cuda::thread_scope_block>;
+  using local_ref_type     = typename RefType::with_scope<cuda::thread_scope_block>;
+  using local_storage_type = typename local_ref_type::storage_type;
 
-  __shared__ typename local_ref_type::storage_type local_storage;
+  alignas(16) extern __shared__ char shmem[];
+  local_storage_type* local_storage = reinterpret_cast<local_storage_type*>(shmem);
 
   auto const loop_stride = cuco::detail::grid_stride();
   auto idx               = cuco::detail::global_thread_id();
   auto const block       = cooperative_groups::this_thread_block();
 
-  if (block.thread_rank() == 0) { new (&local_storage) typename local_ref_type::storage_type{}; }
+  if (block.thread_rank() == 0) { new (local_storage) local_storage_type{}; }
   block.sync();
 
-  local_ref_type local_ref(local_storage, {});
+  local_ref_type local_ref(*local_storage, {});
   local_ref.clear(block);
   block.sync();
 
@@ -59,6 +61,18 @@ CUCO_KERNEL void add_shmem(InputIt first, cuco::detail::index_type n, RefType re
   ref.merge(block, local_ref);
 }
 
+template <class InputIt, class RefType>
+CUCO_KERNEL void add_gmem(InputIt first, cuco::detail::index_type n, RefType ref)
+{
+  auto const loop_stride = cuco::detail::grid_stride();
+  auto idx               = cuco::detail::global_thread_id();
+
+  while (idx < n) {
+    ref.add(*(first + idx));
+    idx += loop_stride;
+  }
+}
+
 template <class OtherRefType, class RefType>
 CUCO_KERNEL void merge(OtherRefType other_ref, RefType ref)
 {
diff --git a/tests/distinct_count_estimator/unique_sequence_test.cu b/tests/distinct_count_estimator/unique_sequence_test.cu
index 23c86321d..9ebbc6291 100644
--- a/tests/distinct_count_estimator/unique_sequence_test.cu
+++ b/tests/distinct_count_estimator/unique_sequence_test.cu
@@ -32,40 +32,28 @@
 TEMPLATE_TEST_CASE_SIG("distinct_count_estimator: unique sequence",
                        "",
                        ((typename T, int32_t Precision, typename Hash), T, Precision, Hash),
-                       (int32_t, 9, cuco::xxhash_32<int32_t>),
-                       (int32_t, 10, cuco::xxhash_32<int32_t>),
-                       (int32_t, 11, cuco::xxhash_32<int32_t>),
-                       (int32_t, 12, cuco::xxhash_32<int32_t>),
-                       (int32_t, 13, cuco::xxhash_32<int32_t>),
                        (int32_t, 9, cuco::xxhash_64<int32_t>),
-                       (int32_t, 10, cuco::xxhash_64<int32_t>),
                        (int32_t, 11, cuco::xxhash_64<int32_t>),
-                       (int32_t, 12, cuco::xxhash_64<int32_t>),
                        (int32_t, 13, cuco::xxhash_64<int32_t>),
-                       (int64_t, 9, cuco::xxhash_32<int64_t>),
-                       (int64_t, 10, cuco::xxhash_32<int64_t>),
-                       (int64_t, 11, cuco::xxhash_32<int64_t>),
-                       (int64_t, 12, cuco::xxhash_32<int64_t>),
-                       (int64_t, 13, cuco::xxhash_32<int64_t>),
+                       (int32_t, 16, cuco::xxhash_64<int32_t>),
+                       (int32_t, 18, cuco::xxhash_64<int32_t>),
+                       (int32_t, 20, cuco::xxhash_64<int32_t>),
                        (int64_t, 9, cuco::xxhash_64<int64_t>),
-                       (int64_t, 10, cuco::xxhash_64<int64_t>),
                        (int64_t, 11, cuco::xxhash_64<int64_t>),
-                       (int64_t, 12, cuco::xxhash_64<int64_t>),
                        (int64_t, 13, cuco::xxhash_64<int64_t>),
-                       (__int128_t, 9, cuco::xxhash_32<__int128_t>),
-                       (__int128_t, 10, cuco::xxhash_32<__int128_t>),
-                       (__int128_t, 11, cuco::xxhash_32<__int128_t>),
-                       (__int128_t, 12, cuco::xxhash_32<__int128_t>),
-                       (__int128_t, 13, cuco::xxhash_32<__int128_t>),
+                       (int64_t, 16, cuco::xxhash_64<int64_t>),
+                       (int64_t, 18, cuco::xxhash_64<int64_t>),
+                       (int64_t, 20, cuco::xxhash_64<int64_t>),
                        (__int128_t, 9, cuco::xxhash_64<__int128_t>),
-                       (__int128_t, 10, cuco::xxhash_64<__int128_t>),
                        (__int128_t, 11, cuco::xxhash_64<__int128_t>),
-                       (__int128_t, 12, cuco::xxhash_64<__int128_t>),
-                       (__int128_t, 13, cuco::xxhash_64<__int128_t>))
+                       (__int128_t, 13, cuco::xxhash_64<__int128_t>),
+                       (__int128_t, 16, cuco::xxhash_64<__int128_t>),
+                       (__int128_t, 18, cuco::xxhash_64<__int128_t>),
+                       (__int128_t, 20, cuco::xxhash_64<__int128_t>))
 {
   // This factor determines the error threshold for passing the test
   // TODO might be too high
-  double constexpr tolerance_factor = 3.0;
+  double constexpr tolerance_factor = 2.5;
   // RSD for a given precision is given by the following formula
   double const relative_standard_deviation =
     1.04 / std::sqrt(static_cast<double>(1ull << Precision));
@@ -80,7 +68,7 @@ TEMPLATE_TEST_CASE_SIG("distinct_count_estimator: unique sequence",
   thrust::sequence(items.begin(), items.end(), 0);
 
   // Initialize the estimator
-  cuco::distinct_count_estimator<T> estimator;
+  cuco::distinct_count_estimator<T, Precision, cuda::thread_scope_device, Hash> estimator;
 
   REQUIRE(estimator.estimate() == 0);
 

From b1253bfa7b0860b85dddaac0e1d82d6b63696b94 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Wed, 13 Mar 2024 12:46:13 +0000
Subject: [PATCH 28/78] Remove storage class and move host implementations to
 ref class

---
 .../device_ref_example.cu                     |   9 +-
 .../distinct_count_estimator.inl              |  29 +-
 .../distinct_count_estimator_ref.inl          |  89 +++++-
 .../cuco/detail/hyperloglog/hyperloglog.cuh   | 159 +++-------
 .../detail/hyperloglog/hyperloglog_ref.cuh    | 276 +++++++++++++++++-
 include/cuco/detail/hyperloglog/kernels.cuh   |  14 +-
 include/cuco/distinct_count_estimator.cuh     |  29 +-
 include/cuco/distinct_count_estimator_ref.cuh | 126 +++++++-
 8 files changed, 583 insertions(+), 148 deletions(-)

diff --git a/examples/distinct_count_estimator/device_ref_example.cu b/examples/distinct_count_estimator/device_ref_example.cu
index 2701e34b7..845634388 100644
--- a/examples/distinct_count_estimator/device_ref_example.cu
+++ b/examples/distinct_count_estimator/device_ref_example.cu
@@ -37,18 +37,15 @@ __global__ void piggyback_kernel(RefType ref, InputIt first, std::size_t n)
   using local_ref_type = typename RefType::with_scope<cuda::thread_scope_block>;
 
   // Shared memory storage for the block-local estimator
-  __shared__ typename local_ref_type::storage_type local_storage;
+  alignas(local_ref_type::sketch_alignment())
+    __shared__ std::byte local_sketch[local_ref_type::sketch_bytes()];
 
   auto const loop_stride = gridDim.x * blockDim.x;
   auto idx               = blockDim.x * blockIdx.x + threadIdx.x;
   auto const block       = cooperative_groups::this_thread_block();
 
-  // Initialize the local storage object
-  if (block.thread_rank() == 0) { new (&local_storage) typename local_ref_type::storage_type{}; }
-  block.sync();
-
   // Create the local estimator with the shared memory storage
-  local_ref_type local_ref(local_storage, {});
+  local_ref_type local_ref(cuda::std::span{local_sketch, local_ref_type::sketch_bytes()}, {});
 
   // Initialize the local estimator
   local_ref.clear(block);
diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl
index 79488e0e1..df68a0593 100644
--- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl
+++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl
@@ -98,6 +98,33 @@ template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash, clas
 typename distinct_count_estimator<T, Precision, Scope, Hash, Allocator>::ref_type<>
 distinct_count_estimator<T, Precision, Scope, Hash, Allocator>::ref() const noexcept
 {
-  return ref_type<>{this->impl_->storage_ref(), this->impl_->hash()};
+  return {this->sketch(), this->hash()};
 }
+
+template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash, class Allocator>
+auto distinct_count_estimator<T, Precision, Scope, Hash, Allocator>::hash() const noexcept
+{
+  return this->impl_->hash();
+}
+
+template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash, class Allocator>
+auto distinct_count_estimator<T, Precision, Scope, Hash, Allocator>::sketch() const noexcept
+{
+  return this->impl_->sketch();
+}
+
+template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash, class Allocator>
+constexpr size_t
+distinct_count_estimator<T, Precision, Scope, Hash, Allocator>::sketch_bytes() noexcept
+{
+  return impl_type::sketch_bytes();
+}
+
+template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash, class Allocator>
+constexpr size_t
+distinct_count_estimator<T, Precision, Scope, Hash, Allocator>::sketch_alignment() noexcept
+{
+  return impl_type::sketch();
+}
+
 }  // namespace cuco
\ No newline at end of file
diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl
index 3b940edfd..50bea1675 100644
--- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl
+++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl
@@ -17,9 +17,10 @@
 namespace cuco {
 
 template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash>
+template <class U, std::size_t N>
 __host__ __device__ constexpr distinct_count_estimator_ref<T, Precision, Scope, Hash>::
-  distinct_count_estimator_ref(storage_type& storage, Hash const& hash) noexcept
-  : impl_{storage, hash}
+  distinct_count_estimator_ref(cuda::std::span<U, N> sketch_span, Hash const& hash) noexcept
+  : impl_{sketch_span, hash}
 {
 }
 
@@ -31,12 +32,42 @@ __device__ void distinct_count_estimator_ref<T, Precision, Scope, Hash>::clear(
   this->impl_.clear(group);
 }
 
+template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash>
+__host__ void distinct_count_estimator_ref<T, Precision, Scope, Hash>::clear_async(
+  cuco::cuda_stream_ref stream) noexcept
+{
+  this->impl_.clear_async(stream);
+}
+
+template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash>
+__host__ void distinct_count_estimator_ref<T, Precision, Scope, Hash>::clear(
+  cuco::cuda_stream_ref stream)
+{
+  this->impl_.clear(stream);
+}
+
 template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash>
 __device__ void distinct_count_estimator_ref<T, Precision, Scope, Hash>::add(T const& item) noexcept
 {
   this->impl_.add(item);
 }
 
+template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash>
+template <class InputIt>
+__host__ void distinct_count_estimator_ref<T, Precision, Scope, Hash>::add_async(
+  InputIt first, InputIt last, cuco::cuda_stream_ref stream)
+{
+  this->impl_.add_async(first, last, stream);
+}
+
+template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash>
+template <class InputIt>
+__host__ void distinct_count_estimator_ref<T, Precision, Scope, Hash>::add(
+  InputIt first, InputIt last, cuco::cuda_stream_ref stream)
+{
+  this->impl_.add(first, last, stream);
+}
+
 template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash>
 template <class CG, cuda::thread_scope OtherScope>
 __device__ void distinct_count_estimator_ref<T, Precision, Scope, Hash>::merge(
@@ -46,10 +77,64 @@ __device__ void distinct_count_estimator_ref<T, Precision, Scope, Hash>::merge(
   this->impl_.merge(group, other.impl_);
 }
 
+template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash>
+template <cuda::thread_scope OtherScope>
+__host__ void distinct_count_estimator_ref<T, Precision, Scope, Hash>::merge_async(
+  distinct_count_estimator_ref<T, Precision, OtherScope, Hash> const& other,
+  cuco::cuda_stream_ref stream) noexcept
+{
+  this->impl_.merge_async(other, stream);
+}
+
+template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash>
+template <cuda::thread_scope OtherScope>
+__host__ void distinct_count_estimator_ref<T, Precision, Scope, Hash>::merge(
+  distinct_count_estimator_ref<T, Precision, OtherScope, Hash> const& other,
+  cuco::cuda_stream_ref stream)
+{
+  this->impl_.merge(other, stream);
+}
+
 template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash>
 __device__ std::size_t distinct_count_estimator_ref<T, Precision, Scope, Hash>::estimate(
   cooperative_groups::thread_block const& group) const noexcept
 {
   return this->impl_.estimate(group);
 }
+
+template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash>
+__host__ std::size_t distinct_count_estimator_ref<T, Precision, Scope, Hash>::estimate(
+  cuco::cuda_stream_ref stream) const
+{
+  return this->impl_.estimate(stream);
+}
+
+template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash>
+__host__ __device__ auto distinct_count_estimator_ref<T, Precision, Scope, Hash>::hash()
+  const noexcept
+{
+  return this->impl_.hash();
+}
+
+template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash>
+__host__ __device__ auto distinct_count_estimator_ref<T, Precision, Scope, Hash>::sketch()
+  const noexcept
+{
+  return this->impl_.sketch();
+}
+
+template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash>
+__host__ __device__ constexpr std::size_t
+distinct_count_estimator_ref<T, Precision, Scope, Hash>::sketch_bytes() noexcept
+{
+  return impl_type::sketch_bytes();
+}
+
+template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash>
+__host__ __device__ constexpr std::size_t
+distinct_count_estimator_ref<T, Precision, Scope, Hash>::sketch_alignment() noexcept
+{
+  return impl_type::sketch_alignment();
+}
+
 }  // namespace cuco
\ No newline at end of file
diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh
index 986166836..56e13da66 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog.cuh
@@ -17,14 +17,9 @@
 
 #include <cuco/cuda_stream_ref.hpp>
 #include <cuco/detail/error.hpp>
-#include <cuco/detail/hyperloglog/finalizer.cuh>
 #include <cuco/detail/hyperloglog/hyperloglog_ref.cuh>
-#include <cuco/detail/hyperloglog/kernels.cuh>
-#include <cuco/detail/hyperloglog/storage.cuh>
 #include <cuco/detail/storage/storage_base.cuh>
-#include <cuco/detail/utils.hpp>
 #include <cuco/hash_functions.cuh>
-#include <cuco/utility/allocator.hpp>
 #include <cuco/utility/cuda_thread_scope.cuh>
 
 #include <cstddef>
@@ -57,12 +52,11 @@ class hyperloglog {
   using ref_type = hyperloglog_ref<T, Precision, NewScope, Hash>;  ///< Non-owning reference
                                                                    ///< type
 
-  using value_type   = typename ref_type<>::value_type;    ///< Type of items to count
-  using storage_type = typename ref_type<>::storage_type;  ///< Storage type
-  using hash_type    = typename ref_type<>::hash_type;     ///< Hash function type
+  using value_type = typename ref_type<>::value_type;  ///< Type of items to count
+  using hash_type  = typename ref_type<>::hash_type;   ///< Hash function type
   using allocator_type =
-    typename std::allocator_traits<Allocator>::template rebind_alloc<storage_type>;  ///< Allocator
-                                                                                     ///< type
+    typename std::allocator_traits<Allocator>::template rebind_alloc<std::byte>;  ///< Allocator
+                                                                                  ///< type
 
   /**
    * @brief Constructs a `hyperloglog` host object.
@@ -74,12 +68,12 @@ class hyperloglog {
    * @param stream CUDA stream used to initialize the object
    */
   constexpr hyperloglog(Hash const& hash, Allocator const& alloc, cuco::cuda_stream_ref stream)
-    : hash_{hash},
-      allocator_{alloc},
-      deleter_{1ull, allocator_},
-      storage_{allocator_.allocate(1ull), deleter_}
+    : allocator_{alloc},
+      deleter_{this->sketch_bytes(), this->allocator_},
+      sketch_{this->allocator_.allocate(this->sketch_bytes()), this->deleter_},
+      ref_{cuda::std::span{this->sketch_.get(), this->sketch_bytes()}, hash}
   {
-    this->clear_async(stream);
+    this->ref_.clear_async(stream);
   }
 
   ~hyperloglog() = default;
@@ -100,11 +94,7 @@ class hyperloglog {
    *
    * @param stream CUDA stream this operation is executed in
    */
-  void clear_async(cuco::cuda_stream_ref stream) noexcept
-  {
-    auto constexpr block_size = 1024;
-    cuco::hyperloglog_ns::detail::clear<<<1, block_size, 0, stream>>>(this->ref());
-  }
+  void clear_async(cuco::cuda_stream_ref stream) noexcept { this->ref_.clear_async(stream); }
 
   /**
    * @brief Resets the estimator, i.e., clears the current count estimate.
@@ -114,11 +104,7 @@ class hyperloglog {
    *
    * @param stream CUDA stream this operation is executed in
    */
-  void clear(cuco::cuda_stream_ref stream)
-  {
-    this->clear_async(stream);
-    stream.synchronize();
-  }
+  void clear(cuco::cuda_stream_ref stream) { this->ref_.clear(stream); }
 
   /**
    * @brief Asynchronously adds to be counted items to the estimator.
@@ -134,34 +120,7 @@ class hyperloglog {
   template <class InputIt>
   void add_async(InputIt first, InputIt last, cuco::cuda_stream_ref stream)
   {
-    auto const num_items = cuco::detail::distance(first, last);
-    if (num_items == 0) { return; }
-
-    int grid_size         = 0;
-    int block_size        = 0;
-    int const shmem_bytes = sizeof(storage_type);
-
-    // We make use of the occupancy calculator here to get the minimum number of blocks which still
-    // saturate the GPU. This reduces the atomic contention on the final register array during the
-    // merge phase.
-    CUCO_CUDA_TRY(cudaOccupancyMaxPotentialBlockSize(
-      &grid_size,
-      &block_size,
-      &cuco::hyperloglog_ns::detail::add_shmem<InputIt, ref_type<>>,
-      shmem_bytes));
-
-    if (grid_size != 0) {  // use shmem codepath
-      cuco::hyperloglog_ns::detail::add_shmem<<<grid_size, block_size, shmem_bytes, stream>>>(
-        first, num_items, this->ref());
-    } else {  // use gmem codepath since there is not enough shmem available
-      block_size = 0;
-      CUCO_CUDA_TRY(cudaOccupancyMaxPotentialBlockSize(
-        &grid_size, &block_size, &cuco::hyperloglog_ns::detail::add_gmem<InputIt, ref_type<>>));
-      CUCO_EXPECTS(grid_size != 0, "Invalid kernel launch configuration");
-
-      cuco::hyperloglog_ns::detail::add_gmem<<<grid_size, block_size, 0, stream>>>(
-        first, num_items, this->ref());
-    }
+    this->ref_.add_async(first, last, stream);
   }
 
   /**
@@ -181,8 +140,7 @@ class hyperloglog {
   template <class InputIt>
   void add(InputIt first, InputIt last, cuco::cuda_stream_ref stream)
   {
-    this->add_async(first, last, stream);
-    stream.synchronize();
+    this->ref_.add(first, last, stream);
   }
 
   /**
@@ -198,7 +156,7 @@ class hyperloglog {
   void merge_async(hyperloglog<T, Precision, OtherScope, Hash, OtherAllocator> const& other,
                    cuco::cuda_stream_ref stream) noexcept
   {
-    this->merge_async(other.ref(), stream);
+    this->ref_.merge_async(other.ref(), stream);
   }
 
   /**
@@ -217,8 +175,7 @@ class hyperloglog {
   void merge(hyperloglog<T, Precision, OtherScope, Hash, OtherAllocator> const& other,
              cuco::cuda_stream_ref stream)
   {
-    this->merge_async(other, stream);
-    stream.synchronize();
+    this->ref_.merge(other.ref(), stream);
   }
 
   /**
@@ -232,8 +189,7 @@ class hyperloglog {
   template <cuda::thread_scope OtherScope>
   void merge_async(ref_type<OtherScope> const& other, cuco::cuda_stream_ref stream) noexcept
   {
-    auto constexpr block_size = 1024;
-    cuco::hyperloglog_ns::detail::merge<<<1, block_size, 0, stream>>>(other, this->ref());
+    this->ref_.merge_async(other, stream);
   }
 
   /**
@@ -250,8 +206,7 @@ class hyperloglog {
   template <cuda::thread_scope OtherScope>
   void merge(ref_type<OtherScope> const& other, cuco::cuda_stream_ref stream)
   {
-    this->merge_async(other, stream);
-    stream.synchronize();
+    this->ref_.merge(other, stream);
   }
 
   /**
@@ -265,43 +220,7 @@ class hyperloglog {
    */
   [[nodiscard]] std::size_t estimate(cuco::cuda_stream_ref stream) const
   {
-    // TODO remove test code
-    // std::size_t* result;
-    // cudaMallocHost(&result, sizeof(std::size_t));
-
-    // int grid_size  = 0;
-    // int block_size = 0;
-    // // TODO check cuda error?
-    // cudaOccupancyMaxPotentialBlockSize(
-    //   &grid_size, &block_size, &cuco::hyperloglog_ns::detail::estimate<ref_type<>>);
-
-    // cuco::hyperloglog_ns::detail::estimate<<<grid_size, block_size, 0, stream>>>(
-    //   result, this->ref());
-    // stream.synchronize();
-
-    // return *result;
-
-    // TODO this function currently copies the registers to the host and then finalizes the result;
-    // move computation to device? Edit: host computation is faster -.-
-    storage_type registers;
-    // TODO check if storage is host accessible
-    CUCO_CUDA_TRY(cudaMemcpyAsync(
-      &registers, this->storage_.get(), sizeof(storage_type), cudaMemcpyDeviceToHost, stream));
-    stream.synchronize();
-
-    using fp_type = typename ref_type<>::fp_type;
-    fp_type sum   = 0;
-    int zeroes    = 0;
-
-    // geometric mean computation + count registers with 0s
-    for (int i = 0; i < registers.size(); ++i) {
-      auto const reg = registers[i];
-      sum += fp_type{1} / static_cast<fp_type>(1ull << reg);
-      zeroes += reg == 0;
-    }
-
-    // pass intermediate result to finalizer for bias correction, etc.
-    return cuco::hyperloglog_ns::detail::finalizer<Precision>::finalize(sum, zeroes);
+    return this->ref_.estimate(stream);
   }
 
   /**
@@ -309,30 +228,48 @@ class hyperloglog {
    *
    * @return Device ref object of the current `distinct_count_estimator` host object
    */
-  [[nodiscard]] ref_type<> ref() const noexcept
-  {
-    return ref_type<>{*(this->storage_.get()), this->hash_};
-  }
+  [[nodiscard]] ref_type<> ref() const noexcept { return this->ref_; }
 
   /**
-   * @brief Get storage ref.
+   * @brief Get hash function.
    *
-   * @return Reference to storage
+   * @return The hash function
    */
-  [[nodiscard]] storage_type& storage_ref() const noexcept { return *(this->storage_.get()); }
+  [[nodiscard]] auto hash() const noexcept { return this->ref_.hash(); }
 
   /**
-   * @brief Get hash function.
+   * @brief Gets the span of the sketch.
    *
-   * @return The hash function
+   * @return The cuda::std::span of the sketch
+   */
+  [[nodiscard]] auto sketch() const noexcept { return this->ref_.sketch(); }
+
+  /**
+   * @brief Gets the number of bytes required for the sketch storage.
+   *
+   * @return The number of bytes required for the sketch
    */
-  [[nodiscard]] auto hash() const noexcept { return this->hash_; }
+  [[nodiscard]] constexpr std::size_t sketch_bytes() const noexcept
+  {
+    return ref_type<>::sketch_bytes();
+  }
+
+  /**
+   * @brief Gets the alignment required for the sketch storage.
+   *
+   * @return The required alignment
+   */
+  [[nodiscard]] static constexpr std::size_t sketch_alignment() noexcept
+  {
+    return ref_type<>::sketch_alignment();
+  }
 
  private:
-  hash_type hash_;                                       ///< Hash function used to hash items
   allocator_type allocator_;                             ///< Storage allocator
   custom_deleter<std::size_t, allocator_type> deleter_;  ///< Storage deleter
-  std::unique_ptr<storage_type, custom_deleter<std::size_t, allocator_type>> storage_;  ///< Storage
+  std::unique_ptr<std::byte, custom_deleter<std::size_t, allocator_type>>
+    sketch_;        ///< Sketch storage
+  ref_type<> ref_;  //< Ref type
 
   // Needs to be friends with other instantiations of this class template to have access to their
   // storage
diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
index e2d57c65d..46e61966a 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
@@ -15,14 +15,20 @@
  */
 #pragma once
 
+#include <cuco/cuda_stream_ref.hpp>
 #include <cuco/detail/__config>
+#include <cuco/detail/error.hpp>
 #include <cuco/detail/hyperloglog/finalizer.cuh>
-#include <cuco/detail/hyperloglog/storage.cuh>
+#include <cuco/detail/hyperloglog/kernels.cuh>
+#include <cuco/detail/utils.hpp>
 #include <cuco/hash_functions.cuh>
 #include <cuco/utility/cuda_thread_scope.cuh>
 #include <cuco/utility/traits.hpp>
 
+#include <thrust/host_vector.h>
+
 #include <cuda/std/bit>
+#include <cuda/std/span>
 #include <cuda/std/utility>
 
 #include <cooperative_groups.h>
@@ -31,6 +37,7 @@
 #include <cstddef>
 
 namespace cuco::detail {
+
 /**
  * @brief A GPU-accelerated utility for approximating the number of distinct items in a multiset.
  *
@@ -47,14 +54,15 @@ namespace cuco::detail {
  */
 template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash>
 class hyperloglog_ref {
+  using register_type = int;  ///< Register array storage
+  // We use `int` here since this is the smallest type that supports native `atomicMax` on GPUs
+  using fp_type = float;  ///< Floating point type used for reduction
  public:
-  using fp_type                      = float;      ///< Floating point type used for reduction
   static constexpr auto thread_scope = Scope;      ///< CUDA thread scope
   static constexpr auto precision    = Precision;  ///< Precision
 
-  using value_type   = T;                                       ///< Type of items to count
-  using storage_type = hyperloglog_dense_registers<Precision>;  ///< Storage type
-  using hash_type    = Hash;                                    ///< Hash function type
+  using value_type = T;     ///< Type of items to count
+  using hash_type  = Hash;  ///< Hash function type
 
   template <cuda::thread_scope NewScope>
   using with_scope = hyperloglog_ref<T, Precision, NewScope, Hash>;  ///< Ref type with different
@@ -63,12 +71,17 @@ class hyperloglog_ref {
   /**
    * @brief Constructs a non-owning `hyperloglog_ref` object.
    *
-   * @param storage Reference to storage object of type `storage_type`
+   * @param sketch_span Reference to sketch storage
    * @param hash The hash function used to hash items
    */
-  __host__ __device__ constexpr hyperloglog_ref(storage_type& storage, Hash const& hash) noexcept
-    : hash_{hash}, storage_{storage}
+  template <class U, std::size_t N>
+  __host__ __device__ constexpr hyperloglog_ref(cuda::std::span<U, N> sketch_span,
+                                                Hash const& hash) noexcept
+    : hash_{hash},
+      sketch_{reinterpret_cast<register_type*>(sketch_span.data()),
+              this->sketch_bytes() / sizeof(register_type)}
   {
+    // TODO check size and alignment
   }
 
   /**
@@ -81,7 +94,34 @@ class hyperloglog_ref {
   template <class CG>
   __device__ void clear(CG const& group) noexcept
   {
-    this->storage_.clear(group);
+    for (int i = group.thread_rank(); i < this->sketch_.size(); i += group.size()) {
+      this->sketch_[i] = 0;
+    }
+  }
+
+  /**
+   * @brief Resets the estimator, i.e., clears the current count estimate.
+   *
+   * @note This function synchronizes the given stream. For asynchronous execution use
+   * `clear_async`.
+   *
+   * @param stream CUDA stream this operation is executed in
+   */
+  __host__ void clear(cuco::cuda_stream_ref stream)
+  {
+    this->clear_async(stream);
+    stream.synchronize();
+  }
+
+  /**
+   * @brief Asynchronously resets the estimator, i.e., clears the current count estimate.
+   *
+   * @param stream CUDA stream this operation is executed in
+   */
+  __host__ void clear_async(cuco::cuda_stream_ref stream) noexcept
+  {
+    auto constexpr block_size = 1024;
+    cuco::hyperloglog_ns::detail::clear<<<1, block_size, 0, stream>>>(*this);
   }
 
   /**
@@ -97,7 +137,83 @@ class hyperloglog_ref {
     auto const reg                          = h & register_mask;
     auto const zeroes = cuda::std::countl_zero(h | register_mask) + 1;  // __clz
 
-    this->storage_.update_max<thread_scope>(reg, zeroes);
+    this->update_max(reg, zeroes);
+  }
+
+  /**
+   * @brief Asynchronously adds to be counted items to the estimator.
+   *
+   * @tparam InputIt Device accessible random access input iterator where
+   * <tt>std::is_convertible<std::iterator_traits<InputIt>::value_type,
+   * T></tt> is `true`
+   *
+   * @param first Beginning of the sequence of items
+   * @param last End of the sequence of items
+   * @param stream CUDA stream this operation is executed in
+   */
+  template <class InputIt>
+  __host__ void add_async(InputIt first, InputIt last, cuco::cuda_stream_ref stream)
+  {
+    auto const num_items = cuco::detail::distance(first, last);
+    if (num_items == 0) { return; }
+
+    int grid_size         = 0;
+    int block_size        = 0;
+    int const shmem_bytes = sketch_bytes();
+
+    // TODO specialize for is_continuous_iterator -> use memcpy_async
+
+    // try expanding shmem partition beyond 48KB if necessary
+    bool const fits_shmem =
+      cudaSuccess ==
+      cudaFuncSetAttribute(cuco::hyperloglog_ns::detail::add_shmem<InputIt, hyperloglog_ref>,
+                           cudaFuncAttributeMaxDynamicSharedMemorySize,
+                           shmem_bytes);
+
+    // We make use of the occupancy calculator to get the minimum number of blocks which still
+    // saturates the GPU. This reduces the shmem initialization overhead and atomic contention on
+    // the final register array during the merge phase.
+    if (fits_shmem) {  // use shmem codepath
+      CUCO_CUDA_TRY(cudaOccupancyMaxPotentialBlockSize(
+        &grid_size,
+        &block_size,
+        &cuco::hyperloglog_ns::detail::add_shmem<InputIt, hyperloglog_ref>,
+        shmem_bytes));
+
+      cuco::hyperloglog_ns::detail::add_shmem<<<grid_size, block_size, shmem_bytes, stream>>>(
+        first, num_items, *this);
+    } else {  // use gmem codepath since there is not enough shmem available
+      block_size = 0;
+      CUCO_CUDA_TRY(cudaOccupancyMaxPotentialBlockSize(
+        &grid_size,
+        &block_size,
+        &cuco::hyperloglog_ns::detail::add_gmem<InputIt, hyperloglog_ref>));
+      CUCO_EXPECTS(grid_size != 0, "Invalid kernel launch configuration");
+
+      cuco::hyperloglog_ns::detail::add_gmem<<<grid_size, block_size, 0, stream>>>(
+        first, num_items, *this);
+    }
+  }
+
+  /**
+   * @brief Adds to be counted items to the estimator.
+   *
+   * @note This function synchronizes the given stream. For asynchronous execution use
+   * `add_async`.
+   *
+   * @tparam InputIt Device accessible random access input iterator where
+   * <tt>std::is_convertible<std::iterator_traits<InputIt>::value_type,
+   * T></tt> is `true`
+   *
+   * @param first Beginning of the sequence of items
+   * @param last End of the sequence of items
+   * @param stream CUDA stream this operation is executed in
+   */
+  template <class InputIt>
+  __host__ void add(InputIt first, InputIt last, cuco::cuda_stream_ref stream)
+  {
+    this->add_async(first, last, stream);
+    stream.synchronize();
   }
 
   /**
@@ -113,7 +229,44 @@ class hyperloglog_ref {
   __device__ void merge(CG const& group,
                         hyperloglog_ref<T, Precision, OtherScope, Hash> const& other) noexcept
   {
-    this->storage_.merge<thread_scope>(group, other.storage_);
+    for (int i = group.thread_rank(); i < this->sketch_.size(); i += group.size()) {
+      this->update_max(i, other.sketch_[i]);
+    }
+  }
+
+  /**
+   * @brief Asynchronously merges the result of `other` estimator reference into `*this` estimator.
+   *
+   * @tparam OtherScope Thread scope of `other` estimator
+   *
+   * @param other Other estimator reference to be merged into `*this`
+   * @param stream CUDA stream this operation is executed in
+   */
+  template <cuda::thread_scope OtherScope>
+  __host__ void merge_async(hyperloglog_ref<T, Precision, OtherScope, Hash> const& other,
+                            cuco::cuda_stream_ref stream) noexcept
+  {
+    auto constexpr block_size = 1024;
+    cuco::hyperloglog_ns::detail::merge<<<1, block_size, 0, stream>>>(other, *this);
+  }
+
+  /**
+   * @brief Merges the result of `other` estimator reference into `*this` estimator.
+   *
+   * @note This function synchronizes the given stream. For asynchronous execution use
+   * `merge_async`.
+   *
+   * @tparam OtherScope Thread scope of `other` estimator
+   *
+   * @param other Other estimator reference to be merged into `*this`
+   * @param stream CUDA stream this operation is executed in
+   */
+  template <cuda::thread_scope OtherScope>
+  __host__ void merge(hyperloglog_ref<T, Precision, OtherScope, Hash> const& other,
+                      cuco::cuda_stream_ref stream)
+  {
+    this->merge_async(other, stream);
+    stream.synchronize();
   }
 
   /**
@@ -138,8 +291,8 @@ class hyperloglog_ref {
 
     fp_type thread_sum = 0;
     int thread_zeroes  = 0;
-    for (int i = group.thread_rank(); i < this->storage_.size(); i += group.size()) {
-      auto const reg = this->storage_[i];
+    for (int i = group.thread_rank(); i < this->sketch_.size(); i += group.size()) {
+      auto const reg = this->sketch_[i];
       thread_sum += fp_type{1} / static_cast<fp_type>(1 << reg);
       thread_zeroes += reg == 0;
     }
@@ -174,9 +327,102 @@ class hyperloglog_ref {
     return estimate;
   }
 
+  /**
+   * @brief Compute the estimated distinct items count.
+   *
+   * @note This function synchronizes the given stream.
+   *
+   * @param stream CUDA stream this operation is executed in
+   *
+   * @return Approximate distinct items count
+   */
+  [[nodiscard]] __host__ std::size_t estimate(cuco::cuda_stream_ref stream) const
+  {
+    auto const num_regs = 1ull << Precision;
+    thrust::host_vector<register_type> host_sketch(num_regs);
+
+    // TODO check if storage is host accessible
+    CUCO_CUDA_TRY(cudaMemcpyAsync(host_sketch.data(),
+                                  this->sketch_.data(),
+                                  sizeof(register_type) * num_regs,
+                                  cudaMemcpyDeviceToHost,
+                                  stream));
+    stream.synchronize();
+
+    fp_type sum = 0;
+    int zeroes  = 0;
+
+    // geometric mean computation + count registers with 0s
+    for (auto const reg : host_sketch) {
+      sum += fp_type{1} / static_cast<fp_type>(1ull << reg);
+      zeroes += reg == 0;
+    }
+
+    // pass intermediate result to finalizer for bias correction, etc.
+    return cuco::hyperloglog_ns::detail::finalizer<Precision>::finalize(sum, zeroes);
+  }
+
+  /**
+   * @brief Gets the hash function.
+   *
+   * @return The hash function
+   */
+  [[nodiscard]] __host__ __device__ auto hash() const noexcept { return this->hash_; }
+
+  /**
+   * @brief Gets the span of the sketch.
+   *
+   * @return The cuda::std::span of the sketch
+   */
+  [[nodiscard]] __host__ __device__ auto sketch() const noexcept { return this->sketch_; }
+
+  /**
+   * @brief Gets the number of bytes required for the sketch storage.
+   *
+   * @return The number of bytes required for the sketch
+   */
+  [[nodiscard]] __host__ __device__ static constexpr std::size_t sketch_bytes() noexcept
+  {
+    return (1ull << Precision) * sizeof(register_type);
+  }
+
+  /**
+   * @brief Gets the alignment required for the sketch storage.
+   *
+   * @return The required alignment
+   */
+  [[nodiscard]] __host__ __device__ static constexpr std::size_t sketch_alignment() noexcept
+  {
+    return alignof(register_type);
+  }
+
  private:
-  hash_type hash_;         ///< Hash function used to hash items
-  storage_type& storage_;  ///< Reference to storage object
+  /**
+   * @brief Atomically updates the register at position `i` with `max(reg[i], value)`.
+   *
+   * @tparam Scope CUDA thread scope
+   *
+   * @param i Register index
+   * @param value New value
+   */
+  __device__ constexpr void update_max(int i, register_type value) noexcept
+  {
+    if constexpr (Scope == cuda::thread_scope_thread) {
+      this->sketch_[i] = max(this->sketch_[i], value);
+    } else if constexpr (Scope == cuda::thread_scope_block) {
+      atomicMax_block(&(this->sketch_[i]), value);
+    } else if constexpr (Scope == cuda::thread_scope_device) {
+      atomicMax(&(this->sketch_[i]), value);
+    } else if constexpr (Scope == cuda::thread_scope_system) {
+      atomicMax_system(&(this->sketch_[i]), value);
+    } else {
+      static_assert(cuco::dependent_false<decltype(Scope)>, "Unsupported thread scope");
+    }
+  }
+
+  hash_type hash_;  ///< Hash function used to hash items
+  cuda::std::span<register_type, sketch_bytes() / sizeof(register_type)>
+    sketch_;  ///< HLL sketch storage
 
   template <class T_, int32_t Precision_, cuda::thread_scope Scope_, class Hash_>
   friend class hyperloglog_ref;
diff --git a/include/cuco/detail/hyperloglog/kernels.cuh b/include/cuco/detail/hyperloglog/kernels.cuh
index 653caac95..8b2ab73e9 100644
--- a/include/cuco/detail/hyperloglog/kernels.cuh
+++ b/include/cuco/detail/hyperloglog/kernels.cuh
@@ -18,6 +18,8 @@
 #include <cuco/detail/utility/cuda.cuh>
 #include <cuco/utility/cuda_thread_scope.cuh>
 
+#include <cuda/std/span>
+
 #include <cstddef>
 
 #include <cooperative_groups.h>
@@ -35,20 +37,16 @@ CUCO_KERNEL void clear(RefType ref)
 template <class InputIt, class RefType>
 CUCO_KERNEL void add_shmem(InputIt first, cuco::detail::index_type n, RefType ref)
 {
-  using local_ref_type     = typename RefType::with_scope<cuda::thread_scope_block>;
-  using local_storage_type = typename local_ref_type::storage_type;
+  using local_ref_type = typename RefType::with_scope<cuda::thread_scope_block>;
 
-  alignas(16) extern __shared__ char shmem[];
-  local_storage_type* local_storage = reinterpret_cast<local_storage_type*>(shmem);
+  // TODO assert alignment
+  extern __shared__ std::byte local_sketch[];
 
   auto const loop_stride = cuco::detail::grid_stride();
   auto idx               = cuco::detail::global_thread_id();
   auto const block       = cooperative_groups::this_thread_block();
 
-  if (block.thread_rank() == 0) { new (local_storage) local_storage_type{}; }
-  block.sync();
-
-  local_ref_type local_ref(*local_storage, {});
+  local_ref_type local_ref(cuda::std::span{local_sketch, ref.sketch_bytes()}, {});
   local_ref.clear(block);
   block.sync();
 
diff --git a/include/cuco/distinct_count_estimator.cuh b/include/cuco/distinct_count_estimator.cuh
index 5a9a16c85..38f9cbd16 100644
--- a/include/cuco/distinct_count_estimator.cuh
+++ b/include/cuco/distinct_count_estimator.cuh
@@ -61,7 +61,6 @@ class distinct_count_estimator {
 
   using value_type     = typename impl_type::value_type;      ///< Type of items to count
   using allocator_type = typename impl_type::allocator_type;  ///< Allocator type
-  using storage_type   = typename impl_type::storage_type;    ///< Storage type
 
   // TODO enable CTAD
   /**
@@ -212,6 +211,34 @@ class distinct_count_estimator {
    */
   [[nodiscard]] ref_type<> ref() const noexcept;
 
+  /**
+   * @brief Get hash function.
+   *
+   * @return The hash function
+   */
+  [[nodiscard]] auto hash() const noexcept;
+
+  /**
+   * @brief Gets the span of the sketch.
+   *
+   * @return The cuda::std::span of the sketch
+   */
+  [[nodiscard]] auto sketch() const noexcept;
+
+  /**
+   * @brief Gets the number of bytes required for the sketch storage.
+   *
+   * @return The number of bytes required for the sketch
+   */
+  [[nodiscard]] static constexpr std::size_t sketch_bytes() noexcept;
+
+  /**
+   * @brief Gets the alignment required for the sketch storage.
+   *
+   * @return The required alignment
+   */
+  [[nodiscard]] static constexpr std::size_t sketch_alignment() noexcept;
+
  private:
   std::unique_ptr<impl_type> impl_;  ///< Implementation object
 };
diff --git a/include/cuco/distinct_count_estimator_ref.cuh b/include/cuco/distinct_count_estimator_ref.cuh
index d656d6e17..905a6d379 100644
--- a/include/cuco/distinct_count_estimator_ref.cuh
+++ b/include/cuco/distinct_count_estimator_ref.cuh
@@ -15,12 +15,15 @@
  */
 #pragma once
 
+#include <cuco/cuda_stream_ref.hpp>
 #include <cuco/detail/hyperloglog/hyperloglog_ref.cuh>
 #include <cuco/hash_functions.cuh>
 #include <cuco/utility/cuda_thread_scope.cuh>
 
 #include <cooperative_groups.h>
 
+#include <cstddef>
+
 namespace cuco {
 /**
  * @brief A GPU-accelerated utility for approximating the number of distinct items in a multiset.
@@ -44,8 +47,7 @@ class distinct_count_estimator_ref {
   static constexpr auto thread_scope = impl_type::thread_scope;  ///< CUDA thread scope
   static constexpr auto precision    = impl_type::precision;     ///< Precision
 
-  using value_type   = typename impl_type::value_type;    ///< Type of items to count
-  using storage_type = typename impl_type::storage_type;  ///< Storage type
+  using value_type = typename impl_type::value_type;  ///< Type of items to count
 
   template <cuda::thread_scope NewScope>
   using with_scope =
@@ -56,10 +58,11 @@ class distinct_count_estimator_ref {
   /**
    * @brief Constructs a non-owning `distinct_count_estimator_ref` object.
    *
-   * @param storage Reference to storage object of type `storage_type`
+   * @param sketch_span Reference to sketch storage
    * @param hash The hash function used to hash items
    */
-  __host__ __device__ constexpr distinct_count_estimator_ref(storage_type& storage,
+  template <class U, std::size_t N>
+  __host__ __device__ constexpr distinct_count_estimator_ref(cuda::std::span<U, N> sketch_span,
                                                              Hash const& hash = {}) noexcept;
 
   /**
@@ -72,6 +75,23 @@ class distinct_count_estimator_ref {
   template <class CG>
   __device__ void clear(CG const& group) noexcept;
 
+  /**
+   * @brief Asynchronously resets the estimator, i.e., clears the current count estimate.
+   *
+   * @param stream CUDA stream this operation is executed in
+   */
+  __host__ void clear_async(cuco::cuda_stream_ref stream = {}) noexcept;
+
+  /**
+   * @brief Resets the estimator, i.e., clears the current count estimate.
+   *
+   * @note This function synchronizes the given stream. For asynchronous execution use
+   * `clear_async`.
+   *
+   * @param stream CUDA stream this operation is executed in
+   */
+  __host__ void clear(cuco::cuda_stream_ref stream = {});
+
   /**
    * @brief Adds an item to the estimator.
    *
@@ -79,6 +99,37 @@ class distinct_count_estimator_ref {
    */
   __device__ void add(T const& item) noexcept;
 
+  /**
+   * @brief Asynchronously adds to be counted items to the estimator.
+   *
+   * @tparam InputIt Device accessible random access input iterator where
+   * <tt>std::is_convertible<std::iterator_traits<InputIt>::value_type,
+   * T></tt> is `true`
+   *
+   * @param first Beginning of the sequence of items
+   * @param last End of the sequence of items
+   * @param stream CUDA stream this operation is executed in
+   */
+  template <class InputIt>
+  __host__ void add_async(InputIt first, InputIt last, cuco::cuda_stream_ref stream = {});
+
+  /**
+   * @brief Adds to be counted items to the estimator.
+   *
+   * @note This function synchronizes the given stream. For asynchronous execution use
+   * `add_async`.
+   *
+   * @tparam InputIt Device accessible random access input iterator where
+   * <tt>std::is_convertible<std::iterator_traits<InputIt>::value_type,
+   * T></tt> is `true`
+   *
+   * @param first Beginning of the sequence of items
+   * @param last End of the sequence of items
+   * @param stream CUDA stream this operation is executed in
+   */
+  template <class InputIt>
+  __host__ void add(InputIt first, InputIt last, cuco::cuda_stream_ref stream = {});
+
   /**
    * @brief Merges the result of `other` estimator reference into `*this` estimator reference.
    *
@@ -93,6 +144,34 @@ class distinct_count_estimator_ref {
     CG const& group,
     distinct_count_estimator_ref<T, Precision, OtherScope, Hash> const& other) noexcept;
 
+  /**
+   * @brief Asynchronously merges the result of `other` estimator reference into `*this` estimator.
+   *
+   * @tparam OtherScope Thread scope of `other` estimator
+   *
+   * @param other Other estimator reference to be merged into `*this`
+   * @param stream CUDA stream this operation is executed in
+   */
+  template <cuda::thread_scope OtherScope>
+  __host__ void merge_async(
+    distinct_count_estimator_ref<T, Precision, OtherScope, Hash> const& other,
+    cuco::cuda_stream_ref stream = {}) noexcept;
+
+  /**
+   * @brief Merges the result of `other` estimator reference into `*this` estimator.
+   *
+   * @note This function synchronizes the given stream. For asynchronous execution use
+   * `merge_async`.
+   *
+   * @tparam OtherScope Thread scope of `other` estimator
+   *
+   * @param other Other estimator reference to be merged into `*this`
+   * @param stream CUDA stream this operation is executed in
+   */
+  template <cuda::thread_scope OtherScope>
+  __host__ void merge(distinct_count_estimator_ref<T, Precision, OtherScope, Hash> const& other,
+                      cuco::cuda_stream_ref stream = {});
+
   /**
    * @brief Compute the estimated distinct items count.
    *
@@ -103,6 +182,45 @@ class distinct_count_estimator_ref {
   [[nodiscard]] __device__ std::size_t estimate(
     cooperative_groups::thread_block const& group) const noexcept;
 
+  /**
+   * @brief Compute the estimated distinct items count.
+   *
+   * @note This function synchronizes the given stream.
+   *
+   * @param stream CUDA stream this operation is executed in
+   *
+   * @return Approximate distinct items count
+   */
+  [[nodiscard]] __host__ std::size_t estimate(cuco::cuda_stream_ref stream = {}) const;
+
+  /**
+   * @brief Gets the hash function.
+   *
+   * @return The hash function
+   */
+  [[nodiscard]] __host__ __device__ auto hash() const noexcept;
+
+  /**
+   * @brief Gets the span of the sketch.
+   *
+   * @return The cuda::std::span of the sketch
+   */
+  [[nodiscard]] __host__ __device__ auto sketch() const noexcept;
+
+  /**
+   * @brief Gets the number of bytes required for the sketch storage.
+   *
+   * @return The number of bytes required for the sketch
+   */
+  [[nodiscard]] __host__ __device__ static constexpr std::size_t sketch_bytes() noexcept;
+
+  /**
+   * @brief Gets the alignment required for the sketch storage.
+   *
+   * @return The required alignment
+   */
+  [[nodiscard]] __host__ __device__ static constexpr std::size_t sketch_alignment() noexcept;
+
  private:
   impl_type impl_;  ///< Implementation object
 

From 22c083d3d851694dc70035b8cc27e4ce7dd31c8f Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Thu, 14 Mar 2024 22:12:37 +0000
Subject: [PATCH 29/78] Remove storage class

---
 include/cuco/detail/hyperloglog/storage.cuh | 126 --------------------
 1 file changed, 126 deletions(-)
 delete mode 100644 include/cuco/detail/hyperloglog/storage.cuh

diff --git a/include/cuco/detail/hyperloglog/storage.cuh b/include/cuco/detail/hyperloglog/storage.cuh
deleted file mode 100644
index 5ca525c69..000000000
--- a/include/cuco/detail/hyperloglog/storage.cuh
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cuco/utility/cuda_thread_scope.cuh>
-#include <cuco/utility/traits.hpp>
-
-#include <cstddef>
-#include <cuda/std/array>
-
-namespace cuco::detail {
-
-/**
- * @brief Storage class for `hyperloglog` and `hyperloglog_ref`.
- *
- * @note This class implements the dense storage layout from the HyperLogLog++ paper, but uses
- * 4bytes per register instead of only 6bits. This is required since we need to update registers
- * atomically.
- *
- * @tparam Precision Tuning parameter to trade runtime/memory footprint for better accuracy
- */
-template <int32_t Precision>
-class hyperloglog_dense_registers {
- public:
-  // We use `int` here since this is the smallest type that supports native `atomicMax` on GPUs
-  using register_type = int;  ///< Register array storage
-  /**
-   * @brief Clears the storage.
-   *
-   * @tparam CG CUDA Cooperative Group type
-   *
-   * @param group CUDA Cooperative group this operation is executed in
-   */
-  template <class CG>
-  __device__ void constexpr clear(CG const& group) noexcept
-  {
-    for (int i = group.thread_rank(); i < this->registers_.size(); i += group.size()) {
-      this->registers_[i] = 0;
-    }
-  }
-
-  /**
-   * @brief Returns a reference to the element at specified location `i`. No bounds checking is
-   * performed.
-   *
-   * @param i Position of the element to return
-   *
-   * @return Reference to the requested element
-   */
-  __host__ __device__ constexpr int& operator[](int i) noexcept { return this->registers_[i]; }
-
-  /**
-   * @brief Returns the element at specified location `i`. No bounds checking is performed.
-   *
-   * @param i Position of the element to return
-   *
-   * @return Requested element
-   */
-  __host__ __device__ constexpr int operator[](int i) const noexcept { return this->registers_[i]; }
-
-  /**
-   * @brief Returns the number of elements in the container.
-   *
-   * @return The number of elements in the container
-   */
-  __host__ __device__ constexpr int size() const noexcept { return this->registers_.size(); }
-
-  /**
-   * @brief Atomically updates the register at position `i` with `max(reg[i], value)`.
-   *
-   * @tparam Scope CUDA thread scope
-   *
-   * @param i Register index
-   * @param value New value
-   */
-  template <cuda::thread_scope Scope>
-  __device__ constexpr void update_max(int i, register_type value) noexcept
-  {
-    if constexpr (Scope == cuda::thread_scope_thread) {
-      this->registers_[i] = max(this->registers_[i], value);
-    } else if constexpr (Scope == cuda::thread_scope_block) {
-      atomicMax_block(&(this->registers_[i]), value);
-    } else if constexpr (Scope == cuda::thread_scope_device) {
-      atomicMax(&(this->registers_[i]), value);
-    } else if constexpr (Scope == cuda::thread_scope_system) {
-      atomicMax_system(&(this->registers_[i]), value);
-    } else {
-      static_assert(cuco::dependent_false<decltype(Scope)>, "Unsupported thread scope");
-    }
-  }
-
-  /**
-   * @brief Combines the contents of `other` storage into `*this` storage.
-   *
-   * @tparam Scope CUDA thread scope
-   * @tparam CG CUDA Cooperative Group type
-   *
-   * @param group CUDA Cooperative group this operation is executed in
-   * @param other Other storage
-   */
-  template <cuda::thread_scope Scope, class CG>
-  __device__ void constexpr merge(CG const& group,
-                                  hyperloglog_dense_registers const& other) noexcept
-  {
-    for (int i = group.thread_rank(); i < this->registers_.size(); i += group.size()) {
-      this->update_max<Scope>(i, other.registers_[i]);
-    }
-  }
-
- private:
-  cuda::std::array<register_type, 1ull << Precision> registers_;  ///< Register array storage
-};
-}  // namespace cuco::detail

From 56cdc6bce550b7a8d14acd017f300689b2e4c96d Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Fri, 15 Mar 2024 00:09:38 +0000
Subject: [PATCH 30/78] Add vectorized add kernel

---
 .../detail/hyperloglog/hyperloglog_ref.cuh    | 110 +++++++++++++-----
 include/cuco/detail/hyperloglog/kernels.cuh   |  37 ++++++
 2 files changed, 115 insertions(+), 32 deletions(-)

diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
index 46e61966a..d658e748e 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
@@ -26,6 +26,7 @@
 #include <cuco/utility/traits.hpp>
 
 #include <thrust/host_vector.h>
+#include <thrust/type_traits/is_contiguous_iterator.h>
 
 #include <cuda/std/bit>
 #include <cuda/std/span>
@@ -160,38 +161,63 @@ class hyperloglog_ref {
     int grid_size         = 0;
     int block_size        = 0;
     int const shmem_bytes = sketch_bytes();
+    void const* kernel    = nullptr;
+
+    // In case the input iterator represents a contiguous memory segment we can employ efficient
+    // vectorized loads
+    if constexpr (thrust::is_contiguous_iterator_v<InputIt>) {
+      auto const ptr = thrust::raw_pointer_cast(&first[0]);
+      auto const alignment =
+        1 << cuda::std::countr_zero(reinterpret_cast<cuda::std::uintptr_t>(ptr) | 16);
+      auto const vector_size = alignment / sizeof(value_type);
+
+      switch (vector_size) {
+        case 2:
+          kernel = reinterpret_cast<void const*>(
+            cuco::hyperloglog_ns::detail::add_shmem_vectorized<2, hyperloglog_ref>);
+          break;
+        case 4:
+          kernel = reinterpret_cast<void const*>(
+            cuco::hyperloglog_ns::detail::add_shmem_vectorized<4, hyperloglog_ref>);
+          break;
+        case 8:
+          kernel = reinterpret_cast<void const*>(
+            cuco::hyperloglog_ns::detail::add_shmem_vectorized<8, hyperloglog_ref>);
+          break;
+      };
+    }
 
-    // TODO specialize for is_continuous_iterator -> use memcpy_async
-
-    // try expanding shmem partition beyond 48KB if necessary
-    bool const fits_shmem =
-      cudaSuccess ==
-      cudaFuncSetAttribute(cuco::hyperloglog_ns::detail::add_shmem<InputIt, hyperloglog_ref>,
-                           cudaFuncAttributeMaxDynamicSharedMemorySize,
-                           shmem_bytes);
-
-    // We make use of the occupancy calculator to get the minimum number of blocks which still
-    // saturates the GPU. This reduces the shmem initialization overhead and atomic contention on
-    // the final register array during the merge phase.
-    if (fits_shmem) {  // use shmem codepath
-      CUCO_CUDA_TRY(cudaOccupancyMaxPotentialBlockSize(
-        &grid_size,
-        &block_size,
-        &cuco::hyperloglog_ns::detail::add_shmem<InputIt, hyperloglog_ref>,
-        shmem_bytes));
-
-      cuco::hyperloglog_ns::detail::add_shmem<<<grid_size, block_size, shmem_bytes, stream>>>(
-        first, num_items, *this);
-    } else {  // use gmem codepath since there is not enough shmem available
-      block_size = 0;
-      CUCO_CUDA_TRY(cudaOccupancyMaxPotentialBlockSize(
-        &grid_size,
-        &block_size,
-        &cuco::hyperloglog_ns::detail::add_gmem<InputIt, hyperloglog_ref>));
-      CUCO_EXPECTS(grid_size != 0, "Invalid kernel launch configuration");
-
-      cuco::hyperloglog_ns::detail::add_gmem<<<grid_size, block_size, 0, stream>>>(
-        first, num_items, *this);
+    if (kernel != nullptr and this->try_reserve_shmem(kernel, shmem_bytes)) {
+      // We make use of the occupancy calculator to get the minimum number of blocks which still
+      // saturates the GPU. This reduces the shmem initialization overhead and atomic contention on
+      // the final register array during the merge phase.
+      CUCO_CUDA_TRY(
+        cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, kernel, shmem_bytes));
+
+      auto const ptr      = thrust::raw_pointer_cast(&first[0]);
+      void* kernel_args[] = {
+        (void*)(&ptr),  // TODO can't use reinterpret_cast since it can't cast away const
+        (void*)(&num_items),
+        reinterpret_cast<void*>(this)};
+      CUCO_CUDA_TRY(
+        cudaLaunchKernel(kernel, grid_size, block_size, kernel_args, shmem_bytes, stream));
+    } else {
+      kernel = (void const*)cuco::hyperloglog_ns::detail::add_shmem<InputIt, hyperloglog_ref>;
+      void* kernel_args[] = {(void*)(&first), (void*)(&num_items), reinterpret_cast<void*>(this)};
+      if (this->try_reserve_shmem(kernel, shmem_bytes)) {
+        CUCO_CUDA_TRY(
+          cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, kernel, shmem_bytes));
+
+        CUCO_CUDA_TRY(
+          cudaLaunchKernel(kernel, grid_size, block_size, kernel_args, shmem_bytes, stream));
+      } else {
+        // Computes sketch directly in global memory. (Fallback path in case there is not enough
+        // shared memory avalable)
+        kernel = (void const*)cuco::hyperloglog_ns::detail::add_gmem<InputIt, hyperloglog_ref>;
+
+        CUCO_CUDA_TRY(
+          cudaLaunchKernel(kernel, grid_size, block_size, kernel_args, shmem_bytes, stream));
+      }
     }
   }
 
@@ -235,7 +261,8 @@ class hyperloglog_ref {
   }
 
   /**
-   * @brief Asynchronously merges the result of `other` estimator reference into `*this` estimator.
+   * @brief Asynchronously merges the result of `other` estimator reference into `*this`
+   * estimator.
    *
    * @tparam OtherScope Thread scope of `other` estimator
    *
@@ -420,6 +447,25 @@ class hyperloglog_ref {
     }
   }
 
+  /**
+   * @brief Try expanding the shmem partition for a given kernel beyond 48KB is necessary.
+   *
+   * @tparam Kernel Type of kernel function
+   *
+   * @param kernel The kernel function
+   * @param shmem_bytes Number of requested dynamic shared memory bytes
+   *
+   * @returns True iff kernel configuration is succesful
+   */
+  template <typename Kernel>
+  [[nodiscard]] __host__ constexpr bool try_reserve_shmem(Kernel kernel,
+                                                          int shmem_bytes) const noexcept
+  {
+    return cudaSuccess == cudaFuncSetAttribute(reinterpret_cast<void const*>(kernel),
+                                               cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                               shmem_bytes);
+  }
+
   hash_type hash_;  ///< Hash function used to hash items
   cuda::std::span<register_type, sketch_bytes() / sizeof(register_type)>
     sketch_;  ///< HLL sketch storage
diff --git a/include/cuco/detail/hyperloglog/kernels.cuh b/include/cuco/detail/hyperloglog/kernels.cuh
index 8b2ab73e9..4da78d020 100644
--- a/include/cuco/detail/hyperloglog/kernels.cuh
+++ b/include/cuco/detail/hyperloglog/kernels.cuh
@@ -18,6 +18,7 @@
 #include <cuco/detail/utility/cuda.cuh>
 #include <cuco/utility/cuda_thread_scope.cuh>
 
+#include <cuda/std/array>
 #include <cuda/std/span>
 
 #include <cstddef>
@@ -34,6 +35,42 @@ CUCO_KERNEL void clear(RefType ref)
   if (block.group_index().x == 0) { ref.clear(block); }
 }
 
+template <int32_t VectorSize, class RefType>
+CUCO_KERNEL void add_shmem_vectorized(typename RefType::value_type const* first,
+                                      cuco::detail::index_type n,
+                                      RefType ref)
+{
+  using value_type     = typename RefType::value_type;
+  using vector_type    = cuda::std::array<value_type, VectorSize>;
+  using local_ref_type = typename RefType::with_scope<cuda::thread_scope_block>;
+
+  // TODO assert alignment
+  extern __shared__ std::byte local_sketch[];
+
+  auto const loop_stride = cuco::detail::grid_stride();
+  auto idx               = cuco::detail::global_thread_id();
+  auto const block       = cooperative_groups::this_thread_block();
+
+  local_ref_type local_ref(cuda::std::span{local_sketch, ref.sketch_bytes()}, {});
+  local_ref.clear(block);
+  block.sync();
+
+  vector_type vec;
+  while (idx < n / VectorSize) {
+    vec = *reinterpret_cast<vector_type*>(
+      __builtin_assume_aligned(first + idx * VectorSize, sizeof(vector_type)));
+    for (auto const& i : vec) {
+      local_ref.add(i);
+    }
+    idx += loop_stride;
+  }
+  auto const remainder = n % VectorSize;
+  if (idx >= n / VectorSize and idx < n / VectorSize + remainder) { local_ref.add(*(first + idx)); }
+  block.sync();
+
+  ref.merge(block, local_ref);
+}
+
 template <class InputIt, class RefType>
 CUCO_KERNEL void add_shmem(InputIt first, cuco::detail::index_type n, RefType ref)
 {

From 9b4b61294a3ecc904854f739bb8b05c07724e719 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Fri, 15 Mar 2024 12:55:15 +0000
Subject: [PATCH 31/78] Add missing kernel config

---
 include/cuco/detail/hyperloglog/hyperloglog_ref.cuh | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
index d658e748e..35ffce286 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
@@ -202,7 +202,8 @@ class hyperloglog_ref {
       CUCO_CUDA_TRY(
         cudaLaunchKernel(kernel, grid_size, block_size, kernel_args, shmem_bytes, stream));
     } else {
-      kernel = (void const*)cuco::hyperloglog_ns::detail::add_shmem<InputIt, hyperloglog_ref>;
+      kernel = reinterpret_cast<void const*>(
+        cuco::hyperloglog_ns::detail::add_shmem<InputIt, hyperloglog_ref>);
       void* kernel_args[] = {(void*)(&first), (void*)(&num_items), reinterpret_cast<void*>(this)};
       if (this->try_reserve_shmem(kernel, shmem_bytes)) {
         CUCO_CUDA_TRY(
@@ -213,10 +214,12 @@ class hyperloglog_ref {
       } else {
         // Computes sketch directly in global memory. (Fallback path in case there is not enough
         // shared memory avalable)
-        kernel = (void const*)cuco::hyperloglog_ns::detail::add_gmem<InputIt, hyperloglog_ref>;
+        kernel = reinterpret_cast<void const*>(
+          cuco::hyperloglog_ns::detail::add_gmem<InputIt, hyperloglog_ref>);
 
-        CUCO_CUDA_TRY(
-          cudaLaunchKernel(kernel, grid_size, block_size, kernel_args, shmem_bytes, stream));
+        CUCO_CUDA_TRY(cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, kernel, 0));
+
+        CUCO_CUDA_TRY(cudaLaunchKernel(kernel, grid_size, block_size, kernel_args, 0, stream));
       }
     }
   }
@@ -448,7 +451,7 @@ class hyperloglog_ref {
   }
 
   /**
-   * @brief Try expanding the shmem partition for a given kernel beyond 48KB is necessary.
+   * @brief Try expanding the shmem partition for a given kernel beyond 48KB if necessary.
    *
    * @tparam Kernel Type of kernel function
    *

From 30bd79ddbd919855261fddb1c2b9765c0e585156 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Fri, 15 Mar 2024 13:35:30 +0000
Subject: [PATCH 32/78] Make tuning arrs accessible in non-constexpr context

---
 include/cuco/detail/hyperloglog/finalizer.cuh |  22 +-
 include/cuco/detail/hyperloglog/tuning.cuh    | 226 +++++++-----------
 2 files changed, 97 insertions(+), 151 deletions(-)

diff --git a/include/cuco/detail/hyperloglog/finalizer.cuh b/include/cuco/detail/hyperloglog/finalizer.cuh
index 8b221f6ba..60a7ffcab 100644
--- a/include/cuco/detail/hyperloglog/finalizer.cuh
+++ b/include/cuco/detail/hyperloglog/finalizer.cuh
@@ -60,7 +60,7 @@ class finalizer {
       if (e <= 2.5 * m) { return cuda::std::round(h); }
 
       if constexpr (Precision < 19) {
-        e = (h <= threshold<Precision>()) ? h : bias_corrected_estimate(e);
+        e = (h <= threshold(Precision)) ? h : bias_corrected_estimate(e);
       }
     } else {
       // HLL++ is defined only when p < 19, otherwise we need to fallback to HLL.
@@ -95,7 +95,7 @@ class finalizer {
   __host__ __device__ static double constexpr bias(double e) noexcept
   {
     auto const anchor_index = interpolation_anchor_index(e);
-    int const n             = raw_estimate_data<Precision>().size();
+    int const n             = raw_estimate_data_size(Precision);
 
     auto low  = cuda::std::max(anchor_index - k + 1, 0);
     auto high = cuda::std::min(low + k, n);
@@ -106,8 +106,8 @@ class finalizer {
       high += 1;
     }
 
-    auto const& biases = bias_data<Precision>();
-    double bias_sum    = 0.0;
+    auto biases     = bias_data(Precision);
+    double bias_sum = 0.0;
     for (int i = low; i < high; ++i) {
       bias_sum += biases[i];
     }
@@ -117,15 +117,16 @@ class finalizer {
 
   __host__ __device__ static double distance(double e, int i) noexcept
   {
-    auto const diff = e - raw_estimate_data<Precision>()[i];
+    auto const diff = e - raw_estimate_data(Precision)[i];
     return diff * diff;
   }
 
   __host__ __device__ static int interpolation_anchor_index(double e) noexcept
   {
-    auto const& estimates = raw_estimate_data<Precision>();
-    int left              = 0;
-    int right             = static_cast<int>(estimates.size()) - 1;
+    auto estimates = raw_estimate_data(Precision);
+    int const n    = raw_estimate_data_size(Precision);
+    int left       = 0;
+    int right      = static_cast<int>(n) - 1;
     int mid;
     int candidate_index = 0;  // Index of the closest element found
 
@@ -146,9 +147,8 @@ class finalizer {
     // 'left - 1' to find the closest one, taking care of boundary conditions.
 
     // Distance from 'e' to the element at 'left', if within bounds
-    double const dist_lhs = left < static_cast<int>(estimates.size())
-                              ? cuda::std::abs(estimates[left] - e)
-                              : cuda::std::numeric_limits<double>::max();
+    double const dist_lhs = left < static_cast<int>(n) ? cuda::std::abs(estimates[left] - e)
+                                                       : cuda::std::numeric_limits<double>::max();
     // Distance from 'e' to the element at 'left - 1', if within bounds
     double const dist_rhs = left - 1 >= 0 ? cuda::std::abs(estimates[left - 1] - e)
                                           : cuda::std::numeric_limits<double>::max();
diff --git a/include/cuco/detail/hyperloglog/tuning.cuh b/include/cuco/detail/hyperloglog/tuning.cuh
index 05cacb067..a816ffa89 100644
--- a/include/cuco/detail/hyperloglog/tuning.cuh
+++ b/include/cuco/detail/hyperloglog/tuning.cuh
@@ -26,53 +26,11 @@ namespace cuco::hyperloglog_ns::detail {
 #endif
 
 // clang-format off
-template <int32_t Precision>
-__host__ __device__ constexpr auto threshold() noexcept;
+CUCO_HLL_TUNING_ARR_DECL threshold_data{10.0, 20.0, 40.0, 80.0, 220.0, 400.0, 900.0, 1800.0, 3100.0, 6500.0, 15500.0, 20000.0, 50000.0, 120000.0, 350000.0};
 
-template <>
-__host__ __device__ constexpr auto threshold<4>() noexcept { return 10.0; };
-
-template <>
-__host__ __device__ constexpr auto threshold<5>() noexcept { return 20.0; };
-
-template <>
-__host__ __device__ constexpr auto threshold<6>() noexcept { return 40.0; };
-
-template <>
-__host__ __device__ constexpr auto threshold<7>() noexcept { return 80.0; };
-
-template <>
-__host__ __device__ constexpr auto threshold<8>() noexcept { return 220.0; };
-
-template <>
-__host__ __device__ constexpr auto threshold<9>() noexcept { return 400.0; };
-
-template <>
-__host__ __device__ constexpr auto threshold<10>() noexcept { return 900.0; };
-
-template <>
-__host__ __device__ constexpr auto threshold<11>() noexcept { return 1800.0; };
-
-template <>
-__host__ __device__ constexpr auto threshold<12>() noexcept { return 3100.0; };
-
-template <>
-__host__ __device__ constexpr auto threshold<13>() noexcept { return 6500.0; };
-
-template <>
-__host__ __device__ constexpr auto threshold<14>() noexcept { return 15500.0; };
-
-template <>
-__host__ __device__ constexpr auto threshold<15>() noexcept { return 20000.0; };
-
-template <>
-__host__ __device__ constexpr auto threshold<16>() noexcept { return 50000.0; };
-
-template <>
-__host__ __device__ constexpr auto threshold<17>() noexcept { return 120000.0; };
-
-template <>
-__host__ __device__ constexpr auto threshold<18>() noexcept { return 350000.0; };
+__host__ __device__ constexpr auto threshold(int32_t precision) noexcept {
+  return threshold_data[precision - 4];
+}
 
 // HLL++ uses an interpolation method over the raw estimated cardinality to select the optimal bias.
 // Parameters/interpolation points taken from
@@ -94,53 +52,47 @@ CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p17{94542.0, 96125.811, 97728.019, 99
 CUCO_HLL_TUNING_ARR_DECL raw_estimate_data_p18{189084.0, 192250.913, 195456.774, 198696.946, 201977.762, 205294.444, 208651.754, 212042.099, 215472.269, 218941.91, 222443.912, 225996.845, 229568.199, 233193.568, 236844.457, 240543.233, 244279.475, 248044.27, 251854.588, 255693.2, 259583.619, 263494.621, 267445.385, 271454.061, 275468.769, 279549.456, 283646.446, 287788.198, 291966.099, 296181.164, 300431.469, 304718.618, 309024.004, 313393.508, 317760.803, 322209.731, 326675.061, 331160.627, 335654.47, 340241.442, 344841.833, 349467.132, 354130.629, 358819.432, 363574.626, 368296.587, 373118.482, 377914.93, 382782.301, 387680.669, 392601.981, 397544.323, 402529.115, 407546.018, 412593.658, 417638.657, 422762.865, 427886.169, 433017.167, 438213.273, 443441.254, 448692.421, 453937.533, 459239.049, 464529.569, 469910.083, 475274.03, 480684.473, 486070.26, 491515.237, 496995.651, 502476.617, 507973.609, 513497.19, 519083.233, 524726.509, 530305.505, 535945.728, 541584.404, 547274.055, 552967.236, 558667.862, 564360.216, 570128.148, 575965.08, 581701.952, 587532.523, 593361.144, 599246.128, 605033.418, 610958.779, 616837.117, 622772.818, 628672.04, 634675.369, 640574.831, 646585.739, 652574.547, 658611.217, 664642.684, 670713.914, 676737.681, 682797.313, 688837.897, 694917.874, 701009.882, 707173.648, 713257.254, 719415.392, 725636.761, 731710.697, 737906.209, 744103.074, 750313.39, 756504.185, 762712.579, 768876.985, 775167.859, 781359.0, 787615.959, 793863.597, 800245.477, 806464.582, 812785.294, 819005.925, 825403.057, 831676.197, 837936.284, 844266.968, 850642.711, 856959.756, 863322.774, 869699.931, 876102.478, 882355.787, 888694.463, 895159.952, 901536.143, 907872.631, 914293.672, 920615.14, 927130.974, 933409.404, 939922.178, 946331.47, 952745.93, 959209.264, 965590.224, 972077.284, 978501.961, 984953.19, 991413.271, 997817.479, 1004222.658, 1010725.676, 1017177.138, 1023612.529, 1030098.236, 1036493.719, 1043112.207, 1049537.036, 1056008.096, 1062476.184, 1068942.337, 1075524.95, 1081932.864, 1088426.025, 1094776.005, 1101327.448, 1107901.673, 1114423.639, 1120884.602, 1127324.923, 1133794.24, 1140328.886, 1146849.376, 1153346.682, 1159836.502, 1166478.703, 1172953.304, 1179391.502, 1185950.982, 1192544.052, 1198913.41, 1205430.994, 1212015.525, 1218674.042, 1225121.683, 1231551.101, 1238126.379, 1244673.795, 1251260.649, 1257697.86, 1264320.983, 1270736.319, 1277274.694, 1283804.95, 1290211.514, 1296858.568, 1303455.691};
 
 // helpers for selecting the corresponding arrays for a given precision
-template <int32_t Precision>
-__host__ __device__ auto const& raw_estimate_data() noexcept;
-
-template <>
-__host__ __device__ auto const& raw_estimate_data<4>() noexcept { return raw_estimate_data_p4; };
-
-template <>
-__host__ __device__ auto const& raw_estimate_data<5>() noexcept { return raw_estimate_data_p5; };
-
-template <>
-__host__ __device__ auto const& raw_estimate_data<6>() noexcept { return raw_estimate_data_p6; };
-
-template <>
-__host__ __device__ auto const& raw_estimate_data<7>() noexcept { return raw_estimate_data_p7; };
-
-template <>
-__host__ __device__ auto const& raw_estimate_data<8>() noexcept { return raw_estimate_data_p8; };
-
-template <>
-__host__ __device__ auto const& raw_estimate_data<9>() noexcept { return raw_estimate_data_p9; };
-
-template <>
-__host__ __device__ auto const& raw_estimate_data<10>() noexcept { return raw_estimate_data_p10; };
-
-template <>
-__host__ __device__ auto const& raw_estimate_data<11>() noexcept { return raw_estimate_data_p11; };
-
-template <>
-__host__ __device__ auto const& raw_estimate_data<12>() noexcept { return raw_estimate_data_p12; };
-
-template <>
-__host__ __device__ auto const& raw_estimate_data<13>() noexcept { return raw_estimate_data_p13; };
-
-template <>
-__host__ __device__ auto const& raw_estimate_data<14>() noexcept { return raw_estimate_data_p14; };
-
-template <>
-__host__ __device__ auto const& raw_estimate_data<15>() noexcept { return raw_estimate_data_p15; };
-
-template <>
-__host__ __device__ auto const& raw_estimate_data<16>() noexcept { return raw_estimate_data_p16; };
-
-template <>
-__host__ __device__ auto const& raw_estimate_data<17>() noexcept { return raw_estimate_data_p17; };
-
-template <>
-__host__ __device__ auto const& raw_estimate_data<18>() noexcept { return raw_estimate_data_p18; };
+__host__ __device__ constexpr double const* raw_estimate_data(int32_t precision) noexcept {
+  switch (precision) {
+    case 4:  return raw_estimate_data_p4.data();
+    case 5:  return raw_estimate_data_p5.data();
+    case 6:  return raw_estimate_data_p6.data();
+    case 7:  return raw_estimate_data_p7.data();
+    case 8:  return raw_estimate_data_p8.data();
+    case 9:  return raw_estimate_data_p9.data();
+    case 10: return raw_estimate_data_p10.data();
+    case 11: return raw_estimate_data_p11.data();
+    case 12: return raw_estimate_data_p12.data();
+    case 13: return raw_estimate_data_p13.data();
+    case 14: return raw_estimate_data_p14.data();
+    case 15: return raw_estimate_data_p15.data();
+    case 16: return raw_estimate_data_p16.data();
+    case 17: return raw_estimate_data_p17.data();
+    case 18: return raw_estimate_data_p18.data();
+    default: return nullptr;
+  }
+}
+
+__host__ __device__ constexpr size_t raw_estimate_data_size(int32_t precision) noexcept {
+  switch (precision) {
+    case 4:  return raw_estimate_data_p4.size();
+    case 5:  return raw_estimate_data_p5.size();
+    case 6:  return raw_estimate_data_p6.size();
+    case 7:  return raw_estimate_data_p7.size();
+    case 8:  return raw_estimate_data_p8.size();
+    case 9:  return raw_estimate_data_p9.size();
+    case 10: return raw_estimate_data_p10.size();
+    case 11: return raw_estimate_data_p11.size();
+    case 12: return raw_estimate_data_p12.size();
+    case 13: return raw_estimate_data_p13.size();
+    case 14: return raw_estimate_data_p14.size();
+    case 15: return raw_estimate_data_p15.size();
+    case 16: return raw_estimate_data_p16.size();
+    case 17: return raw_estimate_data_p17.size();
+    case 18: return raw_estimate_data_p18.size();
+    default: return 0;
+  }
+}
 
 CUCO_HLL_TUNING_ARR_DECL bias_data_p4{10.0, 9.717, 9.207, 8.7896, 8.2882, 7.8204, 7.3772, 6.9342, 6.5202, 6.161, 5.7722, 5.4636, 5.0396, 4.6766, 4.3566, 4.0454, 3.7936, 3.4856, 3.2666, 2.9946, 2.766, 2.4692, 2.3638, 2.0764, 1.7864, 1.7602, 1.4814, 1.433, 1.2926, 1.0664, 0.999600000000001, 0.7956, 0.5366, 0.589399999999998, 0.573799999999999, 0.269799999999996, 0.368200000000002, 0.0544000000000011, 0.234200000000001, 0.0108000000000033, -0.203400000000002, -0.0701999999999998, -0.129600000000003, -0.364199999999997, -0.480600000000003, -0.226999999999997, -0.322800000000001, -0.382599999999996, -0.511200000000002, -0.669600000000003, -0.749400000000001, -0.500399999999999, -0.617600000000003, -0.6922, -0.601599999999998, -0.416200000000003, -0.338200000000001, -0.782600000000002, -0.648600000000002, -0.919800000000002, -0.851799999999997, -0.962400000000002, -0.6402, -1.1922, -1.0256, -1.086, -1.21899999999999, -0.819400000000002, -0.940600000000003, -1.1554, -1.2072, -1.1752, -1.16759999999999, -1.14019999999999, -1.3754, -1.29859999999999, -1.607, -1.3292, -1.7606};
 CUCO_HLL_TUNING_ARR_DECL bias_data_p5{22.0, 21.1194, 20.8208, 20.2318, 19.77, 19.2436, 18.7774, 18.2848, 17.8224, 17.3742, 16.9336, 16.503, 16.0494, 15.6292, 15.2124, 14.798, 14.367, 13.9728, 13.5944, 13.217, 12.8438, 12.3696, 12.0956, 11.7044, 11.324, 11.0668, 10.6698, 10.3644, 10.049, 9.6918, 9.4146, 9.082, 8.687, 8.5398, 8.2462, 7.857, 7.6606, 7.4168, 7.1248, 6.9222, 6.6804, 6.447, 6.3454, 5.9594, 5.7636, 5.5776, 5.331, 5.19, 4.9676, 4.7564, 4.5314, 4.4442, 4.3708, 3.9774, 3.9624, 3.8796, 3.755, 3.472, 3.2076, 3.1024, 2.8908, 2.7338, 2.7728, 2.629, 2.413, 2.3266, 2.1524, 2.2642, 2.1806, 2.0566, 1.9192, 1.7598, 1.3516, 1.5802, 1.43859999999999, 1.49160000000001, 1.1524, 1.1892, 0.841399999999993, 0.879800000000003, 0.837599999999995, 0.469800000000006, 0.765600000000006, 0.331000000000003, 0.591399999999993, 0.601200000000006, 0.701599999999999, 0.558199999999999, 0.339399999999998, 0.354399999999998, 0.491200000000006, 0.308000000000007, 0.355199999999996, -0.0254000000000048, 0.205200000000005, -0.272999999999996, 0.132199999999997, 0.394400000000005, -0.241200000000006, 0.242000000000004, 0.191400000000002, 0.253799999999998, -0.122399999999999, -0.370800000000003, 0.193200000000004, -0.0848000000000013, 0.0867999999999967, -0.327200000000005, -0.285600000000002, 0.311400000000006, -0.128399999999999, -0.754999999999995, -0.209199999999996, -0.293599999999998, -0.364000000000004, -0.253600000000006, -0.821200000000005, -0.253600000000006, -0.510400000000004, -0.383399999999995, -0.491799999999998, -0.220200000000006, -0.0972000000000008, -0.557400000000001, -0.114599999999996, -0.295000000000002, -0.534800000000004, 0.346399999999988, -0.65379999999999, 0.0398000000000138, 0.0341999999999985, -0.995800000000003, -0.523400000000009, -0.489000000000004, -0.274799999999999, -0.574999999999989, -0.482799999999997, 0.0571999999999946, -0.330600000000004, -0.628800000000012, -0.140199999999993, -0.540600000000012, -0.445999999999998, -0.599400000000003, -0.262599999999992, 0.163399999999996, -0.100599999999986, -0.39500000000001, -1.06960000000001, -0.836399999999998, -0.753199999999993, -0.412399999999991, -0.790400000000005, -0.29679999999999, -0.28540000000001, -0.193000000000012, -0.0772000000000048, -0.962799999999987, -0.414800000000014};
@@ -158,53 +110,47 @@ CUCO_HLL_TUNING_ARR_DECL bias_data_p16{47270.0, 46423.3584, 45585.7074, 44757.15
 CUCO_HLL_TUNING_ARR_DECL bias_data_p17{94541.0, 92848.811, 91174.019, 89517.558, 87879.9705, 86262.7565, 84663.5125, 83083.7435, 81521.7865, 79977.272, 78455.9465, 76950.219, 75465.432, 73994.152, 72546.71, 71115.2345, 69705.6765, 68314.937, 66944.2705, 65591.255, 64252.9485, 62938.016, 61636.8225, 60355.592, 59092.789, 57850.568, 56624.518, 55417.343, 54231.1415, 53067.387, 51903.526, 50774.649, 49657.6415, 48561.05, 47475.7575, 46410.159, 45364.852, 44327.053, 43318.4005, 42325.6165, 41348.4595, 40383.6265, 39436.77, 38509.502, 37594.035, 36695.939, 35818.6895, 34955.691, 34115.8095, 33293.949, 32465.0775, 31657.6715, 30877.2585, 30093.78, 29351.3695, 28594.1365, 27872.115, 27168.7465, 26477.076, 25774.541, 25106.5375, 24452.5135, 23815.5125, 23174.0655, 22555.2685, 21960.2065, 21376.3555, 20785.1925, 20211.517, 19657.0725, 19141.6865, 18579.737, 18081.3955, 17578.995, 17073.44, 16608.335, 16119.911, 15651.266, 15194.583, 14749.0495, 14343.4835, 13925.639, 13504.509, 13099.3885, 12691.2855, 12328.018, 11969.0345, 11596.5145, 11245.6355, 10917.6575, 10580.9785, 10277.8605, 9926.58100000001, 9605.538, 9300.42950000003, 8989.97850000003, 8728.73249999998, 8448.3235, 8175.31050000002, 7898.98700000002, 7629.79100000003, 7413.76199999999, 7149.92300000001, 6921.12650000001, 6677.1545, 6443.28000000003, 6278.23450000002, 6014.20049999998, 5791.20299999998, 5605.78450000001, 5438.48800000001, 5234.2255, 5059.6825, 4887.43349999998, 4682.935, 4496.31099999999, 4322.52250000002, 4191.42499999999, 4021.24200000003, 3900.64799999999, 3762.84250000003, 3609.98050000001, 3502.29599999997, 3363.84250000003, 3206.54849999998, 3079.70000000001, 2971.42300000001, 2867.80349999998, 2727.08100000001, 2630.74900000001, 2496.6165, 2440.902, 2356.19150000002, 2235.58199999999, 2120.54149999999, 2012.25449999998, 1933.35600000003, 1820.93099999998, 1761.54800000001, 1663.09350000002, 1578.84600000002, 1509.48149999999, 1427.3345, 1379.56150000001, 1306.68099999998, 1212.63449999999, 1084.17300000001, 1124.16450000001, 1060.69949999999, 1007.48849999998, 941.194499999983, 879.880500000028, 836.007500000007, 782.802000000025, 748.385499999975, 647.991500000004, 626.730500000005, 570.776000000013, 484.000500000024, 513.98550000001, 418.985499999952, 386.996999999974, 370.026500000036, 355.496999999974, 356.731499999994, 255.92200000002, 259.094000000041, 205.434499999974, 165.374500000034, 197.347500000033, 95.718499999959, 67.6165000000037, 54.6970000000438, 31.7395000000251, -15.8784999999916, 8.42500000004657, -26.3754999999655, -118.425500000012, -66.6629999999423, -42.9745000000112, -107.364999999991, -189.839000000036, -162.611499999999, -164.964999999967, -189.079999999958, -223.931499999948, -235.329999999958, -269.639500000048, -249.087999999989, -206.475499999942, -283.04449999996, -290.667000000016, -304.561499999953, -336.784499999951, -380.386500000022, -283.280499999993, -364.533000000054, -389.059499999974, -364.454000000027, -415.748000000021, -417.155000000028};
 CUCO_HLL_TUNING_ARR_DECL bias_data_p18{189083.0, 185696.913, 182348.774, 179035.946, 175762.762, 172526.444, 169329.754, 166166.099, 163043.269, 159958.91, 156907.912, 153906.845, 150924.199, 147996.568, 145093.457, 142239.233, 139421.475, 136632.27, 133889.588, 131174.2, 128511.619, 125868.621, 123265.385, 120721.061, 118181.769, 115709.456, 113252.446, 110840.198, 108465.099, 106126.164, 103823.469, 101556.618, 99308.004, 97124.508, 94937.803, 92833.731, 90745.061, 88677.627, 86617.47, 84650.442, 82697.833, 80769.132, 78879.629, 77014.432, 75215.626, 73384.587, 71652.482, 69895.93, 68209.301, 66553.669, 64921.981, 63310.323, 61742.115, 60205.018, 58698.658, 57190.657, 55760.865, 54331.169, 52908.167, 51550.273, 50225.254, 48922.421, 47614.533, 46362.049, 45098.569, 43926.083, 42736.03, 41593.473, 40425.26, 39316.237, 38243.651, 37170.617, 36114.609, 35084.19, 34117.233, 33206.509, 32231.505, 31318.728, 30403.404, 29540.0550000001, 28679.236, 27825.862, 26965.216, 26179.148, 25462.08, 24645.952, 23922.523, 23198.144, 22529.128, 21762.4179999999, 21134.779, 20459.117, 19840.818, 19187.04, 18636.3689999999, 17982.831, 17439.7389999999, 16874.547, 16358.2169999999, 15835.684, 15352.914, 14823.681, 14329.313, 13816.897, 13342.874, 12880.882, 12491.648, 12021.254, 11625.392, 11293.7610000001, 10813.697, 10456.209, 10099.074, 9755.39000000001, 9393.18500000006, 9047.57900000003, 8657.98499999999, 8395.85900000005, 8033.0, 7736.95900000003, 7430.59699999995, 7258.47699999996, 6924.58200000005, 6691.29399999999, 6357.92500000005, 6202.05700000003, 5921.19700000004, 5628.28399999999, 5404.96799999999, 5226.71100000001, 4990.75600000005, 4799.77399999998, 4622.93099999998, 4472.478, 4171.78700000001, 3957.46299999999, 3868.95200000005, 3691.14300000004, 3474.63100000005, 3341.67200000002, 3109.14000000001, 3071.97400000005, 2796.40399999998, 2756.17799999996, 2611.46999999997, 2471.93000000005, 2382.26399999997, 2209.22400000005, 2142.28399999999, 2013.96100000001, 1911.18999999994, 1818.27099999995, 1668.47900000005, 1519.65800000005, 1469.67599999998, 1367.13800000004, 1248.52899999998, 1181.23600000003, 1022.71900000004, 1088.20700000005, 959.03600000008, 876.095999999903, 791.183999999892, 703.337000000058, 731.949999999953, 586.86400000006, 526.024999999907, 323.004999999888, 320.448000000091, 340.672999999952, 309.638999999966, 216.601999999955, 102.922999999952, 19.2399999999907, -0.114000000059605, -32.6240000000689, -89.3179999999702, -153.497999999905, -64.2970000000205, -143.695999999996, -259.497999999905, -253.017999999924, -213.948000000091, -397.590000000084, -434.006000000052, -403.475000000093, -297.958000000101, -404.317000000039, -528.898999999976, -506.621000000043, -513.205000000075, -479.351000000024, -596.139999999898, -527.016999999993, -664.681000000099, -680.306000000099, -704.050000000047, -850.486000000034, -757.43200000003, -713.308999999892};
 
-template <int32_t Precision>
-__host__ __device__ auto const& bias_data() noexcept;
-
-template <>
-__host__ __device__ auto const& bias_data<4>() noexcept { return bias_data_p4; };
-
-template <>
-__host__ __device__ auto const& bias_data<5>() noexcept { return bias_data_p5; };
-
-template <>
-__host__ __device__ auto const& bias_data<6>() noexcept { return bias_data_p6; };
-
-template <>
-__host__ __device__ auto const& bias_data<7>() noexcept { return bias_data_p7; };
-
-template <>
-__host__ __device__ auto const& bias_data<8>() noexcept { return bias_data_p8; };
-
-template <>
-__host__ __device__ auto const& bias_data<9>() noexcept { return bias_data_p9; };
-
-template <>
-__host__ __device__ auto const& bias_data<10>() noexcept { return bias_data_p10; };
-
-template <>
-__host__ __device__ auto const& bias_data<11>() noexcept { return bias_data_p11; };
-
-template <>
-__host__ __device__ auto const& bias_data<12>() noexcept { return bias_data_p12; };
-
-template <>
-__host__ __device__ auto const& bias_data<13>() noexcept { return bias_data_p13; };
-
-template <>
-__host__ __device__ auto const& bias_data<14>() noexcept { return bias_data_p14; };
-
-template <>
-__host__ __device__ auto const& bias_data<15>() noexcept { return bias_data_p15; };
-
-template <>
-__host__ __device__ auto const& bias_data<16>() noexcept { return bias_data_p16; };
-
-template <>
-__host__ __device__ auto const& bias_data<17>() noexcept { return bias_data_p17; };
-
-template <>
-__host__ __device__ auto const& bias_data<18>() noexcept { return bias_data_p18; };
+__host__ __device__ constexpr double const* bias_data(int32_t precision) noexcept {
+  switch (precision) {
+    case 4:  return bias_data_p4.data();
+    case 5:  return bias_data_p5.data();
+    case 6:  return bias_data_p6.data();
+    case 7:  return bias_data_p7.data();
+    case 8:  return bias_data_p8.data();
+    case 9:  return bias_data_p9.data();
+    case 10: return bias_data_p10.data();
+    case 11: return bias_data_p11.data();
+    case 12: return bias_data_p12.data();
+    case 13: return bias_data_p13.data();
+    case 14: return bias_data_p14.data();
+    case 15: return bias_data_p15.data();
+    case 16: return bias_data_p16.data();
+    case 17: return bias_data_p17.data();
+    case 18: return bias_data_p18.data();
+    default: return nullptr;
+  }
+}
+
+__host__ __device__ constexpr size_t bias_data_size(int32_t precision) noexcept {
+  switch (precision) {
+    case 4:  return bias_data_p4.size();
+    case 5:  return bias_data_p5.size();
+    case 6:  return bias_data_p6.size();
+    case 7:  return bias_data_p7.size();
+    case 8:  return bias_data_p8.size();
+    case 9:  return bias_data_p9.size();
+    case 10: return bias_data_p10.size();
+    case 11: return bias_data_p11.size();
+    case 12: return bias_data_p12.size();
+    case 13: return bias_data_p13.size();
+    case 14: return bias_data_p14.size();
+    case 15: return bias_data_p15.size();
+    case 16: return bias_data_p16.size();
+    case 17: return bias_data_p17.size();
+    case 18: return bias_data_p18.size();
+    default: return 0;
+  }
+}
 // clang-format on
 
 }  // namespace cuco::hyperloglog_ns::detail
\ No newline at end of file

From e93c248439be182e7756280efd77694c3c5e6210 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Fri, 15 Mar 2024 13:50:37 +0000
Subject: [PATCH 33/78] Allow wider vector sizes

---
 include/cuco/detail/hyperloglog/hyperloglog_ref.cuh | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
index 35ffce286..60552bf00 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
@@ -166,9 +166,10 @@ class hyperloglog_ref {
     // In case the input iterator represents a contiguous memory segment we can employ efficient
     // vectorized loads
     if constexpr (thrust::is_contiguous_iterator_v<InputIt>) {
-      auto const ptr = thrust::raw_pointer_cast(&first[0]);
+      auto const ptr                  = thrust::raw_pointer_cast(&first[0]);
+      auto constexpr max_vector_bytes = 32;
       auto const alignment =
-        1 << cuda::std::countr_zero(reinterpret_cast<cuda::std::uintptr_t>(ptr) | 16);
+        1 << cuda::std::countr_zero(reinterpret_cast<cuda::std::uintptr_t>(ptr) | max_vector_bytes);
       auto const vector_size = alignment / sizeof(value_type);
 
       switch (vector_size) {
@@ -184,6 +185,10 @@ class hyperloglog_ref {
           kernel = reinterpret_cast<void const*>(
             cuco::hyperloglog_ns::detail::add_shmem_vectorized<8, hyperloglog_ref>);
           break;
+        case 16:
+          kernel = reinterpret_cast<void const*>(
+            cuco::hyperloglog_ns::detail::add_shmem_vectorized<16, hyperloglog_ref>);
+          break;
       };
     }
 

From 204b8e25aa6cf09ee12e5ea357f7b799096b6287 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Fri, 15 Mar 2024 14:56:28 +0000
Subject: [PATCH 34/78] Fix processing of remaining items

---
 include/cuco/detail/hyperloglog/kernels.cuh | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/include/cuco/detail/hyperloglog/kernels.cuh b/include/cuco/detail/hyperloglog/kernels.cuh
index 4da78d020..c7e03491e 100644
--- a/include/cuco/detail/hyperloglog/kernels.cuh
+++ b/include/cuco/detail/hyperloglog/kernels.cuh
@@ -49,12 +49,14 @@ CUCO_KERNEL void add_shmem_vectorized(typename RefType::value_type const* first,
 
   auto const loop_stride = cuco::detail::grid_stride();
   auto idx               = cuco::detail::global_thread_id();
+  auto const grid        = cooperative_groups::this_grid();
   auto const block       = cooperative_groups::this_thread_block();
 
   local_ref_type local_ref(cuda::std::span{local_sketch, ref.sketch_bytes()}, {});
   local_ref.clear(block);
   block.sync();
 
+  // each thread processes VectorSize-many items per iteration
   vector_type vec;
   while (idx < n / VectorSize) {
     vec = *reinterpret_cast<vector_type*>(
@@ -64,8 +66,13 @@ CUCO_KERNEL void add_shmem_vectorized(typename RefType::value_type const* first,
     }
     idx += loop_stride;
   }
-  auto const remainder = n % VectorSize;
-  if (idx >= n / VectorSize and idx < n / VectorSize + remainder) { local_ref.add(*(first + idx)); }
+  // a single thread processes the remaining items
+  cooperative_groups::invoke_one(grid, [&]() {
+    auto const remainder = n % VectorSize;
+    for (int i = 0; i < remainder; ++i) {
+      local_ref.add(*(first + n - i - 1));
+    }
+  });
   block.sync();
 
   ref.merge(block, local_ref);

From fe1cf5a9a4e04205a66681efbe328570bd54bbcb Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Fri, 15 Mar 2024 15:02:49 +0000
Subject: [PATCH 35/78] Guard invoke_one with macro

---
 include/cuco/detail/__config                | 4 ++++
 include/cuco/detail/hyperloglog/kernels.cuh | 9 +++++++++
 2 files changed, 13 insertions(+)

diff --git a/include/cuco/detail/__config b/include/cuco/detail/__config
index e0ac92a23..6d4bf7339 100644
--- a/include/cuco/detail/__config
+++ b/include/cuco/detail/__config
@@ -39,6 +39,10 @@
 #define CUCO_HAS_CUDA_BARRIER
 #endif
 
+#if defined(CUDART_VERSION) && (CUDART_VERSION >= 12010)
+#define CUCO_HAS_CG_INVOKE_ONE
+#endif
+
 #if (CUCO_CUDA_MINIMUM_ARCH >= 700)
 #define CUCO_HAS_INDEPENDENT_THREADS
 #endif
diff --git a/include/cuco/detail/hyperloglog/kernels.cuh b/include/cuco/detail/hyperloglog/kernels.cuh
index c7e03491e..ba4ceb506 100644
--- a/include/cuco/detail/hyperloglog/kernels.cuh
+++ b/include/cuco/detail/hyperloglog/kernels.cuh
@@ -67,12 +67,21 @@ CUCO_KERNEL void add_shmem_vectorized(typename RefType::value_type const* first,
     idx += loop_stride;
   }
   // a single thread processes the remaining items
+#if defined(CUCO_HAS_CG_INVOKE_ONE)
   cooperative_groups::invoke_one(grid, [&]() {
     auto const remainder = n % VectorSize;
     for (int i = 0; i < remainder; ++i) {
       local_ref.add(*(first + n - i - 1));
     }
   });
+#else
+  if (grid.thread_rank() == 0) {
+    auto const remainder = n % VectorSize;
+    for (int i = 0; i < remainder; ++i) {
+      local_ref.add(*(first + n - i - 1));
+    }
+  }
+#endif
   block.sync();
 
   ref.merge(block, local_ref);

From ae9e77c826923aafe68caa67ff658a026ac01507 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Mon, 18 Mar 2024 23:05:34 +0000
Subject: [PATCH 36/78] Specify sketch size/precision at runtime

---
 benchmarks/distinct_count_estimator_bench.cu  | 24 +++--
 .../device_ref_example.cu                     | 10 +-
 .../distinct_count_estimator.inl              | 93 ++++++++++--------
 .../distinct_count_estimator_ref.inl          | 96 ++++++++++---------
 include/cuco/detail/hyperloglog/finalizer.cuh | 78 ++++++++-------
 .../cuco/detail/hyperloglog/hyperloglog.cuh   | 47 +++++----
 .../detail/hyperloglog/hyperloglog_ref.cuh    | 83 +++++++++-------
 include/cuco/distinct_count_estimator.cuh     | 40 ++++----
 include/cuco/distinct_count_estimator_ref.cuh | 47 ++++-----
 .../unique_sequence_test.cu                   | 25 ++---
 10 files changed, 293 insertions(+), 250 deletions(-)

diff --git a/benchmarks/distinct_count_estimator_bench.cu b/benchmarks/distinct_count_estimator_bench.cu
index 12504f120..9b3ba02c0 100644
--- a/benchmarks/distinct_count_estimator_bench.cu
+++ b/benchmarks/distinct_count_estimator_bench.cu
@@ -56,7 +56,8 @@ void distinct_count_estimator_e2e(nvbench::state& state, nvbench::type_list<Esti
 {
   using T = typename Estimator::value_type;
 
-  auto const num_items = state.get_int64("NumInputs");
+  auto const num_items      = state.get_int64("NumInputs");
+  auto const sketch_size_kb = state.get_int64("SketchSizeKB");
 
   thrust::device_vector<T> items(num_items);
 
@@ -66,7 +67,7 @@ void distinct_count_estimator_e2e(nvbench::state& state, nvbench::type_list<Esti
   state.add_element_count(num_items);
   state.add_global_memory_reads<T>(num_items, "InputSize");
 
-  Estimator estimator;
+  Estimator estimator(sketch_size_kb);
   estimator.add(items.begin(), items.end());
 
   double estimated_cardinality  = estimator.estimate();
@@ -99,7 +100,8 @@ void distinct_count_estimator_add(nvbench::state& state, nvbench::type_list<Esti
 {
   using T = typename Estimator::value_type;
 
-  auto const num_items = state.get_int64("NumInputs");
+  auto const num_items      = state.get_int64("NumInputs");
+  auto const sketch_size_kb = state.get_int64("SketchSizeKB");
 
   thrust::device_vector<T> items(num_items);
 
@@ -109,7 +111,7 @@ void distinct_count_estimator_add(nvbench::state& state, nvbench::type_list<Esti
   state.add_element_count(num_items);
   state.add_global_memory_reads<T>(num_items, "InputSize");
 
-  Estimator estimator;
+  Estimator estimator(sketch_size_kb);
   state.exec(nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
     estimator.clear_async({launch.get_stream()});
 
@@ -119,21 +121,16 @@ void distinct_count_estimator_add(nvbench::state& state, nvbench::type_list<Esti
   });
 }
 
-using ESTIMATOR_RANGE = nvbench::type_list<cuco::distinct_count_estimator<nvbench::int32_t, 10>,
-                                           cuco::distinct_count_estimator<nvbench::int32_t, 11>,
-                                           cuco::distinct_count_estimator<nvbench::int32_t, 12>,
-                                           cuco::distinct_count_estimator<nvbench::int64_t, 10>,
-                                           cuco::distinct_count_estimator<nvbench::int64_t, 11>,
-                                           cuco::distinct_count_estimator<nvbench::int64_t, 12>,
-                                           cuco::distinct_count_estimator<__int128_t, 10>,
-                                           cuco::distinct_count_estimator<__int128_t, 11>,
-                                           cuco::distinct_count_estimator<__int128_t, 12>>;
+using ESTIMATOR_RANGE = nvbench::type_list<cuco::distinct_count_estimator<nvbench::int32_t>,
+                                           cuco::distinct_count_estimator<nvbench::int64_t>,
+                                           cuco::distinct_count_estimator<__int128_t>>;
 
 NVBENCH_BENCH_TYPES(distinct_count_estimator_e2e,
                     NVBENCH_TYPE_AXES(ESTIMATOR_RANGE, nvbench::type_list<distribution::unique>))
   .set_name("distinct_count_estimator_e2e")
   .set_type_axes_names({"Estimator", "Distribution"})
   .add_int64_power_of_two_axis("NumInputs", {28, 29, 30})
+  .add_int64_axis("SketchSizeKB", {8, 16, 32})
   .set_max_noise(defaults::MAX_NOISE);
 
 NVBENCH_BENCH_TYPES(distinct_count_estimator_add,
@@ -141,4 +138,5 @@ NVBENCH_BENCH_TYPES(distinct_count_estimator_add,
   .set_name("distinct_count_estimator::add_async")
   .set_type_axes_names({"Estimator", "Distribution"})
   .add_int64_power_of_two_axis("NumInputs", {28, 29, 30})
+  .add_int64_axis("SketchSizeKB", {8, 16, 32})
   .set_max_noise(defaults::MAX_NOISE);
\ No newline at end of file
diff --git a/examples/distinct_count_estimator/device_ref_example.cu b/examples/distinct_count_estimator/device_ref_example.cu
index 845634388..c8716e421 100644
--- a/examples/distinct_count_estimator/device_ref_example.cu
+++ b/examples/distinct_count_estimator/device_ref_example.cu
@@ -37,15 +37,14 @@ __global__ void piggyback_kernel(RefType ref, InputIt first, std::size_t n)
   using local_ref_type = typename RefType::with_scope<cuda::thread_scope_block>;
 
   // Shared memory storage for the block-local estimator
-  alignas(local_ref_type::sketch_alignment())
-    __shared__ std::byte local_sketch[local_ref_type::sketch_bytes()];
+  extern __shared__ std::byte local_sketch[];
 
   auto const loop_stride = gridDim.x * blockDim.x;
   auto idx               = blockDim.x * blockIdx.x + threadIdx.x;
   auto const block       = cooperative_groups::this_thread_block();
 
   // Create the local estimator with the shared memory storage
-  local_ref_type local_ref(cuda::std::span{local_sketch, local_ref_type::sketch_bytes()}, {});
+  local_ref_type local_ref(cuda::std::span{local_sketch, ref.sketch_bytes()});
 
   // Initialize the local estimator
   local_ref.clear(block);
@@ -103,8 +102,11 @@ int main(void)
   // Clear the estimator so it can be reused
   estimator.clear();
 
+  // Number of dynamic shared memory bytes required to store a CTA-local sketch
+  auto const sketch_bytes = estimator.sketch_bytes();
+
   // Call the custom kernel and pass a non-owning reference to the estimator to the GPU
-  piggyback_kernel<<<10, 512>>>(estimator.ref(), items.begin(), num_items);
+  piggyback_kernel<<<10, 512, sketch_bytes>>>(estimator.ref(), items.begin(), num_items);
 
   // Calculate the cardinality estimate from the custom kernel
   std::size_t const estimated_cardinality_custom = estimator.estimate();
diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl
index df68a0593..54806aba6 100644
--- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl
+++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl
@@ -16,115 +16,124 @@
 
 namespace cuco {
 
-template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash, class Allocator>
-constexpr distinct_count_estimator<T, Precision, Scope, Hash, Allocator>::distinct_count_estimator(
-  Hash const& hash, Allocator const& alloc, cuco::cuda_stream_ref stream)
-  : impl_{std::make_unique<impl_type>(hash, alloc, stream)}
+template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
+constexpr distinct_count_estimator<T, Scope, Hash, Allocator>::distinct_count_estimator(
+  std::size_t max_sketch_size_kb,
+  Hash const& hash,
+  Allocator const& alloc,
+  cuco::cuda_stream_ref stream)
+  : impl_{std::make_unique<impl_type>(max_sketch_size_kb, hash, alloc, stream)}
 {
 }
 
-template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash, class Allocator>
-void distinct_count_estimator<T, Precision, Scope, Hash, Allocator>::clear_async(
+template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
+void distinct_count_estimator<T, Scope, Hash, Allocator>::clear_async(
   cuco::cuda_stream_ref stream) noexcept
 {
   this->impl_->clear_async(stream);
 }
 
-template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash, class Allocator>
-void distinct_count_estimator<T, Precision, Scope, Hash, Allocator>::clear(
-  cuco::cuda_stream_ref stream)
+template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
+void distinct_count_estimator<T, Scope, Hash, Allocator>::clear(cuco::cuda_stream_ref stream)
 {
   this->impl_->clear(stream);
 }
 
-template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash, class Allocator>
+template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
 template <class InputIt>
-void distinct_count_estimator<T, Precision, Scope, Hash, Allocator>::add_async(
+void distinct_count_estimator<T, Scope, Hash, Allocator>::add_async(
   InputIt first, InputIt last, cuco::cuda_stream_ref stream) noexcept
 {
   this->impl_->add_async(first, last, stream);
 }
 
-template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash, class Allocator>
+template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
 template <class InputIt>
-void distinct_count_estimator<T, Precision, Scope, Hash, Allocator>::add(
-  InputIt first, InputIt last, cuco::cuda_stream_ref stream)
+void distinct_count_estimator<T, Scope, Hash, Allocator>::add(InputIt first,
+                                                              InputIt last,
+                                                              cuco::cuda_stream_ref stream)
 {
   this->impl_->add(first, last, stream);
 }
 
-template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash, class Allocator>
+template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
 template <cuda::thread_scope OtherScope, class OtherAllocator>
-void distinct_count_estimator<T, Precision, Scope, Hash, Allocator>::merge_async(
-  distinct_count_estimator<T, Precision, OtherScope, Hash, OtherAllocator> const& other,
+void distinct_count_estimator<T, Scope, Hash, Allocator>::merge_async(
+  distinct_count_estimator<T, OtherScope, Hash, OtherAllocator> const& other,
   cuco::cuda_stream_ref stream) noexcept
 {
   this->impl_->merge_async(other, stream);
 }
 
-template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash, class Allocator>
+template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
 template <cuda::thread_scope OtherScope, class OtherAllocator>
-void distinct_count_estimator<T, Precision, Scope, Hash, Allocator>::merge(
-  distinct_count_estimator<T, Precision, OtherScope, Hash, OtherAllocator> const& other,
+void distinct_count_estimator<T, Scope, Hash, Allocator>::merge(
+  distinct_count_estimator<T, OtherScope, Hash, OtherAllocator> const& other,
   cuco::cuda_stream_ref stream)
 {
   this->impl_->merge(other, stream);
 }
 
-template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash, class Allocator>
+template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
 template <cuda::thread_scope OtherScope>
-void distinct_count_estimator<T, Precision, Scope, Hash, Allocator>::merge_async(
+void distinct_count_estimator<T, Scope, Hash, Allocator>::merge_async(
   ref_type<OtherScope> const& other, cuco::cuda_stream_ref stream) noexcept
 {
   this->impl_->merge_async(other, stream);
 }
 
-template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash, class Allocator>
+template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
 template <cuda::thread_scope OtherScope>
-void distinct_count_estimator<T, Precision, Scope, Hash, Allocator>::merge(
-  ref_type<OtherScope> const& other, cuco::cuda_stream_ref stream)
+void distinct_count_estimator<T, Scope, Hash, Allocator>::merge(ref_type<OtherScope> const& other,
+                                                                cuco::cuda_stream_ref stream)
 {
   this->impl_->merge(other, stream);
 }
 
-template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash, class Allocator>
-std::size_t distinct_count_estimator<T, Precision, Scope, Hash, Allocator>::estimate(
+template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
+std::size_t distinct_count_estimator<T, Scope, Hash, Allocator>::estimate(
   cuco::cuda_stream_ref stream) const
 {
   return this->impl_->estimate(stream);
 }
 
-template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash, class Allocator>
-typename distinct_count_estimator<T, Precision, Scope, Hash, Allocator>::ref_type<>
-distinct_count_estimator<T, Precision, Scope, Hash, Allocator>::ref() const noexcept
+template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
+typename distinct_count_estimator<T, Scope, Hash, Allocator>::ref_type<>
+distinct_count_estimator<T, Scope, Hash, Allocator>::ref() const noexcept
 {
   return {this->sketch(), this->hash()};
 }
 
-template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash, class Allocator>
-auto distinct_count_estimator<T, Precision, Scope, Hash, Allocator>::hash() const noexcept
+template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
+auto distinct_count_estimator<T, Scope, Hash, Allocator>::hash() const noexcept
 {
   return this->impl_->hash();
 }
 
-template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash, class Allocator>
-auto distinct_count_estimator<T, Precision, Scope, Hash, Allocator>::sketch() const noexcept
+template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
+cuda::std::span<std::byte> distinct_count_estimator<T, Scope, Hash, Allocator>::sketch()
+  const noexcept
 {
   return this->impl_->sketch();
 }
 
-template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash, class Allocator>
-constexpr size_t
-distinct_count_estimator<T, Precision, Scope, Hash, Allocator>::sketch_bytes() noexcept
+template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
+constexpr size_t distinct_count_estimator<T, Scope, Hash, Allocator>::sketch_bytes() const noexcept
+{
+  return this->impl_->sketch_bytes();
+}
+
+template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
+constexpr size_t distinct_count_estimator<T, Scope, Hash, Allocator>::sketch_bytes(
+  size_t max_sketch_size_kb) noexcept
 {
-  return impl_type::sketch_bytes();
+  return impl_type::sketch_bytes(max_sketch_size_kb);
 }
 
-template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash, class Allocator>
-constexpr size_t
-distinct_count_estimator<T, Precision, Scope, Hash, Allocator>::sketch_alignment() noexcept
+template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
+constexpr size_t distinct_count_estimator<T, Scope, Hash, Allocator>::sketch_alignment() noexcept
 {
-  return impl_type::sketch();
+  return impl_type::sketch_alignment();
 }
 
 }  // namespace cuco
\ No newline at end of file
diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl
index 50bea1675..d0cf85475 100644
--- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl
+++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl
@@ -16,123 +16,127 @@
 
 namespace cuco {
 
-template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash>
-template <class U, std::size_t N>
-__host__ __device__ constexpr distinct_count_estimator_ref<T, Precision, Scope, Hash>::
-  distinct_count_estimator_ref(cuda::std::span<U, N> sketch_span, Hash const& hash) noexcept
+template <class T, cuda::thread_scope Scope, class Hash>
+__host__
+  __device__ constexpr distinct_count_estimator_ref<T, Scope, Hash>::distinct_count_estimator_ref(
+    cuda::std::span<std::byte> sketch_span, Hash const& hash)
   : impl_{sketch_span, hash}
 {
 }
 
-template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash>
+template <class T, cuda::thread_scope Scope, class Hash>
 template <class CG>
-__device__ void distinct_count_estimator_ref<T, Precision, Scope, Hash>::clear(
-  CG const& group) noexcept
+__device__ void distinct_count_estimator_ref<T, Scope, Hash>::clear(CG const& group) noexcept
 {
   this->impl_.clear(group);
 }
 
-template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash>
-__host__ void distinct_count_estimator_ref<T, Precision, Scope, Hash>::clear_async(
+template <class T, cuda::thread_scope Scope, class Hash>
+__host__ void distinct_count_estimator_ref<T, Scope, Hash>::clear_async(
   cuco::cuda_stream_ref stream) noexcept
 {
   this->impl_.clear_async(stream);
 }
 
-template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash>
-__host__ void distinct_count_estimator_ref<T, Precision, Scope, Hash>::clear(
-  cuco::cuda_stream_ref stream)
+template <class T, cuda::thread_scope Scope, class Hash>
+__host__ void distinct_count_estimator_ref<T, Scope, Hash>::clear(cuco::cuda_stream_ref stream)
 {
   this->impl_.clear(stream);
 }
 
-template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash>
-__device__ void distinct_count_estimator_ref<T, Precision, Scope, Hash>::add(T const& item) noexcept
+template <class T, cuda::thread_scope Scope, class Hash>
+__device__ void distinct_count_estimator_ref<T, Scope, Hash>::add(T const& item) noexcept
 {
   this->impl_.add(item);
 }
 
-template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash>
+template <class T, cuda::thread_scope Scope, class Hash>
 template <class InputIt>
-__host__ void distinct_count_estimator_ref<T, Precision, Scope, Hash>::add_async(
-  InputIt first, InputIt last, cuco::cuda_stream_ref stream)
+__host__ void distinct_count_estimator_ref<T, Scope, Hash>::add_async(InputIt first,
+                                                                      InputIt last,
+                                                                      cuco::cuda_stream_ref stream)
 {
   this->impl_.add_async(first, last, stream);
 }
 
-template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash>
+template <class T, cuda::thread_scope Scope, class Hash>
 template <class InputIt>
-__host__ void distinct_count_estimator_ref<T, Precision, Scope, Hash>::add(
-  InputIt first, InputIt last, cuco::cuda_stream_ref stream)
+__host__ void distinct_count_estimator_ref<T, Scope, Hash>::add(InputIt first,
+                                                                InputIt last,
+                                                                cuco::cuda_stream_ref stream)
 {
   this->impl_.add(first, last, stream);
 }
 
-template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash>
+template <class T, cuda::thread_scope Scope, class Hash>
 template <class CG, cuda::thread_scope OtherScope>
-__device__ void distinct_count_estimator_ref<T, Precision, Scope, Hash>::merge(
-  CG const& group,
-  distinct_count_estimator_ref<T, Precision, OtherScope, Hash> const& other) noexcept
+__device__ void distinct_count_estimator_ref<T, Scope, Hash>::merge(
+  CG const& group, distinct_count_estimator_ref<T, OtherScope, Hash> const& other) noexcept
 {
   this->impl_.merge(group, other.impl_);
 }
 
-template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash>
+template <class T, cuda::thread_scope Scope, class Hash>
 template <cuda::thread_scope OtherScope>
-__host__ void distinct_count_estimator_ref<T, Precision, Scope, Hash>::merge_async(
-  distinct_count_estimator_ref<T, Precision, OtherScope, Hash> const& other,
+__host__ void distinct_count_estimator_ref<T, Scope, Hash>::merge_async(
+  distinct_count_estimator_ref<T, OtherScope, Hash> const& other,
   cuco::cuda_stream_ref stream) noexcept
 {
   this->impl_.merge_async(other, stream);
 }
 
-template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash>
+template <class T, cuda::thread_scope Scope, class Hash>
 template <cuda::thread_scope OtherScope>
-__host__ void distinct_count_estimator_ref<T, Precision, Scope, Hash>::merge(
-  distinct_count_estimator_ref<T, Precision, OtherScope, Hash> const& other,
-  cuco::cuda_stream_ref stream)
+__host__ void distinct_count_estimator_ref<T, Scope, Hash>::merge(
+  distinct_count_estimator_ref<T, OtherScope, Hash> const& other, cuco::cuda_stream_ref stream)
 {
   this->impl_.merge(other, stream);
 }
 
-template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash>
-__device__ std::size_t distinct_count_estimator_ref<T, Precision, Scope, Hash>::estimate(
+template <class T, cuda::thread_scope Scope, class Hash>
+__device__ std::size_t distinct_count_estimator_ref<T, Scope, Hash>::estimate(
   cooperative_groups::thread_block const& group) const noexcept
 {
   return this->impl_.estimate(group);
 }
 
-template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash>
-__host__ std::size_t distinct_count_estimator_ref<T, Precision, Scope, Hash>::estimate(
+template <class T, cuda::thread_scope Scope, class Hash>
+__host__ std::size_t distinct_count_estimator_ref<T, Scope, Hash>::estimate(
   cuco::cuda_stream_ref stream) const
 {
   return this->impl_.estimate(stream);
 }
 
-template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash>
-__host__ __device__ auto distinct_count_estimator_ref<T, Precision, Scope, Hash>::hash()
-  const noexcept
+template <class T, cuda::thread_scope Scope, class Hash>
+__host__ __device__ auto distinct_count_estimator_ref<T, Scope, Hash>::hash() const noexcept
 {
   return this->impl_.hash();
 }
 
-template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash>
-__host__ __device__ auto distinct_count_estimator_ref<T, Precision, Scope, Hash>::sketch()
-  const noexcept
+template <class T, cuda::thread_scope Scope, class Hash>
+__host__ __device__ cuda::std::span<std::byte>
+distinct_count_estimator_ref<T, Scope, Hash>::sketch() const noexcept
 {
   return this->impl_.sketch();
 }
 
-template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash>
+template <class T, cuda::thread_scope Scope, class Hash>
 __host__ __device__ constexpr std::size_t
-distinct_count_estimator_ref<T, Precision, Scope, Hash>::sketch_bytes() noexcept
+distinct_count_estimator_ref<T, Scope, Hash>::sketch_bytes() const noexcept
 {
-  return impl_type::sketch_bytes();
+  return this->impl_.sketch_bytes();
 }
 
-template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash>
+template <class T, cuda::thread_scope Scope, class Hash>
 __host__ __device__ constexpr std::size_t
-distinct_count_estimator_ref<T, Precision, Scope, Hash>::sketch_alignment() noexcept
+distinct_count_estimator_ref<T, Scope, Hash>::sketch_bytes(std::size_t max_sketch_size_kb) noexcept
+{
+  return impl_type::sketch_bytes(max_sketch_size_kb);
+}
+
+template <class T, cuda::thread_scope Scope, class Hash>
+__host__ __device__ constexpr std::size_t
+distinct_count_estimator_ref<T, Scope, Hash>::sketch_alignment() noexcept
 {
   return impl_type::sketch_alignment();
 }
diff --git a/include/cuco/detail/hyperloglog/finalizer.cuh b/include/cuco/detail/hyperloglog/finalizer.cuh
index 60a7ffcab..f40a0e751 100644
--- a/include/cuco/detail/hyperloglog/finalizer.cuh
+++ b/include/cuco/detail/hyperloglog/finalizer.cuh
@@ -30,17 +30,24 @@ namespace cuco::hyperloglog_ns::detail {
  * @note Variable names correspond to the definitions given in the HLL++ paper:
  * https://static.googleusercontent.com/media/research.google.com/de//pubs/archive/40671.pdf
  *
- * @tparam Precision Tuning parameter to trade accuracy for runtime/memory footprint
  */
-template <int32_t Precision>
 class finalizer {
   // Note: Most of the types in this implementation are explicit instead of relying on `auto` to
   // avoid confusion with the reference implementation.
 
-  // this minimum number of registers is required by HLL++
-  static_assert(Precision >= 4, "Precision must be greater or equal to 4");
-
  public:
+  /**
+   * @brief Contructs an HLL finalizer object.
+   *
+   * @throws Iff precision vale is not supported
+   *
+   * @param precision HLL precision parameter
+   */
+  __host__ __device__ constexpr finalizer(int precision) : precision_{precision}, m_{1 << precision}
+  {
+    // TODO check if precision >= 4
+  }
+
   /**
    * @brief Compute the bias-corrected cardinality estimate.
    *
@@ -49,53 +56,50 @@ class finalizer {
    *
    * @return Bias-corrected cardinality estimate
    */
-  __host__ __device__ static std::size_t constexpr finalize(double z, int v) noexcept
+  __host__ __device__ constexpr std::size_t operator()(double z, int v) const noexcept
   {
     auto e = alpha_mm() / z;
 
     if (v > 0) {
       // Use linear counting for small cardinality estimates.
-      double const h = m * log(static_cast<double>(m) / v);
+      double const h = this->m_ * log(static_cast<double>(this->m_) / v);
       // The threshold `2.5 * m` is from the original HLL algorithm.
-      if (e <= 2.5 * m) { return cuda::std::round(h); }
+      if (e <= 2.5 * this->m_) { return cuda::std::round(h); }
 
-      if constexpr (Precision < 19) {
-        e = (h <= threshold(Precision)) ? h : bias_corrected_estimate(e);
+      if (this->precision_ < 19) {
+        e = (h <= threshold(this->precision_)) ? h : bias_corrected_estimate(e);
       }
     } else {
       // HLL++ is defined only when p < 19, otherwise we need to fallback to HLL.
-      if constexpr (Precision < 19) { e = bias_corrected_estimate(e); }
+      if (this->precision_ < 19) { e = bias_corrected_estimate(e); }
     }
 
     return cuda::std::round(e);
   }
 
  private:
-  static auto constexpr m = (1 << Precision);  ///< Number of registers
-  static auto constexpr k = 6;                 ///< Number of interpolation points to consider
-
-  __host__ __device__ static double constexpr alpha_mm() noexcept
+  __host__ __device__ constexpr double alpha_mm() const noexcept
   {
-    if constexpr (m == 16) {
-      return 0.673 * m * m;
-    } else if constexpr (m == 32) {
-      return 0.697 * m * m;
-    } else if constexpr (m == 64) {
-      return 0.709 * m * m;
+    if (this->m_ == 16) {
+      return 0.673 * this->m_ * this->m_;
+    } else if (this->m_ == 32) {
+      return 0.697 * this->m_ * this->m_;
+    } else if (this->m_ == 64) {
+      return 0.709 * this->m_ * this->m_;
     } else {
-      return (0.7213 / (1.0 + 1.079 / m)) * m * m;
+      return (0.7213 / (1.0 + 1.079 / this->m_)) * this->m_ * this->m_;
     }
   }
 
-  __host__ __device__ static double constexpr bias_corrected_estimate(double e) noexcept
+  __host__ __device__ constexpr double bias_corrected_estimate(double e) const noexcept
   {
-    return (e < 5.0 * m) ? e - bias(e) : e;
+    return (e < 5.0 * this->m_) ? e - bias(e) : e;
   }
 
-  __host__ __device__ static double constexpr bias(double e) noexcept
+  __host__ __device__ constexpr double bias(double e) const noexcept
   {
     auto const anchor_index = interpolation_anchor_index(e);
-    int const n             = raw_estimate_data_size(Precision);
+    int const n             = raw_estimate_data_size(this->precision_);
 
     auto low  = cuda::std::max(anchor_index - k + 1, 0);
     auto high = cuda::std::min(low + k, n);
@@ -106,7 +110,7 @@ class finalizer {
       high += 1;
     }
 
-    auto biases     = bias_data(Precision);
+    auto biases     = bias_data(this->precision_);
     double bias_sum = 0.0;
     for (int i = low; i < high; ++i) {
       bias_sum += biases[i];
@@ -115,19 +119,19 @@ class finalizer {
     return bias_sum / (high - low);
   }
 
-  __host__ __device__ static double distance(double e, int i) noexcept
+  __host__ __device__ constexpr double distance(double e, int i) const noexcept
   {
-    auto const diff = e - raw_estimate_data(Precision)[i];
+    auto const diff = e - raw_estimate_data(this->precision_)[i];
     return diff * diff;
   }
 
-  __host__ __device__ static int interpolation_anchor_index(double e) noexcept
+  __host__ __device__ constexpr int interpolation_anchor_index(double e) const noexcept
   {
-    auto estimates = raw_estimate_data(Precision);
-    int const n    = raw_estimate_data_size(Precision);
-    int left       = 0;
-    int right      = static_cast<int>(n) - 1;
-    int mid;
+    auto estimates      = raw_estimate_data(this->precision_);
+    int const n         = raw_estimate_data_size(this->precision_);
+    int left            = 0;
+    int right           = static_cast<int>(n) - 1;
+    int mid             = -1;
     int candidate_index = 0;  // Index of the closest element found
 
     while (left <= right) {
@@ -157,5 +161,9 @@ class finalizer {
 
     return candidate_index;
   }
+
+  static constexpr auto k = 6;  ///< Number of interpolation points to consider
+  int precision_;
+  int m_;
 };
 }  // namespace cuco::hyperloglog_ns::detail
\ No newline at end of file
diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh
index 56e13da66..159afeb99 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog.cuh
@@ -32,25 +32,20 @@ namespace cuco::detail {
  *
  * @note This class implements the HyperLogLog/HyperLogLog++ algorithm:
  * https://static.googleusercontent.com/media/research.google.com/de//pubs/archive/40671.pdf.
- * @note The `Precision` parameter can be used to trade runtime/memory footprint for better
- * accuracy. A higher value corresponds to a more accurate result, however, setting the precision
- * too high will result in deminishing results.
  *
  * @tparam T Type of items to count
- * @tparam Precision Tuning parameter to trade runtime/memory footprint for better accuracy
  * @tparam Scope The scope in which operations will be performed by individual threads
  * @tparam Hash Hash function used to hash items
  * @tparam Allocator Type of allocator used for device storage
  */
-template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash, class Allocator>
+template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
 class hyperloglog {
  public:
-  static constexpr auto thread_scope = Scope;      ///< CUDA thread scope
-  static constexpr auto precision    = Precision;  ///< Precision
+  static constexpr auto thread_scope = Scope;  ///< CUDA thread scope
 
   template <cuda::thread_scope NewScope = thread_scope>
-  using ref_type = hyperloglog_ref<T, Precision, NewScope, Hash>;  ///< Non-owning reference
-                                                                   ///< type
+  using ref_type = hyperloglog_ref<T, NewScope, Hash>;  ///< Non-owning reference
+                                                        ///< type
 
   using value_type = typename ref_type<>::value_type;  ///< Type of items to count
   using hash_type  = typename ref_type<>::hash_type;   ///< Hash function type
@@ -63,15 +58,19 @@ class hyperloglog {
    *
    * @note This function synchronizes the given stream.
    *
+   * @param max_sketch_size_kb Maximum sketch size in KB
    * @param hash The hash function used to hash items
    * @param alloc Allocator used for allocating device storage
    * @param stream CUDA stream used to initialize the object
    */
-  constexpr hyperloglog(Hash const& hash, Allocator const& alloc, cuco::cuda_stream_ref stream)
+  constexpr hyperloglog(std::size_t max_sketch_size_kb,
+                        Hash const& hash,
+                        Allocator const& alloc,
+                        cuco::cuda_stream_ref stream)
     : allocator_{alloc},
-      deleter_{this->sketch_bytes(), this->allocator_},
-      sketch_{this->allocator_.allocate(this->sketch_bytes()), this->deleter_},
-      ref_{cuda::std::span{this->sketch_.get(), this->sketch_bytes()}, hash}
+      deleter_{this->sketch_bytes(max_sketch_size_kb), this->allocator_},
+      sketch_{this->allocator_.allocate(this->sketch_bytes(max_sketch_size_kb)), this->deleter_},
+      ref_{cuda::std::span{this->sketch_.get(), this->sketch_bytes(max_sketch_size_kb)}, hash}
   {
     this->ref_.clear_async(stream);
   }
@@ -153,7 +152,7 @@ class hyperloglog {
    * @param stream CUDA stream this operation is executed in
    */
   template <cuda::thread_scope OtherScope, class OtherAllocator>
-  void merge_async(hyperloglog<T, Precision, OtherScope, Hash, OtherAllocator> const& other,
+  void merge_async(hyperloglog<T, OtherScope, Hash, OtherAllocator> const& other,
                    cuco::cuda_stream_ref stream) noexcept
   {
     this->ref_.merge_async(other.ref(), stream);
@@ -172,7 +171,7 @@ class hyperloglog {
    * @param stream CUDA stream this operation is executed in
    */
   template <cuda::thread_scope OtherScope, class OtherAllocator>
-  void merge(hyperloglog<T, Precision, OtherScope, Hash, OtherAllocator> const& other,
+  void merge(hyperloglog<T, OtherScope, Hash, OtherAllocator> const& other,
              cuco::cuda_stream_ref stream)
   {
     this->ref_.merge(other.ref(), stream);
@@ -242,7 +241,7 @@ class hyperloglog {
    *
    * @return The cuda::std::span of the sketch
    */
-  [[nodiscard]] auto sketch() const noexcept { return this->ref_.sketch(); }
+  [[nodiscard]] cuda::std::span<std::byte> sketch() const noexcept { return this->ref_.sketch(); }
 
   /**
    * @brief Gets the number of bytes required for the sketch storage.
@@ -251,7 +250,19 @@ class hyperloglog {
    */
   [[nodiscard]] constexpr std::size_t sketch_bytes() const noexcept
   {
-    return ref_type<>::sketch_bytes();
+    return this->ref_.sketch_bytes();
+  }
+
+  /**
+   * @brief Gets the number of bytes required for the sketch storage.
+   *
+   * @param max_sketch_size_kb Upper bound sketch size in KB
+   *
+   * @return The number of bytes required for the sketch
+   */
+  [[nodiscard]] static constexpr std::size_t sketch_bytes(std::size_t max_sketch_size_kb) noexcept
+  {
+    return ref_type<>::sketch_bytes(max_sketch_size_kb);
   }
 
   /**
@@ -273,7 +284,7 @@ class hyperloglog {
 
   // Needs to be friends with other instantiations of this class template to have access to their
   // storage
-  template <class T_, int32_t Precision_, cuda::thread_scope Scope_, class Hash_, class Allocator_>
+  template <class T_, cuda::thread_scope Scope_, class Hash_, class Allocator_>
   friend class hyperloglog;
 };
 }  // namespace cuco::detail
\ No newline at end of file
diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
index 60552bf00..5016fb21f 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
@@ -44,30 +44,25 @@ namespace cuco::detail {
  *
  * @note This class implements the HyperLogLog/HyperLogLog++ algorithm:
  * https://static.googleusercontent.com/media/research.google.com/de//pubs/archive/40671.pdf.
- * @note The `Precision` parameter can be used to trade runtime/memory footprint for better
- * accuracy. A higher value corresponds to a more accurate result, however, setting the precision
- * too high will result in deminishing results.
  *
  * @tparam T Type of items to count
- * @tparam Precision Tuning parameter to trade runtime/memory footprint for better accuracy
  * @tparam Scope The scope in which operations will be performed by individual threads
  * @tparam Hash Hash function used to hash items
  */
-template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash>
+template <class T, cuda::thread_scope Scope, class Hash>
 class hyperloglog_ref {
   using register_type = int;  ///< Register array storage
   // We use `int` here since this is the smallest type that supports native `atomicMax` on GPUs
   using fp_type = float;  ///< Floating point type used for reduction
  public:
-  static constexpr auto thread_scope = Scope;      ///< CUDA thread scope
-  static constexpr auto precision    = Precision;  ///< Precision
+  static constexpr auto thread_scope = Scope;  ///< CUDA thread scope
 
   using value_type = T;     ///< Type of items to count
   using hash_type  = Hash;  ///< Hash function type
 
   template <cuda::thread_scope NewScope>
-  using with_scope = hyperloglog_ref<T, Precision, NewScope, Hash>;  ///< Ref type with different
-                                                                     ///< thread scope
+  using with_scope = hyperloglog_ref<T, NewScope, Hash>;  ///< Ref type with different
+                                                          ///< thread scope
 
   /**
    * @brief Constructs a non-owning `hyperloglog_ref` object.
@@ -75,10 +70,11 @@ class hyperloglog_ref {
    * @param sketch_span Reference to sketch storage
    * @param hash The hash function used to hash items
    */
-  template <class U, std::size_t N>
-  __host__ __device__ constexpr hyperloglog_ref(cuda::std::span<U, N> sketch_span,
-                                                Hash const& hash) noexcept
+  __host__ __device__ constexpr hyperloglog_ref(cuda::std::span<std::byte> sketch_span,
+                                                Hash const& hash)
     : hash_{hash},
+      precision_{cuda::std::countr_zero(this->sketch_bytes(sketch_span.size() / 1024) /
+                                        sizeof(register_type))},
       sketch_{reinterpret_cast<register_type*>(sketch_span.data()),
               this->sketch_bytes() / sizeof(register_type)}
   {
@@ -133,10 +129,10 @@ class hyperloglog_ref {
   __device__ void add(T const& item) noexcept
   {
     using hash_value_type = decltype(cuda::std::declval<hash_type>()(cuda::std::declval<T>()));
-    hash_value_type constexpr register_mask = (1ull << Precision) - 1;
-    auto const h                            = this->hash_(item);
-    auto const reg                          = h & register_mask;
-    auto const zeroes = cuda::std::countl_zero(h | register_mask) + 1;  // __clz
+    hash_value_type const register_mask = (1ull << this->precision_) - 1;
+    auto const h                        = this->hash_(item);
+    auto const reg                      = h & register_mask;
+    auto const zeroes                   = cuda::std::countl_zero(h | register_mask) + 1;  // __clz
 
     this->update_max(reg, zeroes);
   }
@@ -260,9 +256,12 @@ class hyperloglog_ref {
    * @param other Other estimator reference to be merged into `*this`
    */
   template <class CG, cuda::thread_scope OtherScope>
-  __device__ void merge(CG const& group,
-                        hyperloglog_ref<T, Precision, OtherScope, Hash> const& other) noexcept
+  __device__ void merge(CG const& group, hyperloglog_ref<T, OtherScope, Hash> const& other) noexcept
   {
+    if (other.precision_ != this->precision_) {
+      __trap();  // TODO check if this hurts performance
+    }
+
     for (int i = group.thread_rank(); i < this->sketch_.size(); i += group.size()) {
       this->update_max(i, other.sketch_[i]);
     }
@@ -278,7 +277,7 @@ class hyperloglog_ref {
    * @param stream CUDA stream this operation is executed in
    */
   template <cuda::thread_scope OtherScope>
-  __host__ void merge_async(hyperloglog_ref<T, Precision, OtherScope, Hash> const& other,
+  __host__ void merge_async(hyperloglog_ref<T, OtherScope, Hash> const& other,
                             cuco::cuda_stream_ref stream) noexcept
   {
     auto constexpr block_size = 1024;
@@ -297,7 +296,7 @@ class hyperloglog_ref {
    * @param stream CUDA stream this operation is executed in
    */
   template <cuda::thread_scope OtherScope>
-  __host__ void merge(hyperloglog_ref<T, Precision, OtherScope, Hash> const& other,
+  __host__ void merge(hyperloglog_ref<T, OtherScope, Hash> const& other,
                       cuco::cuda_stream_ref stream)
   {
     this->merge_async(other, stream);
@@ -353,9 +352,10 @@ class hyperloglog_ref {
     group.sync();
 
     if (group.thread_rank() == 0) {
-      auto const z = block_sum.load(cuda::std::memory_order_relaxed);
-      auto const v = block_zeroes.load(cuda::std::memory_order_relaxed);
-      estimate     = cuco::hyperloglog_ns::detail::finalizer<Precision>::finalize(z, v);
+      auto const z        = block_sum.load(cuda::std::memory_order_relaxed);
+      auto const v        = block_zeroes.load(cuda::std::memory_order_relaxed);
+      auto const finalize = cuco::hyperloglog_ns::detail::finalizer(this->precision_);
+      estimate            = finalize(z, v);
     }
     group.sync();
 
@@ -373,7 +373,7 @@ class hyperloglog_ref {
    */
   [[nodiscard]] __host__ std::size_t estimate(cuco::cuda_stream_ref stream) const
   {
-    auto const num_regs = 1ull << Precision;
+    auto const num_regs = 1ull << this->precision_;
     thrust::host_vector<register_type> host_sketch(num_regs);
 
     // TODO check if storage is host accessible
@@ -393,8 +393,10 @@ class hyperloglog_ref {
       zeroes += reg == 0;
     }
 
+    auto const finalize = cuco::hyperloglog_ns::detail::finalizer(this->precision_);
+
     // pass intermediate result to finalizer for bias correction, etc.
-    return cuco::hyperloglog_ns::detail::finalizer<Precision>::finalize(sum, zeroes);
+    return finalize(sum, zeroes);
   }
 
   /**
@@ -409,16 +411,33 @@ class hyperloglog_ref {
    *
    * @return The cuda::std::span of the sketch
    */
-  [[nodiscard]] __host__ __device__ auto sketch() const noexcept { return this->sketch_; }
+  [[nodiscard]] __host__ __device__ cuda::std::span<std::byte> sketch() const noexcept
+  {
+    return cuda::std::span<std::byte>(reinterpret_cast<std::byte*>(this->sketch_.data()),
+                                      this->sketch_bytes());
+  }
+
+  /**
+   * @brief Gets the number of bytes required for the sketch storage.
+   *
+   * @return The number of bytes required for the sketch
+   */
+  [[nodiscard]] __host__ __device__ std::size_t sketch_bytes() const noexcept
+  {
+    return (1ull << this->precision_) * sizeof(register_type);
+  }
 
   /**
    * @brief Gets the number of bytes required for the sketch storage.
    *
+   * @param max_sketch_size_kb Upper bound sketch size in KB
+   *
    * @return The number of bytes required for the sketch
    */
-  [[nodiscard]] __host__ __device__ static constexpr std::size_t sketch_bytes() noexcept
+  [[nodiscard]] __host__ __device__ static constexpr std::size_t sketch_bytes(
+    std::size_t max_sketch_size_kb) noexcept
   {
-    return (1ull << Precision) * sizeof(register_type);
+    return cuda::std::bit_floor(max_sketch_size_kb * 1024);
   }
 
   /**
@@ -474,11 +493,11 @@ class hyperloglog_ref {
                                                shmem_bytes);
   }
 
-  hash_type hash_;  ///< Hash function used to hash items
-  cuda::std::span<register_type, sketch_bytes() / sizeof(register_type)>
-    sketch_;  ///< HLL sketch storage
+  hash_type hash_;                         ///< Hash function used to hash items
+  int32_t precision_;                      ///< HLL precision parameter
+  cuda::std::span<register_type> sketch_;  ///< HLL sketch storage
 
-  template <class T_, int32_t Precision_, cuda::thread_scope Scope_, class Hash_>
+  template <class T_, cuda::thread_scope Scope_, class Hash_>
   friend class hyperloglog_ref;
 };
 }  // namespace cuco::detail
\ No newline at end of file
diff --git a/include/cuco/distinct_count_estimator.cuh b/include/cuco/distinct_count_estimator.cuh
index 38f9cbd16..0a2490ad7 100644
--- a/include/cuco/distinct_count_estimator.cuh
+++ b/include/cuco/distinct_count_estimator.cuh
@@ -32,9 +32,6 @@ namespace cuco {
  *
  * @note This implementation is based on the HyperLogLog++ algorithm:
  * https://static.googleusercontent.com/media/research.google.com/de//pubs/archive/40671.pdf.
- * @note The `Precision` parameter can be used to trade runtime/memory footprint for better
- * accuracy. A higher value corresponds to a more accurate result, however, setting the precision
- * too high will result in deminishing returns.
  *
  * @tparam T Type of items to count
  * @tparam Precision Tuning parameter to trade runtime/memory footprint for better accuracy
@@ -43,21 +40,18 @@ namespace cuco {
  * @tparam Allocator Type of allocator used for device storage
  */
 template <class T,
-          int32_t Precision        = 11,
           cuda::thread_scope Scope = cuda::thread_scope_device,
           class Hash               = cuco::xxhash_64<T>,
           class Allocator          = cuco::cuda_allocator<std::byte>>
 class distinct_count_estimator {
-  using impl_type = detail::hyperloglog<T, Precision, Scope, Hash, Allocator>;
+  using impl_type = detail::hyperloglog<T, Scope, Hash, Allocator>;
 
  public:
   static constexpr auto thread_scope = impl_type::thread_scope;  ///< CUDA thread scope
-  static constexpr auto precision    = impl_type::precision;     ///< Precision
 
   template <cuda::thread_scope NewScope = thread_scope>
-  using ref_type =
-    cuco::distinct_count_estimator_ref<T, Precision, NewScope, Hash>;  ///< Non-owning reference
-                                                                       ///< type
+  using ref_type = cuco::distinct_count_estimator_ref<T, NewScope, Hash>;  ///< Non-owning reference
+                                                                           ///< type
 
   using value_type     = typename impl_type::value_type;      ///< Type of items to count
   using allocator_type = typename impl_type::allocator_type;  ///< Allocator type
@@ -68,13 +62,15 @@ class distinct_count_estimator {
    *
    * @note This function synchronizes the given stream.
    *
+   * @param max_sketch_size_kb Maximum sketch size in KB
    * @param hash The hash function used to hash items
    * @param alloc Allocator used for allocating device storage
    * @param stream CUDA stream used to initialize the object
    */
-  constexpr distinct_count_estimator(Hash const& hash             = {},
-                                     Allocator const& alloc       = {},
-                                     cuco::cuda_stream_ref stream = {});
+  constexpr distinct_count_estimator(std::size_t max_sketch_size_kb = 32,
+                                     Hash const& hash               = {},
+                                     Allocator const& alloc         = {},
+                                     cuco::cuda_stream_ref stream   = {});
 
   ~distinct_count_estimator() = default;
 
@@ -148,9 +144,8 @@ class distinct_count_estimator {
    * @param stream CUDA stream this operation is executed in
    */
   template <cuda::thread_scope OtherScope, class OtherAllocator>
-  void merge_async(
-    distinct_count_estimator<T, Precision, OtherScope, Hash, OtherAllocator> const& other,
-    cuco::cuda_stream_ref stream = {}) noexcept;
+  void merge_async(distinct_count_estimator<T, OtherScope, Hash, OtherAllocator> const& other,
+                   cuco::cuda_stream_ref stream = {}) noexcept;
 
   /**
    * @brief Merges the result of `other` estimator into `*this` estimator.
@@ -165,7 +160,7 @@ class distinct_count_estimator {
    * @param stream CUDA stream this operation is executed in
    */
   template <cuda::thread_scope OtherScope, class OtherAllocator>
-  void merge(distinct_count_estimator<T, Precision, OtherScope, Hash, OtherAllocator> const& other,
+  void merge(distinct_count_estimator<T, OtherScope, Hash, OtherAllocator> const& other,
              cuco::cuda_stream_ref stream = {});
 
   /**
@@ -223,14 +218,23 @@ class distinct_count_estimator {
    *
    * @return The cuda::std::span of the sketch
    */
-  [[nodiscard]] auto sketch() const noexcept;
+  [[nodiscard]] cuda::std::span<std::byte> sketch() const noexcept;
 
   /**
    * @brief Gets the number of bytes required for the sketch storage.
    *
    * @return The number of bytes required for the sketch
    */
-  [[nodiscard]] static constexpr std::size_t sketch_bytes() noexcept;
+  [[nodiscard]] constexpr std::size_t sketch_bytes() const noexcept;
+
+  /**
+   * @brief Gets the number of bytes required for the sketch storage.
+   *
+   * @param max_sketch_size_kb Upper bound sketch size in KB
+   *
+   * @return The number of bytes required for the sketch
+   */
+  [[nodiscard]] static constexpr std::size_t sketch_bytes(std::size_t max_sketch_size_kb) noexcept;
 
   /**
    * @brief Gets the alignment required for the sketch storage.
diff --git a/include/cuco/distinct_count_estimator_ref.cuh b/include/cuco/distinct_count_estimator_ref.cuh
index 905a6d379..65f899723 100644
--- a/include/cuco/distinct_count_estimator_ref.cuh
+++ b/include/cuco/distinct_count_estimator_ref.cuh
@@ -30,29 +30,23 @@ namespace cuco {
  *
  * @note This implementation is based on the HyperLogLog++ algorithm:
  * https://static.googleusercontent.com/media/research.google.com/de//pubs/archive/40671.pdf.
- * @note The `Precision` parameter can be used to trade runtime/memory footprint for better
- * accuracy. A higher value corresponds to a more accurate result, however, setting the precision
- * too high will result in deminishing results.
  *
  * @tparam T Type of items to count
- * @tparam Precision Tuning parameter to trade runtime/memory footprint for better accuracy
  * @tparam Scope The scope in which operations will be performed by individual threads
  * @tparam Hash Hash function used to hash items
  */
-template <class T, int32_t Precision, cuda::thread_scope Scope, class Hash>
+template <class T, cuda::thread_scope Scope, class Hash>
 class distinct_count_estimator_ref {
-  using impl_type = detail::hyperloglog_ref<T, Precision, Scope, Hash>;
+  using impl_type = detail::hyperloglog_ref<T, Scope, Hash>;
 
  public:
   static constexpr auto thread_scope = impl_type::thread_scope;  ///< CUDA thread scope
-  static constexpr auto precision    = impl_type::precision;     ///< Precision
 
   using value_type = typename impl_type::value_type;  ///< Type of items to count
 
   template <cuda::thread_scope NewScope>
-  using with_scope =
-    distinct_count_estimator_ref<T, Precision, NewScope, Hash>;  ///< Ref type with different thread
-                                                                 ///< scope
+  using with_scope = distinct_count_estimator_ref<T, NewScope, Hash>;  ///< Ref type with different
+                                                                       ///< thread scope
 
   // TODO let storage_type be inferred?
   /**
@@ -61,9 +55,8 @@ class distinct_count_estimator_ref {
    * @param sketch_span Reference to sketch storage
    * @param hash The hash function used to hash items
    */
-  template <class U, std::size_t N>
-  __host__ __device__ constexpr distinct_count_estimator_ref(cuda::std::span<U, N> sketch_span,
-                                                             Hash const& hash = {}) noexcept;
+  __host__ __device__ constexpr distinct_count_estimator_ref(cuda::std::span<std::byte> sketch_span,
+                                                             Hash const& hash = {});
 
   /**
    * @brief Resets the estimator, i.e., clears the current count estimate.
@@ -140,9 +133,8 @@ class distinct_count_estimator_ref {
    * @param other Other estimator reference to be merged into `*this`
    */
   template <class CG, cuda::thread_scope OtherScope>
-  __device__ void merge(
-    CG const& group,
-    distinct_count_estimator_ref<T, Precision, OtherScope, Hash> const& other) noexcept;
+  __device__ void merge(CG const& group,
+                        distinct_count_estimator_ref<T, OtherScope, Hash> const& other) noexcept;
 
   /**
    * @brief Asynchronously merges the result of `other` estimator reference into `*this` estimator.
@@ -153,9 +145,8 @@ class distinct_count_estimator_ref {
    * @param stream CUDA stream this operation is executed in
    */
   template <cuda::thread_scope OtherScope>
-  __host__ void merge_async(
-    distinct_count_estimator_ref<T, Precision, OtherScope, Hash> const& other,
-    cuco::cuda_stream_ref stream = {}) noexcept;
+  __host__ void merge_async(distinct_count_estimator_ref<T, OtherScope, Hash> const& other,
+                            cuco::cuda_stream_ref stream = {}) noexcept;
 
   /**
    * @brief Merges the result of `other` estimator reference into `*this` estimator.
@@ -169,7 +160,7 @@ class distinct_count_estimator_ref {
    * @param stream CUDA stream this operation is executed in
    */
   template <cuda::thread_scope OtherScope>
-  __host__ void merge(distinct_count_estimator_ref<T, Precision, OtherScope, Hash> const& other,
+  __host__ void merge(distinct_count_estimator_ref<T, OtherScope, Hash> const& other,
                       cuco::cuda_stream_ref stream = {});
 
   /**
@@ -205,14 +196,24 @@ class distinct_count_estimator_ref {
    *
    * @return The cuda::std::span of the sketch
    */
-  [[nodiscard]] __host__ __device__ auto sketch() const noexcept;
+  [[nodiscard]] __host__ __device__ cuda::std::span<std::byte> sketch() const noexcept;
 
   /**
    * @brief Gets the number of bytes required for the sketch storage.
    *
    * @return The number of bytes required for the sketch
    */
-  [[nodiscard]] __host__ __device__ static constexpr std::size_t sketch_bytes() noexcept;
+  [[nodiscard]] __host__ __device__ constexpr std::size_t sketch_bytes() const noexcept;
+
+  /**
+   * @brief Gets the number of bytes required for the sketch storage.
+   *
+   * @param max_sketch_size_kb Upper bound sketch size in KB
+   *
+   * @return The number of bytes required for the sketch
+   */
+  [[nodiscard]] __host__ __device__ static constexpr std::size_t sketch_bytes(
+    std::size_t max_sketch_size_kb) noexcept;
 
   /**
    * @brief Gets the alignment required for the sketch storage.
@@ -224,7 +225,7 @@ class distinct_count_estimator_ref {
  private:
   impl_type impl_;  ///< Implementation object
 
-  template <class T_, int32_t Precision_, cuda::thread_scope Scope_, class Hash_>
+  template <class T_, cuda::thread_scope Scope_, class Hash_>
   friend class distinct_count_estimator_ref;
 };
 }  // namespace cuco
diff --git a/tests/distinct_count_estimator/unique_sequence_test.cu b/tests/distinct_count_estimator/unique_sequence_test.cu
index 9ebbc6291..fffdd751b 100644
--- a/tests/distinct_count_estimator/unique_sequence_test.cu
+++ b/tests/distinct_count_estimator/unique_sequence_test.cu
@@ -32,24 +32,9 @@
 TEMPLATE_TEST_CASE_SIG("distinct_count_estimator: unique sequence",
                        "",
                        ((typename T, int32_t Precision, typename Hash), T, Precision, Hash),
-                       (int32_t, 9, cuco::xxhash_64<int32_t>),
-                       (int32_t, 11, cuco::xxhash_64<int32_t>),
-                       (int32_t, 13, cuco::xxhash_64<int32_t>),
-                       (int32_t, 16, cuco::xxhash_64<int32_t>),
-                       (int32_t, 18, cuco::xxhash_64<int32_t>),
-                       (int32_t, 20, cuco::xxhash_64<int32_t>),
-                       (int64_t, 9, cuco::xxhash_64<int64_t>),
-                       (int64_t, 11, cuco::xxhash_64<int64_t>),
-                       (int64_t, 13, cuco::xxhash_64<int64_t>),
-                       (int64_t, 16, cuco::xxhash_64<int64_t>),
-                       (int64_t, 18, cuco::xxhash_64<int64_t>),
-                       (int64_t, 20, cuco::xxhash_64<int64_t>),
-                       (__int128_t, 9, cuco::xxhash_64<__int128_t>),
-                       (__int128_t, 11, cuco::xxhash_64<__int128_t>),
-                       (__int128_t, 13, cuco::xxhash_64<__int128_t>),
-                       (__int128_t, 16, cuco::xxhash_64<__int128_t>),
-                       (__int128_t, 18, cuco::xxhash_64<__int128_t>),
-                       (__int128_t, 20, cuco::xxhash_64<__int128_t>))
+                       (int32_t, cuco::xxhash_64<int32_t>),
+                       (int64_t, cuco::xxhash_64<int64_t>),
+                       (__int128_t, cuco::xxhash_64<__int128_t>))
 {
   // This factor determines the error threshold for passing the test
   // TODO might be too high
@@ -59,6 +44,8 @@ TEMPLATE_TEST_CASE_SIG("distinct_count_estimator: unique sequence",
     1.04 / std::sqrt(static_cast<double>(1ull << Precision));
 
   auto num_items_pow2 = GENERATE(25, 26, 28);
+  auto sketch_size_kb = GENERATE(2, 8, 32, 256, 1024, 4096);
+  INFO("sketch_size_kb=" << sketch_size_kb);
   INFO("num_items=2^" << num_items_pow2);
   auto num_items = 1ull << num_items_pow2;
 
@@ -68,7 +55,7 @@ TEMPLATE_TEST_CASE_SIG("distinct_count_estimator: unique sequence",
   thrust::sequence(items.begin(), items.end(), 0);
 
   // Initialize the estimator
-  cuco::distinct_count_estimator<T, Precision, cuda::thread_scope_device, Hash> estimator;
+  cuco::distinct_count_estimator<T, cuda::thread_scope_device, Hash> estimator(sketch_size_kb);
 
   REQUIRE(estimator.estimate() == 0);
 

From 65ff70a5a6d128958e1af2237bf4422d2d4bed42 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Mon, 18 Mar 2024 23:27:50 +0000
Subject: [PATCH 37/78] Pre-compute register mask

---
 include/cuco/detail/hyperloglog/hyperloglog_ref.cuh | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
index 5016fb21f..af450d080 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
@@ -54,6 +54,8 @@ class hyperloglog_ref {
   using register_type = int;  ///< Register array storage
   // We use `int` here since this is the smallest type that supports native `atomicMax` on GPUs
   using fp_type = float;  ///< Floating point type used for reduction
+  using hash_value_type =
+    decltype(cuda::std::declval<Hash>()(cuda::std::declval<T>()));  ///< Hash value type
  public:
   static constexpr auto thread_scope = Scope;  ///< CUDA thread scope
 
@@ -75,6 +77,7 @@ class hyperloglog_ref {
     : hash_{hash},
       precision_{cuda::std::countr_zero(this->sketch_bytes(sketch_span.size() / 1024) /
                                         sizeof(register_type))},
+      register_mask_{(1ull << this->precision_) - 1},
       sketch_{reinterpret_cast<register_type*>(sketch_span.data()),
               this->sketch_bytes() / sizeof(register_type)}
   {
@@ -128,11 +131,9 @@ class hyperloglog_ref {
    */
   __device__ void add(T const& item) noexcept
   {
-    using hash_value_type = decltype(cuda::std::declval<hash_type>()(cuda::std::declval<T>()));
-    hash_value_type const register_mask = (1ull << this->precision_) - 1;
-    auto const h                        = this->hash_(item);
-    auto const reg                      = h & register_mask;
-    auto const zeroes                   = cuda::std::countl_zero(h | register_mask) + 1;  // __clz
+    auto const h      = this->hash_(item);
+    auto const reg    = h & this->register_mask_;
+    auto const zeroes = cuda::std::countl_zero(h | this->register_mask_) + 1;  // __clz
 
     this->update_max(reg, zeroes);
   }
@@ -495,6 +496,7 @@ class hyperloglog_ref {
 
   hash_type hash_;                         ///< Hash function used to hash items
   int32_t precision_;                      ///< HLL precision parameter
+  hash_value_type register_mask_;          ///< Mask used to separate register index from count
   cuda::std::span<register_type> sketch_;  ///< HLL sketch storage
 
   template <class T_, cuda::thread_scope Scope_, class Hash_>

From 806879922d320f636c233b00b5e6c762d85100fd Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Tue, 19 Mar 2024 16:12:01 +0000
Subject: [PATCH 38/78] Fix unit test

---
 .../unique_sequence_test.cu                    | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/tests/distinct_count_estimator/unique_sequence_test.cu b/tests/distinct_count_estimator/unique_sequence_test.cu
index fffdd751b..2addf3bee 100644
--- a/tests/distinct_count_estimator/unique_sequence_test.cu
+++ b/tests/distinct_count_estimator/unique_sequence_test.cu
@@ -31,23 +31,25 @@
 
 TEMPLATE_TEST_CASE_SIG("distinct_count_estimator: unique sequence",
                        "",
-                       ((typename T, int32_t Precision, typename Hash), T, Precision, Hash),
+                       ((typename T, typename Hash), T, Hash),
                        (int32_t, cuco::xxhash_64<int32_t>),
                        (int64_t, cuco::xxhash_64<int64_t>),
                        (__int128_t, cuco::xxhash_64<__int128_t>))
 {
+  auto num_items_pow2 = GENERATE(25, 26, 28);
+  auto hll_precision  = GENERATE(8, 10, 12, 13, 18, 20);
+  auto sketch_size_kb = 4 * (1ull << hll_precision) / 1024;
+  INFO("hll_precision=" << hll_precision);
+  INFO("sketch_size_kb=" << sketch_size_kb);
+  INFO("num_items=2^" << num_items_pow2);
+  auto num_items = 1ull << num_items_pow2;
+
   // This factor determines the error threshold for passing the test
   // TODO might be too high
   double constexpr tolerance_factor = 2.5;
   // RSD for a given precision is given by the following formula
   double const relative_standard_deviation =
-    1.04 / std::sqrt(static_cast<double>(1ull << Precision));
-
-  auto num_items_pow2 = GENERATE(25, 26, 28);
-  auto sketch_size_kb = GENERATE(2, 8, 32, 256, 1024, 4096);
-  INFO("sketch_size_kb=" << sketch_size_kb);
-  INFO("num_items=2^" << num_items_pow2);
-  auto num_items = 1ull << num_items_pow2;
+    1.04 / std::sqrt(static_cast<double>(1ull << hll_precision));
 
   thrust::device_vector<T> items(num_items);
 

From 04c303dcbe435f82237a4d382c8c74d2880735ba Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Wed, 20 Mar 2024 13:21:48 +0000
Subject: [PATCH 39/78] Add sketch_size_kb strong type and fix stupid bug where
 I called a static member function with this->

---
 benchmarks/distinct_count_estimator_bench.cu  |  2 +-
 .../host_bulk_example.cu                      |  1 -
 .../distinct_count_estimator.inl              |  4 +-
 .../distinct_count_estimator_ref.inl          |  3 +-
 .../cuco/detail/hyperloglog/hyperloglog.cuh   | 12 ++--
 .../detail/hyperloglog/hyperloglog_ref.cuh    | 10 ++--
 include/cuco/distinct_count_estimator.cuh     | 13 +++--
 include/cuco/distinct_count_estimator_ref.cuh |  7 ++-
 include/cuco/sketch_size.hpp                  | 55 +++++++++++++++++++
 .../unique_sequence_test.cu                   |  3 +-
 10 files changed, 87 insertions(+), 23 deletions(-)
 create mode 100644 include/cuco/sketch_size.hpp

diff --git a/benchmarks/distinct_count_estimator_bench.cu b/benchmarks/distinct_count_estimator_bench.cu
index 9b3ba02c0..07135a999 100644
--- a/benchmarks/distinct_count_estimator_bench.cu
+++ b/benchmarks/distinct_count_estimator_bench.cu
@@ -67,7 +67,7 @@ void distinct_count_estimator_e2e(nvbench::state& state, nvbench::type_list<Esti
   state.add_element_count(num_items);
   state.add_global_memory_reads<T>(num_items, "InputSize");
 
-  Estimator estimator(sketch_size_kb);
+  Estimator estimator(cuco::sketch_size_kb(sketch_size_kb));
   estimator.add(items.begin(), items.end());
 
   double estimated_cardinality  = estimator.estimate();
diff --git a/examples/distinct_count_estimator/host_bulk_example.cu b/examples/distinct_count_estimator/host_bulk_example.cu
index 9e60ae47b..add3cb626 100644
--- a/examples/distinct_count_estimator/host_bulk_example.cu
+++ b/examples/distinct_count_estimator/host_bulk_example.cu
@@ -25,7 +25,6 @@
  * @file host_bulk_example.cu
  * @brief Demonstrates usage of `cuco::distinct_count_estimator` "bulk" host APIs.
  */
-
 int main(void)
 {
   using T                         = int;
diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl
index 54806aba6..be3fcfb9e 100644
--- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl
+++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl
@@ -18,7 +18,7 @@ namespace cuco {
 
 template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
 constexpr distinct_count_estimator<T, Scope, Hash, Allocator>::distinct_count_estimator(
-  std::size_t max_sketch_size_kb,
+  cuco::sketch_size_kb max_sketch_size_kb,
   Hash const& hash,
   Allocator const& alloc,
   cuco::cuda_stream_ref stream)
@@ -125,7 +125,7 @@ constexpr size_t distinct_count_estimator<T, Scope, Hash, Allocator>::sketch_byt
 
 template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
 constexpr size_t distinct_count_estimator<T, Scope, Hash, Allocator>::sketch_bytes(
-  size_t max_sketch_size_kb) noexcept
+  cuco::sketch_size_kb max_sketch_size_kb) noexcept
 {
   return impl_type::sketch_bytes(max_sketch_size_kb);
 }
diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl
index d0cf85475..90f609b5d 100644
--- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl
+++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl
@@ -129,7 +129,8 @@ distinct_count_estimator_ref<T, Scope, Hash>::sketch_bytes() const noexcept
 
 template <class T, cuda::thread_scope Scope, class Hash>
 __host__ __device__ constexpr std::size_t
-distinct_count_estimator_ref<T, Scope, Hash>::sketch_bytes(std::size_t max_sketch_size_kb) noexcept
+distinct_count_estimator_ref<T, Scope, Hash>::sketch_bytes(
+  cuco::sketch_size_kb max_sketch_size_kb) noexcept
 {
   return impl_type::sketch_bytes(max_sketch_size_kb);
 }
diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh
index 159afeb99..b8e5c5db2 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog.cuh
@@ -20,6 +20,7 @@
 #include <cuco/detail/hyperloglog/hyperloglog_ref.cuh>
 #include <cuco/detail/storage/storage_base.cuh>
 #include <cuco/hash_functions.cuh>
+#include <cuco/sketch_size.hpp>
 #include <cuco/utility/cuda_thread_scope.cuh>
 
 #include <cstddef>
@@ -63,14 +64,14 @@ class hyperloglog {
    * @param alloc Allocator used for allocating device storage
    * @param stream CUDA stream used to initialize the object
    */
-  constexpr hyperloglog(std::size_t max_sketch_size_kb,
+  constexpr hyperloglog(cuco::sketch_size_kb max_sketch_size_kb,
                         Hash const& hash,
                         Allocator const& alloc,
                         cuco::cuda_stream_ref stream)
     : allocator_{alloc},
-      deleter_{this->sketch_bytes(max_sketch_size_kb), this->allocator_},
-      sketch_{this->allocator_.allocate(this->sketch_bytes(max_sketch_size_kb)), this->deleter_},
-      ref_{cuda::std::span{this->sketch_.get(), this->sketch_bytes(max_sketch_size_kb)}, hash}
+      deleter_{sketch_bytes(max_sketch_size_kb), this->allocator_},
+      sketch_{this->allocator_.allocate(sketch_bytes(max_sketch_size_kb)), this->deleter_},
+      ref_{cuda::std::span{this->sketch_.get(), sketch_bytes(max_sketch_size_kb)}, hash}
   {
     this->ref_.clear_async(stream);
   }
@@ -260,7 +261,8 @@ class hyperloglog {
    *
    * @return The number of bytes required for the sketch
    */
-  [[nodiscard]] static constexpr std::size_t sketch_bytes(std::size_t max_sketch_size_kb) noexcept
+  [[nodiscard]] static constexpr std::size_t sketch_bytes(
+    cuco::sketch_size_kb max_sketch_size_kb) noexcept
   {
     return ref_type<>::sketch_bytes(max_sketch_size_kb);
   }
diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
index af450d080..75b95bf4a 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
@@ -22,6 +22,7 @@
 #include <cuco/detail/hyperloglog/kernels.cuh>
 #include <cuco/detail/utils.hpp>
 #include <cuco/hash_functions.cuh>
+#include <cuco/sketch_size.hpp>
 #include <cuco/utility/cuda_thread_scope.cuh>
 #include <cuco/utility/traits.hpp>
 
@@ -75,8 +76,9 @@ class hyperloglog_ref {
   __host__ __device__ constexpr hyperloglog_ref(cuda::std::span<std::byte> sketch_span,
                                                 Hash const& hash)
     : hash_{hash},
-      precision_{cuda::std::countr_zero(this->sketch_bytes(sketch_span.size() / 1024) /
-                                        sizeof(register_type))},
+      precision_{cuda::std::countr_zero(
+        sketch_bytes(cuco::sketch_size_kb(static_cast<double>(sketch_span.size() / 1024))) /
+        sizeof(register_type))},
       register_mask_{(1ull << this->precision_) - 1},
       sketch_{reinterpret_cast<register_type*>(sketch_span.data()),
               this->sketch_bytes() / sizeof(register_type)}
@@ -436,9 +438,9 @@ class hyperloglog_ref {
    * @return The number of bytes required for the sketch
    */
   [[nodiscard]] __host__ __device__ static constexpr std::size_t sketch_bytes(
-    std::size_t max_sketch_size_kb) noexcept
+    cuco::sketch_size_kb max_sketch_size_kb) noexcept
   {
-    return cuda::std::bit_floor(max_sketch_size_kb * 1024);
+    return cuda::std::bit_floor(static_cast<std::size_t>(max_sketch_size_kb * 1024));
   }
 
   /**
diff --git a/include/cuco/distinct_count_estimator.cuh b/include/cuco/distinct_count_estimator.cuh
index 0a2490ad7..9860d7246 100644
--- a/include/cuco/distinct_count_estimator.cuh
+++ b/include/cuco/distinct_count_estimator.cuh
@@ -19,6 +19,7 @@
 #include <cuco/detail/hyperloglog/hyperloglog.cuh>
 #include <cuco/distinct_count_estimator_ref.cuh>
 #include <cuco/hash_functions.cuh>
+#include <cuco/sketch_size.hpp>
 #include <cuco/utility/allocator.hpp>
 #include <cuco/utility/cuda_thread_scope.cuh>
 
@@ -34,7 +35,6 @@ namespace cuco {
  * https://static.googleusercontent.com/media/research.google.com/de//pubs/archive/40671.pdf.
  *
  * @tparam T Type of items to count
- * @tparam Precision Tuning parameter to trade runtime/memory footprint for better accuracy
  * @tparam Scope The scope in which operations will be performed by individual threads
  * @tparam Hash Hash function used to hash items
  * @tparam Allocator Type of allocator used for device storage
@@ -67,10 +67,10 @@ class distinct_count_estimator {
    * @param alloc Allocator used for allocating device storage
    * @param stream CUDA stream used to initialize the object
    */
-  constexpr distinct_count_estimator(std::size_t max_sketch_size_kb = 32,
-                                     Hash const& hash               = {},
-                                     Allocator const& alloc         = {},
-                                     cuco::cuda_stream_ref stream   = {});
+  constexpr distinct_count_estimator(cuco::sketch_size_kb max_sketch_size_kb = 32_KB,
+                                     Hash const& hash                        = {},
+                                     Allocator const& alloc                  = {},
+                                     cuco::cuda_stream_ref stream            = {});
 
   ~distinct_count_estimator() = default;
 
@@ -234,7 +234,8 @@ class distinct_count_estimator {
    *
    * @return The number of bytes required for the sketch
    */
-  [[nodiscard]] static constexpr std::size_t sketch_bytes(std::size_t max_sketch_size_kb) noexcept;
+  [[nodiscard]] static constexpr std::size_t sketch_bytes(
+    cuco::sketch_size_kb max_sketch_size_kb) noexcept;
 
   /**
    * @brief Gets the alignment required for the sketch storage.
diff --git a/include/cuco/distinct_count_estimator_ref.cuh b/include/cuco/distinct_count_estimator_ref.cuh
index 65f899723..feb2ac6d4 100644
--- a/include/cuco/distinct_count_estimator_ref.cuh
+++ b/include/cuco/distinct_count_estimator_ref.cuh
@@ -18,6 +18,7 @@
 #include <cuco/cuda_stream_ref.hpp>
 #include <cuco/detail/hyperloglog/hyperloglog_ref.cuh>
 #include <cuco/hash_functions.cuh>
+#include <cuco/sketch_size.hpp>
 #include <cuco/utility/cuda_thread_scope.cuh>
 
 #include <cooperative_groups.h>
@@ -35,7 +36,9 @@ namespace cuco {
  * @tparam Scope The scope in which operations will be performed by individual threads
  * @tparam Hash Hash function used to hash items
  */
-template <class T, cuda::thread_scope Scope, class Hash>
+template <class T,
+          cuda::thread_scope Scope = cuda::thread_scope_device,
+          class Hash               = cuco::xxhash_64<T>>
 class distinct_count_estimator_ref {
   using impl_type = detail::hyperloglog_ref<T, Scope, Hash>;
 
@@ -213,7 +216,7 @@ class distinct_count_estimator_ref {
    * @return The number of bytes required for the sketch
    */
   [[nodiscard]] __host__ __device__ static constexpr std::size_t sketch_bytes(
-    std::size_t max_sketch_size_kb) noexcept;
+    cuco::sketch_size_kb max_sketch_size_kb) noexcept;
 
   /**
    * @brief Gets the alignment required for the sketch storage.
diff --git a/include/cuco/sketch_size.hpp b/include/cuco/sketch_size.hpp
new file mode 100644
index 000000000..f9dce1aed
--- /dev/null
+++ b/include/cuco/sketch_size.hpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+namespace cuco {
+
+/**
+ * @brief Strng type for specifying the sketch size of cuco::distinct_count_estimator(_ref) in KB.
+ *
+ * Values can also be given as literals, e.g., 64.3_KB
+ */
+class sketch_size_kb {
+ public:
+  /**
+   * @brief Constructs a sketch_size_kb object.
+   *
+   * @param value The size of a sketch given in KB
+   */
+  __host__ __device__ explicit constexpr sketch_size_kb(double value) noexcept : value_{value} {}
+
+  /**
+   * @brief Conversion to value type.
+   *
+   * @return Sketch size in KB
+   */
+  __host__ __device__ constexpr operator double() const noexcept { return this->value_; }
+
+ private:
+  double value_;  ///< Sketch size in KB
+};
+}  // namespace cuco
+
+// User-defined literal operators for sketch_size_KB
+__host__ __device__ constexpr cuco::sketch_size_kb operator""_KB(long double value)
+{
+  return cuco::sketch_size_kb{static_cast<double>(value)};
+}
+
+__host__ __device__ constexpr cuco::sketch_size_kb operator""_KB(unsigned long long int value)
+{
+  return cuco::sketch_size_kb{static_cast<double>(value)};
+}
\ No newline at end of file
diff --git a/tests/distinct_count_estimator/unique_sequence_test.cu b/tests/distinct_count_estimator/unique_sequence_test.cu
index 2addf3bee..7d6321de6 100644
--- a/tests/distinct_count_estimator/unique_sequence_test.cu
+++ b/tests/distinct_count_estimator/unique_sequence_test.cu
@@ -57,7 +57,8 @@ TEMPLATE_TEST_CASE_SIG("distinct_count_estimator: unique sequence",
   thrust::sequence(items.begin(), items.end(), 0);
 
   // Initialize the estimator
-  cuco::distinct_count_estimator<T, cuda::thread_scope_device, Hash> estimator(sketch_size_kb);
+  cuco::distinct_count_estimator<T, cuda::thread_scope_device, Hash> estimator{
+    cuco::sketch_size_kb(sketch_size_kb)};
 
   REQUIRE(estimator.estimate() == 0);
 

From a7036ae06f0b7cafe4b8cbb253df6f89673f49a1 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Wed, 20 Mar 2024 13:29:29 +0000
Subject: [PATCH 40/78] Fix benchmark

---
 benchmarks/distinct_count_estimator_bench.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/distinct_count_estimator_bench.cu b/benchmarks/distinct_count_estimator_bench.cu
index 07135a999..9bc3fc514 100644
--- a/benchmarks/distinct_count_estimator_bench.cu
+++ b/benchmarks/distinct_count_estimator_bench.cu
@@ -67,7 +67,7 @@ void distinct_count_estimator_e2e(nvbench::state& state, nvbench::type_list<Esti
   state.add_element_count(num_items);
   state.add_global_memory_reads<T>(num_items, "InputSize");
 
-  Estimator estimator(cuco::sketch_size_kb(sketch_size_kb));
+  Estimator estimator{cuco::sketch_size_kb(sketch_size_kb)};
   estimator.add(items.begin(), items.end());
 
   double estimated_cardinality  = estimator.estimate();
@@ -111,7 +111,7 @@ void distinct_count_estimator_add(nvbench::state& state, nvbench::type_list<Esti
   state.add_element_count(num_items);
   state.add_global_memory_reads<T>(num_items, "InputSize");
 
-  Estimator estimator(sketch_size_kb);
+  Estimator estimator{cuco::sketch_size_kb(sketch_size_kb)};
   state.exec(nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
     estimator.clear_async({launch.get_stream()});
 

From 3e25da738f762231aca8fcbf737b99f8aad6d96a Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Wed, 20 Mar 2024 14:17:41 +0000
Subject: [PATCH 41/78] More robust error estimation in benchmark

---
 benchmarks/distinct_count_estimator_bench.cu | 60 +++++++++++++-------
 1 file changed, 40 insertions(+), 20 deletions(-)

diff --git a/benchmarks/distinct_count_estimator_bench.cu b/benchmarks/distinct_count_estimator_bench.cu
index 9bc3fc514..a75411002 100644
--- a/benchmarks/distinct_count_estimator_bench.cu
+++ b/benchmarks/distinct_count_estimator_bench.cu
@@ -48,6 +48,31 @@ template <typename InputIt>
   return set.size();
 }
 
+template <class Estimator, class Dist>
+[[nodiscard]] double relative_error(nvbench::state& state, std::size_t num_samples = 5)
+{
+  using T = typename Estimator::value_type;
+
+  auto const num_items      = state.get_int64("NumInputs");
+  auto const sketch_size_kb = state.get_int64("SketchSizeKB");
+
+  thrust::device_vector<T> items(num_items);
+
+  key_generator gen;
+  Estimator estimator{cuco::sketch_size_kb(sketch_size_kb)};
+  double error_sum = 0;
+  for (std::size_t i = 0; i < num_samples; ++i) {
+    gen.generate(dist_from_state<Dist>(state), items.begin(), items.end());
+    estimator.add(items.begin(), items.end());
+    double estimated_cardinality = estimator.estimate();
+    double true_cardinality      = exact_distinct_count(items.begin(), num_items);
+    error_sum += abs(true_cardinality - estimated_cardinality) / true_cardinality;
+    estimator.clear();
+  }
+
+  return error_sum / num_samples;
+}
+
 /**
  * @brief A benchmark evaluating `cuco::distinct_count_estimator` end-to-end performance
  */
@@ -59,36 +84,31 @@ void distinct_count_estimator_e2e(nvbench::state& state, nvbench::type_list<Esti
   auto const num_items      = state.get_int64("NumInputs");
   auto const sketch_size_kb = state.get_int64("SketchSizeKB");
 
-  thrust::device_vector<T> items(num_items);
-
-  key_generator gen;
-  gen.generate(dist_from_state<Dist>(state), items.begin(), items.end());
-
   state.add_element_count(num_items);
   state.add_global_memory_reads<T>(num_items, "InputSize");
 
-  Estimator estimator{cuco::sketch_size_kb(sketch_size_kb)};
-  estimator.add(items.begin(), items.end());
+  auto const err = relative_error<Estimator, Dist>(state);
+  auto& summ     = state.add_summary("MeanRelativeError");
+  summ.set_string("hint", "MRelErr");
+  summ.set_string("short_name", "MeanRelativeError");
+  summ.set_string("description", "Mean relatve approximation error.");
+  summ.set_float64("value", err);
 
-  double estimated_cardinality  = estimator.estimate();
-  double const true_cardinality = exact_distinct_count(items.begin(), num_items);
-  auto const relative_error     = abs(true_cardinality - estimated_cardinality) / true_cardinality;
+  thrust::device_vector<T> items(num_items);
 
-  auto& summ = state.add_summary("RelativeError");
-  summ.set_string("hint", "RelErr");
-  summ.set_string("short_name", "RelativeError");
-  summ.set_string("description", "Relatve approximation error.");
-  summ.set_float64("value", relative_error);
+  key_generator gen;
+  gen.generate(dist_from_state<Dist>(state), items.begin(), items.end());
 
-  estimator.clear();
+  Estimator estimator{cuco::sketch_size_kb(sketch_size_kb)};
+  std::size_t estimated_cardinality = 0;
   state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer,
              [&](nvbench::launch& launch, auto& timer) {
-               estimator.clear_async({launch.get_stream()});
-
                timer.start();
                estimator.add_async(items.begin(), items.end(), {launch.get_stream()});
                estimated_cardinality = estimator.estimate({launch.get_stream()});
                timer.stop();
+
+               estimator.clear_async({launch.get_stream()});
              });
 }
 
@@ -113,11 +133,11 @@ void distinct_count_estimator_add(nvbench::state& state, nvbench::type_list<Esti
 
   Estimator estimator{cuco::sketch_size_kb(sketch_size_kb)};
   state.exec(nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
-    estimator.clear_async({launch.get_stream()});
-
     timer.start();
     estimator.add_async(items.begin(), items.end(), {launch.get_stream()});
     timer.stop();
+
+    estimator.clear_async({launch.get_stream()});
   });
 }
 

From e5d51125c398ada35bce3c64d05f1fdc28de2648 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Wed, 20 Mar 2024 15:01:11 +0000
Subject: [PATCH 42/78] Benchmark gmem fallback kernel

---
 benchmarks/distinct_count_estimator_bench.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/distinct_count_estimator_bench.cu b/benchmarks/distinct_count_estimator_bench.cu
index a75411002..c5ae8a6b3 100644
--- a/benchmarks/distinct_count_estimator_bench.cu
+++ b/benchmarks/distinct_count_estimator_bench.cu
@@ -158,5 +158,5 @@ NVBENCH_BENCH_TYPES(distinct_count_estimator_add,
   .set_name("distinct_count_estimator::add_async")
   .set_type_axes_names({"Estimator", "Distribution"})
   .add_int64_power_of_two_axis("NumInputs", {28, 29, 30})
-  .add_int64_axis("SketchSizeKB", {8, 16, 32})
+  .add_int64_axis("SketchSizeKB", {8, 16, 32, 256})  // 256KB uses gmem fallback kernel
   .set_max_noise(defaults::MAX_NOISE);
\ No newline at end of file

From 99c0dee0347a8c71cf16206fbcdc8626e0141a2f Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Wed, 20 Mar 2024 15:04:58 +0000
Subject: [PATCH 43/78] Rename max_sketch_size_kb -> sketch_size_kb

---
 .../distinct_count_estimator.inl                 |  8 ++++----
 .../distinct_count_estimator_ref.inl             |  4 ++--
 include/cuco/detail/hyperloglog/hyperloglog.cuh  | 16 ++++++++--------
 .../cuco/detail/hyperloglog/hyperloglog_ref.cuh  |  6 +++---
 include/cuco/distinct_count_estimator.cuh        | 14 +++++++-------
 include/cuco/distinct_count_estimator_ref.cuh    |  4 ++--
 6 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl
index be3fcfb9e..21128de9e 100644
--- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl
+++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl
@@ -18,11 +18,11 @@ namespace cuco {
 
 template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
 constexpr distinct_count_estimator<T, Scope, Hash, Allocator>::distinct_count_estimator(
-  cuco::sketch_size_kb max_sketch_size_kb,
+  cuco::sketch_size_kb sketch_size_kb,
   Hash const& hash,
   Allocator const& alloc,
   cuco::cuda_stream_ref stream)
-  : impl_{std::make_unique<impl_type>(max_sketch_size_kb, hash, alloc, stream)}
+  : impl_{std::make_unique<impl_type>(sketch_size_kb, hash, alloc, stream)}
 {
 }
 
@@ -125,9 +125,9 @@ constexpr size_t distinct_count_estimator<T, Scope, Hash, Allocator>::sketch_byt
 
 template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
 constexpr size_t distinct_count_estimator<T, Scope, Hash, Allocator>::sketch_bytes(
-  cuco::sketch_size_kb max_sketch_size_kb) noexcept
+  cuco::sketch_size_kb sketch_size_kb) noexcept
 {
-  return impl_type::sketch_bytes(max_sketch_size_kb);
+  return impl_type::sketch_bytes(sketch_size_kb);
 }
 
 template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl
index 90f609b5d..53beb5016 100644
--- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl
+++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl
@@ -130,9 +130,9 @@ distinct_count_estimator_ref<T, Scope, Hash>::sketch_bytes() const noexcept
 template <class T, cuda::thread_scope Scope, class Hash>
 __host__ __device__ constexpr std::size_t
 distinct_count_estimator_ref<T, Scope, Hash>::sketch_bytes(
-  cuco::sketch_size_kb max_sketch_size_kb) noexcept
+  cuco::sketch_size_kb sketch_size_kb) noexcept
 {
-  return impl_type::sketch_bytes(max_sketch_size_kb);
+  return impl_type::sketch_bytes(sketch_size_kb);
 }
 
 template <class T, cuda::thread_scope Scope, class Hash>
diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh
index b8e5c5db2..5c0d07833 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog.cuh
@@ -59,19 +59,19 @@ class hyperloglog {
    *
    * @note This function synchronizes the given stream.
    *
-   * @param max_sketch_size_kb Maximum sketch size in KB
+   * @param sketch_size_kb Maximum sketch size in KB
    * @param hash The hash function used to hash items
    * @param alloc Allocator used for allocating device storage
    * @param stream CUDA stream used to initialize the object
    */
-  constexpr hyperloglog(cuco::sketch_size_kb max_sketch_size_kb,
+  constexpr hyperloglog(cuco::sketch_size_kb sketch_size_kb,
                         Hash const& hash,
                         Allocator const& alloc,
                         cuco::cuda_stream_ref stream)
     : allocator_{alloc},
-      deleter_{sketch_bytes(max_sketch_size_kb), this->allocator_},
-      sketch_{this->allocator_.allocate(sketch_bytes(max_sketch_size_kb)), this->deleter_},
-      ref_{cuda::std::span{this->sketch_.get(), sketch_bytes(max_sketch_size_kb)}, hash}
+      deleter_{sketch_bytes(sketch_size_kb), this->allocator_},
+      sketch_{this->allocator_.allocate(sketch_bytes(sketch_size_kb)), this->deleter_},
+      ref_{cuda::std::span{this->sketch_.get(), sketch_bytes(sketch_size_kb)}, hash}
   {
     this->ref_.clear_async(stream);
   }
@@ -257,14 +257,14 @@ class hyperloglog {
   /**
    * @brief Gets the number of bytes required for the sketch storage.
    *
-   * @param max_sketch_size_kb Upper bound sketch size in KB
+   * @param sketch_size_kb Upper bound sketch size in KB
    *
    * @return The number of bytes required for the sketch
    */
   [[nodiscard]] static constexpr std::size_t sketch_bytes(
-    cuco::sketch_size_kb max_sketch_size_kb) noexcept
+    cuco::sketch_size_kb sketch_size_kb) noexcept
   {
-    return ref_type<>::sketch_bytes(max_sketch_size_kb);
+    return ref_type<>::sketch_bytes(sketch_size_kb);
   }
 
   /**
diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
index 75b95bf4a..06adaf78b 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
@@ -433,14 +433,14 @@ class hyperloglog_ref {
   /**
    * @brief Gets the number of bytes required for the sketch storage.
    *
-   * @param max_sketch_size_kb Upper bound sketch size in KB
+   * @param sketch_size_kb Upper bound sketch size in KB
    *
    * @return The number of bytes required for the sketch
    */
   [[nodiscard]] __host__ __device__ static constexpr std::size_t sketch_bytes(
-    cuco::sketch_size_kb max_sketch_size_kb) noexcept
+    cuco::sketch_size_kb sketch_size_kb) noexcept
   {
-    return cuda::std::bit_floor(static_cast<std::size_t>(max_sketch_size_kb * 1024));
+    return cuda::std::bit_floor(static_cast<std::size_t>(sketch_size_kb * 1024));
   }
 
   /**
diff --git a/include/cuco/distinct_count_estimator.cuh b/include/cuco/distinct_count_estimator.cuh
index 9860d7246..863a23a83 100644
--- a/include/cuco/distinct_count_estimator.cuh
+++ b/include/cuco/distinct_count_estimator.cuh
@@ -62,15 +62,15 @@ class distinct_count_estimator {
    *
    * @note This function synchronizes the given stream.
    *
-   * @param max_sketch_size_kb Maximum sketch size in KB
+   * @param sketch_size_kb Maximum sketch size in KB
    * @param hash The hash function used to hash items
    * @param alloc Allocator used for allocating device storage
    * @param stream CUDA stream used to initialize the object
    */
-  constexpr distinct_count_estimator(cuco::sketch_size_kb max_sketch_size_kb = 32_KB,
-                                     Hash const& hash                        = {},
-                                     Allocator const& alloc                  = {},
-                                     cuco::cuda_stream_ref stream            = {});
+  constexpr distinct_count_estimator(cuco::sketch_size_kb sketch_size_kb = 32_KB,
+                                     Hash const& hash                    = {},
+                                     Allocator const& alloc              = {},
+                                     cuco::cuda_stream_ref stream        = {});
 
   ~distinct_count_estimator() = default;
 
@@ -230,12 +230,12 @@ class distinct_count_estimator {
   /**
    * @brief Gets the number of bytes required for the sketch storage.
    *
-   * @param max_sketch_size_kb Upper bound sketch size in KB
+   * @param sketch_size_kb Upper bound sketch size in KB
    *
    * @return The number of bytes required for the sketch
    */
   [[nodiscard]] static constexpr std::size_t sketch_bytes(
-    cuco::sketch_size_kb max_sketch_size_kb) noexcept;
+    cuco::sketch_size_kb sketch_size_kb) noexcept;
 
   /**
    * @brief Gets the alignment required for the sketch storage.
diff --git a/include/cuco/distinct_count_estimator_ref.cuh b/include/cuco/distinct_count_estimator_ref.cuh
index feb2ac6d4..25f1834e3 100644
--- a/include/cuco/distinct_count_estimator_ref.cuh
+++ b/include/cuco/distinct_count_estimator_ref.cuh
@@ -211,12 +211,12 @@ class distinct_count_estimator_ref {
   /**
    * @brief Gets the number of bytes required for the sketch storage.
    *
-   * @param max_sketch_size_kb Upper bound sketch size in KB
+   * @param sketch_size_kb Upper bound sketch size in KB
    *
    * @return The number of bytes required for the sketch
    */
   [[nodiscard]] __host__ __device__ static constexpr std::size_t sketch_bytes(
-    cuco::sketch_size_kb max_sketch_size_kb) noexcept;
+    cuco::sketch_size_kb sketch_size_kb) noexcept;
 
   /**
    * @brief Gets the alignment required for the sketch storage.

From aeaecf405c1bc475b13639b72968c586bddcbde5 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Wed, 20 Mar 2024 15:32:49 +0000
Subject: [PATCH 44/78] Improve error handling and docs

---
 .../distinct_count_estimator.inl              |  4 +-
 .../distinct_count_estimator_ref.inl          |  5 +--
 .../cuco/detail/hyperloglog/hyperloglog.cuh   | 15 ++++++-
 .../detail/hyperloglog/hyperloglog_ref.cuh    | 39 ++++++++++++++++---
 include/cuco/distinct_count_estimator.cuh     | 15 ++++++-
 include/cuco/distinct_count_estimator_ref.cuh | 14 +++++--
 6 files changed, 74 insertions(+), 18 deletions(-)

diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl
index 21128de9e..9ea4816d5 100644
--- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl
+++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl
@@ -60,7 +60,7 @@ template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
 template <cuda::thread_scope OtherScope, class OtherAllocator>
 void distinct_count_estimator<T, Scope, Hash, Allocator>::merge_async(
   distinct_count_estimator<T, OtherScope, Hash, OtherAllocator> const& other,
-  cuco::cuda_stream_ref stream) noexcept
+  cuco::cuda_stream_ref stream)
 {
   this->impl_->merge_async(other, stream);
 }
@@ -77,7 +77,7 @@ void distinct_count_estimator<T, Scope, Hash, Allocator>::merge(
 template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
 template <cuda::thread_scope OtherScope>
 void distinct_count_estimator<T, Scope, Hash, Allocator>::merge_async(
-  ref_type<OtherScope> const& other, cuco::cuda_stream_ref stream) noexcept
+  ref_type<OtherScope> const& other, cuco::cuda_stream_ref stream)
 {
   this->impl_->merge_async(other, stream);
 }
diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl
index 53beb5016..3be39ac44 100644
--- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl
+++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl
@@ -71,7 +71,7 @@ __host__ void distinct_count_estimator_ref<T, Scope, Hash>::add(InputIt first,
 template <class T, cuda::thread_scope Scope, class Hash>
 template <class CG, cuda::thread_scope OtherScope>
 __device__ void distinct_count_estimator_ref<T, Scope, Hash>::merge(
-  CG const& group, distinct_count_estimator_ref<T, OtherScope, Hash> const& other) noexcept
+  CG const& group, distinct_count_estimator_ref<T, OtherScope, Hash> const& other)
 {
   this->impl_.merge(group, other.impl_);
 }
@@ -79,8 +79,7 @@ __device__ void distinct_count_estimator_ref<T, Scope, Hash>::merge(
 template <class T, cuda::thread_scope Scope, class Hash>
 template <cuda::thread_scope OtherScope>
 __host__ void distinct_count_estimator_ref<T, Scope, Hash>::merge_async(
-  distinct_count_estimator_ref<T, OtherScope, Hash> const& other,
-  cuco::cuda_stream_ref stream) noexcept
+  distinct_count_estimator_ref<T, OtherScope, Hash> const& other, cuco::cuda_stream_ref stream)
 {
   this->impl_.merge_async(other, stream);
 }
diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh
index 5c0d07833..23079e4db 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog.cuh
@@ -59,6 +59,9 @@ class hyperloglog {
    *
    * @note This function synchronizes the given stream.
    *
+   * @throw If sketch size < 0.0625KB or 64B
+   * @throw If sketch storage has insufficient alignment
+   *
    * @param sketch_size_kb Maximum sketch size in KB
    * @param hash The hash function used to hash items
    * @param alloc Allocator used for allocating device storage
@@ -146,6 +149,8 @@ class hyperloglog {
   /**
    * @brief Asynchronously merges the result of `other` estimator into `*this` estimator.
    *
+   * @throw If this->sketch_bytes() != other.sketch_bytes()
+   *
    * @tparam OtherScope Thread scope of `other` estimator
    * @tparam OtherAllocator Allocator type of `other` estimator
    *
@@ -154,7 +159,7 @@ class hyperloglog {
    */
   template <cuda::thread_scope OtherScope, class OtherAllocator>
   void merge_async(hyperloglog<T, OtherScope, Hash, OtherAllocator> const& other,
-                   cuco::cuda_stream_ref stream) noexcept
+                   cuco::cuda_stream_ref stream)
   {
     this->ref_.merge_async(other.ref(), stream);
   }
@@ -165,6 +170,8 @@ class hyperloglog {
    * @note This function synchronizes the given stream. For asynchronous execution use
    * `merge_async`.
    *
+   * @throw If this->sketch_bytes() != other.sketch_bytes()
+   *
    * @tparam OtherScope Thread scope of `other` estimator
    * @tparam OtherAllocator Allocator type of `other` estimator
    *
@@ -181,13 +188,15 @@ class hyperloglog {
   /**
    * @brief Asynchronously merges the result of `other` estimator reference into `*this` estimator.
    *
+   * @throw If this->sketch_bytes() != other.sketch_bytes()
+   *
    * @tparam OtherScope Thread scope of `other` estimator
    *
    * @param other Other estimator reference to be merged into `*this`
    * @param stream CUDA stream this operation is executed in
    */
   template <cuda::thread_scope OtherScope>
-  void merge_async(ref_type<OtherScope> const& other, cuco::cuda_stream_ref stream) noexcept
+  void merge_async(ref_type<OtherScope> const& other, cuco::cuda_stream_ref stream)
   {
     this->ref_.merge_async(other, stream);
   }
@@ -198,6 +207,8 @@ class hyperloglog {
    * @note This function synchronizes the given stream. For asynchronous execution use
    * `merge_async`.
    *
+   * @throw If this->sketch_bytes() != other.sketch_bytes()
+   *
    * @tparam OtherScope Thread scope of `other` estimator
    *
    * @param other Other estimator reference to be merged into `*this`
diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
index 06adaf78b..87d49d245 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
@@ -70,6 +70,9 @@ class hyperloglog_ref {
   /**
    * @brief Constructs a non-owning `hyperloglog_ref` object.
    *
+   * @throw If sketch size < 0.0625KB or 64B
+   * @throw If sketch storage has insufficient alignment
+   *
    * @param sketch_span Reference to sketch storage
    * @param hash The hash function used to hash items
    */
@@ -83,7 +86,24 @@ class hyperloglog_ref {
       sketch_{reinterpret_cast<register_type*>(sketch_span.data()),
               this->sketch_bytes() / sizeof(register_type)}
   {
-    // TODO check size and alignment
+    auto const alignment =
+      1ull << cuda::std::countr_zero(reinterpret_cast<cuda::std::uintptr_t>(sketch_span.data()));
+
+    if (alignment < sketch_alignment()) {
+#ifdef __CUDA_ARCH__
+      __trap();
+#else
+      CUCO_FAIL("Insufficient sketch alignment", std::runtime_error);
+#endif
+    }
+
+    if (this->precision_ < 4) {
+#ifdef __CUDA_ARCH__
+      __trap();
+#else
+      CUCO_FAIL("Minimum required sketch size is 0.0625KB or 64B", std::runtime_error);
+#endif
+    }
   }
 
   /**
@@ -252,6 +272,8 @@ class hyperloglog_ref {
   /**
    * @brief Merges the result of `other` estimator reference into `*this` estimator reference.
    *
+   * @throw If this->sketch_bytes() != other.sketch_bytes()
+   *
    * @tparam CG CUDA Cooperative Group type
    * @tparam OtherScope Thread scope of `other` estimator
    *
@@ -259,11 +281,9 @@ class hyperloglog_ref {
    * @param other Other estimator reference to be merged into `*this`
    */
   template <class CG, cuda::thread_scope OtherScope>
-  __device__ void merge(CG const& group, hyperloglog_ref<T, OtherScope, Hash> const& other) noexcept
+  __device__ void merge(CG const& group, hyperloglog_ref<T, OtherScope, Hash> const& other)
   {
-    if (other.precision_ != this->precision_) {
-      __trap();  // TODO check if this hurts performance
-    }
+    if (other.precision_ != this->precision_) { __trap(); }
 
     for (int i = group.thread_rank(); i < this->sketch_.size(); i += group.size()) {
       this->update_max(i, other.sketch_[i]);
@@ -274,6 +294,8 @@ class hyperloglog_ref {
    * @brief Asynchronously merges the result of `other` estimator reference into `*this`
    * estimator.
    *
+   * @throw If this->sketch_bytes() != other.sketch_bytes()
+   *
    * @tparam OtherScope Thread scope of `other` estimator
    *
    * @param other Other estimator reference to be merged into `*this`
@@ -281,8 +303,11 @@ class hyperloglog_ref {
    */
   template <cuda::thread_scope OtherScope>
   __host__ void merge_async(hyperloglog_ref<T, OtherScope, Hash> const& other,
-                            cuco::cuda_stream_ref stream) noexcept
+                            cuco::cuda_stream_ref stream)
   {
+    CUCO_EXPECTS(other.precision == this->precision_,
+                 "Cannot merge estimators with different sketch sizes",
+                 std::runtime_error);
     auto constexpr block_size = 1024;
     cuco::hyperloglog_ns::detail::merge<<<1, block_size, 0, stream>>>(other, *this);
   }
@@ -293,6 +318,8 @@ class hyperloglog_ref {
    * @note This function synchronizes the given stream. For asynchronous execution use
    * `merge_async`.
    *
+   * @throw If this->sketch_bytes() != other.sketch_bytes()
+   *
    * @tparam OtherScope Thread scope of `other` estimator
    *
    * @param other Other estimator reference to be merged into `*this`
diff --git a/include/cuco/distinct_count_estimator.cuh b/include/cuco/distinct_count_estimator.cuh
index 863a23a83..3cd22b469 100644
--- a/include/cuco/distinct_count_estimator.cuh
+++ b/include/cuco/distinct_count_estimator.cuh
@@ -62,6 +62,9 @@ class distinct_count_estimator {
    *
    * @note This function synchronizes the given stream.
    *
+   * @throw If sketch size < 0.0625KB or 64B
+   * @throw If sketch storage has insufficient alignment
+   *
    * @param sketch_size_kb Maximum sketch size in KB
    * @param hash The hash function used to hash items
    * @param alloc Allocator used for allocating device storage
@@ -137,6 +140,8 @@ class distinct_count_estimator {
   /**
    * @brief Asynchronously merges the result of `other` estimator into `*this` estimator.
    *
+   * @throw If this->sketch_bytes() != other.sketch_bytes()
+   *
    * @tparam OtherScope Thread scope of `other` estimator
    * @tparam OtherAllocator Allocator type of `other` estimator
    *
@@ -145,7 +150,7 @@ class distinct_count_estimator {
    */
   template <cuda::thread_scope OtherScope, class OtherAllocator>
   void merge_async(distinct_count_estimator<T, OtherScope, Hash, OtherAllocator> const& other,
-                   cuco::cuda_stream_ref stream = {}) noexcept;
+                   cuco::cuda_stream_ref stream = {});
 
   /**
    * @brief Merges the result of `other` estimator into `*this` estimator.
@@ -153,6 +158,8 @@ class distinct_count_estimator {
    * @note This function synchronizes the given stream. For asynchronous execution use
    * `merge_async`.
    *
+   * @throw If this->sketch_bytes() != other.sketch_bytes()
+   *
    * @tparam OtherScope Thread scope of `other` estimator
    * @tparam OtherAllocator Allocator type of `other` estimator
    *
@@ -166,13 +173,15 @@ class distinct_count_estimator {
   /**
    * @brief Asynchronously merges the result of `other` estimator reference into `*this` estimator.
    *
+   * @throw If this->sketch_bytes() != other.sketch_bytes()
+   *
    * @tparam OtherScope Thread scope of `other` estimator
    *
    * @param other Other estimator reference to be merged into `*this`
    * @param stream CUDA stream this operation is executed in
    */
   template <cuda::thread_scope OtherScope>
-  void merge_async(ref_type<OtherScope> const& other, cuco::cuda_stream_ref stream = {}) noexcept;
+  void merge_async(ref_type<OtherScope> const& other, cuco::cuda_stream_ref stream = {});
 
   /**
    * @brief Merges the result of `other` estimator reference into `*this` estimator.
@@ -180,6 +189,8 @@ class distinct_count_estimator {
    * @note This function synchronizes the given stream. For asynchronous execution use
    * `merge_async`.
    *
+   * @throw If this->sketch_bytes() != other.sketch_bytes()
+   *
    * @tparam OtherScope Thread scope of `other` estimator
    *
    * @param other Other estimator reference to be merged into `*this`
diff --git a/include/cuco/distinct_count_estimator_ref.cuh b/include/cuco/distinct_count_estimator_ref.cuh
index 25f1834e3..a0806cccf 100644
--- a/include/cuco/distinct_count_estimator_ref.cuh
+++ b/include/cuco/distinct_count_estimator_ref.cuh
@@ -51,10 +51,12 @@ class distinct_count_estimator_ref {
   using with_scope = distinct_count_estimator_ref<T, NewScope, Hash>;  ///< Ref type with different
                                                                        ///< thread scope
 
-  // TODO let storage_type be inferred?
   /**
    * @brief Constructs a non-owning `distinct_count_estimator_ref` object.
    *
+   * @throw If sketch size < 0.0625KB or 64B
+   * @throw If sketch storage has insufficient alignment
+   *
    * @param sketch_span Reference to sketch storage
    * @param hash The hash function used to hash items
    */
@@ -129,6 +131,8 @@ class distinct_count_estimator_ref {
   /**
    * @brief Merges the result of `other` estimator reference into `*this` estimator reference.
    *
+   * @throw If this->sketch_bytes() != other.sketch_bytes()
+   *
    * @tparam CG CUDA Cooperative Group type
    * @tparam OtherScope Thread scope of `other` estimator
    *
@@ -137,11 +141,13 @@ class distinct_count_estimator_ref {
    */
   template <class CG, cuda::thread_scope OtherScope>
   __device__ void merge(CG const& group,
-                        distinct_count_estimator_ref<T, OtherScope, Hash> const& other) noexcept;
+                        distinct_count_estimator_ref<T, OtherScope, Hash> const& other);
 
   /**
    * @brief Asynchronously merges the result of `other` estimator reference into `*this` estimator.
    *
+   * @throw If this->sketch_bytes() != other.sketch_bytes()
+   *
    * @tparam OtherScope Thread scope of `other` estimator
    *
    * @param other Other estimator reference to be merged into `*this`
@@ -149,11 +155,13 @@ class distinct_count_estimator_ref {
    */
   template <cuda::thread_scope OtherScope>
   __host__ void merge_async(distinct_count_estimator_ref<T, OtherScope, Hash> const& other,
-                            cuco::cuda_stream_ref stream = {}) noexcept;
+                            cuco::cuda_stream_ref stream = {});
 
   /**
    * @brief Merges the result of `other` estimator reference into `*this` estimator.
    *
+   * @throw If this->sketch_bytes() != other.sketch_bytes()
+   *
    * @note This function synchronizes the given stream. For asynchronous execution use
    * `merge_async`.
    *

From 55fa31205be9d968c86b514c101492cc65b5b724 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Wed, 20 Mar 2024 17:52:50 +0000
Subject: [PATCH 45/78] Cleanup finalizer

---
 include/cuco/detail/hyperloglog/finalizer.cuh | 30 ++++++++-----------
 1 file changed, 13 insertions(+), 17 deletions(-)

diff --git a/include/cuco/detail/hyperloglog/finalizer.cuh b/include/cuco/detail/hyperloglog/finalizer.cuh
index f40a0e751..845453b0d 100644
--- a/include/cuco/detail/hyperloglog/finalizer.cuh
+++ b/include/cuco/detail/hyperloglog/finalizer.cuh
@@ -29,6 +29,7 @@ namespace cuco::hyperloglog_ns::detail {
  *
  * @note Variable names correspond to the definitions given in the HLL++ paper:
  * https://static.googleusercontent.com/media/research.google.com/de//pubs/archive/40671.pdf
+ * @note Previcion must be >= 4.
  *
  */
 class finalizer {
@@ -39,13 +40,10 @@ class finalizer {
   /**
    * @brief Contructs an HLL finalizer object.
    *
-   * @throws Iff precision vale is not supported
-   *
    * @param precision HLL precision parameter
    */
   __host__ __device__ constexpr finalizer(int precision) : precision_{precision}, m_{1 << precision}
   {
-    // TODO check if precision >= 4
   }
 
   /**
@@ -58,7 +56,7 @@ class finalizer {
    */
   __host__ __device__ constexpr std::size_t operator()(double z, int v) const noexcept
   {
-    auto e = alpha_mm() / z;
+    double e = this->alpha_mm() / z;
 
     if (v > 0) {
       // Use linear counting for small cardinality estimates.
@@ -67,11 +65,11 @@ class finalizer {
       if (e <= 2.5 * this->m_) { return cuda::std::round(h); }
 
       if (this->precision_ < 19) {
-        e = (h <= threshold(this->precision_)) ? h : bias_corrected_estimate(e);
+        e = (h <= threshold(this->precision_)) ? h : this->bias_corrected_estimate(e);
       }
     } else {
       // HLL++ is defined only when p < 19, otherwise we need to fallback to HLL.
-      if (this->precision_ < 19) { e = bias_corrected_estimate(e); }
+      if (this->precision_ < 19) { e = this->bias_corrected_estimate(e); }
     }
 
     return cuda::std::round(e);
@@ -80,32 +78,30 @@ class finalizer {
  private:
   __host__ __device__ constexpr double alpha_mm() const noexcept
   {
-    if (this->m_ == 16) {
-      return 0.673 * this->m_ * this->m_;
-    } else if (this->m_ == 32) {
-      return 0.697 * this->m_ * this->m_;
-    } else if (this->m_ == 64) {
-      return 0.709 * this->m_ * this->m_;
-    } else {
-      return (0.7213 / (1.0 + 1.079 / this->m_)) * this->m_ * this->m_;
+    double const m2 = this->m_ * this->m_;
+    switch (this->m_) {
+      case 16: return 0.673 * m2;
+      case 32: return 0.697 * m2;
+      case 64: return 0.709 * m2;
+      default: return (0.7213 / (1.0 + 1.079 / this->m_)) * m2;
     }
   }
 
   __host__ __device__ constexpr double bias_corrected_estimate(double e) const noexcept
   {
-    return (e < 5.0 * this->m_) ? e - bias(e) : e;
+    return (e < 5.0 * this->m_) ? e - this->bias(e) : e;
   }
 
   __host__ __device__ constexpr double bias(double e) const noexcept
   {
-    auto const anchor_index = interpolation_anchor_index(e);
+    auto const anchor_index = this->interpolation_anchor_index(e);
     int const n             = raw_estimate_data_size(this->precision_);
 
     auto low  = cuda::std::max(anchor_index - k + 1, 0);
     auto high = cuda::std::min(low + k, n);
     // Keep moving bounds as long as the (exclusive) high bound is closer to the estimate than
     // the lower (inclusive) bound.
-    while (high < n and distance(e, high) < distance(e, low)) {
+    while (high < n and this->distance(e, high) < this->distance(e, low)) {
       low += 1;
       high += 1;
     }

From 2229c6844ec6a02097d2a84ca0444ae52f16af3a Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Wed, 20 Mar 2024 17:53:17 +0000
Subject: [PATCH 46/78] Use double reduction

---
 include/cuco/detail/hyperloglog/hyperloglog_ref.cuh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
index 87d49d245..f601d0b3b 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
@@ -54,7 +54,7 @@ template <class T, cuda::thread_scope Scope, class Hash>
 class hyperloglog_ref {
   using register_type = int;  ///< Register array storage
   // We use `int` here since this is the smallest type that supports native `atomicMax` on GPUs
-  using fp_type = float;  ///< Floating point type used for reduction
+  using fp_type = double;  ///< Floating point type used for reduction
   using hash_value_type =
     decltype(cuda::std::declval<Hash>()(cuda::std::declval<T>()));  ///< Hash value type
  public:
@@ -374,6 +374,7 @@ class hyperloglog_ref {
     auto const warp_zeroes =
       cooperative_groups::reduce(warp, thread_zeroes, cooperative_groups::plus<int>());
     // TODO warp sync needed?
+    // TODO use invoke_one
     if (warp.thread_rank() == 0) {
       block_sum.fetch_add(warp_sum, cuda::std::memory_order_relaxed);
       block_zeroes.fetch_add(warp_zeroes, cuda::std::memory_order_relaxed);

From 156a843f12fe57215c336d0ebd8dee449b886663 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Wed, 20 Mar 2024 17:56:46 +0000
Subject: [PATCH 47/78] Use .estimate() in device ref example

---
 .../device_ref_example.cu                     | 45 +++++++++++++++++--
 1 file changed, 41 insertions(+), 4 deletions(-)

diff --git a/examples/distinct_count_estimator/device_ref_example.cu b/examples/distinct_count_estimator/device_ref_example.cu
index c8716e421..933726641 100644
--- a/examples/distinct_count_estimator/device_ref_example.cu
+++ b/examples/distinct_count_estimator/device_ref_example.cu
@@ -69,10 +69,10 @@ __global__ void piggyback_kernel(RefType ref, InputIt first, std::size_t n)
   block.sync();
 
   // We can also compute the local estimate on the device
-  auto const local_estimate = local_ref.estimate(block);
+  // auto const local_estimate = local_ref.estimate(block);
   if (block.thread_rank() == 0) {
     // The local estimate should approximately be `num_items`/`gridDim.x`
-    printf("Estimate for block %d = %llu\n", blockIdx.x, local_estimate);
+    // printf("Estimate for block %d = %llu\n", blockIdx.x, local_estimate);
   }
 
   // In the end, we merge the shared memory estimator into the global estimator which gives us the
@@ -80,10 +80,40 @@ __global__ void piggyback_kernel(RefType ref, InputIt first, std::size_t n)
   ref.merge(block, local_ref);
 }
 
+template <typename Ref, typename InputIt, typename OutputIt>
+__global__ void device_estimate_kernel(cuco::sketch_size_kb sketch_size_kb,
+                                       InputIt in,
+                                       size_t n,
+                                       OutputIt out)
+{
+  extern __shared__ std::byte local_sketch[];
+
+  auto const block = cooperative_groups::this_thread_block();
+
+  // only a single block computes the estimate
+  if (block.group_index().x == 0) {
+    Ref estimator(cuda::std::span(local_sketch, Ref::sketch_bytes(sketch_size_kb)));
+
+    estimator.clear(block);
+    block.sync();
+
+    for (int i = block.thread_rank(); i < n; i += block.num_threads()) {
+      estimator.add(*(in + i));
+    }
+    block.sync();
+    // we can compute the final estimate on the device and return the result to the host
+    auto const estimate = estimator.estimate(block);
+
+    if (block.thread_rank() == 0) { *out = estimate; }
+  }
+}
+
 int main(void)
 {
   using T                         = int;
+  using estimator_type            = cuco::distinct_count_estimator<T>;
   constexpr std::size_t num_items = 1ull << 28;  // 1GB
+  auto const sketch_size_kb       = 32_KB;
 
   thrust::device_vector<T> items(num_items);
 
@@ -91,7 +121,7 @@ int main(void)
   thrust::sequence(items.begin(), items.end(), 0);
 
   // Initialize the estimator
-  cuco::distinct_count_estimator<T> estimator;
+  estimator_type estimator(sketch_size_kb);
 
   // Add all items to the estimator
   estimator.add(items.begin(), items.end());
@@ -111,7 +141,14 @@ int main(void)
   // Calculate the cardinality estimate from the custom kernel
   std::size_t const estimated_cardinality_custom = estimator.estimate();
 
-  if (estimated_cardinality_bulk == estimated_cardinality_custom) {
+  thrust::device_vector<std::size_t> device_estimate(1);
+  device_estimate_kernel<typename estimator_type::ref_type<cuda::thread_scope_block>>
+    <<<1, 512, sketch_bytes>>>(sketch_size_kb, items.begin(), num_items, device_estimate.begin());
+
+  std::size_t const estimated_cardinality_device = device_estimate[0];
+
+  if (estimated_cardinality_custom == estimated_cardinality_bulk and
+      estimated_cardinality_device == estimated_cardinality_bulk) {
     std::cout << "Success! Cardinality estimates are identical" << std::endl;
   }
 

From 80dde95525a6a5fcfcd6348d262182762e532e25 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Wed, 20 Mar 2024 18:00:56 +0000
Subject: [PATCH 48/78] Add device ref test

---
 tests/CMakeLists.txt                          |  3 +-
 .../device_ref_test.cu                        | 94 +++++++++++++++++++
 2 files changed, 96 insertions(+), 1 deletion(-)
 create mode 100644 tests/distinct_count_estimator/device_ref_test.cu

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 496a014f3..5110c2cbd 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -118,4 +118,5 @@ ConfigureTest(DYNAMIC_BITSET_TEST
 ###################################################################################################
 # - distinct_count_estimator ----------------------------------------------------------------------
 ConfigureTest(DISTINCT_COUNT_ESTIMATOR_TEST
-    distinct_count_estimator/unique_sequence_test.cu)
+    distinct_count_estimator/unique_sequence_test.cu
+    distinct_count_estimator/device_ref_test.cu)
diff --git a/tests/distinct_count_estimator/device_ref_test.cu b/tests/distinct_count_estimator/device_ref_test.cu
new file mode 100644
index 000000000..33fe8993f
--- /dev/null
+++ b/tests/distinct_count_estimator/device_ref_test.cu
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utils.hpp>
+
+#include <cuco/distinct_count_estimator.cuh>
+#include <cuco/hash_functions.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/sequence.h>
+
+#include <catch2/catch_template_test_macros.hpp>
+#include <catch2/generators/catch_generators.hpp>
+
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+
+template <typename Ref, typename InputIt, typename OutputIt>
+__global__ void estimate_kernel(cuco::sketch_size_kb sketch_size_kb,
+                                InputIt in,
+                                size_t n,
+                                OutputIt out)
+{
+  extern __shared__ std::byte local_sketch[];
+
+  auto const block = cooperative_groups::this_thread_block();
+
+  // only a single block computes the estimate
+  if (block.group_index().x == 0) {
+    Ref estimator(cuda::std::span(local_sketch, Ref::sketch_bytes(sketch_size_kb)));
+
+    estimator.clear(block);
+    block.sync();
+
+    for (int i = block.thread_rank(); i < n; i += block.num_threads()) {
+      estimator.add(*(in + i));
+    }
+    block.sync();
+    auto const estimate = estimator.estimate(block);
+    if (block.thread_rank() == 0) { *out = estimate; }
+  }
+}
+
+TEMPLATE_TEST_CASE_SIG("distinct_count_estimator: device ref",
+                       "",
+                       ((typename T, typename Hash), T, Hash),
+                       (int32_t, cuco::xxhash_64<int32_t>),
+                       (int64_t, cuco::xxhash_64<int64_t>),
+                       (__int128_t, cuco::xxhash_64<__int128_t>))
+{
+  using estimator_type = cuco::distinct_count_estimator<T, cuda::thread_scope_device, Hash>;
+
+  auto num_items_pow2 = GENERATE(25, 26, 28);
+  auto hll_precision  = GENERATE(8, 10, 12, 13);
+  auto sketch_size_kb = 4 * (1ull << hll_precision) / 1024;
+  INFO("hll_precision=" << hll_precision);
+  INFO("sketch_size_kb=" << sketch_size_kb);
+  INFO("num_items=2^" << num_items_pow2);
+  auto num_items = 1ull << num_items_pow2;
+
+  thrust::device_vector<T> items(num_items);
+
+  // Generate `num_items` distinct items
+  thrust::sequence(items.begin(), items.end(), 0);
+
+  // Initialize the estimator
+  estimator_type estimator{cuco::sketch_size_kb(sketch_size_kb)};
+
+  // Add all items to the estimator
+  estimator.add(items.begin(), items.end());
+
+  auto const host_estimate = estimator.estimate();
+
+  thrust::device_vector<std::size_t> device_estimate(1);
+  estimate_kernel<typename estimator_type::ref_type<cuda::thread_scope_block>>
+    <<<1, 512, estimator.sketch_bytes()>>>(
+      cuco::sketch_size_kb(sketch_size_kb), items.begin(), num_items, device_estimate.begin());
+
+  REQUIRE(device_estimate[0] == host_estimate);
+}

From 730bf73c1fbb2d3dc265d5d1f7698d12ae839c05 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Wed, 20 Mar 2024 22:09:24 +0000
Subject: [PATCH 49/78] Restructure to reduce fp error

---
 include/cuco/detail/hyperloglog/finalizer.cuh | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/include/cuco/detail/hyperloglog/finalizer.cuh b/include/cuco/detail/hyperloglog/finalizer.cuh
index 845453b0d..705464ac0 100644
--- a/include/cuco/detail/hyperloglog/finalizer.cuh
+++ b/include/cuco/detail/hyperloglog/finalizer.cuh
@@ -78,12 +78,11 @@ class finalizer {
  private:
   __host__ __device__ constexpr double alpha_mm() const noexcept
   {
-    double const m2 = this->m_ * this->m_;
     switch (this->m_) {
-      case 16: return 0.673 * m2;
-      case 32: return 0.697 * m2;
-      case 64: return 0.709 * m2;
-      default: return (0.7213 / (1.0 + 1.079 / this->m_)) * m2;
+      case 16: return 0.673 * this->m_ * this->m_;
+      case 32: return 0.697 * this->m_ * this->m_;
+      case 64: return 0.709 * this->m_ * this->m_;
+      default: return (0.7213 / (1.0 + 1.079 / this->m_)) * this->m_ * this->m_;
     }
   }
 

From c50e7954662b118b57f8a8df2d18272b1549ec57 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Wed, 20 Mar 2024 22:16:44 +0000
Subject: [PATCH 50/78] Rename parameter for other estimator ref

---
 .../distinct_count_estimator.inl                     | 10 +++++-----
 include/cuco/detail/hyperloglog/hyperloglog.cuh      | 12 ++++++------
 include/cuco/distinct_count_estimator.cuh            |  8 ++++----
 3 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl
index 9ea4816d5..5b105af5c 100644
--- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl
+++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl
@@ -77,17 +77,17 @@ void distinct_count_estimator<T, Scope, Hash, Allocator>::merge(
 template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
 template <cuda::thread_scope OtherScope>
 void distinct_count_estimator<T, Scope, Hash, Allocator>::merge_async(
-  ref_type<OtherScope> const& other, cuco::cuda_stream_ref stream)
+  ref_type<OtherScope> const& other_ref, cuco::cuda_stream_ref stream)
 {
-  this->impl_->merge_async(other, stream);
+  this->impl_->merge_async(other_ref, stream);
 }
 
 template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
 template <cuda::thread_scope OtherScope>
-void distinct_count_estimator<T, Scope, Hash, Allocator>::merge(ref_type<OtherScope> const& other,
-                                                                cuco::cuda_stream_ref stream)
+void distinct_count_estimator<T, Scope, Hash, Allocator>::merge(
+  ref_type<OtherScope> const& other_ref, cuco::cuda_stream_ref stream)
 {
-  this->impl_->merge(other, stream);
+  this->impl_->merge(other_ref, stream);
 }
 
 template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh
index 23079e4db..b07b5e83b 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog.cuh
@@ -192,13 +192,13 @@ class hyperloglog {
    *
    * @tparam OtherScope Thread scope of `other` estimator
    *
-   * @param other Other estimator reference to be merged into `*this`
+   * @param other_ref Other estimator reference to be merged into `*this`
    * @param stream CUDA stream this operation is executed in
    */
   template <cuda::thread_scope OtherScope>
-  void merge_async(ref_type<OtherScope> const& other, cuco::cuda_stream_ref stream)
+  void merge_async(ref_type<OtherScope> const& other_ref, cuco::cuda_stream_ref stream)
   {
-    this->ref_.merge_async(other, stream);
+    this->ref_.merge_async(other_ref, stream);
   }
 
   /**
@@ -211,13 +211,13 @@ class hyperloglog {
    *
    * @tparam OtherScope Thread scope of `other` estimator
    *
-   * @param other Other estimator reference to be merged into `*this`
+   * @param other_ref Other estimator reference to be merged into `*this`
    * @param stream CUDA stream this operation is executed in
    */
   template <cuda::thread_scope OtherScope>
-  void merge(ref_type<OtherScope> const& other, cuco::cuda_stream_ref stream)
+  void merge(ref_type<OtherScope> const& other_ref, cuco::cuda_stream_ref stream)
   {
-    this->ref_.merge(other, stream);
+    this->ref_.merge(other_ref, stream);
   }
 
   /**
diff --git a/include/cuco/distinct_count_estimator.cuh b/include/cuco/distinct_count_estimator.cuh
index 3cd22b469..8e4cc097a 100644
--- a/include/cuco/distinct_count_estimator.cuh
+++ b/include/cuco/distinct_count_estimator.cuh
@@ -177,11 +177,11 @@ class distinct_count_estimator {
    *
    * @tparam OtherScope Thread scope of `other` estimator
    *
-   * @param other Other estimator reference to be merged into `*this`
+   * @param other_ref Other estimator reference to be merged into `*this`
    * @param stream CUDA stream this operation is executed in
    */
   template <cuda::thread_scope OtherScope>
-  void merge_async(ref_type<OtherScope> const& other, cuco::cuda_stream_ref stream = {});
+  void merge_async(ref_type<OtherScope> const& other_ref, cuco::cuda_stream_ref stream = {});
 
   /**
    * @brief Merges the result of `other` estimator reference into `*this` estimator.
@@ -193,11 +193,11 @@ class distinct_count_estimator {
    *
    * @tparam OtherScope Thread scope of `other` estimator
    *
-   * @param other Other estimator reference to be merged into `*this`
+   * @param other_ref Other estimator reference to be merged into `*this`
    * @param stream CUDA stream this operation is executed in
    */
   template <cuda::thread_scope OtherScope>
-  void merge(ref_type<OtherScope> const& other, cuco::cuda_stream_ref stream = {});
+  void merge(ref_type<OtherScope> const& other_ref, cuco::cuda_stream_ref stream = {});
 
   /**
    * @brief Compute the estimated distinct items count.

From d5595dae09bda28954dd3c0d3bf7eb124cf3f052 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Wed, 20 Mar 2024 22:28:40 +0000
Subject: [PATCH 51/78] Update benchmark

---
 benchmarks/distinct_count_estimator_bench.cu | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/benchmarks/distinct_count_estimator_bench.cu b/benchmarks/distinct_count_estimator_bench.cu
index c5ae8a6b3..f071fbf6b 100644
--- a/benchmarks/distinct_count_estimator_bench.cu
+++ b/benchmarks/distinct_count_estimator_bench.cu
@@ -49,7 +49,7 @@ template <typename InputIt>
 }
 
 template <class Estimator, class Dist>
-[[nodiscard]] double relative_error(nvbench::state& state, std::size_t num_samples = 5)
+[[nodiscard]] double relative_error(nvbench::state& state, std::size_t num_samples)
 {
   using T = typename Estimator::value_type;
 
@@ -87,8 +87,9 @@ void distinct_count_estimator_e2e(nvbench::state& state, nvbench::type_list<Esti
   state.add_element_count(num_items);
   state.add_global_memory_reads<T>(num_items, "InputSize");
 
-  auto const err = relative_error<Estimator, Dist>(state);
-  auto& summ     = state.add_summary("MeanRelativeError");
+  auto const err_samples = (cuda::std::is_same_v<Dist, distribution::unique>) ? 1 : 5;
+  auto const err         = relative_error<Estimator, Dist>(state, err_samples);
+  auto& summ             = state.add_summary("MeanRelativeError");
   summ.set_string("hint", "MRelErr");
   summ.set_string("short_name", "MeanRelativeError");
   summ.set_string("description", "Mean relatve approximation error.");
@@ -146,17 +147,19 @@ using ESTIMATOR_RANGE = nvbench::type_list<cuco::distinct_count_estimator<nvbenc
                                            cuco::distinct_count_estimator<__int128_t>>;
 
 NVBENCH_BENCH_TYPES(distinct_count_estimator_e2e,
-                    NVBENCH_TYPE_AXES(ESTIMATOR_RANGE, nvbench::type_list<distribution::unique>))
+                    NVBENCH_TYPE_AXES(ESTIMATOR_RANGE, nvbench::type_list<distribution::uniform>))
   .set_name("distinct_count_estimator_e2e")
   .set_type_axes_names({"Estimator", "Distribution"})
   .add_int64_power_of_two_axis("NumInputs", {28, 29, 30})
-  .add_int64_axis("SketchSizeKB", {8, 16, 32})
+  .add_int64_axis("SketchSizeKB", {8, 16, 32, 64, 128, 256})  // 256KB uses gmem fallback kernel
+  .add_int64_axis("Multiplicity", {1})
   .set_max_noise(defaults::MAX_NOISE);
 
 NVBENCH_BENCH_TYPES(distinct_count_estimator_add,
-                    NVBENCH_TYPE_AXES(ESTIMATOR_RANGE, nvbench::type_list<distribution::unique>))
+                    NVBENCH_TYPE_AXES(ESTIMATOR_RANGE, nvbench::type_list<distribution::uniform>))
   .set_name("distinct_count_estimator::add_async")
   .set_type_axes_names({"Estimator", "Distribution"})
   .add_int64_power_of_two_axis("NumInputs", {28, 29, 30})
-  .add_int64_axis("SketchSizeKB", {8, 16, 32, 256})  // 256KB uses gmem fallback kernel
+  .add_int64_axis("SketchSizeKB", {8, 16, 32, 64, 128, 256})
+  .add_int64_axis("Multiplicity", {1})
   .set_max_noise(defaults::MAX_NOISE);
\ No newline at end of file

From 16ad77a5efbb66bfa69d9d37dc6011a196436d6e Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Wed, 20 Mar 2024 22:43:39 +0000
Subject: [PATCH 52/78] Rebind allocator to register_type to ensure proper
 alignment

---
 .../cuco/detail/hyperloglog/hyperloglog.cuh   | 20 +++++++++++--------
 .../detail/hyperloglog/hyperloglog_ref.cuh    |  6 +++---
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh
index b07b5e83b..3c962536b 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog.cuh
@@ -48,11 +48,12 @@ class hyperloglog {
   using ref_type = hyperloglog_ref<T, NewScope, Hash>;  ///< Non-owning reference
                                                         ///< type
 
-  using value_type = typename ref_type<>::value_type;  ///< Type of items to count
-  using hash_type  = typename ref_type<>::hash_type;   ///< Hash function type
+  using value_type    = typename ref_type<>::value_type;     ///< Type of items to count
+  using hash_type     = typename ref_type<>::hash_type;      ///< Hash function type
+  using register_type = typename ref_type<>::register_type;  ///< HLL register type
   using allocator_type =
-    typename std::allocator_traits<Allocator>::template rebind_alloc<std::byte>;  ///< Allocator
-                                                                                  ///< type
+    typename std::allocator_traits<Allocator>::template rebind_alloc<register_type>;  ///< Allocator
+                                                                                      ///< type
 
   /**
    * @brief Constructs a `hyperloglog` host object.
@@ -72,9 +73,12 @@ class hyperloglog {
                         Allocator const& alloc,
                         cuco::cuda_stream_ref stream)
     : allocator_{alloc},
-      deleter_{sketch_bytes(sketch_size_kb), this->allocator_},
-      sketch_{this->allocator_.allocate(sketch_bytes(sketch_size_kb)), this->deleter_},
-      ref_{cuda::std::span{this->sketch_.get(), sketch_bytes(sketch_size_kb)}, hash}
+      deleter_{sketch_bytes(sketch_size_kb) / sizeof(register_type), this->allocator_},
+      sketch_{this->allocator_.allocate(sketch_bytes(sketch_size_kb) / sizeof(register_type)),
+              this->deleter_},
+      ref_{cuda::std::span{reinterpret_cast<std::byte*>(this->sketch_.get()),
+                           sketch_bytes(sketch_size_kb)},
+           hash}
   {
     this->ref_.clear_async(stream);
   }
@@ -291,7 +295,7 @@ class hyperloglog {
  private:
   allocator_type allocator_;                             ///< Storage allocator
   custom_deleter<std::size_t, allocator_type> deleter_;  ///< Storage deleter
-  std::unique_ptr<std::byte, custom_deleter<std::size_t, allocator_type>>
+  std::unique_ptr<register_type, custom_deleter<std::size_t, allocator_type>>
     sketch_;        ///< Sketch storage
   ref_type<> ref_;  //< Ref type
 
diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
index f601d0b3b..b85b3b5eb 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
@@ -52,7 +52,6 @@ namespace cuco::detail {
  */
 template <class T, cuda::thread_scope Scope, class Hash>
 class hyperloglog_ref {
-  using register_type = int;  ///< Register array storage
   // We use `int` here since this is the smallest type that supports native `atomicMax` on GPUs
   using fp_type = double;  ///< Floating point type used for reduction
   using hash_value_type =
@@ -60,8 +59,9 @@ class hyperloglog_ref {
  public:
   static constexpr auto thread_scope = Scope;  ///< CUDA thread scope
 
-  using value_type = T;     ///< Type of items to count
-  using hash_type  = Hash;  ///< Hash function type
+  using value_type    = T;     ///< Type of items to count
+  using hash_type     = Hash;  ///< Hash function type
+  using register_type = int;   ///< HLL register type
 
   template <cuda::thread_scope NewScope>
   using with_scope = hyperloglog_ref<T, NewScope, Hash>;  ///< Ref type with different

From b501a32b95488c291c5562dbdb23a50026cd5d02 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Wed, 20 Mar 2024 22:45:27 +0000
Subject: [PATCH 53/78] Use cudaMemcpyDefault

---
 include/cuco/detail/hyperloglog/hyperloglog_ref.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
index b85b3b5eb..c84f0c76d 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
@@ -411,7 +411,7 @@ class hyperloglog_ref {
     CUCO_CUDA_TRY(cudaMemcpyAsync(host_sketch.data(),
                                   this->sketch_.data(),
                                   sizeof(register_type) * num_regs,
-                                  cudaMemcpyDeviceToHost,
+                                  cudaMemcpyDefault,
                                   stream));
     stream.synchronize();
 

From 0bf0a88104061c1c68a84abbe6bcc4cdcce6ab47 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Wed, 20 Mar 2024 22:53:49 +0000
Subject: [PATCH 54/78] Mention alignment requirements in device_ref_example

---
 examples/distinct_count_estimator/device_ref_example.cu | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/examples/distinct_count_estimator/device_ref_example.cu b/examples/distinct_count_estimator/device_ref_example.cu
index 933726641..92c5169d9 100644
--- a/examples/distinct_count_estimator/device_ref_example.cu
+++ b/examples/distinct_count_estimator/device_ref_example.cu
@@ -39,6 +39,14 @@ __global__ void piggyback_kernel(RefType ref, InputIt first, std::size_t n)
   // Shared memory storage for the block-local estimator
   extern __shared__ std::byte local_sketch[];
 
+  // The following check is optional since the base address of dynamic shared memory is guaranteed
+  // to meet the alignment requirements
+  /*
+  auto const alignment =
+    1ull << cuda::std::countr_zero(reinterpret_cast<std::uintptr_t>(local_sketch));
+  assert(alignment >= local_ref_type::sketch_alignment());
+  */
+
   auto const loop_stride = gridDim.x * blockDim.x;
   auto idx               = blockDim.x * blockIdx.x + threadIdx.x;
   auto const block       = cooperative_groups::this_thread_block();

From a36136054b9771b39ff3cffdc015c3490aade6e1 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Wed, 20 Mar 2024 23:47:18 +0000
Subject: [PATCH 55/78] Pass T instead of Estimator to benchmark

---
 benchmarks/distinct_count_estimator_bench.cu | 26 +++++++++-----------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/benchmarks/distinct_count_estimator_bench.cu b/benchmarks/distinct_count_estimator_bench.cu
index f071fbf6b..272cfea88 100644
--- a/benchmarks/distinct_count_estimator_bench.cu
+++ b/benchmarks/distinct_count_estimator_bench.cu
@@ -76,10 +76,10 @@ template <class Estimator, class Dist>
 /**
  * @brief A benchmark evaluating `cuco::distinct_count_estimator` end-to-end performance
  */
-template <typename Estimator, typename Dist>
-void distinct_count_estimator_e2e(nvbench::state& state, nvbench::type_list<Estimator, Dist>)
+template <typename T, typename Dist>
+void distinct_count_estimator_e2e(nvbench::state& state, nvbench::type_list<T, Dist>)
 {
-  using T = typename Estimator::value_type;
+  using estimator_type = cuco::distinct_count_estimator<T>;
 
   auto const num_items      = state.get_int64("NumInputs");
   auto const sketch_size_kb = state.get_int64("SketchSizeKB");
@@ -88,7 +88,7 @@ void distinct_count_estimator_e2e(nvbench::state& state, nvbench::type_list<Esti
   state.add_global_memory_reads<T>(num_items, "InputSize");
 
   auto const err_samples = (cuda::std::is_same_v<Dist, distribution::unique>) ? 1 : 5;
-  auto const err         = relative_error<Estimator, Dist>(state, err_samples);
+  auto const err         = relative_error<estimator_type, Dist>(state, err_samples);
   auto& summ             = state.add_summary("MeanRelativeError");
   summ.set_string("hint", "MRelErr");
   summ.set_string("short_name", "MeanRelativeError");
@@ -100,7 +100,7 @@ void distinct_count_estimator_e2e(nvbench::state& state, nvbench::type_list<Esti
   key_generator gen;
   gen.generate(dist_from_state<Dist>(state), items.begin(), items.end());
 
-  Estimator estimator{cuco::sketch_size_kb(sketch_size_kb)};
+  estimator_type estimator{cuco::sketch_size_kb(sketch_size_kb)};
   std::size_t estimated_cardinality = 0;
   state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer,
              [&](nvbench::launch& launch, auto& timer) {
@@ -116,10 +116,10 @@ void distinct_count_estimator_e2e(nvbench::state& state, nvbench::type_list<Esti
 /**
  * @brief A benchmark evaluating `cuco::distinct_count_estimator::add` performance
  */
-template <typename Estimator, typename Dist>
-void distinct_count_estimator_add(nvbench::state& state, nvbench::type_list<Estimator, Dist>)
+template <typename T, typename Dist>
+void distinct_count_estimator_add(nvbench::state& state, nvbench::type_list<T, Dist>)
 {
-  using T = typename Estimator::value_type;
+  using estimator_type = cuco::distinct_count_estimator<T>;
 
   auto const num_items      = state.get_int64("NumInputs");
   auto const sketch_size_kb = state.get_int64("SketchSizeKB");
@@ -132,7 +132,7 @@ void distinct_count_estimator_add(nvbench::state& state, nvbench::type_list<Esti
   state.add_element_count(num_items);
   state.add_global_memory_reads<T>(num_items, "InputSize");
 
-  Estimator estimator{cuco::sketch_size_kb(sketch_size_kb)};
+  estimator_type estimator{cuco::sketch_size_kb(sketch_size_kb)};
   state.exec(nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
     timer.start();
     estimator.add_async(items.begin(), items.end(), {launch.get_stream()});
@@ -142,12 +142,10 @@ void distinct_count_estimator_add(nvbench::state& state, nvbench::type_list<Esti
   });
 }
 
-using ESTIMATOR_RANGE = nvbench::type_list<cuco::distinct_count_estimator<nvbench::int32_t>,
-                                           cuco::distinct_count_estimator<nvbench::int64_t>,
-                                           cuco::distinct_count_estimator<__int128_t>>;
+using TYPE_RANGE = nvbench::type_list<nvbench::int32_t, nvbench::int64_t, __int128_t>;
 
 NVBENCH_BENCH_TYPES(distinct_count_estimator_e2e,
-                    NVBENCH_TYPE_AXES(ESTIMATOR_RANGE, nvbench::type_list<distribution::uniform>))
+                    NVBENCH_TYPE_AXES(TYPE_RANGE, nvbench::type_list<distribution::uniform>))
   .set_name("distinct_count_estimator_e2e")
   .set_type_axes_names({"Estimator", "Distribution"})
   .add_int64_power_of_two_axis("NumInputs", {28, 29, 30})
@@ -156,7 +154,7 @@ NVBENCH_BENCH_TYPES(distinct_count_estimator_e2e,
   .set_max_noise(defaults::MAX_NOISE);
 
 NVBENCH_BENCH_TYPES(distinct_count_estimator_add,
-                    NVBENCH_TYPE_AXES(ESTIMATOR_RANGE, nvbench::type_list<distribution::uniform>))
+                    NVBENCH_TYPE_AXES(TYPE_RANGE, nvbench::type_list<distribution::uniform>))
   .set_name("distinct_count_estimator::add_async")
   .set_type_axes_names({"Estimator", "Distribution"})
   .add_int64_power_of_two_axis("NumInputs", {28, 29, 30})

From 66870a7fa6bb97799807c2aa3567a85eb7152ddc Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Wed, 20 Mar 2024 23:51:51 +0000
Subject: [PATCH 56/78] Fix typo in benchmark script

---
 benchmarks/distinct_count_estimator_bench.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/distinct_count_estimator_bench.cu b/benchmarks/distinct_count_estimator_bench.cu
index 272cfea88..76a664eaa 100644
--- a/benchmarks/distinct_count_estimator_bench.cu
+++ b/benchmarks/distinct_count_estimator_bench.cu
@@ -147,7 +147,7 @@ using TYPE_RANGE = nvbench::type_list<nvbench::int32_t, nvbench::int64_t, __int1
 NVBENCH_BENCH_TYPES(distinct_count_estimator_e2e,
                     NVBENCH_TYPE_AXES(TYPE_RANGE, nvbench::type_list<distribution::uniform>))
   .set_name("distinct_count_estimator_e2e")
-  .set_type_axes_names({"Estimator", "Distribution"})
+  .set_type_axes_names({"T", "Distribution"})
   .add_int64_power_of_two_axis("NumInputs", {28, 29, 30})
   .add_int64_axis("SketchSizeKB", {8, 16, 32, 64, 128, 256})  // 256KB uses gmem fallback kernel
   .add_int64_axis("Multiplicity", {1})
@@ -156,7 +156,7 @@ NVBENCH_BENCH_TYPES(distinct_count_estimator_e2e,
 NVBENCH_BENCH_TYPES(distinct_count_estimator_add,
                     NVBENCH_TYPE_AXES(TYPE_RANGE, nvbench::type_list<distribution::uniform>))
   .set_name("distinct_count_estimator::add_async")
-  .set_type_axes_names({"Estimator", "Distribution"})
+  .set_type_axes_names({"T", "Distribution"})
   .add_int64_power_of_two_axis("NumInputs", {28, 29, 30})
   .add_int64_axis("SketchSizeKB", {8, 16, 32, 64, 128, 256})
   .add_int64_axis("Multiplicity", {1})

From b990dcae5b55caebe95d76509f18f9fb6a507fc3 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Thu, 21 Mar 2024 14:10:45 +0000
Subject: [PATCH 57/78] Rename hash function

---
 .../distinct_count_estimator/distinct_count_estimator.inl   | 6 +++---
 .../distinct_count_estimator_ref.inl                        | 5 +++--
 include/cuco/detail/hyperloglog/hyperloglog.cuh             | 4 ++--
 include/cuco/detail/hyperloglog/hyperloglog_ref.cuh         | 6 +++---
 include/cuco/distinct_count_estimator.cuh                   | 3 ++-
 include/cuco/distinct_count_estimator_ref.cuh               | 3 ++-
 6 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl
index 5b105af5c..6538e1588 100644
--- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl
+++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl
@@ -101,13 +101,13 @@ template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
 typename distinct_count_estimator<T, Scope, Hash, Allocator>::ref_type<>
 distinct_count_estimator<T, Scope, Hash, Allocator>::ref() const noexcept
 {
-  return {this->sketch(), this->hash()};
+  return {this->sketch(), this->hash_function()};
 }
 
 template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
-auto distinct_count_estimator<T, Scope, Hash, Allocator>::hash() const noexcept
+auto distinct_count_estimator<T, Scope, Hash, Allocator>::hash_function() const noexcept
 {
-  return this->impl_->hash();
+  return this->impl_->hash_function();
 }
 
 template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl
index 3be39ac44..535e40b32 100644
--- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl
+++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl
@@ -107,9 +107,10 @@ __host__ std::size_t distinct_count_estimator_ref<T, Scope, Hash>::estimate(
 }
 
 template <class T, cuda::thread_scope Scope, class Hash>
-__host__ __device__ auto distinct_count_estimator_ref<T, Scope, Hash>::hash() const noexcept
+__host__ __device__ auto distinct_count_estimator_ref<T, Scope, Hash>::hash_function()
+  const noexcept
 {
-  return this->impl_.hash();
+  return this->impl_.hash_function();
 }
 
 template <class T, cuda::thread_scope Scope, class Hash>
diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh
index 3c962536b..2b6ca738b 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog.cuh
@@ -49,7 +49,7 @@ class hyperloglog {
                                                         ///< type
 
   using value_type    = typename ref_type<>::value_type;     ///< Type of items to count
-  using hash_type     = typename ref_type<>::hash_type;      ///< Hash function type
+  using hasher        = typename ref_type<>::hasher;         ///< Hash function type
   using register_type = typename ref_type<>::register_type;  ///< HLL register type
   using allocator_type =
     typename std::allocator_traits<Allocator>::template rebind_alloc<register_type>;  ///< Allocator
@@ -250,7 +250,7 @@ class hyperloglog {
    *
    * @return The hash function
    */
-  [[nodiscard]] auto hash() const noexcept { return this->ref_.hash(); }
+  [[nodiscard]] auto hash_function() const noexcept { return this->ref_.hash_function(); }
 
   /**
    * @brief Gets the span of the sketch.
diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
index c84f0c76d..c8ace0f23 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
@@ -60,7 +60,7 @@ class hyperloglog_ref {
   static constexpr auto thread_scope = Scope;  ///< CUDA thread scope
 
   using value_type    = T;     ///< Type of items to count
-  using hash_type     = Hash;  ///< Hash function type
+  using hasher        = Hash;  ///< Hash function type
   using register_type = int;   ///< HLL register type
 
   template <cuda::thread_scope NewScope>
@@ -435,7 +435,7 @@ class hyperloglog_ref {
    *
    * @return The hash function
    */
-  [[nodiscard]] __host__ __device__ auto hash() const noexcept { return this->hash_; }
+  [[nodiscard]] __host__ __device__ auto hash_function() const noexcept { return this->hash_; }
 
   /**
    * @brief Gets the span of the sketch.
@@ -524,7 +524,7 @@ class hyperloglog_ref {
                                                shmem_bytes);
   }
 
-  hash_type hash_;                         ///< Hash function used to hash items
+  hasher hash_;                            ///< Hash function used to hash items
   int32_t precision_;                      ///< HLL precision parameter
   hash_value_type register_mask_;          ///< Mask used to separate register index from count
   cuda::std::span<register_type> sketch_;  ///< HLL sketch storage
diff --git a/include/cuco/distinct_count_estimator.cuh b/include/cuco/distinct_count_estimator.cuh
index 8e4cc097a..64ed5cc56 100644
--- a/include/cuco/distinct_count_estimator.cuh
+++ b/include/cuco/distinct_count_estimator.cuh
@@ -54,6 +54,7 @@ class distinct_count_estimator {
                                                                            ///< type
 
   using value_type     = typename impl_type::value_type;      ///< Type of items to count
+  using hasher         = typename impl_type::hasher;          ///< Type of hash function
   using allocator_type = typename impl_type::allocator_type;  ///< Allocator type
 
   // TODO enable CTAD
@@ -222,7 +223,7 @@ class distinct_count_estimator {
    *
    * @return The hash function
    */
-  [[nodiscard]] auto hash() const noexcept;
+  [[nodiscard]] auto hash_function() const noexcept;
 
   /**
    * @brief Gets the span of the sketch.
diff --git a/include/cuco/distinct_count_estimator_ref.cuh b/include/cuco/distinct_count_estimator_ref.cuh
index a0806cccf..f6ebfe94b 100644
--- a/include/cuco/distinct_count_estimator_ref.cuh
+++ b/include/cuco/distinct_count_estimator_ref.cuh
@@ -46,6 +46,7 @@ class distinct_count_estimator_ref {
   static constexpr auto thread_scope = impl_type::thread_scope;  ///< CUDA thread scope
 
   using value_type = typename impl_type::value_type;  ///< Type of items to count
+  using hasher     = typename impl_type::hasher;      ///< Type of hash function
 
   template <cuda::thread_scope NewScope>
   using with_scope = distinct_count_estimator_ref<T, NewScope, Hash>;  ///< Ref type with different
@@ -200,7 +201,7 @@ class distinct_count_estimator_ref {
    *
    * @return The hash function
    */
-  [[nodiscard]] __host__ __device__ auto hash() const noexcept;
+  [[nodiscard]] __host__ __device__ auto hash_function() const noexcept;
 
   /**
    * @brief Gets the span of the sketch.

From c87309e9895ccc82e25b390787a81fed46c5ede0 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Thu, 21 Mar 2024 14:15:35 +0000
Subject: [PATCH 58/78] Use placement new to initialize sketch

---
 include/cuco/detail/hyperloglog/hyperloglog_ref.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
index c8ace0f23..ade4a8166 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
@@ -117,7 +117,7 @@ class hyperloglog_ref {
   __device__ void clear(CG const& group) noexcept
   {
     for (int i = group.thread_rank(); i < this->sketch_.size(); i += group.size()) {
-      this->sketch_[i] = 0;
+      new (&(this->sketch_[i])) register_type{};
     }
   }
 

From 03d4b41187cadfdf4d02a04ef3f0e9aced08df01 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Thu, 21 Mar 2024 14:19:21 +0000
Subject: [PATCH 59/78] Remove custom_deleter member

---
 include/cuco/detail/hyperloglog/hyperloglog.cuh | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh
index 2b6ca738b..38dff73f2 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog.cuh
@@ -73,9 +73,9 @@ class hyperloglog {
                         Allocator const& alloc,
                         cuco::cuda_stream_ref stream)
     : allocator_{alloc},
-      deleter_{sketch_bytes(sketch_size_kb) / sizeof(register_type), this->allocator_},
-      sketch_{this->allocator_.allocate(sketch_bytes(sketch_size_kb) / sizeof(register_type)),
-              this->deleter_},
+      sketch_{
+        this->allocator_.allocate(sketch_bytes(sketch_size_kb) / sizeof(register_type)),
+        custom_deleter{sketch_bytes(sketch_size_kb) / sizeof(register_type), this->allocator_}},
       ref_{cuda::std::span{reinterpret_cast<std::byte*>(this->sketch_.get()),
                            sketch_bytes(sketch_size_kb)},
            hash}
@@ -293,8 +293,7 @@ class hyperloglog {
   }
 
  private:
-  allocator_type allocator_;                             ///< Storage allocator
-  custom_deleter<std::size_t, allocator_type> deleter_;  ///< Storage deleter
+  allocator_type allocator_;  ///< Storage allocator
   std::unique_ptr<register_type, custom_deleter<std::size_t, allocator_type>>
     sketch_;        ///< Sketch storage
   ref_type<> ref_;  //< Ref type

From 7de06fb05c8da41af8a2b88a6b3ab1b2db3b46d6 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Thu, 21 Mar 2024 14:21:34 +0000
Subject: [PATCH 60/78] Rename sketch_size.hpp -> sktech_size.cuh

---
 include/cuco/detail/hyperloglog/hyperloglog.cuh     | 2 +-
 include/cuco/detail/hyperloglog/hyperloglog_ref.cuh | 2 +-
 include/cuco/distinct_count_estimator.cuh           | 2 +-
 include/cuco/distinct_count_estimator_ref.cuh       | 2 +-
 include/cuco/{sketch_size.hpp => sketch_size.cuh}   | 0
 5 files changed, 4 insertions(+), 4 deletions(-)
 rename include/cuco/{sketch_size.hpp => sketch_size.cuh} (100%)

diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh
index 38dff73f2..a5c0a0e4a 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog.cuh
@@ -20,7 +20,7 @@
 #include <cuco/detail/hyperloglog/hyperloglog_ref.cuh>
 #include <cuco/detail/storage/storage_base.cuh>
 #include <cuco/hash_functions.cuh>
-#include <cuco/sketch_size.hpp>
+#include <cuco/sketch_size.cuh>
 #include <cuco/utility/cuda_thread_scope.cuh>
 
 #include <cstddef>
diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
index ade4a8166..67c38f0b8 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
@@ -22,7 +22,7 @@
 #include <cuco/detail/hyperloglog/kernels.cuh>
 #include <cuco/detail/utils.hpp>
 #include <cuco/hash_functions.cuh>
-#include <cuco/sketch_size.hpp>
+#include <cuco/sketch_size.cuh>
 #include <cuco/utility/cuda_thread_scope.cuh>
 #include <cuco/utility/traits.hpp>
 
diff --git a/include/cuco/distinct_count_estimator.cuh b/include/cuco/distinct_count_estimator.cuh
index 64ed5cc56..51fb82080 100644
--- a/include/cuco/distinct_count_estimator.cuh
+++ b/include/cuco/distinct_count_estimator.cuh
@@ -19,7 +19,7 @@
 #include <cuco/detail/hyperloglog/hyperloglog.cuh>
 #include <cuco/distinct_count_estimator_ref.cuh>
 #include <cuco/hash_functions.cuh>
-#include <cuco/sketch_size.hpp>
+#include <cuco/sketch_size.cuh>
 #include <cuco/utility/allocator.hpp>
 #include <cuco/utility/cuda_thread_scope.cuh>
 
diff --git a/include/cuco/distinct_count_estimator_ref.cuh b/include/cuco/distinct_count_estimator_ref.cuh
index f6ebfe94b..d639310aa 100644
--- a/include/cuco/distinct_count_estimator_ref.cuh
+++ b/include/cuco/distinct_count_estimator_ref.cuh
@@ -18,7 +18,7 @@
 #include <cuco/cuda_stream_ref.hpp>
 #include <cuco/detail/hyperloglog/hyperloglog_ref.cuh>
 #include <cuco/hash_functions.cuh>
-#include <cuco/sketch_size.hpp>
+#include <cuco/sketch_size.cuh>
 #include <cuco/utility/cuda_thread_scope.cuh>
 
 #include <cooperative_groups.h>
diff --git a/include/cuco/sketch_size.hpp b/include/cuco/sketch_size.cuh
similarity index 100%
rename from include/cuco/sketch_size.hpp
rename to include/cuco/sketch_size.cuh

From 185d3c4c213edde41da76401b6da48ea6c91923b Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Thu, 21 Mar 2024 14:24:19 +0000
Subject: [PATCH 61/78] Use std::abs

---
 benchmarks/distinct_count_estimator_bench.cu           | 3 ++-
 examples/distinct_count_estimator/host_bulk_example.cu | 4 +++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/benchmarks/distinct_count_estimator_bench.cu b/benchmarks/distinct_count_estimator_bench.cu
index 76a664eaa..37c5f4f00 100644
--- a/benchmarks/distinct_count_estimator_bench.cu
+++ b/benchmarks/distinct_count_estimator_bench.cu
@@ -28,6 +28,7 @@
 
 #include <cuda/functional>
 
+#include <cmath>
 #include <cstddef>
 
 using namespace cuco::benchmark;
@@ -66,7 +67,7 @@ template <class Estimator, class Dist>
     estimator.add(items.begin(), items.end());
     double estimated_cardinality = estimator.estimate();
     double true_cardinality      = exact_distinct_count(items.begin(), num_items);
-    error_sum += abs(true_cardinality - estimated_cardinality) / true_cardinality;
+    error_sum += std::abs(true_cardinality - estimated_cardinality) / true_cardinality;
     estimator.clear();
   }
 
diff --git a/examples/distinct_count_estimator/host_bulk_example.cu b/examples/distinct_count_estimator/host_bulk_example.cu
index add3cb626..96a46a8df 100644
--- a/examples/distinct_count_estimator/host_bulk_example.cu
+++ b/examples/distinct_count_estimator/host_bulk_example.cu
@@ -18,6 +18,7 @@
 #include <thrust/device_vector.h>
 #include <thrust/sequence.h>
 
+#include <cmath>
 #include <cstddef>
 #include <iostream>
 
@@ -49,7 +50,8 @@ int main(void)
 
   std::cout << "True cardinality: " << num_items
             << "\nEstimated cardinality: " << estimated_cardinality << "\nRelative error: "
-            << abs(static_cast<double>(num_items) - static_cast<double>(estimated_cardinality)) /
+            << std::abs(static_cast<double>(num_items) -
+                        static_cast<double>(estimated_cardinality)) /
                  num_items
             << std::endl;
 

From 023d0809a683cf484a70950484f78480ff0a9c16 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Thu, 21 Mar 2024 14:28:09 +0000
Subject: [PATCH 62/78] Use std::vector instead of thrust::host_vector>

---
 include/cuco/detail/hyperloglog/hyperloglog_ref.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
index 67c38f0b8..ade97f7cb 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
@@ -26,7 +26,6 @@
 #include <cuco/utility/cuda_thread_scope.cuh>
 #include <cuco/utility/traits.hpp>
 
-#include <thrust/host_vector.h>
 #include <thrust/type_traits/is_contiguous_iterator.h>
 
 #include <cuda/std/bit>
@@ -37,6 +36,7 @@
 #include <cooperative_groups/reduce.h>
 
 #include <cstddef>
+#include <vector>
 
 namespace cuco::detail {
 
@@ -405,7 +405,7 @@ class hyperloglog_ref {
   [[nodiscard]] __host__ std::size_t estimate(cuco::cuda_stream_ref stream) const
   {
     auto const num_regs = 1ull << this->precision_;
-    thrust::host_vector<register_type> host_sketch(num_regs);
+    std::vector<register_type> host_sketch(num_regs);
 
     // TODO check if storage is host accessible
     CUCO_CUDA_TRY(cudaMemcpyAsync(host_sketch.data(),

From 53cdf376f161717734ffb34283057e9ba540e3c4 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Thu, 21 Mar 2024 14:30:57 +0000
Subject: [PATCH 63/78] Add note about shmem alignment

---
 include/cuco/detail/hyperloglog/kernels.cuh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/cuco/detail/hyperloglog/kernels.cuh b/include/cuco/detail/hyperloglog/kernels.cuh
index ba4ceb506..c04ad4617 100644
--- a/include/cuco/detail/hyperloglog/kernels.cuh
+++ b/include/cuco/detail/hyperloglog/kernels.cuh
@@ -44,7 +44,8 @@ CUCO_KERNEL void add_shmem_vectorized(typename RefType::value_type const* first,
   using vector_type    = cuda::std::array<value_type, VectorSize>;
   using local_ref_type = typename RefType::with_scope<cuda::thread_scope_block>;
 
-  // TODO assert alignment
+  // Base address of dynamic shared memory is guaranteed to be aligned to at least 16 bytes which is
+  // sufficient for this purpose
   extern __shared__ std::byte local_sketch[];
 
   auto const loop_stride = cuco::detail::grid_stride();

From 2a81714e7d7e1bfa521eeeaa0045804791eece14 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Thu, 21 Mar 2024 14:32:05 +0000
Subject: [PATCH 64/78] Remove comment

---
 include/cuco/distinct_count_estimator.cuh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/cuco/distinct_count_estimator.cuh b/include/cuco/distinct_count_estimator.cuh
index 51fb82080..d7318d26a 100644
--- a/include/cuco/distinct_count_estimator.cuh
+++ b/include/cuco/distinct_count_estimator.cuh
@@ -82,7 +82,6 @@ class distinct_count_estimator {
   distinct_count_estimator& operator=(distinct_count_estimator const&) = delete;
   distinct_count_estimator(distinct_count_estimator&&) = default;  ///< Move constructor
 
-  // TODO this is somehow required to pass the Doxygen check.
   /**
    * @brief Copy-assignment operator.
    *

From d859b39e7aa302de7e2207f273c0990e27b17a88 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Thu, 21 Mar 2024 15:19:03 +0000
Subject: [PATCH 65/78] Remove device-sided error handling since it hurts
 performance

---
 .../detail/hyperloglog/hyperloglog_ref.cuh    | 34 ++++++++-----------
 include/cuco/distinct_count_estimator_ref.cuh |  7 ++--
 2 files changed, 19 insertions(+), 22 deletions(-)

diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
index ade97f7cb..7186d785b 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
@@ -35,6 +35,7 @@
 #include <cooperative_groups.h>
 #include <cooperative_groups/reduce.h>
 
+#include <algorithm>  // there is no <cuda/std/algorithm>
 #include <cstddef>
 #include <vector>
 
@@ -70,8 +71,9 @@ class hyperloglog_ref {
   /**
    * @brief Constructs a non-owning `hyperloglog_ref` object.
    *
-   * @throw If sketch size < 0.0625KB or 64B
-   * @throw If sketch storage has insufficient alignment
+   * @throw If sketch size < 0.0625KB or 64B. Throws if called from host; UB if called from device.
+   * @throw If sketch storage has insufficient alignment. Throws if called from host; UB if called.
+   * from device.
    *
    * @param sketch_span Reference to sketch storage
    * @param hash The hash function used to hash items
@@ -86,24 +88,15 @@ class hyperloglog_ref {
       sketch_{reinterpret_cast<register_type*>(sketch_span.data()),
               this->sketch_bytes() / sizeof(register_type)}
   {
+#ifndef __CUDA_ARCH__
     auto const alignment =
       1ull << cuda::std::countr_zero(reinterpret_cast<cuda::std::uintptr_t>(sketch_span.data()));
+    CUCO_EXPECTS(
+      alignment >= sketch_alignment(), "Insufficient sketch alignment", std::runtime_error);
 
-    if (alignment < sketch_alignment()) {
-#ifdef __CUDA_ARCH__
-      __trap();
-#else
-      CUCO_FAIL("Insufficient sketch alignment", std::runtime_error);
+    CUCO_EXPECTS(
+      this->precision_ >= 4, "Minimum required sketch size is 0.0625KB or 64B", std::runtime_error);
 #endif
-    }
-
-    if (this->precision_ < 4) {
-#ifdef __CUDA_ARCH__
-      __trap();
-#else
-      CUCO_FAIL("Minimum required sketch size is 0.0625KB or 64B", std::runtime_error);
-#endif
-    }
   }
 
   /**
@@ -272,7 +265,7 @@ class hyperloglog_ref {
   /**
    * @brief Merges the result of `other` estimator reference into `*this` estimator reference.
    *
-   * @throw If this->sketch_bytes() != other.sketch_bytes()
+   * @throw If this->sketch_bytes() != other.sketch_bytes() then behavior is undefined
    *
    * @tparam CG CUDA Cooperative Group type
    * @tparam OtherScope Thread scope of `other` estimator
@@ -283,7 +276,8 @@ class hyperloglog_ref {
   template <class CG, cuda::thread_scope OtherScope>
   __device__ void merge(CG const& group, hyperloglog_ref<T, OtherScope, Hash> const& other)
   {
-    if (other.precision_ != this->precision_) { __trap(); }
+    // TODO find a better way to do error handling in device code
+    // if (other.precision_ != this->precision_) { __trap(); }
 
     for (int i = group.thread_rank(); i < this->sketch_.size(); i += group.size()) {
       this->update_max(i, other.sketch_[i]);
@@ -468,7 +462,9 @@ class hyperloglog_ref {
   [[nodiscard]] __host__ __device__ static constexpr std::size_t sketch_bytes(
     cuco::sketch_size_kb sketch_size_kb) noexcept
   {
-    return cuda::std::bit_floor(static_cast<std::size_t>(sketch_size_kb * 1024));
+    // minimum precision is 4 or 64 bytes
+    return std::max(static_cast<std::size_t>(sizeof(register_type) * 1ull << 4),
+                    cuda::std::bit_floor(static_cast<std::size_t>(sketch_size_kb * 1024)));
   }
 
   /**
diff --git a/include/cuco/distinct_count_estimator_ref.cuh b/include/cuco/distinct_count_estimator_ref.cuh
index d639310aa..bc0b9da61 100644
--- a/include/cuco/distinct_count_estimator_ref.cuh
+++ b/include/cuco/distinct_count_estimator_ref.cuh
@@ -55,8 +55,9 @@ class distinct_count_estimator_ref {
   /**
    * @brief Constructs a non-owning `distinct_count_estimator_ref` object.
    *
-   * @throw If sketch size < 0.0625KB or 64B
-   * @throw If sketch storage has insufficient alignment
+   * @throw If sketch size < 0.0625KB or 64B. Throws if called from host; UB if called from device.
+   * @throw If sketch storage has insufficient alignment. Throws if called from host; UB if called
+   * from device.
    *
    * @param sketch_span Reference to sketch storage
    * @param hash The hash function used to hash items
@@ -132,7 +133,7 @@ class distinct_count_estimator_ref {
   /**
    * @brief Merges the result of `other` estimator reference into `*this` estimator reference.
    *
-   * @throw If this->sketch_bytes() != other.sketch_bytes()
+   * @throw If this->sketch_bytes() != other.sketch_bytes() then behavior is undefined
    *
    * @tparam CG CUDA Cooperative Group type
    * @tparam OtherScope Thread scope of `other` estimator

From 43be0f000828ee17602e9202a30f5294d794a4b1 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Thu, 21 Mar 2024 23:46:19 +0000
Subject: [PATCH 66/78] Constexpr all the things!

---
 .../distinct_count_estimator.inl              | 28 +++++++--------
 .../distinct_count_estimator_ref.inl          | 32 ++++++++---------
 .../cuco/detail/hyperloglog/hyperloglog.cuh   | 34 +++++++++++--------
 .../detail/hyperloglog/hyperloglog_ref.cuh    | 34 +++++++++++--------
 include/cuco/distinct_count_estimator.cuh     | 30 ++++++++--------
 include/cuco/distinct_count_estimator_ref.cuh | 31 +++++++++--------
 6 files changed, 101 insertions(+), 88 deletions(-)

diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl
index 6538e1588..5454165a6 100644
--- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl
+++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl
@@ -27,21 +27,22 @@ constexpr distinct_count_estimator<T, Scope, Hash, Allocator>::distinct_count_es
 }
 
 template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
-void distinct_count_estimator<T, Scope, Hash, Allocator>::clear_async(
+constexpr void distinct_count_estimator<T, Scope, Hash, Allocator>::clear_async(
   cuco::cuda_stream_ref stream) noexcept
 {
   this->impl_->clear_async(stream);
 }
 
 template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
-void distinct_count_estimator<T, Scope, Hash, Allocator>::clear(cuco::cuda_stream_ref stream)
+constexpr void distinct_count_estimator<T, Scope, Hash, Allocator>::clear(
+  cuco::cuda_stream_ref stream)
 {
   this->impl_->clear(stream);
 }
 
 template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
 template <class InputIt>
-void distinct_count_estimator<T, Scope, Hash, Allocator>::add_async(
+constexpr void distinct_count_estimator<T, Scope, Hash, Allocator>::add_async(
   InputIt first, InputIt last, cuco::cuda_stream_ref stream) noexcept
 {
   this->impl_->add_async(first, last, stream);
@@ -49,16 +50,15 @@ void distinct_count_estimator<T, Scope, Hash, Allocator>::add_async(
 
 template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
 template <class InputIt>
-void distinct_count_estimator<T, Scope, Hash, Allocator>::add(InputIt first,
-                                                              InputIt last,
-                                                              cuco::cuda_stream_ref stream)
+constexpr void distinct_count_estimator<T, Scope, Hash, Allocator>::add(
+  InputIt first, InputIt last, cuco::cuda_stream_ref stream)
 {
   this->impl_->add(first, last, stream);
 }
 
 template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
 template <cuda::thread_scope OtherScope, class OtherAllocator>
-void distinct_count_estimator<T, Scope, Hash, Allocator>::merge_async(
+constexpr void distinct_count_estimator<T, Scope, Hash, Allocator>::merge_async(
   distinct_count_estimator<T, OtherScope, Hash, OtherAllocator> const& other,
   cuco::cuda_stream_ref stream)
 {
@@ -67,7 +67,7 @@ void distinct_count_estimator<T, Scope, Hash, Allocator>::merge_async(
 
 template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
 template <cuda::thread_scope OtherScope, class OtherAllocator>
-void distinct_count_estimator<T, Scope, Hash, Allocator>::merge(
+constexpr void distinct_count_estimator<T, Scope, Hash, Allocator>::merge(
   distinct_count_estimator<T, OtherScope, Hash, OtherAllocator> const& other,
   cuco::cuda_stream_ref stream)
 {
@@ -76,7 +76,7 @@ void distinct_count_estimator<T, Scope, Hash, Allocator>::merge(
 
 template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
 template <cuda::thread_scope OtherScope>
-void distinct_count_estimator<T, Scope, Hash, Allocator>::merge_async(
+constexpr void distinct_count_estimator<T, Scope, Hash, Allocator>::merge_async(
   ref_type<OtherScope> const& other_ref, cuco::cuda_stream_ref stream)
 {
   this->impl_->merge_async(other_ref, stream);
@@ -84,34 +84,34 @@ void distinct_count_estimator<T, Scope, Hash, Allocator>::merge_async(
 
 template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
 template <cuda::thread_scope OtherScope>
-void distinct_count_estimator<T, Scope, Hash, Allocator>::merge(
+constexpr void distinct_count_estimator<T, Scope, Hash, Allocator>::merge(
   ref_type<OtherScope> const& other_ref, cuco::cuda_stream_ref stream)
 {
   this->impl_->merge(other_ref, stream);
 }
 
 template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
-std::size_t distinct_count_estimator<T, Scope, Hash, Allocator>::estimate(
+constexpr std::size_t distinct_count_estimator<T, Scope, Hash, Allocator>::estimate(
   cuco::cuda_stream_ref stream) const
 {
   return this->impl_->estimate(stream);
 }
 
 template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
-typename distinct_count_estimator<T, Scope, Hash, Allocator>::ref_type<>
+constexpr typename distinct_count_estimator<T, Scope, Hash, Allocator>::ref_type<>
 distinct_count_estimator<T, Scope, Hash, Allocator>::ref() const noexcept
 {
   return {this->sketch(), this->hash_function()};
 }
 
 template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
-auto distinct_count_estimator<T, Scope, Hash, Allocator>::hash_function() const noexcept
+constexpr auto distinct_count_estimator<T, Scope, Hash, Allocator>::hash_function() const noexcept
 {
   return this->impl_->hash_function();
 }
 
 template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
-cuda::std::span<std::byte> distinct_count_estimator<T, Scope, Hash, Allocator>::sketch()
+constexpr cuda::std::span<std::byte> distinct_count_estimator<T, Scope, Hash, Allocator>::sketch()
   const noexcept
 {
   return this->impl_->sketch();
diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl
index 535e40b32..a607c3ce9 100644
--- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl
+++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl
@@ -26,51 +26,51 @@ __host__
 
 template <class T, cuda::thread_scope Scope, class Hash>
 template <class CG>
-__device__ void distinct_count_estimator_ref<T, Scope, Hash>::clear(CG const& group) noexcept
+__device__ constexpr void distinct_count_estimator_ref<T, Scope, Hash>::clear(
+  CG const& group) noexcept
 {
   this->impl_.clear(group);
 }
 
 template <class T, cuda::thread_scope Scope, class Hash>
-__host__ void distinct_count_estimator_ref<T, Scope, Hash>::clear_async(
+__host__ constexpr void distinct_count_estimator_ref<T, Scope, Hash>::clear_async(
   cuco::cuda_stream_ref stream) noexcept
 {
   this->impl_.clear_async(stream);
 }
 
 template <class T, cuda::thread_scope Scope, class Hash>
-__host__ void distinct_count_estimator_ref<T, Scope, Hash>::clear(cuco::cuda_stream_ref stream)
+__host__ constexpr void distinct_count_estimator_ref<T, Scope, Hash>::clear(
+  cuco::cuda_stream_ref stream)
 {
   this->impl_.clear(stream);
 }
 
 template <class T, cuda::thread_scope Scope, class Hash>
-__device__ void distinct_count_estimator_ref<T, Scope, Hash>::add(T const& item) noexcept
+__device__ constexpr void distinct_count_estimator_ref<T, Scope, Hash>::add(T const& item) noexcept
 {
   this->impl_.add(item);
 }
 
 template <class T, cuda::thread_scope Scope, class Hash>
 template <class InputIt>
-__host__ void distinct_count_estimator_ref<T, Scope, Hash>::add_async(InputIt first,
-                                                                      InputIt last,
-                                                                      cuco::cuda_stream_ref stream)
+__host__ constexpr void distinct_count_estimator_ref<T, Scope, Hash>::add_async(
+  InputIt first, InputIt last, cuco::cuda_stream_ref stream)
 {
   this->impl_.add_async(first, last, stream);
 }
 
 template <class T, cuda::thread_scope Scope, class Hash>
 template <class InputIt>
-__host__ void distinct_count_estimator_ref<T, Scope, Hash>::add(InputIt first,
-                                                                InputIt last,
-                                                                cuco::cuda_stream_ref stream)
+__host__ constexpr void distinct_count_estimator_ref<T, Scope, Hash>::add(
+  InputIt first, InputIt last, cuco::cuda_stream_ref stream)
 {
   this->impl_.add(first, last, stream);
 }
 
 template <class T, cuda::thread_scope Scope, class Hash>
 template <class CG, cuda::thread_scope OtherScope>
-__device__ void distinct_count_estimator_ref<T, Scope, Hash>::merge(
+__device__ constexpr void distinct_count_estimator_ref<T, Scope, Hash>::merge(
   CG const& group, distinct_count_estimator_ref<T, OtherScope, Hash> const& other)
 {
   this->impl_.merge(group, other.impl_);
@@ -78,7 +78,7 @@ __device__ void distinct_count_estimator_ref<T, Scope, Hash>::merge(
 
 template <class T, cuda::thread_scope Scope, class Hash>
 template <cuda::thread_scope OtherScope>
-__host__ void distinct_count_estimator_ref<T, Scope, Hash>::merge_async(
+__host__ constexpr void distinct_count_estimator_ref<T, Scope, Hash>::merge_async(
   distinct_count_estimator_ref<T, OtherScope, Hash> const& other, cuco::cuda_stream_ref stream)
 {
   this->impl_.merge_async(other, stream);
@@ -86,7 +86,7 @@ __host__ void distinct_count_estimator_ref<T, Scope, Hash>::merge_async(
 
 template <class T, cuda::thread_scope Scope, class Hash>
 template <cuda::thread_scope OtherScope>
-__host__ void distinct_count_estimator_ref<T, Scope, Hash>::merge(
+__host__ constexpr void distinct_count_estimator_ref<T, Scope, Hash>::merge(
   distinct_count_estimator_ref<T, OtherScope, Hash> const& other, cuco::cuda_stream_ref stream)
 {
   this->impl_.merge(other, stream);
@@ -100,21 +100,21 @@ __device__ std::size_t distinct_count_estimator_ref<T, Scope, Hash>::estimate(
 }
 
 template <class T, cuda::thread_scope Scope, class Hash>
-__host__ std::size_t distinct_count_estimator_ref<T, Scope, Hash>::estimate(
+__host__ constexpr std::size_t distinct_count_estimator_ref<T, Scope, Hash>::estimate(
   cuco::cuda_stream_ref stream) const
 {
   return this->impl_.estimate(stream);
 }
 
 template <class T, cuda::thread_scope Scope, class Hash>
-__host__ __device__ auto distinct_count_estimator_ref<T, Scope, Hash>::hash_function()
+__host__ __device__ constexpr auto distinct_count_estimator_ref<T, Scope, Hash>::hash_function()
   const noexcept
 {
   return this->impl_.hash_function();
 }
 
 template <class T, cuda::thread_scope Scope, class Hash>
-__host__ __device__ cuda::std::span<std::byte>
+__host__ __device__ constexpr cuda::std::span<std::byte>
 distinct_count_estimator_ref<T, Scope, Hash>::sketch() const noexcept
 {
   return this->impl_.sketch();
diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh
index a5c0a0e4a..1d24eb3fc 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog.cuh
@@ -101,7 +101,10 @@ class hyperloglog {
    *
    * @param stream CUDA stream this operation is executed in
    */
-  void clear_async(cuco::cuda_stream_ref stream) noexcept { this->ref_.clear_async(stream); }
+  constexpr void clear_async(cuco::cuda_stream_ref stream) noexcept
+  {
+    this->ref_.clear_async(stream);
+  }
 
   /**
    * @brief Resets the estimator, i.e., clears the current count estimate.
@@ -111,7 +114,7 @@ class hyperloglog {
    *
    * @param stream CUDA stream this operation is executed in
    */
-  void clear(cuco::cuda_stream_ref stream) { this->ref_.clear(stream); }
+  constexpr void clear(cuco::cuda_stream_ref stream) { this->ref_.clear(stream); }
 
   /**
    * @brief Asynchronously adds to be counted items to the estimator.
@@ -125,7 +128,7 @@ class hyperloglog {
    * @param stream CUDA stream this operation is executed in
    */
   template <class InputIt>
-  void add_async(InputIt first, InputIt last, cuco::cuda_stream_ref stream)
+  constexpr void add_async(InputIt first, InputIt last, cuco::cuda_stream_ref stream)
   {
     this->ref_.add_async(first, last, stream);
   }
@@ -145,7 +148,7 @@ class hyperloglog {
    * @param stream CUDA stream this operation is executed in
    */
   template <class InputIt>
-  void add(InputIt first, InputIt last, cuco::cuda_stream_ref stream)
+  constexpr void add(InputIt first, InputIt last, cuco::cuda_stream_ref stream)
   {
     this->ref_.add(first, last, stream);
   }
@@ -162,8 +165,8 @@ class hyperloglog {
    * @param stream CUDA stream this operation is executed in
    */
   template <cuda::thread_scope OtherScope, class OtherAllocator>
-  void merge_async(hyperloglog<T, OtherScope, Hash, OtherAllocator> const& other,
-                   cuco::cuda_stream_ref stream)
+  constexpr void merge_async(hyperloglog<T, OtherScope, Hash, OtherAllocator> const& other,
+                             cuco::cuda_stream_ref stream)
   {
     this->ref_.merge_async(other.ref(), stream);
   }
@@ -183,8 +186,8 @@ class hyperloglog {
    * @param stream CUDA stream this operation is executed in
    */
   template <cuda::thread_scope OtherScope, class OtherAllocator>
-  void merge(hyperloglog<T, OtherScope, Hash, OtherAllocator> const& other,
-             cuco::cuda_stream_ref stream)
+  constexpr void merge(hyperloglog<T, OtherScope, Hash, OtherAllocator> const& other,
+                       cuco::cuda_stream_ref stream)
   {
     this->ref_.merge(other.ref(), stream);
   }
@@ -200,7 +203,7 @@ class hyperloglog {
    * @param stream CUDA stream this operation is executed in
    */
   template <cuda::thread_scope OtherScope>
-  void merge_async(ref_type<OtherScope> const& other_ref, cuco::cuda_stream_ref stream)
+  constexpr void merge_async(ref_type<OtherScope> const& other_ref, cuco::cuda_stream_ref stream)
   {
     this->ref_.merge_async(other_ref, stream);
   }
@@ -219,7 +222,7 @@ class hyperloglog {
    * @param stream CUDA stream this operation is executed in
    */
   template <cuda::thread_scope OtherScope>
-  void merge(ref_type<OtherScope> const& other_ref, cuco::cuda_stream_ref stream)
+  constexpr void merge(ref_type<OtherScope> const& other_ref, cuco::cuda_stream_ref stream)
   {
     this->ref_.merge(other_ref, stream);
   }
@@ -233,7 +236,7 @@ class hyperloglog {
    *
    * @return Approximate distinct items count
    */
-  [[nodiscard]] std::size_t estimate(cuco::cuda_stream_ref stream) const
+  [[nodiscard]] constexpr std::size_t estimate(cuco::cuda_stream_ref stream) const
   {
     return this->ref_.estimate(stream);
   }
@@ -243,21 +246,24 @@ class hyperloglog {
    *
    * @return Device ref object of the current `distinct_count_estimator` host object
    */
-  [[nodiscard]] ref_type<> ref() const noexcept { return this->ref_; }
+  [[nodiscard]] constexpr ref_type<> ref() const noexcept { return this->ref_; }
 
   /**
    * @brief Get hash function.
    *
    * @return The hash function
    */
-  [[nodiscard]] auto hash_function() const noexcept { return this->ref_.hash_function(); }
+  [[nodiscard]] constexpr auto hash_function() const noexcept { return this->ref_.hash_function(); }
 
   /**
    * @brief Gets the span of the sketch.
    *
    * @return The cuda::std::span of the sketch
    */
-  [[nodiscard]] cuda::std::span<std::byte> sketch() const noexcept { return this->ref_.sketch(); }
+  [[nodiscard]] constexpr cuda::std::span<std::byte> sketch() const noexcept
+  {
+    return this->ref_.sketch();
+  }
 
   /**
    * @brief Gets the number of bytes required for the sketch storage.
diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
index 7186d785b..10dc14273 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
@@ -107,7 +107,7 @@ class hyperloglog_ref {
    * @param group CUDA Cooperative group this operation is executed in
    */
   template <class CG>
-  __device__ void clear(CG const& group) noexcept
+  __device__ constexpr void clear(CG const& group) noexcept
   {
     for (int i = group.thread_rank(); i < this->sketch_.size(); i += group.size()) {
       new (&(this->sketch_[i])) register_type{};
@@ -122,7 +122,7 @@ class hyperloglog_ref {
    *
    * @param stream CUDA stream this operation is executed in
    */
-  __host__ void clear(cuco::cuda_stream_ref stream)
+  __host__ constexpr void clear(cuco::cuda_stream_ref stream)
   {
     this->clear_async(stream);
     stream.synchronize();
@@ -133,7 +133,7 @@ class hyperloglog_ref {
    *
    * @param stream CUDA stream this operation is executed in
    */
-  __host__ void clear_async(cuco::cuda_stream_ref stream) noexcept
+  __host__ constexpr void clear_async(cuco::cuda_stream_ref stream) noexcept
   {
     auto constexpr block_size = 1024;
     cuco::hyperloglog_ns::detail::clear<<<1, block_size, 0, stream>>>(*this);
@@ -144,7 +144,7 @@ class hyperloglog_ref {
    *
    * @param item The item to be counted
    */
-  __device__ void add(T const& item) noexcept
+  __device__ constexpr void add(T const& item) noexcept
   {
     auto const h      = this->hash_(item);
     auto const reg    = h & this->register_mask_;
@@ -165,7 +165,7 @@ class hyperloglog_ref {
    * @param stream CUDA stream this operation is executed in
    */
   template <class InputIt>
-  __host__ void add_async(InputIt first, InputIt last, cuco::cuda_stream_ref stream)
+  __host__ constexpr void add_async(InputIt first, InputIt last, cuco::cuda_stream_ref stream)
   {
     auto const num_items = cuco::detail::distance(first, last);
     if (num_items == 0) { return; }
@@ -256,7 +256,7 @@ class hyperloglog_ref {
    * @param stream CUDA stream this operation is executed in
    */
   template <class InputIt>
-  __host__ void add(InputIt first, InputIt last, cuco::cuda_stream_ref stream)
+  __host__ constexpr void add(InputIt first, InputIt last, cuco::cuda_stream_ref stream)
   {
     this->add_async(first, last, stream);
     stream.synchronize();
@@ -274,7 +274,8 @@ class hyperloglog_ref {
    * @param other Other estimator reference to be merged into `*this`
    */
   template <class CG, cuda::thread_scope OtherScope>
-  __device__ void merge(CG const& group, hyperloglog_ref<T, OtherScope, Hash> const& other)
+  __device__ constexpr void merge(CG const& group,
+                                  hyperloglog_ref<T, OtherScope, Hash> const& other)
   {
     // TODO find a better way to do error handling in device code
     // if (other.precision_ != this->precision_) { __trap(); }
@@ -296,8 +297,8 @@ class hyperloglog_ref {
    * @param stream CUDA stream this operation is executed in
    */
   template <cuda::thread_scope OtherScope>
-  __host__ void merge_async(hyperloglog_ref<T, OtherScope, Hash> const& other,
-                            cuco::cuda_stream_ref stream)
+  __host__ constexpr void merge_async(hyperloglog_ref<T, OtherScope, Hash> const& other,
+                                      cuco::cuda_stream_ref stream)
   {
     CUCO_EXPECTS(other.precision == this->precision_,
                  "Cannot merge estimators with different sketch sizes",
@@ -320,8 +321,8 @@ class hyperloglog_ref {
    * @param stream CUDA stream this operation is executed in
    */
   template <cuda::thread_scope OtherScope>
-  __host__ void merge(hyperloglog_ref<T, OtherScope, Hash> const& other,
-                      cuco::cuda_stream_ref stream)
+  __host__ constexpr void merge(hyperloglog_ref<T, OtherScope, Hash> const& other,
+                                cuco::cuda_stream_ref stream)
   {
     this->merge_async(other, stream);
     stream.synchronize();
@@ -396,7 +397,7 @@ class hyperloglog_ref {
    *
    * @return Approximate distinct items count
    */
-  [[nodiscard]] __host__ std::size_t estimate(cuco::cuda_stream_ref stream) const
+  [[nodiscard]] __host__ constexpr std::size_t estimate(cuco::cuda_stream_ref stream) const
   {
     auto const num_regs = 1ull << this->precision_;
     std::vector<register_type> host_sketch(num_regs);
@@ -429,14 +430,17 @@ class hyperloglog_ref {
    *
    * @return The hash function
    */
-  [[nodiscard]] __host__ __device__ auto hash_function() const noexcept { return this->hash_; }
+  [[nodiscard]] __host__ __device__ constexpr auto hash_function() const noexcept
+  {
+    return this->hash_;
+  }
 
   /**
    * @brief Gets the span of the sketch.
    *
    * @return The cuda::std::span of the sketch
    */
-  [[nodiscard]] __host__ __device__ cuda::std::span<std::byte> sketch() const noexcept
+  [[nodiscard]] __host__ __device__ constexpr cuda::std::span<std::byte> sketch() const noexcept
   {
     return cuda::std::span<std::byte>(reinterpret_cast<std::byte*>(this->sketch_.data()),
                                       this->sketch_bytes());
@@ -447,7 +451,7 @@ class hyperloglog_ref {
    *
    * @return The number of bytes required for the sketch
    */
-  [[nodiscard]] __host__ __device__ std::size_t sketch_bytes() const noexcept
+  [[nodiscard]] __host__ __device__ constexpr std::size_t sketch_bytes() const noexcept
   {
     return (1ull << this->precision_) * sizeof(register_type);
   }
diff --git a/include/cuco/distinct_count_estimator.cuh b/include/cuco/distinct_count_estimator.cuh
index d7318d26a..011194ad5 100644
--- a/include/cuco/distinct_count_estimator.cuh
+++ b/include/cuco/distinct_count_estimator.cuh
@@ -94,7 +94,7 @@ class distinct_count_estimator {
    *
    * @param stream CUDA stream this operation is executed in
    */
-  void clear_async(cuco::cuda_stream_ref stream = {}) noexcept;
+  constexpr void clear_async(cuco::cuda_stream_ref stream = {}) noexcept;
 
   /**
    * @brief Resets the estimator, i.e., clears the current count estimate.
@@ -104,7 +104,7 @@ class distinct_count_estimator {
    *
    * @param stream CUDA stream this operation is executed in
    */
-  void clear(cuco::cuda_stream_ref stream = {});
+  constexpr void clear(cuco::cuda_stream_ref stream = {});
 
   /**
    * @brief Asynchronously adds to be counted items to the estimator.
@@ -118,7 +118,7 @@ class distinct_count_estimator {
    * @param stream CUDA stream this operation is executed in
    */
   template <class InputIt>
-  void add_async(InputIt first, InputIt last, cuco::cuda_stream_ref stream = {}) noexcept;
+  constexpr void add_async(InputIt first, InputIt last, cuco::cuda_stream_ref stream = {}) noexcept;
 
   /**
    * @brief Adds to be counted items to the estimator.
@@ -135,7 +135,7 @@ class distinct_count_estimator {
    * @param stream CUDA stream this operation is executed in
    */
   template <class InputIt>
-  void add(InputIt first, InputIt last, cuco::cuda_stream_ref stream = {});
+  constexpr void add(InputIt first, InputIt last, cuco::cuda_stream_ref stream = {});
 
   /**
    * @brief Asynchronously merges the result of `other` estimator into `*this` estimator.
@@ -149,8 +149,9 @@ class distinct_count_estimator {
    * @param stream CUDA stream this operation is executed in
    */
   template <cuda::thread_scope OtherScope, class OtherAllocator>
-  void merge_async(distinct_count_estimator<T, OtherScope, Hash, OtherAllocator> const& other,
-                   cuco::cuda_stream_ref stream = {});
+  constexpr void merge_async(
+    distinct_count_estimator<T, OtherScope, Hash, OtherAllocator> const& other,
+    cuco::cuda_stream_ref stream = {});
 
   /**
    * @brief Merges the result of `other` estimator into `*this` estimator.
@@ -167,8 +168,8 @@ class distinct_count_estimator {
    * @param stream CUDA stream this operation is executed in
    */
   template <cuda::thread_scope OtherScope, class OtherAllocator>
-  void merge(distinct_count_estimator<T, OtherScope, Hash, OtherAllocator> const& other,
-             cuco::cuda_stream_ref stream = {});
+  constexpr void merge(distinct_count_estimator<T, OtherScope, Hash, OtherAllocator> const& other,
+                       cuco::cuda_stream_ref stream = {});
 
   /**
    * @brief Asynchronously merges the result of `other` estimator reference into `*this` estimator.
@@ -181,7 +182,8 @@ class distinct_count_estimator {
    * @param stream CUDA stream this operation is executed in
    */
   template <cuda::thread_scope OtherScope>
-  void merge_async(ref_type<OtherScope> const& other_ref, cuco::cuda_stream_ref stream = {});
+  constexpr void merge_async(ref_type<OtherScope> const& other_ref,
+                             cuco::cuda_stream_ref stream = {});
 
   /**
    * @brief Merges the result of `other` estimator reference into `*this` estimator.
@@ -197,7 +199,7 @@ class distinct_count_estimator {
    * @param stream CUDA stream this operation is executed in
    */
   template <cuda::thread_scope OtherScope>
-  void merge(ref_type<OtherScope> const& other_ref, cuco::cuda_stream_ref stream = {});
+  constexpr void merge(ref_type<OtherScope> const& other_ref, cuco::cuda_stream_ref stream = {});
 
   /**
    * @brief Compute the estimated distinct items count.
@@ -208,28 +210,28 @@ class distinct_count_estimator {
    *
    * @return Approximate distinct items count
    */
-  [[nodiscard]] std::size_t estimate(cuco::cuda_stream_ref stream = {}) const;
+  [[nodiscard]] constexpr std::size_t estimate(cuco::cuda_stream_ref stream = {}) const;
 
   /**
    * @brief Get device ref.
    *
    * @return Device ref object of the current `distinct_count_estimator` host object
    */
-  [[nodiscard]] ref_type<> ref() const noexcept;
+  [[nodiscard]] constexpr ref_type<> ref() const noexcept;
 
   /**
    * @brief Get hash function.
    *
    * @return The hash function
    */
-  [[nodiscard]] auto hash_function() const noexcept;
+  [[nodiscard]] constexpr auto hash_function() const noexcept;
 
   /**
    * @brief Gets the span of the sketch.
    *
    * @return The cuda::std::span of the sketch
    */
-  [[nodiscard]] cuda::std::span<std::byte> sketch() const noexcept;
+  [[nodiscard]] constexpr cuda::std::span<std::byte> sketch() const noexcept;
 
   /**
    * @brief Gets the number of bytes required for the sketch storage.
diff --git a/include/cuco/distinct_count_estimator_ref.cuh b/include/cuco/distinct_count_estimator_ref.cuh
index bc0b9da61..74b60dbb3 100644
--- a/include/cuco/distinct_count_estimator_ref.cuh
+++ b/include/cuco/distinct_count_estimator_ref.cuh
@@ -73,14 +73,14 @@ class distinct_count_estimator_ref {
    * @param group CUDA Cooperative group this operation is executed in
    */
   template <class CG>
-  __device__ void clear(CG const& group) noexcept;
+  __device__ constexpr void clear(CG const& group) noexcept;
 
   /**
    * @brief Asynchronously resets the estimator, i.e., clears the current count estimate.
    *
    * @param stream CUDA stream this operation is executed in
    */
-  __host__ void clear_async(cuco::cuda_stream_ref stream = {}) noexcept;
+  __host__ constexpr void clear_async(cuco::cuda_stream_ref stream = {}) noexcept;
 
   /**
    * @brief Resets the estimator, i.e., clears the current count estimate.
@@ -90,14 +90,14 @@ class distinct_count_estimator_ref {
    *
    * @param stream CUDA stream this operation is executed in
    */
-  __host__ void clear(cuco::cuda_stream_ref stream = {});
+  __host__ constexpr void clear(cuco::cuda_stream_ref stream = {});
 
   /**
    * @brief Adds an item to the estimator.
    *
    * @param item The item to be counted
    */
-  __device__ void add(T const& item) noexcept;
+  __device__ constexpr void add(T const& item) noexcept;
 
   /**
    * @brief Asynchronously adds to be counted items to the estimator.
@@ -111,7 +111,7 @@ class distinct_count_estimator_ref {
    * @param stream CUDA stream this operation is executed in
    */
   template <class InputIt>
-  __host__ void add_async(InputIt first, InputIt last, cuco::cuda_stream_ref stream = {});
+  __host__ constexpr void add_async(InputIt first, InputIt last, cuco::cuda_stream_ref stream = {});
 
   /**
    * @brief Adds to be counted items to the estimator.
@@ -128,7 +128,7 @@ class distinct_count_estimator_ref {
    * @param stream CUDA stream this operation is executed in
    */
   template <class InputIt>
-  __host__ void add(InputIt first, InputIt last, cuco::cuda_stream_ref stream = {});
+  __host__ constexpr void add(InputIt first, InputIt last, cuco::cuda_stream_ref stream = {});
 
   /**
    * @brief Merges the result of `other` estimator reference into `*this` estimator reference.
@@ -142,8 +142,8 @@ class distinct_count_estimator_ref {
    * @param other Other estimator reference to be merged into `*this`
    */
   template <class CG, cuda::thread_scope OtherScope>
-  __device__ void merge(CG const& group,
-                        distinct_count_estimator_ref<T, OtherScope, Hash> const& other);
+  __device__ constexpr void merge(CG const& group,
+                                  distinct_count_estimator_ref<T, OtherScope, Hash> const& other);
 
   /**
    * @brief Asynchronously merges the result of `other` estimator reference into `*this` estimator.
@@ -156,8 +156,9 @@ class distinct_count_estimator_ref {
    * @param stream CUDA stream this operation is executed in
    */
   template <cuda::thread_scope OtherScope>
-  __host__ void merge_async(distinct_count_estimator_ref<T, OtherScope, Hash> const& other,
-                            cuco::cuda_stream_ref stream = {});
+  __host__ constexpr void merge_async(
+    distinct_count_estimator_ref<T, OtherScope, Hash> const& other,
+    cuco::cuda_stream_ref stream = {});
 
   /**
    * @brief Merges the result of `other` estimator reference into `*this` estimator.
@@ -173,8 +174,8 @@ class distinct_count_estimator_ref {
    * @param stream CUDA stream this operation is executed in
    */
   template <cuda::thread_scope OtherScope>
-  __host__ void merge(distinct_count_estimator_ref<T, OtherScope, Hash> const& other,
-                      cuco::cuda_stream_ref stream = {});
+  __host__ constexpr void merge(distinct_count_estimator_ref<T, OtherScope, Hash> const& other,
+                                cuco::cuda_stream_ref stream = {});
 
   /**
    * @brief Compute the estimated distinct items count.
@@ -195,21 +196,21 @@ class distinct_count_estimator_ref {
    *
    * @return Approximate distinct items count
    */
-  [[nodiscard]] __host__ std::size_t estimate(cuco::cuda_stream_ref stream = {}) const;
+  [[nodiscard]] __host__ constexpr std::size_t estimate(cuco::cuda_stream_ref stream = {}) const;
 
   /**
    * @brief Gets the hash function.
    *
    * @return The hash function
    */
-  [[nodiscard]] __host__ __device__ auto hash_function() const noexcept;
+  [[nodiscard]] __host__ __device__ constexpr auto hash_function() const noexcept;
 
   /**
    * @brief Gets the span of the sketch.
    *
    * @return The cuda::std::span of the sketch
    */
-  [[nodiscard]] __host__ __device__ cuda::std::span<std::byte> sketch() const noexcept;
+  [[nodiscard]] __host__ __device__ constexpr cuda::std::span<std::byte> sketch() const noexcept;
 
   /**
    * @brief Gets the number of bytes required for the sketch storage.

From fbd6dab0066d535659d95cd6ae82058b557e0517 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Fri, 22 Mar 2024 00:47:26 +0000
Subject: [PATCH 67/78] Add constructor overload which takes the desired
 standard deviation

---
 .../host_bulk_example.cu                      |  6 +-
 .../distinct_count_estimator.inl              | 17 +++++
 .../distinct_count_estimator_ref.inl          |  8 +++
 .../cuco/detail/hyperloglog/hyperloglog.cuh   | 66 ++++++++++++++++---
 .../detail/hyperloglog/hyperloglog_ref.cuh    | 29 +++++++-
 include/cuco/distinct_count_estimator.cuh     | 29 +++++++-
 include/cuco/distinct_count_estimator_ref.cuh | 13 +++-
 include/cuco/standard_deviation.cuh           | 45 +++++++++++++
 8 files changed, 197 insertions(+), 16 deletions(-)
 create mode 100644 include/cuco/standard_deviation.cuh

diff --git a/examples/distinct_count_estimator/host_bulk_example.cu b/examples/distinct_count_estimator/host_bulk_example.cu
index 96a46a8df..56ee90a42 100644
--- a/examples/distinct_count_estimator/host_bulk_example.cu
+++ b/examples/distinct_count_estimator/host_bulk_example.cu
@@ -36,8 +36,12 @@ int main(void)
   // Generate `num_items` distinct items
   thrust::sequence(items.begin(), items.end(), 0);
 
+  // We define the desired standard deviation of the approximation error
+  // 0.0122197 is the default value and corresponds to a 32KB sketch size
+  auto const sd = cuco::standard_deviation{0.0122197};
+
   // Initialize the estimator
-  cuco::distinct_count_estimator<T> estimator;
+  cuco::distinct_count_estimator<T> estimator{sd};
 
   // Add all items to the estimator
   estimator.add(items.begin(), items.end());
diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl
index 5454165a6..ed5d9792f 100644
--- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl
+++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl
@@ -26,6 +26,16 @@ constexpr distinct_count_estimator<T, Scope, Hash, Allocator>::distinct_count_es
 {
 }
 
+template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
+constexpr distinct_count_estimator<T, Scope, Hash, Allocator>::distinct_count_estimator(
+  cuco::standard_deviation standard_deviation,
+  Hash const& hash,
+  Allocator const& alloc,
+  cuco::cuda_stream_ref stream)
+  : impl_{std::make_unique<impl_type>(standard_deviation, hash, alloc, stream)}
+{
+}
+
 template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
 constexpr void distinct_count_estimator<T, Scope, Hash, Allocator>::clear_async(
   cuco::cuda_stream_ref stream) noexcept
@@ -130,6 +140,13 @@ constexpr size_t distinct_count_estimator<T, Scope, Hash, Allocator>::sketch_byt
   return impl_type::sketch_bytes(sketch_size_kb);
 }
 
+template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
+constexpr size_t distinct_count_estimator<T, Scope, Hash, Allocator>::sketch_bytes(
+  cuco::standard_deviation standard_deviation) noexcept
+{
+  return impl_type::sketch_bytes(standard_deviation);
+}
+
 template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
 constexpr size_t distinct_count_estimator<T, Scope, Hash, Allocator>::sketch_alignment() noexcept
 {
diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl
index a607c3ce9..97649d5bc 100644
--- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl
+++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl
@@ -135,6 +135,14 @@ distinct_count_estimator_ref<T, Scope, Hash>::sketch_bytes(
   return impl_type::sketch_bytes(sketch_size_kb);
 }
 
+template <class T, cuda::thread_scope Scope, class Hash>
+__host__ __device__ constexpr std::size_t
+distinct_count_estimator_ref<T, Scope, Hash>::sketch_bytes(
+  cuco::standard_deviation standard_deviation) noexcept
+{
+  return impl_type::sketch_bytes(standard_deviation);
+}
+
 template <class T, cuda::thread_scope Scope, class Hash>
 __host__ __device__ constexpr std::size_t
 distinct_count_estimator_ref<T, Scope, Hash>::sketch_alignment() noexcept
diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh
index 1d24eb3fc..13106cc08 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog.cuh
@@ -21,6 +21,7 @@
 #include <cuco/detail/storage/storage_base.cuh>
 #include <cuco/hash_functions.cuh>
 #include <cuco/sketch_size.cuh>
+#include <cuco/standard_deviation.cuh>
 #include <cuco/utility/cuda_thread_scope.cuh>
 
 #include <cstddef>
@@ -55,13 +56,34 @@ class hyperloglog {
     typename std::allocator_traits<Allocator>::template rebind_alloc<register_type>;  ///< Allocator
                                                                                       ///< type
 
+ private:
   /**
    * @brief Constructs a `hyperloglog` host object.
    *
    * @note This function synchronizes the given stream.
    *
-   * @throw If sketch size < 0.0625KB or 64B
-   * @throw If sketch storage has insufficient alignment
+   * @param sketch_size_b Sketch size in bytes
+   * @param hash The hash function used to hash items
+   * @param alloc Allocator used for allocating device storage
+   * @param stream CUDA stream used to initialize the object
+   */
+  constexpr hyperloglog(std::size_t sketch_size_b,
+                        Hash const& hash,
+                        Allocator const& alloc,
+                        cuco::cuda_stream_ref stream)
+    : allocator_{alloc},
+      sketch_{this->allocator_.allocate(sketch_size_b / sizeof(register_type)),
+              custom_deleter{sketch_size_b / sizeof(register_type), this->allocator_}},
+      ref_{cuda::std::span{reinterpret_cast<std::byte*>(this->sketch_.get()), sketch_size_b}, hash}
+  {
+    this->ref_.clear_async(stream);
+  }
+
+ public:
+  /**
+   * @brief Constructs a `hyperloglog` host object.
+   *
+   * @note This function synchronizes the given stream.
    *
    * @param sketch_size_kb Maximum sketch size in KB
    * @param hash The hash function used to hash items
@@ -72,15 +94,26 @@ class hyperloglog {
                         Hash const& hash,
                         Allocator const& alloc,
                         cuco::cuda_stream_ref stream)
-    : allocator_{alloc},
-      sketch_{
-        this->allocator_.allocate(sketch_bytes(sketch_size_kb) / sizeof(register_type)),
-        custom_deleter{sketch_bytes(sketch_size_kb) / sizeof(register_type), this->allocator_}},
-      ref_{cuda::std::span{reinterpret_cast<std::byte*>(this->sketch_.get()),
-                           sketch_bytes(sketch_size_kb)},
-           hash}
+    : hyperloglog{sketch_bytes(sketch_size_kb), hash, alloc, stream}
+  {
+  }
+
+  /**
+   * @brief Constructs a `hyperloglog` host object.
+   *
+   * @note This function synchronizes the given stream.
+   *
+   * @param standard_deviation Desired standard deviation for the approximation error
+   * @param hash The hash function used to hash items
+   * @param alloc Allocator used for allocating device storage
+   * @param stream CUDA stream used to initialize the object
+   */
+  constexpr hyperloglog(cuco::standard_deviation standard_deviation,
+                        Hash const& hash,
+                        Allocator const& alloc,
+                        cuco::cuda_stream_ref stream)
+    : hyperloglog{sketch_bytes(standard_deviation), hash, alloc, stream}
   {
-    this->ref_.clear_async(stream);
   }
 
   ~hyperloglog() = default;
@@ -288,6 +321,19 @@ class hyperloglog {
     return ref_type<>::sketch_bytes(sketch_size_kb);
   }
 
+  /**
+   * @brief Gets the number of bytes required for the sketch storage.
+   *
+   * @param standard_deviation Upper bound standard deviation for approximation error
+   *
+   * @return The number of bytes required for the sketch
+   */
+  [[nodiscard]] static constexpr std::size_t sketch_bytes(
+    cuco::standard_deviation standard_deviation) noexcept
+  {
+    return ref_type<>::sketch_bytes(standard_deviation);
+  }
+
   /**
    * @brief Gets the alignment required for the sketch storage.
    *
diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
index 10dc14273..99af829d2 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
@@ -23,6 +23,7 @@
 #include <cuco/detail/utils.hpp>
 #include <cuco/hash_functions.cuh>
 #include <cuco/sketch_size.cuh>
+#include <cuco/standard_deviation.cuh>
 #include <cuco/utility/cuda_thread_scope.cuh>
 #include <cuco/utility/traits.hpp>
 
@@ -71,7 +72,8 @@ class hyperloglog_ref {
   /**
    * @brief Constructs a non-owning `hyperloglog_ref` object.
    *
-   * @throw If sketch size < 0.0625KB or 64B. Throws if called from host; UB if called from device.
+   * @throw If sketch size < 0.0625KB or 64B or standard deviation > 0.2765. Throws if called from
+   * host; UB if called from device.
    * @throw If sketch storage has insufficient alignment. Throws if called from host; UB if called.
    * from device.
    *
@@ -471,6 +473,31 @@ class hyperloglog_ref {
                     cuda::std::bit_floor(static_cast<std::size_t>(sketch_size_kb * 1024)));
   }
 
+  /**
+   * @brief Gets the number of bytes required for the sketch storage.
+   *
+   * @param standard_deviation Upper bound standard deviation for approximation error
+   *
+   * @return The number of bytes required for the sketch
+   */
+  [[nodiscard]] __host__ __device__ static constexpr std::size_t sketch_bytes(
+    cuco::standard_deviation standard_deviation) noexcept
+  {
+    // implementation taken from
+    // https://github.com/apache/spark/blob/6a27789ad7d59cd133653a49be0bb49729542abe/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/HyperLogLogPlusPlusHelper.scala#L43
+
+    //  minimum precision is 4 or 64 bytes
+    auto const precision = std::max(
+      static_cast<int32_t>(4),
+      static_cast<int32_t>(
+        cuda::std::ceil(2.0 * cuda::std::log(1.106 / standard_deviation) / cuda::std::log(2.0))));
+
+    // inverse of this function (ommitting the minimum precision constraint) is
+    // standard_deviation = 1.106 / exp((precision * log(2.0)) / 2.0)
+
+    return sizeof(register_type) * (1ull << precision);
+  }
+
   /**
    * @brief Gets the alignment required for the sketch storage.
    *
diff --git a/include/cuco/distinct_count_estimator.cuh b/include/cuco/distinct_count_estimator.cuh
index 011194ad5..a43590a43 100644
--- a/include/cuco/distinct_count_estimator.cuh
+++ b/include/cuco/distinct_count_estimator.cuh
@@ -20,6 +20,7 @@
 #include <cuco/distinct_count_estimator_ref.cuh>
 #include <cuco/hash_functions.cuh>
 #include <cuco/sketch_size.cuh>
+#include <cuco/standard_deviation.cuh>
 #include <cuco/utility/allocator.hpp>
 #include <cuco/utility/cuda_thread_scope.cuh>
 
@@ -63,9 +64,6 @@ class distinct_count_estimator {
    *
    * @note This function synchronizes the given stream.
    *
-   * @throw If sketch size < 0.0625KB or 64B
-   * @throw If sketch storage has insufficient alignment
-   *
    * @param sketch_size_kb Maximum sketch size in KB
    * @param hash The hash function used to hash items
    * @param alloc Allocator used for allocating device storage
@@ -76,6 +74,21 @@ class distinct_count_estimator {
                                      Allocator const& alloc              = {},
                                      cuco::cuda_stream_ref stream        = {});
 
+  /**
+   * @brief Constructs a `distinct_count_estimator` host object.
+   *
+   * @note This function synchronizes the given stream.
+   *
+   * @param standard_deviation Desired standard deviation for the approximation error
+   * @param hash The hash function used to hash items
+   * @param alloc Allocator used for allocating device storage
+   * @param stream CUDA stream used to initialize the object
+   */
+  constexpr distinct_count_estimator(cuco::standard_deviation standard_deviation,
+                                     Hash const& hash             = {},
+                                     Allocator const& alloc       = {},
+                                     cuco::cuda_stream_ref stream = {});
+
   ~distinct_count_estimator() = default;
 
   distinct_count_estimator(distinct_count_estimator const&)            = delete;
@@ -250,6 +263,16 @@ class distinct_count_estimator {
   [[nodiscard]] static constexpr std::size_t sketch_bytes(
     cuco::sketch_size_kb sketch_size_kb) noexcept;
 
+  /**
+   * @brief Gets the number of bytes required for the sketch storage.
+   *
+   * @param standard_deviation Upper bound standard deviation for approximation error
+   *
+   * @return The number of bytes required for the sketch
+   */
+  [[nodiscard]] static constexpr std::size_t sketch_bytes(
+    cuco::standard_deviation standard_deviation) noexcept;
+
   /**
    * @brief Gets the alignment required for the sketch storage.
    *
diff --git a/include/cuco/distinct_count_estimator_ref.cuh b/include/cuco/distinct_count_estimator_ref.cuh
index 74b60dbb3..d2eb6ab58 100644
--- a/include/cuco/distinct_count_estimator_ref.cuh
+++ b/include/cuco/distinct_count_estimator_ref.cuh
@@ -55,7 +55,8 @@ class distinct_count_estimator_ref {
   /**
    * @brief Constructs a non-owning `distinct_count_estimator_ref` object.
    *
-   * @throw If sketch size < 0.0625KB or 64B. Throws if called from host; UB if called from device.
+   * @throw If sketch size < 0.0625KB or 64B or standard deviation > 0.2765. Throws if called from
+   * host; UB if called from device.
    * @throw If sketch storage has insufficient alignment. Throws if called from host; UB if called
    * from device.
    *
@@ -229,6 +230,16 @@ class distinct_count_estimator_ref {
   [[nodiscard]] __host__ __device__ static constexpr std::size_t sketch_bytes(
     cuco::sketch_size_kb sketch_size_kb) noexcept;
 
+  /**
+   * @brief Gets the number of bytes required for the sketch storage.
+   *
+   * @param standard_deviation Upper bound standard deviation for approximation error
+   *
+   * @return The number of bytes required for the sketch
+   */
+  [[nodiscard]] __host__ __device__ static constexpr std::size_t sketch_bytes(
+    cuco::standard_deviation standard_deviation) noexcept;
+
   /**
    * @brief Gets the alignment required for the sketch storage.
    *
diff --git a/include/cuco/standard_deviation.cuh b/include/cuco/standard_deviation.cuh
new file mode 100644
index 000000000..9486784b9
--- /dev/null
+++ b/include/cuco/standard_deviation.cuh
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+namespace cuco {
+
+/**
+ * @brief Strong type for specifying the desired standard deviation of
+ * cuco::distinct_count_estimator(_ref).
+ */
+class standard_deviation {
+ public:
+  /**
+   * @brief Constructs a standard_deviation object.
+   *
+   * @param value The desired standard deviation
+   */
+  __host__ __device__ explicit constexpr standard_deviation(double value) noexcept : value_{value}
+  {
+  }
+
+  /**
+   * @brief Conversion to value type.
+   *
+   * @return Standard deviation
+   */
+  __host__ __device__ constexpr operator double() const noexcept { return this->value_; }
+
+ private:
+  double value_;  ///< Sketch size in KB
+};
+}  // namespace cuco
\ No newline at end of file

From dfe1a070cdc643dcfd03a446f272c0619bfdbda5 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Fri, 22 Mar 2024 00:49:17 +0000
Subject: [PATCH 68/78] Remove stray include

---
 benchmarks/utils.hpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/benchmarks/utils.hpp b/benchmarks/utils.hpp
index 97ca4988f..392cafe06 100644
--- a/benchmarks/utils.hpp
+++ b/benchmarks/utils.hpp
@@ -21,8 +21,6 @@
 
 #include <nvbench/nvbench.cuh>
 
-#include <cuda/std/atomic>  // thread_scope
-
 namespace cuco::benchmark {
 
 template <typename Dist>

From 2629adc95346b45d62c7b192c0c24f3b5c0a319c Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Sat, 23 Mar 2024 02:04:43 +0000
Subject: [PATCH 69/78] Bugfixes

---
 .../detail/hyperloglog/hyperloglog_ref.cuh    | 34 +++++++++++--------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
index 99af829d2..1abb6d38e 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
@@ -84,7 +84,7 @@ class hyperloglog_ref {
                                                 Hash const& hash)
     : hash_{hash},
       precision_{cuda::std::countr_zero(
-        sketch_bytes(cuco::sketch_size_kb(static_cast<double>(sketch_span.size() / 1024))) /
+        sketch_bytes(cuco::sketch_size_kb(static_cast<double>(sketch_span.size() / 1024.0))) /
         sizeof(register_type))},
       register_mask_{(1ull << this->precision_) - 1},
       sketch_{reinterpret_cast<register_type*>(sketch_span.data()),
@@ -152,6 +152,10 @@ class hyperloglog_ref {
     auto const reg    = h & this->register_mask_;
     auto const zeroes = cuda::std::countl_zero(h | this->register_mask_) + 1;  // __clz
 
+    // reversed order (same one as Spark uses)
+    // auto const reg    = h >> ((sizeof(hash_value_type) * 8) - this->precision_);
+    // auto const zeroes = cuda::std::countl_zero(h << this->precision_) + 1;
+
     this->update_max(reg, zeroes);
   }
 
@@ -207,19 +211,21 @@ class hyperloglog_ref {
     }
 
     if (kernel != nullptr and this->try_reserve_shmem(kernel, shmem_bytes)) {
-      // We make use of the occupancy calculator to get the minimum number of blocks which still
-      // saturates the GPU. This reduces the shmem initialization overhead and atomic contention on
-      // the final register array during the merge phase.
-      CUCO_CUDA_TRY(
-        cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, kernel, shmem_bytes));
-
-      auto const ptr      = thrust::raw_pointer_cast(&first[0]);
-      void* kernel_args[] = {
-        (void*)(&ptr),  // TODO can't use reinterpret_cast since it can't cast away const
-        (void*)(&num_items),
-        reinterpret_cast<void*>(this)};
-      CUCO_CUDA_TRY(
-        cudaLaunchKernel(kernel, grid_size, block_size, kernel_args, shmem_bytes, stream));
+      if constexpr (thrust::is_contiguous_iterator_v<InputIt>) {
+        // We make use of the occupancy calculator to get the minimum number of blocks which still
+        // saturates the GPU. This reduces the shmem initialization overhead and atomic contention
+        // on the final register array during the merge phase.
+        CUCO_CUDA_TRY(
+          cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, kernel, shmem_bytes));
+
+        auto const ptr      = thrust::raw_pointer_cast(&first[0]);
+        void* kernel_args[] = {
+          (void*)(&ptr),  // TODO can't use reinterpret_cast since it can't cast away const
+          (void*)(&num_items),
+          reinterpret_cast<void*>(this)};
+        CUCO_CUDA_TRY(
+          cudaLaunchKernel(kernel, grid_size, block_size, kernel_args, shmem_bytes, stream));
+      }
     } else {
       kernel = reinterpret_cast<void const*>(
         cuco::hyperloglog_ns::detail::add_shmem<InputIt, hyperloglog_ref>);

From 1ad97e27fc26d2e898d6d092957e3b1ffd922ec0 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Wed, 27 Mar 2024 15:19:29 +0000
Subject: [PATCH 70/78] Fix merge

---
 .../distinct_count_estimator/distinct_count_estimator.inl | 8 ++++----
 include/cuco/detail/hyperloglog/hyperloglog_ref.cuh       | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl
index ed5d9792f..2d5ad3a47 100644
--- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl
+++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl
@@ -72,7 +72,7 @@ constexpr void distinct_count_estimator<T, Scope, Hash, Allocator>::merge_async(
   distinct_count_estimator<T, OtherScope, Hash, OtherAllocator> const& other,
   cuco::cuda_stream_ref stream)
 {
-  this->impl_->merge_async(other, stream);
+  this->impl_->merge_async(*(other.impl_), stream);
 }
 
 template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
@@ -81,7 +81,7 @@ constexpr void distinct_count_estimator<T, Scope, Hash, Allocator>::merge(
   distinct_count_estimator<T, OtherScope, Hash, OtherAllocator> const& other,
   cuco::cuda_stream_ref stream)
 {
-  this->impl_->merge(other, stream);
+  this->impl_->merge(*(other.impl_), stream);
 }
 
 template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
@@ -89,7 +89,7 @@ template <cuda::thread_scope OtherScope>
 constexpr void distinct_count_estimator<T, Scope, Hash, Allocator>::merge_async(
   ref_type<OtherScope> const& other_ref, cuco::cuda_stream_ref stream)
 {
-  this->impl_->merge_async(other_ref, stream);
+  this->impl_->merge_async(other_ref.impl_, stream);
 }
 
 template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
@@ -97,7 +97,7 @@ template <cuda::thread_scope OtherScope>
 constexpr void distinct_count_estimator<T, Scope, Hash, Allocator>::merge(
   ref_type<OtherScope> const& other_ref, cuco::cuda_stream_ref stream)
 {
-  this->impl_->merge(other_ref, stream);
+  this->impl_->merge(other_ref.impl_, stream);
 }
 
 template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
index 1abb6d38e..dfd2df499 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
@@ -308,7 +308,7 @@ class hyperloglog_ref {
   __host__ constexpr void merge_async(hyperloglog_ref<T, OtherScope, Hash> const& other,
                                       cuco::cuda_stream_ref stream)
   {
-    CUCO_EXPECTS(other.precision == this->precision_,
+    CUCO_EXPECTS(other.precision_ == this->precision_,
                  "Cannot merge estimators with different sketch sizes",
                  std::runtime_error);
     auto constexpr block_size = 1024;

From 3b0da20b25e4e95ba9e308b8145fd975fae189f7 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Wed, 27 Mar 2024 15:27:54 +0000
Subject: [PATCH 71/78] Add Spark parity tests

---
 tests/CMakeLists.txt                          |   1 +
 .../spark_parity_test.cu                      | 187 ++++++++++++++++++
 2 files changed, 188 insertions(+)
 create mode 100644 tests/distinct_count_estimator/spark_parity_test.cu

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 0acaae3c4..a37f2d4e2 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -120,4 +120,5 @@ ConfigureTest(DYNAMIC_BITSET_TEST
 # - distinct_count_estimator ----------------------------------------------------------------------
 ConfigureTest(DISTINCT_COUNT_ESTIMATOR_TEST
     distinct_count_estimator/unique_sequence_test.cu
+    distinct_count_estimator/spark_parity_test.cu
     distinct_count_estimator/device_ref_test.cu)
diff --git a/tests/distinct_count_estimator/spark_parity_test.cu b/tests/distinct_count_estimator/spark_parity_test.cu
new file mode 100644
index 000000000..4aaaa8a66
--- /dev/null
+++ b/tests/distinct_count_estimator/spark_parity_test.cu
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utils.hpp>
+
+#include <cuco/distinct_count_estimator.cuh>
+#include <cuco/hash_functions.cuh>
+
+#include <cuda/functional>
+#include <thrust/device_vector.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/reverse_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/generators/catch_generators.hpp>
+
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>  // std::memcpy
+
+/**
+ * @file spark_parity_test.cu
+ * @brief Unit test to ensure parity with Spark's HLL implementation
+ *
+ * The following unit tests mimic Spark's unit tests which can be found here:
+ * https://github.com/apache/spark/blob/d10dbaa31a44878df5c7e144f111e18261346531/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlusSuite.scala
+ *
+ */
+
+// TODO implement this test once add_if is available
+// TEST_CASE("distinct_count_estimator: Spark parity: add nulls", "")
+
+TEST_CASE("distinct_count_estimator: Spark parity: deterministic cardinality estimation", "")
+{
+  using T = int;
+  using estimator_type =
+    cuco::distinct_count_estimator<T, cuda::thread_scope_device, cuco::xxhash_64<T>>;
+
+  constexpr size_t repeats = 10;
+  // This factor determines the error threshold for passing the test
+  constexpr double tolerance_factor = 3.0;
+  auto num_items          = GENERATE(100, 500, 1000, 5000, 10000, 50000, 100000, 500000, 1000000);
+  auto standard_deviation = GENERATE(0.1, 0.05, 0.025, 0.01, 0.001);
+
+  auto expected_hll_precision = std::max(
+    static_cast<int32_t>(4),
+    static_cast<int32_t>(std::ceil(2.0 * std::log(1.106 / standard_deviation) / std::log(2.0))));
+  auto expected_sketch_bytes = 4 * (1ull << expected_hll_precision);
+
+  INFO("num_items" << num_items);
+  INFO("standard_deviation=" << standard_deviation);
+  INFO("expected_hll_precision=" << expected_hll_precision);
+  INFO("expected_sketch_bytes=" << expected_sketch_bytes);
+
+  auto sd = cuco::standard_deviation(standard_deviation);
+  auto sb = cuco::sketch_size_kb(expected_sketch_bytes / 1024.0);
+
+  // Validate sketch size calculation
+  REQUIRE(estimator_type::sketch_bytes(sd) >= 64);
+  REQUIRE(estimator_type::sketch_bytes(sd) == expected_sketch_bytes);
+  REQUIRE(estimator_type::sketch_bytes(sd) == estimator_type::sketch_bytes(sb));
+
+  auto items_begin =
+    thrust::make_transform_iterator(thrust::make_counting_iterator<size_t>(0),
+                                    cuda::proclaim_return_type<T>([repeats] __device__(auto i) {
+                                      return static_cast<T>(i / repeats);
+                                    }));
+
+  estimator_type estimator{sd};
+
+  REQUIRE(estimator.estimate() == 0);
+
+  // Add all items to the estimator
+  estimator.add(items_begin, items_begin + num_items);
+
+  auto const estimate = estimator.estimate();
+
+  double const relative_error =
+    std::abs((static_cast<double>(estimate) / static_cast<double>(num_items / repeats)) - 1.0);
+  // RSD for a given precision is given by the following formula
+  double const expected_standard_deviation =
+    1.04 / std::sqrt(static_cast<double>(1ull << expected_hll_precision));
+
+  // Check if the error is acceptable
+  REQUIRE(relative_error < expected_standard_deviation * tolerance_factor);
+}
+
+// the following test is omitted since we refrain from doing randomized unit tests in cuco
+// TEST_CASE("distinct_count_estimator: Spark parity: random cardinality estimation", "")
+
+TEST_CASE("distinct_count_estimator: Spark parity: merging HLL instances", "")
+{
+  using T = int;
+  using estimator_type =
+    cuco::distinct_count_estimator<T, cuda::thread_scope_device, cuco::xxhash_64<T>>;
+
+  auto num_items          = 1000000;
+  auto standard_deviation = cuco::standard_deviation(0.05);
+
+  auto items_begin = thrust::make_counting_iterator<T>(0);
+
+  // count lower half of input
+  estimator_type lower{standard_deviation};
+  lower.add(items_begin, items_begin + num_items / 2);
+
+  // count upper half of input
+  estimator_type upper{standard_deviation};
+  upper.add(items_begin + num_items / 2, items_begin + num_items);
+
+  // merge upper into lower so lower has seen the entire input
+  lower.merge(upper);
+
+  auto reversed_items_begin = thrust::make_transform_iterator(
+    items_begin, cuda::proclaim_return_type<T>([num_items] __device__(auto i) {
+      return static_cast<T>(num_items - i);
+    }));
+
+  // count the entire input vector but in reversed order
+  estimator_type entire{standard_deviation};
+  entire.add(reversed_items_begin, reversed_items_begin + num_items);
+
+  auto const entire_sketch = entire.sketch();
+  auto const lower_sketch  = lower.sketch();
+
+  // check if sketches are bitwise identical
+  REQUIRE(cuco::test::equal(entire_sketch.data(),
+                            entire_sketch.data() + entire_sketch.size(),
+                            lower_sketch.data(),
+                            thrust::equal_to{}));
+}
+
+/*
+The following unit tests fail since xxhash_64 does not deduplicate different bit patterns for NaN
+values and +-0.0. They are thus counted as distinct items.
+
+TEST_CASE("distinct_count_estimator: Spark parity: add 0.0 and -0.0", "")
+{
+  using T = double;
+  using estimator_type =
+    cuco::distinct_count_estimator<T, cuda::thread_scope_device, cuco::xxhash_64<T>>;
+
+  auto standard_deviation = cuco::standard_deviation(0.05);
+
+  auto items = thrust::device_vector<T>({0.0, -0.0});
+
+  estimator_type estimator{standard_deviation};
+  estimator.add(items.begin(), items.end());
+
+  REQUIRE(estimator.estimate() == 1);
+}
+
+TEST_CASE("distinct_count_estimator: Spark parity: add NaN", "")
+{
+  using T = double;
+  using estimator_type =
+    cuco::distinct_count_estimator<T, cuda::thread_scope_device, cuco::xxhash_64<T>>;
+
+  auto standard_deviation = cuco::standard_deviation(0.05);
+
+  // Define the special bit pattern for the NaN.
+  uint64_t nan_bits = 0x7ff1234512345678ULL;
+  double special_nan;
+  std::memcpy(&special_nan, &nan_bits, sizeof(special_nan));
+
+  auto items = thrust::device_vector<T>({0.0, special_nan});
+
+  estimator_type estimator{standard_deviation};
+  estimator.add(items.begin(), items.end());
+
+  REQUIRE(estimator.estimate() == 1);
+}
+*/

From f80509fa998af9f4ac3958e2461f0d46daf50aae Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Wed, 27 Mar 2024 15:31:54 +0000
Subject: [PATCH 72/78] Fix error calculation

---
 benchmarks/distinct_count_estimator_bench.cu           | 2 +-
 examples/distinct_count_estimator/host_bulk_example.cu | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/benchmarks/distinct_count_estimator_bench.cu b/benchmarks/distinct_count_estimator_bench.cu
index 37c5f4f00..eca28e4ed 100644
--- a/benchmarks/distinct_count_estimator_bench.cu
+++ b/benchmarks/distinct_count_estimator_bench.cu
@@ -67,7 +67,7 @@ template <class Estimator, class Dist>
     estimator.add(items.begin(), items.end());
     double estimated_cardinality = estimator.estimate();
     double true_cardinality      = exact_distinct_count(items.begin(), num_items);
-    error_sum += std::abs(true_cardinality - estimated_cardinality) / true_cardinality;
+    error_sum += std::abs(estimated_cardinality / true_cardinality - 1.0);
     estimator.clear();
   }
 
diff --git a/examples/distinct_count_estimator/host_bulk_example.cu b/examples/distinct_count_estimator/host_bulk_example.cu
index 56ee90a42..0cd535e8b 100644
--- a/examples/distinct_count_estimator/host_bulk_example.cu
+++ b/examples/distinct_count_estimator/host_bulk_example.cu
@@ -53,10 +53,9 @@ int main(void)
   std::size_t const estimated_cardinality = estimator.estimate();
 
   std::cout << "True cardinality: " << num_items
-            << "\nEstimated cardinality: " << estimated_cardinality << "\nRelative error: "
-            << std::abs(static_cast<double>(num_items) -
-                        static_cast<double>(estimated_cardinality)) /
-                 num_items
+            << "\nEstimated cardinality: " << estimated_cardinality << "\nError: "
+            << std::abs(
+                 static_cast<double>(estimated_cardinality) / static_cast<double>(num_items) - 1.0)
             << std::endl;
 
   return 0;

From b61a2db6967b586630131f603438cd7192d2197f Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Wed, 3 Apr 2024 00:25:16 +0000
Subject: [PATCH 73/78] Move include/cuco/sentinel.cuh ->
 include/cuco/types.cuh

---
 include/cuco/dynamic_map.cuh             |  2 +-
 include/cuco/static_map.cuh              |  2 +-
 include/cuco/static_map_ref.cuh          |  2 +-
 include/cuco/static_multimap.cuh         |  2 +-
 include/cuco/static_multiset.cuh         |  2 +-
 include/cuco/static_multiset_ref.cuh     |  2 +-
 include/cuco/static_set.cuh              |  2 +-
 include/cuco/static_set_ref.cuh          |  2 +-
 include/cuco/{sentinel.cuh => types.cuh} | 10 ++++++++++
 9 files changed, 18 insertions(+), 8 deletions(-)
 rename include/cuco/{sentinel.cuh => types.cuh} (70%)

diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh
index bf3c7c8a2..aedb81f1a 100644
--- a/include/cuco/dynamic_map.cuh
+++ b/include/cuco/dynamic_map.cuh
@@ -18,8 +18,8 @@
 
 #include <cuco/detail/dynamic_map_kernels.cuh>
 #include <cuco/hash_functions.cuh>
-#include <cuco/sentinel.cuh>
 #include <cuco/static_map.cuh>
+#include <cuco/types.cuh>
 
 #include <thrust/device_vector.h>
 #include <thrust/functional.h>
diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh
index 02434eb25..324c56ced 100644
--- a/include/cuco/static_map.cuh
+++ b/include/cuco/static_map.cuh
@@ -22,8 +22,8 @@
 #include <cuco/detail/static_map_kernels.cuh>
 #include <cuco/hash_functions.cuh>
 #include <cuco/pair.cuh>
-#include <cuco/sentinel.cuh>
 #include <cuco/static_map_ref.cuh>
+#include <cuco/types.cuh>
 #include <cuco/utility/allocator.hpp>
 #include <cuco/utility/cuda_thread_scope.cuh>
 #include <cuco/utility/traits.hpp>
diff --git a/include/cuco/static_map_ref.cuh b/include/cuco/static_map_ref.cuh
index 953507a6a..80c60711d 100644
--- a/include/cuco/static_map_ref.cuh
+++ b/include/cuco/static_map_ref.cuh
@@ -20,8 +20,8 @@
 #include <cuco/hash_functions.cuh>
 #include <cuco/operator.hpp>
 #include <cuco/probing_scheme.cuh>
-#include <cuco/sentinel.cuh>
 #include <cuco/storage.cuh>
+#include <cuco/types.cuh>
 #include <cuco/utility/cuda_thread_scope.cuh>
 
 #include <cuda/std/atomic>
diff --git a/include/cuco/static_multimap.cuh b/include/cuco/static_multimap.cuh
index 1d6e67df0..abdb747d0 100644
--- a/include/cuco/static_multimap.cuh
+++ b/include/cuco/static_multimap.cuh
@@ -20,7 +20,7 @@
 #include <cuco/detail/prime.hpp>
 #include <cuco/hash_functions.cuh>
 #include <cuco/probe_sequences.cuh>
-#include <cuco/sentinel.cuh>
+#include <cuco/types.cuh>
 #include <cuco/utility/allocator.hpp>
 #include <cuco/utility/traits.hpp>
 
diff --git a/include/cuco/static_multiset.cuh b/include/cuco/static_multiset.cuh
index b4a684bcc..5d16857cf 100644
--- a/include/cuco/static_multiset.cuh
+++ b/include/cuco/static_multiset.cuh
@@ -21,9 +21,9 @@
 #include <cuco/extent.cuh>
 #include <cuco/hash_functions.cuh>
 #include <cuco/probing_scheme.cuh>
-#include <cuco/sentinel.cuh>
 #include <cuco/static_multiset_ref.cuh>
 #include <cuco/storage.cuh>
+#include <cuco/types.cuh>
 #include <cuco/utility/allocator.hpp>
 #include <cuco/utility/cuda_thread_scope.cuh>
 #include <cuco/utility/traits.hpp>
diff --git a/include/cuco/static_multiset_ref.cuh b/include/cuco/static_multiset_ref.cuh
index 975ca915b..a8b5bff62 100644
--- a/include/cuco/static_multiset_ref.cuh
+++ b/include/cuco/static_multiset_ref.cuh
@@ -20,8 +20,8 @@
 #include <cuco/hash_functions.cuh>
 #include <cuco/operator.hpp>
 #include <cuco/probing_scheme.cuh>
-#include <cuco/sentinel.cuh>
 #include <cuco/storage.cuh>
+#include <cuco/types.cuh>
 #include <cuco/utility/cuda_thread_scope.cuh>
 
 #include <cuda/std/atomic>
diff --git a/include/cuco/static_set.cuh b/include/cuco/static_set.cuh
index 7c819668e..2aaee75d6 100644
--- a/include/cuco/static_set.cuh
+++ b/include/cuco/static_set.cuh
@@ -21,9 +21,9 @@
 #include <cuco/extent.cuh>
 #include <cuco/hash_functions.cuh>
 #include <cuco/probing_scheme.cuh>
-#include <cuco/sentinel.cuh>
 #include <cuco/static_set_ref.cuh>
 #include <cuco/storage.cuh>
+#include <cuco/types.cuh>
 #include <cuco/utility/allocator.hpp>
 #include <cuco/utility/cuda_thread_scope.cuh>
 #include <cuco/utility/traits.hpp>
diff --git a/include/cuco/static_set_ref.cuh b/include/cuco/static_set_ref.cuh
index f2f661190..004b6b92f 100644
--- a/include/cuco/static_set_ref.cuh
+++ b/include/cuco/static_set_ref.cuh
@@ -20,8 +20,8 @@
 #include <cuco/hash_functions.cuh>
 #include <cuco/operator.hpp>
 #include <cuco/probing_scheme.cuh>
-#include <cuco/sentinel.cuh>
 #include <cuco/storage.cuh>
+#include <cuco/types.cuh>
 #include <cuco/utility/cuda_thread_scope.cuh>
 
 #include <cuda/std/atomic>
diff --git a/include/cuco/sentinel.cuh b/include/cuco/types.cuh
similarity index 70%
rename from include/cuco/sentinel.cuh
rename to include/cuco/types.cuh
index b71d3e6c5..f0c579f85 100644
--- a/include/cuco/sentinel.cuh
+++ b/include/cuco/types.cuh
@@ -18,6 +18,16 @@
 
 #include <cuco/detail/utility/strong_type.cuh>
 
+/**
+ * @brief Defines various strong type wrappers used across this library.
+ *
+ * @note Each strong type inherits from `cuco::detail::strong_type<T>`. `CUCO_DEFINE_STRONG_TYPE`
+ * and `CUCO_DEFINE_TEMPLATE_STRONG_TYPE` are convenience macros used to define a named type in a
+ * single line, e.g., `CUCO_DEFINE_STRONG_TYPE(foo, double)` defines `struct foo : public
+ * cuco::detail::strong_type<double> {...};`, where `cuco::foo{42.0}` is implicitly convertible to
+ * `double{42.0}`.
+ */
+
 namespace cuco {
 /**
  * @brief A strong type wrapper `cuco::empty_key<Key>` used to denote the empty key sentinel.

From 9b0ee68e6eabc60015d3f7746e109b80989e7a50 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Wed, 3 Apr 2024 00:34:16 +0000
Subject: [PATCH 74/78] Move HLL-related strong types to types.cuh

---
 .../cuco/detail/hyperloglog/hyperloglog.cuh   |  3 +-
 .../detail/hyperloglog/hyperloglog_ref.cuh    |  3 +-
 include/cuco/distinct_count_estimator.cuh     |  3 +-
 include/cuco/distinct_count_estimator_ref.cuh |  2 +-
 include/cuco/sketch_size.cuh                  | 55 -------------------
 include/cuco/standard_deviation.cuh           | 45 ---------------
 include/cuco/types.cuh                        | 26 +++++++++
 7 files changed, 30 insertions(+), 107 deletions(-)
 delete mode 100644 include/cuco/sketch_size.cuh
 delete mode 100644 include/cuco/standard_deviation.cuh

diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh
index 13106cc08..011d2bee7 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog.cuh
@@ -20,8 +20,7 @@
 #include <cuco/detail/hyperloglog/hyperloglog_ref.cuh>
 #include <cuco/detail/storage/storage_base.cuh>
 #include <cuco/hash_functions.cuh>
-#include <cuco/sketch_size.cuh>
-#include <cuco/standard_deviation.cuh>
+#include <cuco/types.cuh>
 #include <cuco/utility/cuda_thread_scope.cuh>
 
 #include <cstddef>
diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
index dfd2df499..fc10e32b7 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
@@ -22,8 +22,7 @@
 #include <cuco/detail/hyperloglog/kernels.cuh>
 #include <cuco/detail/utils.hpp>
 #include <cuco/hash_functions.cuh>
-#include <cuco/sketch_size.cuh>
-#include <cuco/standard_deviation.cuh>
+#include <cuco/types.cuh>
 #include <cuco/utility/cuda_thread_scope.cuh>
 #include <cuco/utility/traits.hpp>
 
diff --git a/include/cuco/distinct_count_estimator.cuh b/include/cuco/distinct_count_estimator.cuh
index a43590a43..0eecca954 100644
--- a/include/cuco/distinct_count_estimator.cuh
+++ b/include/cuco/distinct_count_estimator.cuh
@@ -19,8 +19,7 @@
 #include <cuco/detail/hyperloglog/hyperloglog.cuh>
 #include <cuco/distinct_count_estimator_ref.cuh>
 #include <cuco/hash_functions.cuh>
-#include <cuco/sketch_size.cuh>
-#include <cuco/standard_deviation.cuh>
+#include <cuco/types.cuh>
 #include <cuco/utility/allocator.hpp>
 #include <cuco/utility/cuda_thread_scope.cuh>
 
diff --git a/include/cuco/distinct_count_estimator_ref.cuh b/include/cuco/distinct_count_estimator_ref.cuh
index d2eb6ab58..44374c6b5 100644
--- a/include/cuco/distinct_count_estimator_ref.cuh
+++ b/include/cuco/distinct_count_estimator_ref.cuh
@@ -18,7 +18,7 @@
 #include <cuco/cuda_stream_ref.hpp>
 #include <cuco/detail/hyperloglog/hyperloglog_ref.cuh>
 #include <cuco/hash_functions.cuh>
-#include <cuco/sketch_size.cuh>
+#include <cuco/types.cuh>
 #include <cuco/utility/cuda_thread_scope.cuh>
 
 #include <cooperative_groups.h>
diff --git a/include/cuco/sketch_size.cuh b/include/cuco/sketch_size.cuh
deleted file mode 100644
index f9dce1aed..000000000
--- a/include/cuco/sketch_size.cuh
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-namespace cuco {
-
-/**
- * @brief Strng type for specifying the sketch size of cuco::distinct_count_estimator(_ref) in KB.
- *
- * Values can also be given as literals, e.g., 64.3_KB
- */
-class sketch_size_kb {
- public:
-  /**
-   * @brief Constructs a sketch_size_kb object.
-   *
-   * @param value The size of a sketch given in KB
-   */
-  __host__ __device__ explicit constexpr sketch_size_kb(double value) noexcept : value_{value} {}
-
-  /**
-   * @brief Conversion to value type.
-   *
-   * @return Sketch size in KB
-   */
-  __host__ __device__ constexpr operator double() const noexcept { return this->value_; }
-
- private:
-  double value_;  ///< Sketch size in KB
-};
-}  // namespace cuco
-
-// User-defined literal operators for sketch_size_KB
-__host__ __device__ constexpr cuco::sketch_size_kb operator""_KB(long double value)
-{
-  return cuco::sketch_size_kb{static_cast<double>(value)};
-}
-
-__host__ __device__ constexpr cuco::sketch_size_kb operator""_KB(unsigned long long int value)
-{
-  return cuco::sketch_size_kb{static_cast<double>(value)};
-}
\ No newline at end of file
diff --git a/include/cuco/standard_deviation.cuh b/include/cuco/standard_deviation.cuh
deleted file mode 100644
index 9486784b9..000000000
--- a/include/cuco/standard_deviation.cuh
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-namespace cuco {
-
-/**
- * @brief Strong type for specifying the desired standard deviation of
- * cuco::distinct_count_estimator(_ref).
- */
-class standard_deviation {
- public:
-  /**
-   * @brief Constructs a standard_deviation object.
-   *
-   * @param value The desired standard deviation
-   */
-  __host__ __device__ explicit constexpr standard_deviation(double value) noexcept : value_{value}
-  {
-  }
-
-  /**
-   * @brief Conversion to value type.
-   *
-   * @return Standard deviation
-   */
-  __host__ __device__ constexpr operator double() const noexcept { return this->value_; }
-
- private:
-  double value_;  ///< Sketch size in KB
-};
-}  // namespace cuco
\ No newline at end of file
diff --git a/include/cuco/types.cuh b/include/cuco/types.cuh
index f0c579f85..eddc289df 100644
--- a/include/cuco/types.cuh
+++ b/include/cuco/types.cuh
@@ -43,4 +43,30 @@ CUCO_DEFINE_TEMPLATE_STRONG_TYPE(empty_value);
  * @brief A strong type wrapper `cuco::erased_key<Key>` used to denote the erased key sentinel.
  */
 CUCO_DEFINE_TEMPLATE_STRONG_TYPE(erased_key);
+
+/**
+ * @brief A strong type wrapper `cuco::sketch_size_kb` for specifying the upper-bound sketch size of
+ * `cuco::distinct_count_estimator(_ref)` in KB.
+ *
+ * @note Values can also be specified as literals, e.g., 64.3_KB.
+ */
+CUCO_DEFINE_STRONG_TYPE(sketch_size_kb, double);
+
+/**
+ * @brief A strong type wrapper `cuco::standard_deviation` for specifying the desired standard
+ * deviation for the cardinality estimate of `cuco::distinct_count_estimator(_ref)`.
+ */
+CUCO_DEFINE_STRONG_TYPE(standard_deviation, double);
+
 }  // namespace cuco
+
+// User-defined literal operators for `cuco::sketch_size_KB`
+__host__ __device__ constexpr cuco::sketch_size_kb operator""_KB(long double value)
+{
+  return cuco::sketch_size_kb{static_cast<double>(value)};
+}
+
+__host__ __device__ constexpr cuco::sketch_size_kb operator""_KB(unsigned long long int value)
+{
+  return cuco::sketch_size_kb{static_cast<double>(value)};
+}

From 75cd96789023c82d64d1c5b1bd9b02c18b735de8 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Wed, 3 Apr 2024 00:59:03 +0000
Subject: [PATCH 75/78] Apparently Doxygen has become even pickier...

---
 include/cuco/operator.hpp                  | 12 ++++++------
 include/cuco/utility/allocator.hpp         | 19 +++++++++++++++++++
 include/cuco/utility/cuda_thread_scope.cuh | 12 ++++++++----
 include/cuco/utility/traits.hpp            |  9 ++++++---
 4 files changed, 39 insertions(+), 13 deletions(-)

diff --git a/include/cuco/operator.hpp b/include/cuco/operator.hpp
index 8199e23c6..418148eb5 100644
--- a/include/cuco/operator.hpp
+++ b/include/cuco/operator.hpp
@@ -24,37 +24,37 @@ inline namespace op {
  * @brief `insert` operator tag
  */
 struct insert_tag {
-} inline constexpr insert;
+} inline constexpr insert;  ///< `cuco::insert` operator
 
 /**
  * @brief `insert_and_find` operator tag
  */
 struct insert_and_find_tag {
-} inline constexpr insert_and_find;
+} inline constexpr insert_and_find;  ///< `cuco::insert_and_find` operator
 
 /**
  * @brief `insert_or_assign` operator tag
  */
 struct insert_or_assign_tag {
-} inline constexpr insert_or_assign;
+} inline constexpr insert_or_assign;  ///< `cuco::insert_or_assign` operator
 
 /**
  * @brief `erase` operator tag
  */
 struct erase_tag {
-} inline constexpr erase;
+} inline constexpr erase;  ///< `cuco::erase` operator
 
 /**
  * @brief `contains` operator tag
  */
 struct contains_tag {
-} inline constexpr contains;
+} inline constexpr contains;  ///< `cuco::contains` operator
 
 /**
  * @brief `find` operator tag
  */
 struct find_tag {
-} inline constexpr find;
+} inline constexpr find;  ///< `cuco::find` operator
 
 }  // namespace op
 }  // namespace cuco
diff --git a/include/cuco/utility/allocator.hpp b/include/cuco/utility/allocator.hpp
index 583571620..060ae0dd8 100644
--- a/include/cuco/utility/allocator.hpp
+++ b/include/cuco/utility/allocator.hpp
@@ -60,12 +60,31 @@ class cuda_allocator {
   void deallocate(value_type* p, std::size_t) { CUCO_CUDA_TRY(cudaFree(p)); }
 };
 
+/**
+ * @brief Equality comparison operator.
+ *
+ * @tparam T Value type of LHS object
+ * @tparam U Value type of RHS object
+ *
+ * @return `true` iff given arguments are equal
+ */
 template <typename T, typename U>
 bool operator==(cuda_allocator<T> const&, cuda_allocator<U> const&) noexcept
 {
   return true;
 }
 
+/**
+ * @brief Inequality comparison operator.
+ *
+ * @tparam T Value type of LHS object
+ * @tparam U Value type of RHS object
+ *
+ * @param lhs Left-hand side object to compare
+ * @param rhs Right-hand side object to compare
+ *
+ * @return `true` iff given arguments are not equal
+ */
 template <typename T, typename U>
 bool operator!=(cuda_allocator<T> const& lhs, cuda_allocator<U> const& rhs) noexcept
 {
diff --git a/include/cuco/utility/cuda_thread_scope.cuh b/include/cuco/utility/cuda_thread_scope.cuh
index 4e2242487..906605a14 100644
--- a/include/cuco/utility/cuda_thread_scope.cuh
+++ b/include/cuco/utility/cuda_thread_scope.cuh
@@ -36,9 +36,13 @@ struct cuda_thread_scope {
 };
 
 // alias definitions
-inline constexpr auto thread_scope_system = cuda_thread_scope<cuda::thread_scope_system>{};
-inline constexpr auto thread_scope_device = cuda_thread_scope<cuda::thread_scope_device>{};
-inline constexpr auto thread_scope_block  = cuda_thread_scope<cuda::thread_scope_block>{};
-inline constexpr auto thread_scope_thread = cuda_thread_scope<cuda::thread_scope_thread>{};
+inline constexpr auto thread_scope_system =
+  cuda_thread_scope<cuda::thread_scope_system>{};  ///< `cuco::thread_scope_system`
+inline constexpr auto thread_scope_device =
+  cuda_thread_scope<cuda::thread_scope_device>{};  ///< `cuco::thread_scope_device`
+inline constexpr auto thread_scope_block =
+  cuda_thread_scope<cuda::thread_scope_block>{};  ///< `cuco::thread_scope_block`
+inline constexpr auto thread_scope_thread =
+  cuda_thread_scope<cuda::thread_scope_thread>{};  ///< `cuco::thread_scope_thread`
 
 }  // namespace cuco
diff --git a/include/cuco/utility/traits.hpp b/include/cuco/utility/traits.hpp
index dcbfe432a..1325b3a52 100644
--- a/include/cuco/utility/traits.hpp
+++ b/include/cuco/utility/traits.hpp
@@ -46,7 +46,8 @@ struct is_bitwise_comparable<T, std::enable_if_t<std::has_unique_object_represen
   : std::true_type {};
 
 template <typename T>
-inline constexpr bool is_bitwise_comparable_v = is_bitwise_comparable<T>::value;
+inline constexpr bool is_bitwise_comparable_v =
+  is_bitwise_comparable<T>::value;  ///< Shortcut definition
 
 /**
  * @brief Declares that a type `Type` is bitwise comparable.
@@ -59,9 +60,11 @@ inline constexpr bool is_bitwise_comparable_v = is_bitwise_comparable<T>::value;
   }
 
 template <bool value, typename... Args>
-inline constexpr bool dependent_bool_value = value;
+inline constexpr bool dependent_bool_value = value;  ///< Unpacked dependent bool value
 
 template <typename... Args>
-inline constexpr bool dependent_false = dependent_bool_value<false, Args...>;
+inline constexpr bool dependent_false =
+  dependent_bool_value<false, Args...>;  ///< Emits a `false` value which is dependent on the given
+                                         ///< argument types
 
 }  // namespace cuco

From 6929f65ced3b51475e37213ad9dcfb51cd53ce59 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Wed, 3 Apr 2024 16:30:28 +0000
Subject: [PATCH 76/78] Clean up device ref example

---
 examples/distinct_count_estimator/device_ref_example.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/distinct_count_estimator/device_ref_example.cu b/examples/distinct_count_estimator/device_ref_example.cu
index 92c5169d9..ab4d1929f 100644
--- a/examples/distinct_count_estimator/device_ref_example.cu
+++ b/examples/distinct_count_estimator/device_ref_example.cu
@@ -31,7 +31,7 @@
  */
 
 template <class RefType, class InputIt>
-__global__ void piggyback_kernel(RefType ref, InputIt first, std::size_t n)
+__global__ void fused_kernel(RefType ref, InputIt first, std::size_t n)
 {
   // Transform the reference type (with device scope) to a reference type with block scope
   using local_ref_type = typename RefType::with_scope<cuda::thread_scope_block>;
@@ -67,7 +67,7 @@ __global__ void piggyback_kernel(RefType ref, InputIt first, std::size_t n)
     /*
     Here we can add some custom workload that takes the input `item`.
 
-    The idea is that cardinality estimation can be fused/piggy-backed with any other workload that
+    The idea is that cardinality estimation can be fused with any other workload that
     traverses the data. Since `local_ref.add` can run close to the SOL of the DRAM bandwidth, we get
     the estimate "for free" while performing other computations over the data.
     */
@@ -144,7 +144,7 @@ int main(void)
   auto const sketch_bytes = estimator.sketch_bytes();
 
   // Call the custom kernel and pass a non-owning reference to the estimator to the GPU
-  piggyback_kernel<<<10, 512, sketch_bytes>>>(estimator.ref(), items.begin(), num_items);
+  fused_kernel<<<10, 512, sketch_bytes>>>(estimator.ref(), items.begin(), num_items);
 
   // Calculate the cardinality estimate from the custom kernel
   std::size_t const estimated_cardinality_custom = estimator.estimate();

From a496f9ee781d5521f915800d4626db55f534d2b1 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Wed, 3 Apr 2024 16:33:16 +0000
Subject: [PATCH 77/78] Update godbolt links

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index e15558bc2..b31df5bac 100644
--- a/README.md
+++ b/README.md
@@ -239,7 +239,7 @@ We plan to add many GPU-accelerated, concurrent data structures to `cuCollection
 `cuco::distinct_count_estimator` implements the well-established [HyperLogLog++ algorithm](https://static.googleusercontent.com/media/research.google.com/de//pubs/archive/40671.pdf) for approximating the count of distinct items in a multiset/stream.
 
 #### Examples:
-- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/distinct_count_estimator/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/z/EG7cMssxo))
-- [Device-ref APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/distinct_count_estimator/device_ref_example.cu) (see [live example in godbolt](https://godbolt.org/z/va8eE9dqb))
+- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/distinct_count_estimator/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/z/ahjEoWM1E))
+- [Device-ref APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/distinct_count_estimator/device_ref_example.cu) (see [live example in godbolt](https://godbolt.org/z/qebYY8Goj))
 
 

From 7fecd7b71a7f547561c85a10ba039734aa09381d Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Wed, 3 Apr 2024 16:36:02 +0000
Subject: [PATCH 78/78] Clean up unique sequence unit test

---
 tests/distinct_count_estimator/unique_sequence_test.cu | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/distinct_count_estimator/unique_sequence_test.cu b/tests/distinct_count_estimator/unique_sequence_test.cu
index 7d6321de6..3c1558b30 100644
--- a/tests/distinct_count_estimator/unique_sequence_test.cu
+++ b/tests/distinct_count_estimator/unique_sequence_test.cu
@@ -45,7 +45,6 @@ TEMPLATE_TEST_CASE_SIG("distinct_count_estimator: unique sequence",
   auto num_items = 1ull << num_items_pow2;
 
   // This factor determines the error threshold for passing the test
-  // TODO might be too high
   double constexpr tolerance_factor = 2.5;
   // RSD for a given precision is given by the following formula
   double const relative_standard_deviation =
@@ -76,7 +75,7 @@ TEMPLATE_TEST_CASE_SIG("distinct_count_estimator: unique sequence",
   REQUIRE(estimator.estimate() == 0);
 
   double const relative_error =
-    std::abs(static_cast<double>(num_items) - static_cast<double>(estimate)) / num_items;
+    std::abs((static_cast<double>(estimate) / static_cast<double>(num_items)) - 1.0);
 
   // Check if the error is acceptable
   REQUIRE(relative_error < tolerance_factor * relative_standard_deviation);