From 9dc43a4d52a6835cbed9e706c14e00aedcfcc3a6 Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Mon, 6 Oct 2025 17:26:33 +0200 Subject: [PATCH 1/5] Try safe-guarding offsets_builder.set with mutex --- include/builder/build_sparse_index.hpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/include/builder/build_sparse_index.hpp b/include/builder/build_sparse_index.hpp index fe9ccdb..17beff9 100644 --- a/include/builder/build_sparse_index.hpp +++ b/include/builder/build_sparse_index.hpp @@ -1,5 +1,6 @@ #pragma once +#include #include "include/buckets_statistics.hpp" namespace sshash { @@ -86,6 +87,11 @@ buckets_statistics build_sparse_index(parse_data& data, buckets& std::vector threads_buckets_stats; threads_buckets_stats.reserve(num_threads); + // Mutex to protect concurrent writes to offsets_builder + // Multiple threads can write to the same bucket positions, and compact_vector + // packs values into 64-bit words, so nearby writes can race on the same word + std::mutex offsets_mutex; + auto exe = [&](const uint64_t thread_id) { assert(thread_id + 1 < offsets.size()); const uint64_t offset_begin = offsets[thread_id]; @@ -106,6 +112,9 @@ buckets_statistics build_sparse_index(parse_data& data, buckets& uint64_t prev_pos_in_seq = constants::invalid_uint64; for (auto mt : bucket) { if (mt.pos_in_seq != prev_pos_in_seq) { + // Protect compact_vector writes - multiple threads can write to + // positions that share the same 64-bit word + std::lock_guard lock(offsets_mutex); offsets_builder.set(begin + pos++, mt.pos_in_seq); prev_pos_in_seq = mt.pos_in_seq; } From a01f0f8bc1fc0554ec76600f9614faa6216e19ae Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Mon, 6 Oct 2025 18:57:33 +0200 Subject: [PATCH 2/5] fix kmer_t processing for uint_kmer_bits > 64 --- include/builder/parse_file.hpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/builder/parse_file.hpp b/include/builder/parse_file.hpp index f269788..ef13e01 100644 --- a/include/builder/parse_file.hpp +++ b/include/builder/parse_file.hpp @@ -173,7 +173,10 @@ void parse_file(std::istream& is, parse_data& data, /* Push a final sentinel (dummy) value to avoid bounds' checking in kmer_iterator::fill_buff(). */ - bvb_strings.append_bits(0, kmer_t::uint_kmer_bits); + for (int i = 0; i < kmer_t::uint_kmer_bits / 64; i++) { + bvb_strings.append_bits(0, 64); + } + bvb_strings.append_bits(0, kmer_t::uint_kmer_bits % 64); bvb_strings.build(data.strings); From 6578dddf0a72617f5fc01abb1a929df5de3d7ab7 Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Tue, 7 Oct 2025 18:17:23 +0200 Subject: [PATCH 3/5] Single-threaded build_sparse_index --- include/builder/build_sparse_index.hpp | 90 ++++++-------------------- 1 file changed, 18 insertions(+), 72 deletions(-) diff --git a/include/builder/build_sparse_index.hpp b/include/builder/build_sparse_index.hpp index 17beff9..32f32f6 100644 --- a/include/builder/build_sparse_index.hpp +++ b/include/builder/build_sparse_index.hpp @@ -29,17 +29,12 @@ struct bucket_size_iterator { template buckets_statistics build_sparse_index(parse_data& data, buckets& m_buckets, - build_configuration const& build_config) // + build_configuration const& /*build_config*/) // { const uint64_t num_kmers = data.num_kmers; const uint64_t num_minimizer_positions = data.minimizers.num_minimizer_positions(); - const uint64_t num_super_kmers = data.minimizers.num_super_kmers(); const uint64_t num_buckets = data.minimizers.num_minimizers(); - const uint64_t num_threads = build_config.num_threads; - bits::compact_vector::builder offsets_builder; - offsets_builder.resize(num_minimizer_positions, - std::ceil(std::log2(data.strings.num_bits() / kmer_t::bits_per_char))); std::cout << "bits_per_offset = ceil(log2(" << data.strings.num_bits() / kmer_t::bits_per_char << ")) = " << std::ceil(std::log2(data.strings.num_bits() / kmer_t::bits_per_char)) @@ -67,74 +62,25 @@ buckets_statistics build_sparse_index(parse_data& data, buckets& buckets_statistics buckets_stats(num_buckets, num_kmers, num_minimizer_positions); timer.start(); - const uint64_t block_size = (num_super_kmers + num_threads - 1) / num_threads; - std::vector offsets; - offsets.reserve(num_threads + 1); - for (uint64_t offset = -1; offset != num_super_kmers;) { - offsets.push_back(offset + 1); - offset = std::min((offset + 1) + block_size, num_super_kmers); - minimizer_tuple const* b = begin + offset; - uint64_t curr_minimizer = (*b).minimizer; - while (b + 1 < end) { // adjust offset - uint64_t next_minimizer = (*(b + 1)).minimizer; - if (curr_minimizer != next_minimizer) break; - b += 1; - offset += 1; + + bits::compact_vector::builder offsets_builder; + offsets_builder.resize(num_minimizer_positions, + std::ceil(std::log2(data.strings.num_bits() / kmer_t::bits_per_char))); + uint64_t prev_minimizer = constants::invalid_uint64, prev_pos_in_seq = constants::invalid_uint64, bucket_size = 0; + for (auto mt : input) { + if (mt.minimizer != prev_minimizer) { + auto [bucket_begin, bucket_end] = m_buckets.locate_bucket(mt.minimizer); + bucket_size = bucket_end - bucket_begin; + buckets_stats.add_bucket_size(bucket_size); + prev_minimizer = mt.minimizer; + prev_pos_in_seq = constants::invalid_uint64; } - } - offsets.push_back(num_super_kmers); - - std::vector threads_buckets_stats; - threads_buckets_stats.reserve(num_threads); - - // Mutex to protect concurrent writes to offsets_builder - // Multiple threads can write to the same bucket positions, and compact_vector - // packs values into 64-bit words, so nearby writes can race on the same word - std::mutex offsets_mutex; - - auto exe = [&](const uint64_t thread_id) { - assert(thread_id + 1 < offsets.size()); - const uint64_t offset_begin = offsets[thread_id]; - const uint64_t offset_end = offsets[thread_id + 1]; - auto& tbs = threads_buckets_stats[thread_id]; - for (minimizers_tuples_iterator it(begin + offset_begin, begin + offset_end); // - it.has_next(); // - it.next()) // - { - const uint64_t bucket_id = it.minimizer(); - const auto [begin, end] = m_buckets.locate_bucket(bucket_id); - assert(end > begin); - const uint64_t bucket_size = end - begin; - assert(bucket_size == it.bucket().size()); - tbs.add_bucket_size(bucket_size); - uint64_t pos = 0; - auto bucket = it.bucket(); - uint64_t prev_pos_in_seq = constants::invalid_uint64; - for (auto mt : bucket) { - if (mt.pos_in_seq != prev_pos_in_seq) { - // Protect compact_vector writes - multiple threads can write to - // positions that share the same 64-bit word - std::lock_guard lock(offsets_mutex); - offsets_builder.set(begin + pos++, mt.pos_in_seq); - prev_pos_in_seq = mt.pos_in_seq; - } - tbs.add_num_kmers_in_super_kmer(bucket_size, mt.num_kmers_in_super_kmer); - } - assert(pos == bucket_size); + buckets_stats.add_num_kmers_in_super_kmer(bucket_size, mt.num_kmers_in_super_kmer); + if (mt.pos_in_seq != prev_pos_in_seq) { + offsets_builder.push_back(mt.pos_in_seq); + prev_pos_in_seq = mt.pos_in_seq; } - }; - - std::vector threads; - threads.reserve(num_threads); - assert(offsets.size() <= num_threads + 1); - for (uint64_t thread_id = 0; thread_id + 1 < size(offsets); ++thread_id) { - threads_buckets_stats.emplace_back(num_buckets, num_kmers, num_minimizer_positions); - threads.emplace_back(exe, thread_id); - } - for (auto& t : threads) { - if (t.joinable()) t.join(); } - for (auto const& tbs : threads_buckets_stats) buckets_stats += tbs; input.close(); timer.stop(); @@ -154,4 +100,4 @@ buckets_statistics build_sparse_index(parse_data& data, buckets& return buckets_stats; } -} // namespace sshash \ No newline at end of file +} // namespace sshash From b47bec1ebc8f1794f6a6a9e89dde549a2aef8af7 Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Tue, 7 Oct 2025 20:40:15 +0200 Subject: [PATCH 4/5] fixes per review --- include/builder/build_sparse_index.hpp | 2 -- include/builder/parse_file.hpp | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/include/builder/build_sparse_index.hpp b/include/builder/build_sparse_index.hpp index 32f32f6..302f50f 100644 --- a/include/builder/build_sparse_index.hpp +++ b/include/builder/build_sparse_index.hpp @@ -1,6 +1,4 @@ #pragma once - -#include #include "include/buckets_statistics.hpp" namespace sshash { diff --git a/include/builder/parse_file.hpp b/include/builder/parse_file.hpp index ef13e01..94114a5 100644 --- a/include/builder/parse_file.hpp +++ b/include/builder/parse_file.hpp @@ -173,10 +173,10 @@ void parse_file(std::istream& is, parse_data& data, /* Push a final sentinel (dummy) value to avoid bounds' checking in kmer_iterator::fill_buff(). */ - for (int i = 0; i < kmer_t::uint_kmer_bits / 64; i++) { + static_assert(kmer_t::uint_kmer_bits % 64 == 0); + for (int dummy_bits = kmer_t::uint_kmer_bits; dummy_bits; dummy_bits -= 64) { bvb_strings.append_bits(0, 64); } - bvb_strings.append_bits(0, kmer_t::uint_kmer_bits % 64); bvb_strings.build(data.strings); From 96cecad837dad32b1386e682502c2679fb083ade Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Tue, 7 Oct 2025 20:42:53 +0200 Subject: [PATCH 5/5] return newline after pragma once --- include/builder/build_sparse_index.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/include/builder/build_sparse_index.hpp b/include/builder/build_sparse_index.hpp index 302f50f..dd3530a 100644 --- a/include/builder/build_sparse_index.hpp +++ b/include/builder/build_sparse_index.hpp @@ -1,4 +1,5 @@ #pragma once + #include "include/buckets_statistics.hpp" namespace sshash {