Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 18 additions & 64 deletions include/builder/build_sparse_index.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,17 +28,12 @@ struct bucket_size_iterator {

template <class kmer_t>
buckets_statistics build_sparse_index(parse_data<kmer_t>& data, buckets<kmer_t>& m_buckets,
build_configuration const& build_config) //
build_configuration const& /*build_config*/) //
{
const uint64_t num_kmers = data.num_kmers;
const uint64_t num_minimizer_positions = data.minimizers.num_minimizer_positions();
const uint64_t num_super_kmers = data.minimizers.num_super_kmers();
const uint64_t num_buckets = data.minimizers.num_minimizers();
const uint64_t num_threads = build_config.num_threads;

bits::compact_vector::builder offsets_builder;
offsets_builder.resize(num_minimizer_positions,
std::ceil(std::log2(data.strings.num_bits() / kmer_t::bits_per_char)));

std::cout << "bits_per_offset = ceil(log2(" << data.strings.num_bits() / kmer_t::bits_per_char
<< ")) = " << std::ceil(std::log2(data.strings.num_bits() / kmer_t::bits_per_char))
Expand Down Expand Up @@ -66,66 +61,25 @@ buckets_statistics build_sparse_index(parse_data<kmer_t>& data, buckets<kmer_t>&
buckets_statistics buckets_stats(num_buckets, num_kmers, num_minimizer_positions);

timer.start();
const uint64_t block_size = (num_super_kmers + num_threads - 1) / num_threads;
std::vector<uint64_t> offsets;
offsets.reserve(num_threads + 1);
for (uint64_t offset = -1; offset != num_super_kmers;) {
offsets.push_back(offset + 1);
offset = std::min<uint64_t>((offset + 1) + block_size, num_super_kmers);
minimizer_tuple const* b = begin + offset;
uint64_t curr_minimizer = (*b).minimizer;
while (b + 1 < end) { // adjust offset
uint64_t next_minimizer = (*(b + 1)).minimizer;
if (curr_minimizer != next_minimizer) break;
b += 1;
offset += 1;

bits::compact_vector::builder offsets_builder;
offsets_builder.resize(num_minimizer_positions,
std::ceil(std::log2(data.strings.num_bits() / kmer_t::bits_per_char)));
uint64_t prev_minimizer = constants::invalid_uint64, prev_pos_in_seq = constants::invalid_uint64, bucket_size = 0;
for (auto mt : input) {
if (mt.minimizer != prev_minimizer) {
auto [bucket_begin, bucket_end] = m_buckets.locate_bucket(mt.minimizer);
bucket_size = bucket_end - bucket_begin;
buckets_stats.add_bucket_size(bucket_size);
prev_minimizer = mt.minimizer;
prev_pos_in_seq = constants::invalid_uint64;
}
}
offsets.push_back(num_super_kmers);

std::vector<buckets_statistics> threads_buckets_stats;
threads_buckets_stats.reserve(num_threads);

auto exe = [&](const uint64_t thread_id) {
assert(thread_id + 1 < offsets.size());
const uint64_t offset_begin = offsets[thread_id];
const uint64_t offset_end = offsets[thread_id + 1];
auto& tbs = threads_buckets_stats[thread_id];
for (minimizers_tuples_iterator it(begin + offset_begin, begin + offset_end); //
it.has_next(); //
it.next()) //
{
const uint64_t bucket_id = it.minimizer();
const auto [begin, end] = m_buckets.locate_bucket(bucket_id);
assert(end > begin);
const uint64_t bucket_size = end - begin;
assert(bucket_size == it.bucket().size());
tbs.add_bucket_size(bucket_size);
uint64_t pos = 0;
auto bucket = it.bucket();
uint64_t prev_pos_in_seq = constants::invalid_uint64;
for (auto mt : bucket) {
if (mt.pos_in_seq != prev_pos_in_seq) {
offsets_builder.set(begin + pos++, mt.pos_in_seq);
prev_pos_in_seq = mt.pos_in_seq;
}
tbs.add_num_kmers_in_super_kmer(bucket_size, mt.num_kmers_in_super_kmer);
}
assert(pos == bucket_size);
buckets_stats.add_num_kmers_in_super_kmer(bucket_size, mt.num_kmers_in_super_kmer);
if (mt.pos_in_seq != prev_pos_in_seq) {
offsets_builder.push_back(mt.pos_in_seq);
prev_pos_in_seq = mt.pos_in_seq;
}
};

std::vector<std::thread> threads;
threads.reserve(num_threads);
assert(offsets.size() <= num_threads + 1);
for (uint64_t thread_id = 0; thread_id + 1 < size(offsets); ++thread_id) {
threads_buckets_stats.emplace_back(num_buckets, num_kmers, num_minimizer_positions);
threads.emplace_back(exe, thread_id);
}
for (auto& t : threads) {
if (t.joinable()) t.join();
}
for (auto const& tbs : threads_buckets_stats) buckets_stats += tbs;

input.close();
timer.stop();
Expand All @@ -145,4 +99,4 @@ buckets_statistics build_sparse_index(parse_data<kmer_t>& data, buckets<kmer_t>&
return buckets_stats;
}

} // namespace sshash
} // namespace sshash
5 changes: 4 additions & 1 deletion include/builder/parse_file.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,10 @@ void parse_file(std::istream& is, parse_data<kmer_t>& data,

/* Push a final sentinel (dummy) value to avoid bounds' checking in
kmer_iterator::fill_buff(). */
bvb_strings.append_bits(0, kmer_t::uint_kmer_bits);
static_assert(kmer_t::uint_kmer_bits % 64 == 0);
for (int dummy_bits = kmer_t::uint_kmer_bits; dummy_bits; dummy_bits -= 64) {
bvb_strings.append_bits(0, 64);
}

bvb_strings.build(data.strings);

Expand Down
Loading