From da322e1c4a37a8e7584ca983ca2e8b88a49880b4 Mon Sep 17 00:00:00 2001 From: Dan Stahlke Date: Fri, 28 Nov 2025 16:42:57 -0800 Subject: [PATCH] EXPERIMENT: expression templates --- src/bitscan/bbexpr.h | 316 ++++++++++++++++++++++++++++++ src/bitscan/bbset.cpp | 43 ---- src/bitscan/bbset.h | 66 +++---- src/bitscan/tests/CMakeLists.txt | 1 + src/bitscan/tests/test_bbexpr.cpp | 276 ++++++++++++++++++++++++++ 5 files changed, 616 insertions(+), 86 deletions(-) create mode 100644 src/bitscan/bbexpr.h create mode 100644 src/bitscan/tests/test_bbexpr.cpp diff --git a/src/bitscan/bbexpr.h b/src/bitscan/bbexpr.h new file mode 100644 index 0000000..93aac55 --- /dev/null +++ b/src/bitscan/bbexpr.h @@ -0,0 +1,316 @@ +#pragma once + +#include "bbtypes.h" +#include "bbconfig.h" +#include "bitblock.h" + +#include +#include +#include +#include + +namespace bitgraph { + +////////////////////////////////////////////////////////////////////// +/// Forward declarations +////////////////////////////////////////////////////////////////////// + +struct BBIterBeginTag { }; +struct BBIterEndTag { }; + +template +class BBIter; + +////////////////////////////////////////////////////////////////////// +/// BBExpr +////////////////////////////////////////////////////////////////////// + +template +struct BBExpr { + E const &bbexpr_cast() const { return static_cast(*this); } + E &bbexpr_cast() { return static_cast(*this); } + + int capacity() const { return bbexpr_cast().bbexpr_num_blocks()*64; } + + BBIter begin() const { + return BBIter{*this, BBIterBeginTag{}}; + } + + BBIter end() const { + return BBIter{*this, BBIterEndTag{}}; + } +}; + +////////////////////////////////////////////////////////////////////// +/// BBExprMutable +////////////////////////////////////////////////////////////////////// + +template +struct BBExprMutable : public BBExpr { + E const &bbexpr_cast() const { return static_cast(*this); } + E &bbexpr_cast() { return static_cast(*this); } + + E &erase_bit() { + E &self = bbexpr_cast(); + for (int i=0; i= 0 && i < capacity()); + self.bbexpr_get_block(WDIV(i)) &= ~(BITBOARD{1} << WMOD(i)); + return self; + } + + E &set_bit(int firstBit, int lastBit) { + E &self = bbexpr_cast(); + assert(firstBit >= 0 && firstBit <= lastBit); + assert(lastBit < capacity()); + + int bbl = WDIV(firstBit); + int bbh = WDIV(lastBit); + + if (bbl == bbh) + { + self.bbexpr_get_block(bbh) |= bblock::MASK_1(firstBit - WMUL(bbl), lastBit - WMUL(bbh)); + } + else + { + //set to one the intermediate blocks + for (int i = bbl + 1; i < bbh; ++i) { + self.bbexpr_get_block(i) = ONE; + } + + //sets the first and last blocks + self.bbexpr_get_block(bbh) |= bblock::MASK_1_LOW(lastBit - WMUL(bbh)); + self.bbexpr_get_block(bbl) |= bblock::MASK_1_HIGH(firstBit - WMUL(bbl)); + + } + + return self; + } + + /** + * @brief Overwrites this bitstring with @bb_add (equivalent to operator=) + * + * Note: The bitblock size of bb_add must be at least as large as this bitstring. + * + * @param bb_add: input bitstring whose bits are copied + * returns reference to the modified bitstring + **/ + template + E &assign_bit(BBExpr const &rhs) { + E &self = bbexpr_cast(); + RHS const &rhsCast = rhs.bbexpr_cast(); + assert(self.bbexpr_num_blocks() <= rhsCast.bbexpr_num_blocks()); + for (int i=0; i + E &operator&=(BBExpr const &rhs) { + return assign_bit(*this & rhs); + } + + template + E &operator|=(BBExpr const &rhs) { + return assign_bit(*this | rhs); + } + + template + E &operator^=(BBExpr const &rhs) { + return assign_bit(*this ^ rhs); + } +}; + +////////////////////////////////////////////////////////////////////// +/// BBIter +////////////////////////////////////////////////////////////////////// + +template +class BBIter { +public: + using value_type = int; + using difference_type = ptrdiff_t; + using pointer = void; + using reference = const int &; + using iterator_category = std::forward_iterator_tag; + +private: + std::conditional_t obj_; + int bbi_; + int pos_; + +public: + BBIter(BBExpr const &bbdata, BBIterBeginTag) : + obj_(bbdata.bbexpr_cast()), + bbi_(0) + { + BITBOARD x; + while (bbi_ < obj_.bbexpr_num_blocks() && !(x=obj_.bbexpr_get_block(bbi_))) + ++bbi_; + if (bbi_ < obj_.bbexpr_num_blocks()) { + pos_ = lowest_bit_assuming_nonzero(x); + } else { + pos_ = 0; + } + } + + BBIter(BBExpr const &bbdata, BBIterEndTag) : + obj_(bbdata.bbexpr_cast()), + bbi_(bbdata.bbexpr_cast().bbexpr_num_blocks()), + pos_(0) + { } + + bool operator==(BBIter const &o) const { + return bbi_ == o.bbi_ && pos_ == o.pos_; + } + + bool operator!=(BBIter const &o) const { + return !(*this == o); + } + + BBIter &operator++() { + BITBOARD x = obj_.bbexpr_get_block(bbi_) & ((~BITBOARD{1}) << pos_); + if (x) { + pos_ = lowest_bit_assuming_nonzero(x); + } else { + const int n = obj_.bbexpr_num_blocks(); + ++bbi_; + while (bbi_ < n && !(x=obj_.bbexpr_get_block(bbi_))) + ++bbi_; + if (bbi_ < n) { + pos_ = lowest_bit_assuming_nonzero(x); + } else { + pos_ = 0; + } + } + return *this; + } + + int operator*() const { + return pos_ + WMUL(bbi_); + } + + int block_index() const { return bbi_; } + int bit_index() const { return pos_; } + +private: + static int lowest_bit_assuming_nonzero(BITBOARD x) { + // FIXME use countr_zero if C++20 + //__attribute__((assume(x != 0))); + //return std::countr_zero(x); + return __builtin_ctzll(x); + } +}; + +template +BBIter begin(BBExpr const &obj) { + return BBIter{obj, BBIterBeginTag{}}; +} + +template +BBIter end(BBExpr const &obj) { + return BBIter{obj, BBIterEndTag{}}; +} + +////////////////////////////////////////////////////////////////////// +/// Operations +////////////////////////////////////////////////////////////////////// + +template +struct BBBinaryOp : public BBExpr> { + static constexpr bool bbexpr_use_ref = false; + std::conditional_t lhs_; + std::conditional_t rhs_; + Op op_; + + BBBinaryOp(BBExpr const &lhs, BBExpr const &rhs, Op const &op=Op{}) : + lhs_(lhs.bbexpr_cast()), rhs_(rhs.bbexpr_cast()), op_(op) + { + assert(lhs_.bbexpr_num_blocks() == lhs_.bbexpr_num_blocks()); + } + + int bbexpr_num_blocks() const { return lhs_.bbexpr_num_blocks(); } + + BITBOARD bbexpr_get_block(int i) const { + return op_(lhs_.bbexpr_get_block(i), rhs_.bbexpr_get_block(i)); + } +}; + +template +struct BBUnaryOp : public BBExpr> { + static constexpr bool bbexpr_use_ref = false; + std::conditional_t obj_; + Op op_; + + BBUnaryOp(BBExpr const &obj, Op const &op=Op{}) : + obj_(obj.bbexpr_cast()), op_(op) + { } + + int bbexpr_num_blocks() const { return obj_.bbexpr_num_blocks(); } + + BITBOARD bbexpr_get_block(int i) const { return ~obj_.bbexpr_get_block(i); } +}; + +template +BBBinaryOp> operator&(BBExpr const &lhs, BBExpr const &rhs) { + return {lhs, rhs}; +} + +template +BBBinaryOp> operator|(BBExpr const &lhs, BBExpr const &rhs) { + return {lhs, rhs}; +} + +template +BBBinaryOp> operator^(BBExpr const &lhs, BBExpr const &rhs) { + return {lhs, rhs}; +} + +template +BBUnaryOp> operator~(BBExpr const &obj) { + return {obj}; +} + +////////////////////////////////////////////////////////////////////// +/// Containers +////////////////////////////////////////////////////////////////////// + +// FIXME should have const and non-const, with or without constexpr length +struct BBSpan : public BBExpr { + BITBOARD const *vBB_; + int nBB_; + + BBSpan(BITBOARD const *vBB, int nBB) : + vBB_(vBB), + nBB_(nBB) + { } + + static constexpr bool bbexpr_use_ref = false; + int bbexpr_num_blocks() const { return nBB_; } + BITBOARD bbexpr_get_block(int i) const { return vBB_[i]; } +}; + +template +struct BBStatic : public BBExprMutable> { + static_assert(N >= 0); + static constexpr int nBB_ = (N+63)/64; + + //alignas(64) // For AVX512 instructions. Doesn't seem to make it faster. + std::array vBB_; + + BBStatic() : + vBB_{} /*zero-initialized*/ + { } + + static constexpr bool bbexpr_use_ref = true; + static constexpr int bbexpr_num_blocks() { return nBB_; } + BITBOARD bbexpr_get_block(int i) const { assert(i >= 0 && i < nBB_); return vBB_[i]; } + BITBOARD &bbexpr_get_block(int i) { assert(i >= 0 && i < nBB_); return vBB_[i]; } +}; + +} // namespace bitgraph diff --git a/src/bitscan/bbset.cpp b/src/bitscan/bbset.cpp index 1c29fe5..e06fa61 100644 --- a/src/bitscan/bbset.cpp +++ b/src/bitscan/bbset.cpp @@ -168,49 +168,6 @@ void BitSet::reset(int popsize, const vint& lv) noexcept { -////////////////////////// -// -// BITSET OPERATORS -// (size is determined by *this) -///////////////////////// - -BitSet& BitSet::operator &= (const BitSet& bbn){ - - for (auto i = 0; i < nBB_; ++i) { - vBB_[i] &= bbn.vBB_[i]; - } - - return *this; -} - -BitSet& BitSet::operator |= (const BitSet& bbn){ - - for (auto i = 0; i < nBB_; ++i) { - vBB_[i] |= bbn.vBB_[i]; - } - - return *this; -} - -BitSet& BitSet::operator ^= (const BitSet& bbn) { - - for (auto i = 0; i < nBB_; ++i) { - vBB_[i] ^= bbn.vBB_[i]; - } - - return *this; -} - - -BitSet& BitSet::flip (){ - - for (auto i = 0; i < nBB_; ++i) { - vBB_[i] = ~vBB_[i]; - } - - return *this; -} - BitSet& BitSet::flip_block(int firstBlock, int lastBlock) { diff --git a/src/bitscan/bbset.h b/src/bitscan/bbset.h index e5b1b62..f61be12 100644 --- a/src/bitscan/bbset.h +++ b/src/bitscan/bbset.h @@ -16,6 +16,7 @@ #include "bbobject.h" #include "bitblock.h" +#include "bbexpr.h" #include "utils/common.h" //for the primitive stack type #include #include @@ -38,9 +39,23 @@ namespace bitgraph { // @details Does not use HW dependent instructions (intrinsics), nor does it cache information for very fast bitscanning // /////////////////////////////////// - class BitSet :public BBObject { + class BitSet :public BBObject, public BBExprMutable { public: + template + explicit BitSet(const BBExpr &rhs) noexcept : BitSet() { + *this = rhs; + } + + template + BitSet& operator=(const BBExpr &rhs) noexcept { + const E &rhsCast = rhs.bbexpr_cast(); + if (rhsCast.bbexpr_num_blocks() != nBB_) { + nBB_ = rhsCast.bbexpr_num_blocks(); + vBB_.resize(nBB_); + } + return assign_bit(rhs); + } ///////////////////////////// // Independent operators / masks @@ -409,16 +424,6 @@ namespace bitgraph { **/ inline BitSet& set_bit(const BitSet& bb_add); - /** - * @brief Overwrites this bitstring with @bb_add (equivalent to operator=) - * - * Note: The bitblock size of bb_add must be at least as large as this bitstring. - * - * @param bb_add: input bitstring whose bits are copied - * returns reference to the modified bitstring - **/ - inline BitSet& assign_bit(const BitSet& bb_add); - /** * @brief Adds the bits from the bitstring bb_add in the range [0, lastBit] * @param lastBit : the last bit in the range to be copied @@ -551,24 +556,6 @@ namespace bitgraph { //////////////////////// // operators - /** - * @brief Bitwise AND operator with bbn - * @details For set intersection - **/ - BitSet& operator &= (const BitSet& bbn); - - /** - * @brief Bitwise OR operator with bbn - * @details For set union - **/ - BitSet& operator |= (const BitSet& bbn); - - /** - * @brief Bitwise XOR operator with bbn - * @details For symmetric_difference - **/ - BitSet& operator ^= (const BitSet& bbn); - friend bool operator == (const BitSet& lhs, const BitSet& rhs); friend bool operator != (const BitSet& lhs, const BitSet& rhs); @@ -580,7 +567,7 @@ namespace bitgraph { /** * @brief flips 1-bits to 0 and 0-bits to 1 **/ - BitSet& flip(); + BitSet& flip() { return assign_bit(~*this); } /** * @brief flips 1-bits to 0 and 0-bits to 1 in the @@ -805,6 +792,12 @@ namespace bitgraph { virtual int* to_C_array(int* lv, std::size_t& size, bool rev = false); + // BBExpr interface + static constexpr bool bbexpr_use_ref = true; + int bbexpr_num_blocks() const { return nBB_; } + BITBOARD bbexpr_get_block(int i) const { return vBB_[i]; } + BITBOARD &bbexpr_get_block(int i) { return vBB_[i]; } + //////////////////////// //data members @@ -1253,19 +1246,6 @@ namespace bitgraph{ return *this; } - inline BitSet& _impl::BitSet::assign_bit(const BitSet& bb_add) - { - ///////////////////////////////// - assert(nBB_ <= bb_add.nBB_); - ///////////////////////////////// - - for (auto i = 0; i < nBB_; ++i) { - vBB_[i] = bb_add.vBB_[i]; - } - - return *this; - } - BitSet& BitSet::set_block(int firstBlock, int lastBlock, const BitSet& bb_add) { diff --git a/src/bitscan/tests/CMakeLists.txt b/src/bitscan/tests/CMakeLists.txt index c1fef31..430279b 100644 --- a/src/bitscan/tests/CMakeLists.txt +++ b/src/bitscan/tests/CMakeLists.txt @@ -21,6 +21,7 @@ add_executable (test_bitscan test_bitset_sparse.cpp test_bbscan_sparse_nested.cpp + test_bbexpr.cpp ) set_target_properties(test_bitscan diff --git a/src/bitscan/tests/test_bbexpr.cpp b/src/bitscan/tests/test_bbexpr.cpp new file mode 100644 index 0000000..351c5c2 --- /dev/null +++ b/src/bitscan/tests/test_bbexpr.cpp @@ -0,0 +1,276 @@ +#include "bitscan/bbexpr.h" +#include "bitscan/bbscan.h" +#include "bitscan/bbalgorithm.h" + +#include "gtest/gtest.h" + +#include +#include +#include + +using namespace bitgraph; + +TEST(BBExpr, basic) { + const int N = 301; + BitSet a(N); + BitSet b(N); + std::vector baseline_a; + std::vector baseline_b; + std::vector baseline_a_and_b; + + std::mt19937 rng{0}; + std::bernoulli_distribution dist; + for(int i = 0; i <= N; ++i){ + const bool val_a = dist(rng); + const bool val_b = dist(rng); + if (val_a) { + a.set_bit(i); + baseline_a.emplace_back(i); + } + if (val_b) { + b.set_bit(i); + baseline_b.emplace_back(i); + } + if (val_a && val_b) { + baseline_a_and_b.emplace_back(i); + } + } + + std::vector test_a; + for (int i : a) + test_a.emplace_back(i); + EXPECT_EQ(baseline_a, test_a); + + std::vector test_b; + for (int i : b) + test_b.emplace_back(i); + EXPECT_EQ(baseline_b, test_b); + + std::vector test_a_and_b; + for (int i : a & b) + test_a_and_b.emplace_back(i); + EXPECT_EQ(baseline_a_and_b, test_a_and_b); + + EXPECT_EQ(baseline_a, std::vector(begin(a), end(a))); + EXPECT_EQ(baseline_b, std::vector(begin(b), end(b))); + EXPECT_EQ(baseline_a_and_b, std::vector(begin(a & b), end(a & b))); +} + +template +static std::vector random_graph(int N, double p, RngT &rng) { + std::vector g(N, BitSetT(N)); + std::bernoulli_distribution dist{p}; + for (int i=0; i +static int iseq_bbscan(const std::vector &g, std::vector &ub) { + const int N = g.size(); + ub.assign(N, 0); + int pc = N; + int col = 1, v = bbo::noBit; + BBScan bb_unsel(N, true); + BBScan bb_sel(N); + while (true) { + bb_sel = bb_unsel; + bb_sel.init_scan(bbo::DESTRUCTIVE); + // FIXME using countr_zero in _BitScanForward64 makes this 10% faster + while ((v = bb_sel.next_bit_del(bb_unsel)) != bbo::noBit) { + ub[v] = col; + if ((--pc) == 0) { return col; } + bb_sel.erase_block(WDIV(v), -1, g[v]); + } + ++col; + } +} + +// Just like iseq_bbscan but using iterators for the inner loop. +template +static int iseq_bbexpr_v1(const std::vector &g, std::vector &ub) { + const int N = g.size(); + ub.assign(N, 0); + int pc = N; + int col = 1; + BitSet bb_unsel(N, true); + BitSet bb_sel(N); + while (true) { + bb_sel = bb_unsel; + for (int v : bb_sel) { + bb_unsel.erase_bit(v); + ub[v] = col; + if ((--pc) == 0) { return col; } + bb_sel.erase_block(WDIV(v), -1, g[v]); + } + ++col; + } +} + +// Using block and bit indices from the iterator. +template +static int iseq_bbexpr_v2(const std::vector &g, std::vector &ub) { + const int N = g.size(); + ub.assign(N, 0); + int pc = N; + int col = 1; + BitSet bb_unsel(N, true); + BitSet bb_sel(N); + while (true) { + bb_sel = bb_unsel; + for (auto it=begin(bb_sel), itEnd=end(bb_sel); it != itEnd; ++it) { + int v = *it; + //bb_unsel.erase_bit(v); + bb_unsel.bitset().data()[it.block_index()] &= ~(BITBOARD{1} << it.bit_index()); + ub[v] = col; + if ((--pc) == 0) { return col; } + bb_sel.erase_block(it.block_index(), -1, g[v]); + } + ++col; + } +} + +// Using a boolean expression as the object of the for loop. +// This is the fastest version. +template +static int iseq_bbexpr_v3(const std::vector &g, std::vector &ub) { + const int N = g.size(); + ub.assign(N, 0); + int pc = N; + int col = 1; + BitSet bb_unsel(N, true); + BitSet bb_neighbors(N); + while (true) { + bb_neighbors.erase_bit(); + for (int v : (bb_unsel & ~bb_neighbors)) { + bb_unsel.erase_bit(v); + bb_neighbors |= g[v]; + ub[v] = col; + if ((--pc) == 0) { return col; } + } + ++col; + } +} + +// Local bit vector variables are on stack with constexpr size. The hope was +// this would allow vectorizing the loop in `bb_neighbors |= g[v]` using AVX +// instructions. But for some reason it's slower. +// i7-1370P no-turbo benchmark: 200ms +template +static int iseq_bbexpr_v4(const std::vector &g, std::vector &ub) { + const int N = g.size(); + ub.assign(N, 0); + int pc = N; + int col = 1; + assert(N <= 512); + BBStatic<512> bb_unsel; + bb_unsel.set_bit(0, N-1); + BBStatic<512> bb_neighbors; + while (true) { + bb_neighbors.erase_bit(); + for (int v : (bb_unsel & ~bb_neighbors)) { + bb_unsel.erase_bit(v); + bb_neighbors |= g[v]; + ub[v] = col; + if ((--pc) == 0) { return col; } + } + ++col; + } +} + +// Returns nanoseconds per iteration. +template +static int microBenchmark(int niter, Callback const &cb) { + using ClockT = std::chrono::high_resolution_clock; + auto t0 = ClockT::now(); + for (int iter=0; iter(t1 - t0).count() / niter; +} + +TEST(BBExpr, benchmark_iset) { + std::mt19937 rng{0}; // constant seed for benchmark timing consistency + const int N = 500; + const std::vector g = random_graph(N, 0.5, rng); + + // Compare that the results match. + std::vector colors_baseline(N); + const int nc_baseline = iseq_bbscan(g, colors_baseline); + { + std::vector colors(N); + const int nc = iseq_bbexpr_v1(g, colors); + EXPECT_EQ(colors, colors_baseline); + EXPECT_EQ(nc, nc_baseline); + } + { + std::vector colors(N); + const int nc = iseq_bbexpr_v2(g, colors); + EXPECT_EQ(colors, colors_baseline); + EXPECT_EQ(nc, nc_baseline); + } + { + std::vector colors(N); + const int nc = iseq_bbexpr_v3(g, colors); + EXPECT_EQ(colors, colors_baseline); + EXPECT_EQ(nc, nc_baseline); + } + { + std::vector colors(N); + const int nc = iseq_bbexpr_v4(g, colors); + EXPECT_EQ(colors, colors_baseline); + EXPECT_EQ(nc, nc_baseline); + } + + std::vector colors(N); + double totalTimeBBScan = 0; + double totalTimeBBExpr1 = 0; + double totalTimeBBExpr2 = 0; + double totalTimeBBExpr3 = 0; + double totalTimeBBExpr4 = 0; + // Run all versions round-robin several times so they will all be equally + // affected if CPU frequency changes. + const int numOuterIter = 20; + for (int iter=0; iter