create cusparse spgemm reuse

yhmtsai · yhmtsai · commit e076309f69d5 · 2025-06-05T18:23:33.000+02:00
diff --git a/include/spblas/vendor/cusparse/multiply_spgemm.hpp b/include/spblas/vendor/cusparse/multiply_spgemm.hpp
@@ -22,9 +22,10 @@ class spgemm_state_t {
   spgemm_state_t() : spgemm_state_t(cusparse::cuda_allocator<char>{}) {}
 
   spgemm_state_t(cusparse::cuda_allocator<char> alloc)
-      : alloc_(alloc), buffer_size_1_(0), buffer_size_2_(0),
-        workspace_1_(nullptr), workspace_2_(nullptr), result_nnz_(0),
-        result_shape_(0, 0) {
+      : alloc_(alloc), buffer_size_1_(0), buffer_size_2_(0), buffer_size_3_(0),
+        buffer_size_4_(0), buffer_size_5_(0), workspace_1_(nullptr),
+        workspace_2_(nullptr), workspace_3_(nullptr), workspace_4_(nullptr),
+        workspace_5_(nullptr), result_nnz_(0), result_shape_(0, 0) {
     cusparseHandle_t handle;
     __cusparse::throw_if_error(cusparseCreate(&handle));
     if (auto stream = alloc.stream()) {
@@ -157,6 +158,156 @@ class spgemm_state_t {
         to_cuda_datatype<value_type>(), CUSPARSE_SPGEMM_DEFAULT, this->descr_));
   }
 
+  template <matrix A, matrix B, matrix C>
+    requires __detail::has_csr_base<A> && __detail::has_csr_base<B> &&
+             __detail::is_csr_view_v<C>
+  void multiply_symbolic_compute(A&& a, B&& b, C&& c) {
+    auto a_base = __detail::get_ultimate_base(a);
+    auto b_base = __detail::get_ultimate_base(b);
+    using matrix_type = decltype(a_base);
+    using input_type = decltype(b_base);
+    using output_type = std::remove_reference_t<decltype(c)>;
+    using value_type = typename matrix_type::scalar_type;
+    size_t buffer_size = 0;
+
+    auto alpha_optional = __detail::get_scaling_factor(a, b);
+    value_type alpha = alpha_optional.value_or(1);
+    value_type beta = 1;
+    auto handle = this->handle_.get();
+    __cusparse::throw_if_error(cusparseDestroySpMat(mat_a_));
+    __cusparse::throw_if_error(cusparseDestroySpMat(mat_b_));
+    __cusparse::throw_if_error(cusparseDestroySpMat(mat_c_));
+    mat_a_ = __cusparse::create_matrix_descr(a_base);
+    mat_b_ = __cusparse::create_matrix_descr(b_base);
+    mat_c_ = __cusparse::create_matrix_descr(c);
+
+    // ask bufferSize1 bytes for external memory
+    size_t buffer_size_1 = 0;
+    __cusparse::throw_if_error(cusparseSpGEMMreuse_workEstimation(
+        handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+        CUSPARSE_OPERATION_NON_TRANSPOSE, mat_a_, mat_b_, mat_c_,
+        CUSPARSE_SPGEMM_DEFAULT, this->descr_, &buffer_size_1, NULL));
+    if (buffer_size_1 > this->buffer_size_1_) {
+      this->alloc_.deallocate(this->workspace_1_, buffer_size_1_);
+      this->buffer_size_1_ = buffer_size_1;
+      this->workspace_1_ = this->alloc_.allocate(buffer_size_1);
+    }
+    // inspect the matrices A and B to understand the memory requirement for
+    // the next step
+    __cusparse::throw_if_error(cusparseSpGEMMreuse_workEstimation(
+        handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+        CUSPARSE_OPERATION_NON_TRANSPOSE, mat_a_, mat_b_, mat_c_,
+        CUSPARSE_SPGEMM_DEFAULT, this->descr_, &buffer_size_1,
+        this->workspace_1_));
+
+    // ask buffer_size_2/3/4 bytes for external memory
+    size_t buffer_size_2 = 0;
+    size_t buffer_size_3 = 0;
+    size_t buffer_size_4 = 0;
+    cusparseSpGEMMreuse_nnz(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                            CUSPARSE_OPERATION_NON_TRANSPOSE, mat_a_, mat_b_,
+                            mat_c_, CUSPARSE_SPGEMM_DEFAULT, this->descr_,
+                            &buffer_size_2, NULL, &buffer_size_3, NULL,
+                            &buffer_size_4, NULL);
+    if (buffer_size_2 > this->buffer_size_2_) {
+      this->alloc_.deallocate(this->workspace_2_, buffer_size_2_);
+      this->buffer_size_2_ = buffer_size_2;
+      this->workspace_2_ = this->alloc_.allocate(buffer_size_2);
+    }
+    if (buffer_size_3 > this->buffer_size_3_) {
+      this->alloc_.deallocate(this->workspace_3_, buffer_size_3_);
+      this->buffer_size_3_ = buffer_size_3;
+      this->workspace_3_ = this->alloc_.allocate(buffer_size_3);
+    }
+    if (buffer_size_4 > this->buffer_size_4_) {
+      this->alloc_.deallocate(this->workspace_4_, buffer_size_4_);
+      this->buffer_size_4_ = buffer_size_4;
+      this->workspace_4_ = this->alloc_.allocate(buffer_size_4);
+    }
+
+    // compute nnz
+    cusparseSpGEMMreuse_nnz(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                            CUSPARSE_OPERATION_NON_TRANSPOSE, mat_a_, mat_b_,
+                            mat_c_, CUSPARSE_SPGEMM_DEFAULT, this->descr_,
+                            &buffer_size_2, this->workspace_2_, &buffer_size_3,
+                            this->workspace_3_, &buffer_size_4,
+                            this->workspace_4_);
+    // get matrix C non-zero entries c_nnz
+    int64_t c_num_rows, c_num_cols, c_nnz;
+    cusparseSpMatGetSize(mat_c_, &c_num_rows, &c_num_cols, &c_nnz);
+    this->result_nnz_ = c_nnz;
+    this->result_shape_ = index<index_t>(c_num_rows, c_num_cols);
+  }
+
+  template <matrix A, matrix B, matrix C>
+    requires __detail::has_csr_base<A> && __detail::has_csr_base<B> &&
+             __detail::is_csr_view_v<C>
+  void multiply_symbolic_fill(A&& a, B&& b, C&& c) {
+    auto a_base = __detail::get_ultimate_base(a);
+    auto b_base = __detail::get_ultimate_base(b);
+    using matrix_type = decltype(a_base);
+    using input_type = decltype(b_base);
+    using output_type = std::remove_reference_t<decltype(c)>;
+    using value_type = typename matrix_type::scalar_type;
+
+    auto alpha_optional = __detail::get_scaling_factor(a, b);
+    value_type alpha = alpha_optional.value_or(1);
+    value_type beta = 0;
+
+    __cusparse::throw_if_error(cusparseCsrSetPointers(
+        this->mat_c_, c.rowptr().data(), c.colind().data(), c.values().data()));
+
+    auto handle = this->handle_.get();
+    size_t buffer_size_5 = 0;
+    cusparseSpGEMMreuse_copy(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                             CUSPARSE_OPERATION_NON_TRANSPOSE, mat_a_, mat_b_,
+                             mat_c_, CUSPARSE_SPGEMM_DEFAULT, this->descr_,
+                             &buffer_size_5, NULL);
+    if (buffer_size_5 > this->buffer_size_5_) {
+      this->alloc_.deallocate(this->workspace_5_, buffer_size_5_);
+      this->buffer_size_5_ = buffer_size_5;
+      this->workspace_5_ = this->alloc_.allocate(buffer_size_5);
+    }
+    cusparseSpGEMMreuse_copy(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                             CUSPARSE_OPERATION_NON_TRANSPOSE, mat_a_, mat_b_,
+                             mat_c_, CUSPARSE_SPGEMM_DEFAULT, this->descr_,
+                             &buffer_size_5, this->workspace_5_);
+  }
+
+  template <matrix A, matrix B, matrix C>
+    requires __detail::has_csr_base<A> && __detail::has_csr_base<B> &&
+             __detail::is_csr_view_v<C>
+  void multiply_numeric(A&& a, B&& b, C&& c) {
+    auto a_base = __detail::get_ultimate_base(a);
+    auto b_base = __detail::get_ultimate_base(b);
+    using matrix_type = decltype(a_base);
+    using input_type = decltype(b_base);
+    using output_type = std::remove_reference_t<decltype(c)>;
+    using value_type = typename matrix_type::scalar_type;
+
+    auto alpha_optional = __detail::get_scaling_factor(a, b);
+    tensor_scalar_t<A> alpha = alpha_optional.value_or(1);
+    value_type alpha_val = alpha;
+    value_type beta = 0;
+
+    auto handle = this->handle_.get();
+
+    // Update the pointer from the matrix but they must contains the same
+    // sparsity as the previous call.
+    __cusparse::throw_if_error(
+        cusparseCsrSetPointers(this->mat_a_, a_base.rowptr().data(),
+                               a_base.colind().data(), a_base.values().data()));
+    __cusparse::throw_if_error(
+        cusparseCsrSetPointers(this->mat_b_, b_base.rowptr().data(),
+                               b_base.colind().data(), b_base.values().data()));
+    __cusparse::throw_if_error(cusparseCsrSetPointers(
+        this->mat_c_, c.rowptr().data(), c.colind().data(), c.values().data()));
+    cusparseSpGEMMreuse_compute(
+        handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+        CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat_a_, mat_b_, &beta, mat_c_,
+        to_cuda_datatype<value_type>(), CUSPARSE_SPGEMM_DEFAULT, this->descr_);
+  }
+
 private:
   using handle_manager =
       std::unique_ptr<std::pointer_traits<cusparseHandle_t>::element_type,
@@ -165,8 +316,14 @@ class spgemm_state_t {
   cusparse::cuda_allocator<char> alloc_;
   size_t buffer_size_1_;
   size_t buffer_size_2_;
+  size_t buffer_size_3_;
+  size_t buffer_size_4_;
+  size_t buffer_size_5_;
   char* workspace_1_;
   char* workspace_2_;
+  char* workspace_3_;
+  char* workspace_4_;
+  char* workspace_5_;
   index<index_t> result_shape_;
   index_t result_nnz_;
   cusparseSpMatDescr_t mat_a_ = nullptr;
@@ -194,4 +351,27 @@ void multiply_fill(spgemm_state_t& spgemm_handle, A&& a, B&& b, C&& c) {
   spgemm_handle.multiply_fill(a, b, c);
 }
 
+template <matrix A, matrix B, matrix C>
+  requires __detail::has_csr_base<A> && __detail::has_csr_base<B> &&
+           __detail::is_csr_view_v<C>
+void multiply_symbolic_compute(spgemm_state_t& spgemm_handle, A&& a, B&& b,
+                               C&& c) {
+  spgemm_handle.multiply_symbolic_compute(a, b, c);
+}
+
+template <matrix A, matrix B, matrix C>
+  requires __detail::has_csr_base<A> && __detail::has_csr_base<B> &&
+           __detail::is_csr_view_v<C>
+void multiply_symbolic_fill(spgemm_state_t& spgemm_handle, A&& a, B&& b,
+                            C&& c) {
+  spgemm_handle.multiply_symbolic_fill(a, b, c);
+}
+
+template <matrix A, matrix B, matrix C>
+  requires __detail::has_csr_base<A> && __detail::has_csr_base<B> &&
+           __detail::is_csr_view_v<C>
+void multiply_numeric(spgemm_state_t& spgemm_handle, A&& a, B&& b, C&& c) {
+  spgemm_handle.multiply_numeric(a, b, c);
+}
+
 } // namespace spblas
diff --git a/test/gtest/CMakeLists.txt b/test/gtest/CMakeLists.txt
@@ -25,7 +25,7 @@ else()
   if(ENABLE_ROCSPARSE)
     set(TEST_SOURCES device/spmv_test.cpp device/spgemm_test.cpp device/spgemm_reuse_test.cpp device/rocsparse/spgemm_4args_test.cpp)
   else()
-    set(TEST_SOURCES device/spmv_test.cpp device/spgemm_test.cpp)
+    set(TEST_SOURCES device/spmv_test.cpp device/spgemm_test.cpp device/spgemm_reuse_test.cpp)
   endif()
   add_device_test(TEST_SOURCES)
 endif()
diff --git a/test/gtest/device/spgemm_reuse_test.cpp b/test/gtest/device/spgemm_reuse_test.cpp
@@ -34,16 +34,19 @@ TEST(CsrView, SpGEMMReuse) {
       spblas::csr_view<value_t, index_t, offset_t> b(b_values, b_rowptr,
                                                      b_colind, b_shape, b_nnz);
 
-      thrust::device_vector<offset_t> d_c_rowptr(m + 1);
+      std::vector<offset_t> c_rowptr(m + 1);
+      thrust::device_vector<offset_t> d_c_rowptr(c_rowptr);
 
       spblas::csr_view<value_t, index_t, offset_t> d_c(
           nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0);
 
       spblas::spgemm_state_t state;
       spblas::multiply_symbolic_compute(state, d_a, d_b, d_c);
       auto nnz = state.result_nnz();
-      thrust::device_vector<value_t> d_c_values(nnz);
-      thrust::device_vector<index_t> d_c_colind(nnz);
+      std::vector<value_t> c_values(nnz);
+      std::vector<index_t> c_colind(nnz);
+      thrust::device_vector<value_t> d_c_values(c_values);
+      thrust::device_vector<index_t> d_c_colind(c_colind);
       std::span<value_t> d_c_values_span(d_c_values.data().get(), nnz);
       std::span<offset_t> d_c_rowptr_span(d_c_rowptr.data().get(), m + 1);
       std::span<index_t> d_c_colind_span(d_c_colind.data().get(), nnz);
@@ -68,9 +71,6 @@ TEST(CsrView, SpGEMMReuse) {
           thrust::copy(b_values.begin(), b_values.end(), d_b_values.begin());
         }
         spblas::multiply_numeric(state, d_a, d_b, d_c);
-        std::vector<value_t> c_values(nnz);
-        std::vector<offset_t> c_rowptr(m + 1);
-        std::vector<index_t> c_colind(nnz);
         thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin());
         thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin());
         thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin());
@@ -138,17 +138,19 @@ TEST(CsrView, SpGEMMReuse_AScaled) {
           d_b_colind.data().get(), b_shape, b_nnz);
       spblas::csr_view<value_t, index_t, offset_t> b(b_values, b_rowptr,
                                                      b_colind, b_shape, b_nnz);
-
-      thrust::device_vector<offset_t> d_c_rowptr(m + 1);
+      std::vector<offset_t> c_rowptr(m + 1);
+      thrust::device_vector<offset_t> d_c_rowptr(c_rowptr);
 
       spblas::csr_view<value_t, index_t, offset_t> d_c(
           nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0);
 
       spblas::spgemm_state_t state;
       spblas::multiply_symbolic_compute(state, scaled(alpha, d_a), d_b, d_c);
       auto nnz = state.result_nnz();
-      thrust::device_vector<value_t> d_c_values(nnz);
-      thrust::device_vector<index_t> d_c_colind(nnz);
+      std::vector<value_t> c_values(nnz);
+      std::vector<index_t> c_colind(nnz);
+      thrust::device_vector<value_t> d_c_values(c_values);
+      thrust::device_vector<index_t> d_c_colind(c_colind);
       std::span<value_t> d_c_values_span(d_c_values.data().get(), nnz);
       std::span<offset_t> d_c_rowptr_span(d_c_rowptr.data().get(), m + 1);
       std::span<index_t> d_c_colind_span(d_c_colind.data().get(), nnz);
@@ -173,9 +175,6 @@ TEST(CsrView, SpGEMMReuse_AScaled) {
           thrust::copy(b_values.begin(), b_values.end(), d_b_values.begin());
         }
         spblas::multiply_numeric(state, scaled(alpha, d_a), d_b, d_c);
-        std::vector<value_t> c_values(nnz);
-        std::vector<offset_t> c_rowptr(m + 1);
-        std::vector<index_t> c_colind(nnz);
         thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin());
         thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin());
         thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin());
@@ -243,17 +242,19 @@ TEST(CsrView, SpGEMMReuse_BScaled) {
           d_b_colind.data().get(), b_shape, b_nnz);
       spblas::csr_view<value_t, index_t, offset_t> b(b_values, b_rowptr,
                                                      b_colind, b_shape, b_nnz);
-
-      thrust::device_vector<offset_t> d_c_rowptr(m + 1);
+      std::vector<offset_t> c_rowptr(m + 1);
+      thrust::device_vector<offset_t> d_c_rowptr(c_rowptr);
 
       spblas::csr_view<value_t, index_t, offset_t> d_c(
           nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0);
 
       spblas::spgemm_state_t state;
       spblas::multiply_symbolic_compute(state, d_a, scaled(alpha, d_b), d_c);
       auto nnz = state.result_nnz();
-      thrust::device_vector<value_t> d_c_values(nnz);
-      thrust::device_vector<index_t> d_c_colind(nnz);
+      std::vector<value_t> c_values(nnz);
+      std::vector<index_t> c_colind(nnz);
+      thrust::device_vector<value_t> d_c_values(c_values);
+      thrust::device_vector<index_t> d_c_colind(c_colind);
       std::span<value_t> d_c_values_span(d_c_values.data().get(), nnz);
       std::span<offset_t> d_c_rowptr_span(d_c_rowptr.data().get(), m + 1);
       std::span<index_t> d_c_colind_span(d_c_colind.data().get(), nnz);
@@ -278,9 +279,6 @@ TEST(CsrView, SpGEMMReuse_BScaled) {
           thrust::copy(b_values.begin(), b_values.end(), d_b_values.begin());
         }
         spblas::multiply_numeric(state, d_a, scaled(alpha, d_b), d_c);
-        std::vector<value_t> c_values(nnz);
-        std::vector<offset_t> c_rowptr(m + 1);
-        std::vector<index_t> c_colind(nnz);
         thrust::copy(d_c_values.begin(), d_c_values.end(), c_values.begin());
         thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin());
         thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin());
@@ -348,23 +346,29 @@ TEST(CsrView, SpGEMMReuseAndChangePointer) {
       spblas::csr_view<value_t, index_t, offset_t> b(b_values, b_rowptr,
                                                      b_colind, b_shape, b_nnz);
 
-      thrust::device_vector<offset_t> d_c_rowptr(m + 1);
+      std::vector<offset_t> c_rowptr(m + 1);
+      thrust::device_vector<offset_t> d_c_rowptr(c_rowptr);
 
       spblas::csr_view<value_t, index_t, offset_t> d_c(
           nullptr, d_c_rowptr.data().get(), nullptr, {m, n}, 0);
 
       spblas::spgemm_state_t state;
       spblas::multiply_symbolic_compute(state, d_a, d_b, d_c);
       auto nnz = state.result_nnz();
-      thrust::device_vector<value_t> d_c_values(nnz);
-      thrust::device_vector<index_t> d_c_colind(nnz);
+      std::vector<value_t> c_values(nnz);
+      std::vector<index_t> c_colind(nnz);
+      thrust::device_vector<value_t> d_c_values(c_values);
+      thrust::device_vector<index_t> d_c_colind(c_colind);
       std::span<value_t> d_c_values_span(d_c_values.data().get(), nnz);
       std::span<offset_t> d_c_rowptr_span(d_c_rowptr.data().get(), m + 1);
       std::span<index_t> d_c_colind_span(d_c_colind.data().get(), nnz);
       d_c.update(d_c_values_span, d_c_rowptr_span, d_c_colind_span, {m, n},
                  nnz);
 
       spblas::multiply_symbolic_fill(state, d_a, d_b, d_c);
+      // move the sparsity back to host for later copy
+      thrust::copy(d_c_rowptr.begin(), d_c_rowptr.end(), c_rowptr.begin());
+      thrust::copy(d_c_colind.begin(), d_c_colind.end(), c_colind.begin());
       std::mt19937 g(0);
       for (int i = 0; i < 3; i++) {
         // regenerate value of a and b;
@@ -376,16 +380,17 @@ TEST(CsrView, SpGEMMReuseAndChangePointer) {
           v = val_dist(g);
         }
         // create different pointers than the symbolic phase, but they still
-        // hold the same sparsity
+        // hold the same sparsity.
+        // note. cuda without nvcc can only copy from host to device
         thrust::device_vector<value_t> d_a_values_new(a_values);
-        thrust::device_vector<index_t> d_a_colind_new(d_a_colind);
-        thrust::device_vector<index_t> d_a_rowptr_new(d_a_rowptr);
+        thrust::device_vector<index_t> d_a_colind_new(a_colind);
+        thrust::device_vector<index_t> d_a_rowptr_new(a_rowptr);
         thrust::device_vector<value_t> d_b_values_new(b_values);
-        thrust::device_vector<index_t> d_b_colind_new(d_b_colind);
-        thrust::device_vector<index_t> d_b_rowptr_new(d_b_rowptr);
-        thrust::device_vector<value_t> d_c_values_new(d_c_values);
-        thrust::device_vector<index_t> d_c_colind_new(d_c_colind);
-        thrust::device_vector<index_t> d_c_rowptr_new(d_c_rowptr);
+        thrust::device_vector<index_t> d_b_colind_new(b_colind);
+        thrust::device_vector<index_t> d_b_rowptr_new(b_rowptr);
+        thrust::device_vector<value_t> d_c_values_new(c_values);
+        thrust::device_vector<index_t> d_c_colind_new(c_colind);
+        thrust::device_vector<index_t> d_c_rowptr_new(c_rowptr);
         spblas::csr_view<value_t, index_t, offset_t> d_a(
             d_a_values_new.data().get(), d_a_rowptr_new.data().get(),
             d_a_colind_new.data().get(), a_shape, a_nnz);
@@ -398,9 +403,6 @@ TEST(CsrView, SpGEMMReuseAndChangePointer) {
         // call numeric on new data
         spblas::multiply_numeric(state, d_a, d_b, d_c);
         // move c back to host memory
-        std::vector<value_t> c_values(nnz);
-        std::vector<offset_t> c_rowptr(m + 1);
-        std::vector<index_t> c_colind(nnz);
         thrust::copy(d_c_values_new.begin(), d_c_values_new.end(),
                      c_values.begin());
         thrust::copy(d_c_rowptr_new.begin(), d_c_rowptr_new.end(),