diff --git a/CMakeLists.txt b/CMakeLists.txt index 5734ab3..0644cae 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,6 +6,8 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_FLAGS "-O3 -march=native") +option(ENABLE_MINIAPPS "Also build the miniapps. Default: OFF." OFF) + # Get includes, which declares the `spblas` library add_subdirectory(include) @@ -89,4 +91,8 @@ if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME) add_subdirectory(examples) add_subdirectory(test) + + if (ENABLE_MINIAPPS) + add_subdirectory(miniapps) + endif() endif() diff --git a/include/spblas/vendor/onemkl_sycl/algorithms.hpp b/include/spblas/vendor/onemkl_sycl/algorithms.hpp index 9f582a8..cafcbcf 100644 --- a/include/spblas/vendor/onemkl_sycl/algorithms.hpp +++ b/include/spblas/vendor/onemkl_sycl/algorithms.hpp @@ -1,5 +1,6 @@ #pragma once +#include "gemm.hpp" #include "spgemm_impl.hpp" #include "spmm_impl.hpp" #include "spmv_impl.hpp" diff --git a/include/spblas/vendor/onemkl_sycl/gemm.hpp b/include/spblas/vendor/onemkl_sycl/gemm.hpp new file mode 100644 index 0000000..e949059 --- /dev/null +++ b/include/spblas/vendor/onemkl_sycl/gemm.hpp @@ -0,0 +1,51 @@ +#pragma once + +#include + +#include +#include + +namespace spblas { + +template + requires __detail::has_mdspan_matrix_base && + __detail::has_mdspan_matrix_base && + __detail::is_matrix_instantiation_of_mdspan_v +void multiply(A&& a, B&& b, C&& c) { + auto a_base = __detail::get_ultimate_base(a); + auto b_base = __detail::get_ultimate_base(a); + + auto alpha_optional = __detail::get_scaling_factor(a, b); + tensor_scalar_t alpha = alpha_optional.value_or(1); + + sycl::queue q(sycl::cpu_selector_v); + + /* + namespace oneapi::mkl::blas::row_major { + sycl::event gemm(sycl::queue &queue, + onemkl::transpose transa, + onemkl::transpose transb, + std::int64_t m, + std::int64_t n, + std::int64_t k, + Ts alpha, + const Ta *a, + std::int64_t lda, + const Tb *b, + std::int64_t ldb, + Ts beta, + Tc *c, + std::int64_t ldc, + const std::vector &dependencies = {}) + } +*/ + + oneapi::mkl::blas::row_major::gemm( + q, oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::nontrans, + __backend::shape(a)[0], __backend::shape(c)[1], __backend::shape(a)[1], + alpha, a_base.data_handle(), __backend::shape(a)[1], b_base.data_handle(), + __backend::shape(b)[1], 0, c.data_handle(), __backend::shape(c)[1]) + .wait(); +} + +} // namespace spblas diff --git a/miniapps/CMake/miniappsConfig.cmake.in b/miniapps/CMake/miniappsConfig.cmake.in new file mode 100644 index 0000000..6e89b67 --- /dev/null +++ b/miniapps/CMake/miniappsConfig.cmake.in @@ -0,0 +1,22 @@ +include(CMakeFindDependencyMacro) +list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}") + +set(miniapps_VERSION "@miniapps_VERSION@") +set(miniapps_VERSION_MAJOR "@miniapps_VERSION_MAJOR@") +set(miniapps_VERSION_MINOR "@miniapps_VERSION_MINOR@") +set(miniapps_VERSION_PATCH "@miniapps_VERSION_PATCH@") +set(miniapps_VERSION_DEVEL "@miniapps_VERSION_DEVEL@") + +# blas++ +if (NOT blaspp_DIR) + set(blaspp_DIR @blaspp_DIR@) +endif() +find_dependency(blaspp) + +# lapack++ +if (NOT lapackpp_DIR) + set(lapackpp_DIR @lapackpp_DIR@) +endif() +find_dependency(lapackpp) + +include(miniapps) diff --git a/miniapps/CMake/miniappsConfigVersion.cmake.in b/miniapps/CMake/miniappsConfigVersion.cmake.in new file mode 100644 index 0000000..582661d --- /dev/null +++ b/miniapps/CMake/miniappsConfigVersion.cmake.in @@ -0,0 +1,11 @@ +set(PACKAGE_VERSION "@miniapps_VERSION@") + +# Check whether the requested PACKAGE_FIND_VERSION is compatible +if("${PACKAGE_VERSION}" VERSION_LESS "${PACKAGE_FIND_VERSION}") + set(PACKAGE_VERSION_COMPATIBLE FALSE) +else() + set(PACKAGE_VERSION_COMPATIBLE TRUE) + if ("${PACKAGE_VERSION}" VERSION_EQUAL "${PACKAGE_FIND_VERSION}") + set(PACKAGE_VERSION_EXACT TRUE) + endif() +endif() diff --git a/miniapps/CMake/rl_build_options.cmake b/miniapps/CMake/rl_build_options.cmake new file mode 100644 index 0000000..ed85878 --- /dev/null +++ b/miniapps/CMake/rl_build_options.cmake @@ -0,0 +1,28 @@ +set(CMAKE_CXX_STANDARD 23) +set(CMAKE_POSITION_INDEPENDENT_CODE ON) + +option(BUILD_SHARED_LIBS OFF "Configure to build shared or static libraries") + +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE "Release" + CACHE STRING "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel." FORCE) + set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo") +endif() + +set(SANITIZE_ADDRESS OFF CACHE BOOL "Add address sanitizer flags to the library") + +message(STATUS "Checking for OpenMP ... ") +find_package(OpenMP COMPONENTS CXX) + +set(tmp FALSE) +if (OpenMP_CXX_FOUND) + set(tmp TRUE) +endif() + +set(miniapps_HAS_OpenMP ${tmp} CACHE BOOL "Set if we have a working OpenMP") + +include(GNUInstallDirs) + +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/${CMAKE_INSTALL_LIBDIR}") +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/${CMAKE_INSTALL_LIBDIR}") +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/${CMAKE_INSTALL_BINDIR}") diff --git a/miniapps/CMake/rl_config.cmake b/miniapps/CMake/rl_config.cmake new file mode 100644 index 0000000..8df22e8 --- /dev/null +++ b/miniapps/CMake/rl_config.cmake @@ -0,0 +1,11 @@ + +configure_file(CMake/miniappsConfig.cmake.in + ${CMAKE_INSTALL_LIBDIR}/cmake/miniappsConfig.cmake @ONLY) + +configure_file(CMake/miniappsConfigVersion.cmake.in + ${CMAKE_INSTALL_LIBDIR}/cmake/miniappsConfigVersion.cmake @ONLY) + +install(FILES + ${CMAKE_BINARY_DIR}/${CMAKE_INSTALL_LIBDIR}/cmake/miniappsConfig.cmake + ${CMAKE_BINARY_DIR}/${CMAKE_INSTALL_LIBDIR}/cmake/miniappsConfigVersion.cmake + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake) diff --git a/miniapps/CMakeLists.txt b/miniapps/CMakeLists.txt new file mode 100644 index 0000000..e931875 --- /dev/null +++ b/miniapps/CMakeLists.txt @@ -0,0 +1,45 @@ +cmake_minimum_required(VERSION 3.24) + +project(miniapps VERSION 0.1.0) + +list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/CMake") + +set(CMAKE_CXX_STANDARD 23) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +option(SPBLAS_MINIAPPS_ENABLE_CQRRPT "Enable the CQRRPT miniapp. Default ON." ON) +option(SPBLAS_MINIAPPS_ENABLE_SOLVERS "Enable the solvers miniapp. Default ON." ON) + +enable_testing() + +include(rl_build_options) + +# Find dependencies +find_package(blaspp 2024.10.26 QUIET) +if(NOT blaspp_FOUND) + FetchContent_Declare( + blaspp + GIT_REPOSITORY https://github.com/icl-utk-edu/blaspp + GIT_TAG v2024.10.26) + FetchContent_MakeAvailable(blaspp) +endif() + +set(blas "auto") +set(lapack "auto") + +find_package(lapackpp 2024.10.26 QUIET) +if(NOT lapackpp_FOUND) + FetchContent_Declare( + lapackpp + GIT_REPOSITORY https://github.com/icl-utk-edu/lapackpp + GIT_TAG v2024.10.26) + FetchContent_MakeAvailable(lapackpp) +endif() + +# compile sources +add_subdirectory(include) + +add_subdirectory(test) + +# export the configuration +include(rl_config) diff --git a/miniapps/README.md b/miniapps/README.md new file mode 100644 index 0000000..e69de29 diff --git a/miniapps/include/CMakeLists.txt b/miniapps/include/CMakeLists.txt new file mode 100644 index 0000000..b6f2c82 --- /dev/null +++ b/miniapps/include/CMakeLists.txt @@ -0,0 +1,52 @@ +set(miniapps_libs + lapackpp + blaspp + spblas +) +if (miniapps_HAS_OpenMP) + list(APPEND miniapps_libs OpenMP::OpenMP_CXX) +endif() + +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/rl_config.hh.in rl_config.hh) +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/rl_config.hh + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/miniapps) + +add_library(miniapps INTERFACE) +target_include_directories(miniapps INTERFACE + $ + $ + $ +) + +set(miniapps_cxx_opts -Wall -Wextra) +if (SANITIZE_ADDRESS) + list(APPEND miniapps_cxx_opts -fsanitize=address) + target_link_options(miniapps INTERFACE -fsanitize=address) +endif() +target_compile_options(miniapps INTERFACE ${miniapps_cxx_opts}) + +target_link_libraries(miniapps INTERFACE ${miniapps_libs}) + +install( + DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" + DESTINATION include FILES_MATCHING PATTERN "*.hh" +) + +install( + DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}" + DESTINATION include FILES_MATCHING PATTERN "*.hh" +) + +install( + TARGETS miniapps + EXPORT miniapps + INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/miniapps + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} +) + +# install( +# EXPORT miniapps +# DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake +# EXPORT_LINK_INTERFACE_LIBRARIES +# ) diff --git a/miniapps/include/bicgstab.hh b/miniapps/include/bicgstab.hh new file mode 100644 index 0000000..7d6e751 --- /dev/null +++ b/miniapps/include/bicgstab.hh @@ -0,0 +1,161 @@ +#ifndef MINIAPPS_BICGSTAB_HH +#define MINIAPPS_BICGSTAB_HH + +#include +#include +#include +#include + +#include "krylov.hh" +#include "rl_blaspp.hh" +#include "util.hh" +#include + +namespace miniapps { + +template +class BiCGSTAB : public Krylov { +public: + BiCGSTAB(T eps, int max_iters) : Krylov{eps, max_iters} {} + + /** + * BiCGSTAB or the Bi-Conjugate Gradient-Stabilized is a Krylov subspace + * solver. + * + * Being a generic solver, it is capable of solving general matrices, + * including non-s.p.d matrices. Though, the memory and the computational + * requirement of the BiCGSTAB solver are higher than of its s.p.d solver + * counterpart, it has the capability to solve generic systems. It was + * developed by stabilizing the BiCG method. + * + * We implement the BiCGSTAB variant from Section 2.3.8 of "Templates for t + * e Solution of Linear Systems: Building Blocks for Iterative Methods" + * + * @param[in] A the matrix in CSR format + * @param[in] b the right hand side vector + * @param[out] x the solution vector + */ + std::tuple apply(const spblas::csr_view a, + const std::vector b, + std::vector x) override; +}; + +// ----------------------------------------------------------------------------- +template +std::tuple BiCGSTAB::apply(const spblas::csr_view a, + const std::vector b, + std::vector x) { + int iters = 0; + double error = 1.0; + + // r = dense_b + // prev_rho = rho = omega = alpha = beta = gamma = 1.0 + // rr = v = s = t = z = y = p = 0 + std::vector r(b); + std::vector rr(b.size(), 0.0); + std::vector v(b.size(), 0.0); + std::vector s(b.size(), 0.0); + std::vector t(b.size(), 0.0); + std::vector z(b.size(), 0.0); + std::vector y(b.size(), 0.0); + std::vector p(b.size(), 0.0); + std::vector tmp(b.size(), 0.0); + + double prev_rho, rho, omega, alpha, beta, gamma; + prev_rho = rho = omega = alpha = beta = gamma = 1.0; + + // r = b - Ax + spblas::multiply(a, x, tmp); + blas::axpy(r.size(), -1.0, tmp.data(), 1, r.data(), 1); + // rr = r + rr = r; + while (true) { + iters++; + + // rho = dot(rr, r) + rho = blas::dot(r.size(), rr.data(), 1, r.data(), 1); + + error = blas::nrm2(r.size(), r.data(), 1); + if (iters >= this->max_iters || error < this->eps) { + std::cout << "error = " << error << "\n"; + std::cout << "iters = " << iters << "\n"; + std::cout << "eps = " << this->eps << "\n"; + std::cout << "max_iters = " << this->max_iters << "\n"; + break; + } + + if (prev_rho * omega == 0.0) { + p = r; + } else { + // beta = (rho / prev_rho) * (alpha / omega) + beta = (rho / prev_rho) * (alpha / omega); + // p = r + beta * (p - omega * v) + tmp = p; + blas::axpy(tmp.size(), -omega, v.data(), 1, tmp.data(), 1); + blas::scal(tmp.size(), beta, tmp.data(), 1); + blas::axpy(tmp.size(), 1.0, r.data(), 1, tmp.data(), 1); + p = tmp; + } + + // y = preconditioner * p + y = p; + // v = A * y + spblas::multiply(a, y, v); + // beta = dot(rr, v) + beta = blas::dot(v.size(), rr.data(), 1, v.data(), 1); + if (beta == 0.0) { + s = r; + } else { + // alpha = rho / beta + alpha = rho / beta; + // s = r - alpha * v + tmp = v; + blas::scal(tmp.size(), -alpha, tmp.data(), 1); + blas::axpy(tmp.size(), 1.0, r.data(), 1, tmp.data(), 1); + s = tmp; + } + + error = blas::nrm2(s.size(), s.data(), 1); + if (iters >= this->max_iters || error < this->eps) { + std::cout << "error = " << error << "\n"; + std::cout << "iters = " << iters << "\n"; + std::cout << "eps = " << this->eps << "\n"; + std::cout << "max_iters = " << this->max_iters << "\n"; + blas::axpy(x.size(), alpha, y.data(), 1, x.data(), 1); + break; + } + + // z = preconditioner * s + z = s; + // t = A * z + spblas::multiply(a, z, t); + // gamma = dot(t, s) + gamma = blas::dot(s.size(), t.data(), 1, s.data(), 1); + // beta = dot(t, t) + beta = blas::dot(t.size(), t.data(), 1, t.data(), 1); + // omega = gamma / beta + if (beta == 0.0) { + omega = 0.0; + } else { + omega = gamma / beta; + } + // x = x + alpha * y + omega * z + tmp = z; + blas::scal(tmp.size(), omega, tmp.data(), 1); + blas::axpy(tmp.size(), alpha, y.data(), 1, tmp.data(), 1); + blas::axpy(x.size(), 1.0, tmp.data(), 1, x.data(), 1); + + // r = s - omega * t + tmp = t; + blas::scal(tmp.size(), -omega, tmp.data(), 1); + blas::axpy(tmp.size(), 1.0, s.data(), 1, tmp.data(), 1); + r = tmp; + + std::swap(prev_rho, rho); + } + return {error, iters}; +} + +} // namespace miniapps + +#endif // MINIAPPS_BICGSTAB_HH diff --git a/miniapps/include/cg.hh b/miniapps/include/cg.hh new file mode 100644 index 0000000..7dbbea1 --- /dev/null +++ b/miniapps/include/cg.hh @@ -0,0 +1,108 @@ +#ifndef MINIAPPS_CG_HH +#define MINIAPPS_CG_HH + +#include +#include +#include +#include +#include + +#include "krylov.hh" +#include "rl_blaspp.hh" +#include "util.hh" +#include + +namespace miniapps { + +template +class CG : public Krylov { +public: + CG(T eps, int max_iters) : Krylov{eps, max_iters} {} + + /** + * CG or the conjugate gradient method is an iterative type Krylov subspace + * method which is suitable for symmetric positive definite methods. + * + * Though this method performs very well for symmetric positive definite + * matrices, it is in general not suitable for general matrices. + * + * We implement the CG variant from Section 2.3.1 of "Templates for the + * Solution of Linear Systems: Building Blocks for Iterative Methods" + * + * @param[in] A the matrix in CSR format + * @param[in] b the right hand side vector + * @param[out] x the solution vector + */ + std::tuple apply(const spblas::csr_view a, + const std::vector b, + std::vector x) override; +}; + +// ----------------------------------------------------------------------------- +template +std::tuple CG::apply(const spblas::csr_view a, + const std::vector b, std::vector x) { + int iters = 0; + double error = 1.0; + + // r = b + // rho = 0.0 + // prev_rho = 1.0 + // p = q = 0 + std::vector r(b); + std::vector z(r); + std::vector p(b.size(), 0); + std::vector q(b.size(), 0); + std::vector tmp(b.size(), 0); + + double alpha, beta, rho; + alpha = beta = rho = 0.0; + double prev_rho = 1.0; + + // r = b-Ax + spblas::multiply(a, x, tmp); + blas::axpy(r.size(), -1.0, tmp.data(), 1, r.data(), 1); + + while (true) { + // z = preconditioner*r + z = r; + // rho = dot(r, z) + rho = blas::dot(r.size(), r.data(), 1, z.data(), 1); + + iters++; + error = blas::nrm2(r.size(), r.data(), 1); + if (iters >= this->max_iters || error < this->eps) { + std::cout << "error = " << error << "\n"; + std::cout << "iters = " << iters << "\n"; + std::cout << "eps = " << this->eps << "\n"; + std::cout << "max_iters = " << this->max_iters << "\n"; + break; + } + + // beta = rho / prev_rho; + // p = z + beta * p + beta = rho / prev_rho; + tmp = p; + blas::scal(tmp.size(), beta, tmp.data(), 1); + blas::axpy(tmp.size(), 1.0, z.data(), 1, tmp.data(), 1); + p = tmp; + + // q = A * p + spblas::multiply(a, p, q); + + // alpha = rho / dot(p, q) + alpha = rho / blas::dot(q.size(), p.data(), 1, q.data(), 1); + + // x = x + alpha * p + // r = r - alpha * q + blas::axpy(x.size(), alpha, p.data(), 1, x.data(), 1); + blas::axpy(r.size(), -alpha, q.data(), 1, r.data(), 1); + + std::swap(rho, prev_rho); + } + return {error, iters}; +} + +} // namespace miniapps + +#endif // MINIAPPS_CG_HH diff --git a/miniapps/include/cqrrpt.hh b/miniapps/include/cqrrpt.hh new file mode 100644 index 0000000..26e3f32 --- /dev/null +++ b/miniapps/include/cqrrpt.hh @@ -0,0 +1,208 @@ +#ifndef miniapps_cqrrpt_h +#define miniapps_cqrrpt_h + +#include +#include +#include +#include + +#include "rl_blaspp.hh" +#include "rl_lapackpp.hh" +#include "util.hh" +#include + +namespace miniapps { + +template +class CQRRPTalg { +public: + virtual ~CQRRPTalg() {} + + virtual int call(int64_t m, int64_t n, T* A, int64_t lda, T* R, int64_t ldr, + int64_t* J, T d_factor) = 0; +}; + +template +class CQRRPT : public CQRRPTalg { +public: + CQRRPT(T ep, int64_t nz) { + eps = ep; + nnz = nz; + } + + /// Computes a QR factorization with column pivots of the form: + /// A[:, J] = QR, + /// where Q and R are of size m-by-k and k-by-n, with rank(A) = k. + /// Detailed description of this algorithm may be found in Section 5.1.2. + /// of "the RandLAPACK book". + /// + /// @param[in] m + /// The number of rows in the matrix A. + /// + /// @param[in] n + /// The number of columns in the matrix A. + /// + /// @param[in] A + /// The m-by-n matrix A, stored in a column-major format. + /// + /// @param[in] d + /// Embedding dimension of a sketch, m >= d >= n. + /// + /// @param[in] R + /// Represents the upper-triangular R factor of QR factorization. + /// On entry, is empty and may not have any space allocated for it. + /// + /// @param[out] A + /// Overwritten by an m-by-k orthogonal Q factor. + /// Matrix is stored explicitly. + /// + /// @param[out] R + /// Stores k-by-n matrix with upper-triangular R factor. + /// Zero entries are not compressed. + /// + /// @param[out] J + /// Stores k integer type pivot index extries. + /// + /// @return = 0: successful exit + /// + /// @return = 1: cholesky factorization failed + /// + + int call(int64_t m, int64_t n, T* A, int64_t lda, T* R, int64_t ldr, + int64_t* J, T d_factor) override; + +public: + T eps; + int64_t nnz; + int64_t rank; +}; + +// ----------------------------------------------------------------------------- +template +int CQRRPT::call(int64_t m, int64_t n, T* A, int64_t lda, T* R, int64_t ldr, + int64_t* J, T d_factor) { + int i; + int64_t k = n; + int64_t d = d_factor * n; + // A constant for initial rank estimation. + T eps_initial_rank_estimation = + 2 * std::pow(std::numeric_limits::epsilon(), 0.95); + // Variables for a posteriori rank estimation. + int64_t new_rank; + T running_max, running_min, curr_entry; + + T* A_hat = (T*) calloc(d * n, sizeof(T)); + T* tau = (T*) calloc(n, sizeof(T)); + // Buffer for column pivoting. + std::vector J_buf(n, 0); + + /* RandBLAS style + /// Generating a SASO + RandBLAS::SparseDist DS = {.n_rows = d, .n_cols = m, .vec_nnz = this->nnz}; + RandBLAS::SparseSkOp S(DS, state); + state = RandBLAS::fill_sparse(S); + + /// Applying a SASO + RandBLAS::sketch_general( + Layout::ColMajor, Op::NoTrans, Op::NoTrans, + d, n, m, 1.0, S, 0, 0, A, lda, 0.0, A_hat, d + ); + */ + + /// SparseBLAS style + /// Our sketching operators must be in COO format. + // auto&& [values, rowptr, colind, shape, _] = spblas::generate_coo(d, m, + // nnz); + /// TODO: add COO view + // spblas::coo_view s(values, rowptr, colind, shape, nnz); + + /// Perform dense sketching for the sake of tests passing. + /// This is not the intended behavior for this algorithm, as dense sketching + /// tanks the performance of CQRRPT. + auto [buf, a_shape] = spblas::generate_gaussian(d, m); + spblas::__mdspan::mdspan s(buf.data(), d, m); + + spblas::__mdspan::mdspan a(A, m, n); + spblas::__mdspan::mdspan ahat(A_hat, d, n); + + spblas::multiply(s, a, ahat); + A_hat = ahat.data_handle(); + + /// Performing QRCP on a sketch + lapack::geqp3(d, n, A_hat, d, J, tau); + + /// Using naive rank estimation to ensure that R used for preconditioning is + /// invertible. The actual rank estimate k will be computed a posteriori. + /// Using R[i,i] to approximate the i-th singular value of A_hat. + /// Truncate at the largest i where R[i,i] / R[0,0] >= eps. + for (i = 0; i < n; ++i) { + if (std::abs(A_hat[i * d + i]) / std::abs(A_hat[0]) < + eps_initial_rank_estimation) { + k = i; + break; + } + } + this->rank = k; + + // Allocating space for a preconditioner buffer. + T* R_sp = (T*) calloc(k * k, sizeof(T)); + /// Extracting a k by k upper-triangular R. + lapack::lacpy(MatrixType::Upper, k, k, A_hat, d, R_sp, k); + /// Extracting a k by n R representation (k by k upper-triangular, rest - + /// general) + lapack::lacpy(MatrixType::Upper, k, k, A_hat, d, R, ldr); + lapack::lacpy(MatrixType::General, k, n - k, &A_hat[d * k], d, &R[n * k], + ldr); + + // Swap k columns of A with pivots from J + blas::copy(n, J, 1, J_buf.data(), 1); + util::col_swap(m, n, k, A, lda, J_buf); + + // A_pre * R_sp = AP + blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, + Diag::NonUnit, m, k, 1.0, R_sp, k, A, lda); + + // Do Cholesky QR + blas::syrk(Layout::ColMajor, Uplo::Upper, Op::Trans, k, m, 1.0, A, lda, 0.0, + R_sp, k); + lapack::potrf(Uplo::Upper, k, R_sp, k); + + // Re-estimate rank after we have the R-factor form Cholesky QR. + // The strategy here is the same as in naive rank estimation. + // This also automatically takes care of any potentical failures in Cholesky + // factorization. Note that the diagonal of R_sp may not be sorted, so we need + // to keep the running max/min We expect the loss in the orthogonality of Q to + // be approximately equal to u * cond(R_sp)^2, where u is the unit roundoff + // for the numerical type T. + new_rank = k; + running_max = R_sp[0]; + running_min = R_sp[0]; + + for (i = 0; i < k; ++i) { + curr_entry = std::abs(R_sp[i * k + i]); + running_max = std::max(running_max, curr_entry); + running_min = std::min(running_min, curr_entry); + if (running_max / running_min >= + std::sqrt(this->eps / std::numeric_limits::epsilon())) { + new_rank = i - 1; + break; + } + } + + blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, + Diag::NonUnit, m, new_rank, 1.0, R_sp, k, A, lda); + // Get the final R-factor. + blas::trmm(Layout::ColMajor, Side::Left, Uplo::Upper, Op::NoTrans, + Diag::NonUnit, new_rank, n, 1.0, R_sp, k, R, ldr); + + // Set the rank parameter to the value comuted a posteriori. + this->rank = k; + + free(A_hat); + free(R_sp); + free(tau); + + return 0; +} +} // end namespace miniapps +#endif diff --git a/miniapps/include/krylov.hh b/miniapps/include/krylov.hh new file mode 100644 index 0000000..e24f3a5 --- /dev/null +++ b/miniapps/include/krylov.hh @@ -0,0 +1,40 @@ +#ifndef MINIAPPS_KRYLOV_HH +#define MINIAPPS_KRYLOV_HH + +#include + +#include + +namespace miniapps { + +/** + * A very simple Krylov Solver interface for convenience. + * + * @tparam T the value type + */ +template +class Krylov { +public: + Krylov(T eps, int max_iters) : eps{eps}, max_iters{max_iters} {} + + virtual ~Krylov() {} + + virtual std::tuple apply(const spblas::csr_view a, + const std::vector b, + std::vector x) = 0; + + T get_eps() { + return eps; + } + + int get_max_iters() { + return max_iters; + } + + const T eps; + const int max_iters; +}; + +} // namespace miniapps + +#endif // MINIAPPS_KRYLOV_HH diff --git a/miniapps/include/matrix_utils.hh b/miniapps/include/matrix_utils.hh new file mode 100644 index 0000000..50adaa2 --- /dev/null +++ b/miniapps/include/matrix_utils.hh @@ -0,0 +1,151 @@ +#ifndef SPBLAS_MINIAPPS_MATRIX_UTILS_HH +#define SPBLAS_MINIAPPS_MATRIX_UTILS_HH + +#include + +#include +#include +#include +#include + +namespace miniapps { + +template +struct matrix_data_entry { + std::size_t row; + std::size_t col; + T value; +}; + +template +struct matrix_data { + using nonzero_type = matrix_data_entry; + + I num_rows; + I num_cols; + std::vector nonzeros; + + matrix_data(std::vector rowind, std::vector colind, + std::vector values, spblas::index shape) + : num_rows(shape[0]), num_cols(shape[1]) { + assert(rowind.size() == colind.size() && rowind.size() == values.size()); + for (std::size_t i = 0; i < values.size(); i++) { + nonzeros.emplace_back(rowind[i], colind[i], values[i]); + } + } + + auto convert_to_coo() { + auto nnz = nonzeros.size(); + std::vector values(nnz); + std::vector rowind(nnz); + std::vector colind(nnz); + + for (std::size_t i = 0; i < nnz; i++) { + values[i] = nonzeros[i].value; + rowind[i] = nonzeros[i].row; + colind[i] = nonzeros[i].col; + } + return std::tuple(values, rowind, colind, nnz); + } + + void sort_row_major() { + std::sort(begin(nonzeros), end(nonzeros), + [](nonzero_type x, nonzero_type y) { + return std::tie(x.row, x.col) < std::tie(y.row, y.col); + }); + } + + void sum_duplicates() { + this->sort_row_major(); + std::vector new_nonzeros; + if (!nonzeros.empty()) { + new_nonzeros.emplace_back(nonzeros.front().row, nonzeros.front().col, + 0.0); + for (auto entry : nonzeros) { + if (entry.row != new_nonzeros.back().row || + entry.col != new_nonzeros.back().col) { + new_nonzeros.emplace_back(entry.row, entry.col, 0.0); + } + new_nonzeros.back().value += entry.value; + } + nonzeros = std::move(new_nonzeros); + } + } + + void make_symmetric() { + const auto nnz = nonzeros.size(); + // compute (A + op(A^T)) / 2 + for (std::size_t i = 0; i < nnz; i++) { + nonzeros[i].value /= 2.0; + auto entry = nonzeros[i]; + nonzeros.emplace_back(entry.col, entry.row, entry.value); + } + // combine duplicates + this->sum_duplicates(); + } + + void make_diag_dominant(T ratio = 1.0) { + std::vector norms(num_rows); + std::vector diag_positions(num_rows, -1); + std::int64_t i{}; + for (auto entry : nonzeros) { + if (entry.row == entry.col) { + diag_positions[entry.row] = i; + } else { + norms[entry.row] += std::abs(entry.value); + } + i++; + } + for (i = 0; i < num_rows; i++) { + if (norms[i] == 0.0) { + // make sure empty rows don't make the matrix singular + norms[i] = 1.0; + } + if (diag_positions[i] < 0) { + nonzeros.emplace_back(i, i, norms[i] * ratio); + } else { + auto& diag_value = nonzeros[diag_positions[i]].value; + const auto diag_magnitude = std::abs(diag_value); + const auto offdiag_magnitude = norms[i]; + if (diag_magnitude < offdiag_magnitude * ratio) { + const auto scaled_value = + diag_value * (offdiag_magnitude * ratio / diag_magnitude); + if (std::isfinite(scaled_value)) { + diag_value = scaled_value; + } else { + diag_value = offdiag_magnitude * ratio; + } + } + } + } + this->sort_row_major(); + } +}; + +template +auto convert_rowind_to_rowptr(std::vector rowind, std::size_t nnz, + spblas::index shape) { + auto num_rows = shape[0]; + std::vector rowptr(num_rows + 1, 0); + assert(rowind.size() == nnz); + + for (I i = 0; i < nnz; i++) { + rowptr[rowind[i]]++; + } + constexpr auto max = std::numeric_limits::max(); + std::size_t partial_sum{}; + for (std::size_t i = 0; i < num_rows + 1; ++i) { + auto this_nnz = i < num_rows ? rowptr[i] : 0; + rowptr[i] = partial_sum; + if (max - partial_sum < this_nnz) { + throw std::exception(); + } + partial_sum += this_nnz; + } + + return rowptr; +} + +} // namespace miniapps + +#endif // SPBLAS_MINIAPPS_MATRIX_UTILS_HH diff --git a/miniapps/include/miniapps.hh b/miniapps/include/miniapps.hh new file mode 100644 index 0000000..291c427 --- /dev/null +++ b/miniapps/include/miniapps.hh @@ -0,0 +1,19 @@ +#ifndef SPBLAS_MINIAPPS_HH +#define SPBLAS_MINIAPPS_HH + +// misc +#include "util.hh" + +// misc +#include "matrix_utils.hh" + +// randnla +#include "cqrrpt.hh" + +// cg +#include "cg.hh" + +// bicgstab +#include "bicgstab.hh" + +#endif // SPBLAS_MINIAPPS_HH diff --git a/miniapps/include/rl_blaspp.hh b/miniapps/include/rl_blaspp.hh new file mode 100644 index 0000000..1ac915e --- /dev/null +++ b/miniapps/include/rl_blaspp.hh @@ -0,0 +1,12 @@ +#ifndef randlapack_blaspp_h +#define randlapack_blaspp_h + +#include + +using Layout = blas::Layout; +using Op = blas::Op; +using Side = blas::Side; +using Diag = blas::Diag; +using Uplo = blas::Uplo; + +#endif diff --git a/miniapps/include/rl_config.hh.in b/miniapps/include/rl_config.hh.in new file mode 100644 index 0000000..68e2991 --- /dev/null +++ b/miniapps/include/rl_config.hh.in @@ -0,0 +1,11 @@ +#ifndef miniapps_config_h +#define miniapps_config_h + +#define miniapps_VERSION "@miniapps_VERSION@" +#define miniapps_VERSION_MAJOR @miniapps_VERSION_MAJOR@ +#define miniapps_VERSION_MINOR @miniapps_VERSION_MINOR@ +#define miniapps_VERSION_PATCH @miniapps_VERSION_PATCH@ + +#CMAKEDEFINE miniapps_HAS_OpenMP + +#endif diff --git a/miniapps/include/rl_lapackpp.hh b/miniapps/include/rl_lapackpp.hh new file mode 100644 index 0000000..ba62260 --- /dev/null +++ b/miniapps/include/rl_lapackpp.hh @@ -0,0 +1,12 @@ +#ifndef randlapack_lapackpp_h +#define randlapack_lapackpp_h + +#define LAPACK_COMPLEX_CPP 1 + +#include + +using Job = lapack::Job; +using MatrixType = lapack::MatrixType; +using Norm = lapack::Norm; + +#endif diff --git a/miniapps/include/util.hh b/miniapps/include/util.hh new file mode 100644 index 0000000..0066136 --- /dev/null +++ b/miniapps/include/util.hh @@ -0,0 +1,36 @@ +#ifndef miniapps_util_h +#define miniapps_util_h + +#include "rl_blaspp.hh" +#include "rl_lapackpp.hh" +#include + +#include +#include +#include +#include +#include + +namespace miniapps::util { + +/// A version of the above function to be used on a vector of integers +template +void col_swap(int64_t m, int64_t n, int64_t k, T* A, int64_t lda, + std::vector idx) { + if (k > n) + throw std::runtime_error("Invalid rank parameter."); + + int64_t i, j; //, l; + for (i = 0, j = 0; i < k; ++i) { + j = idx[i] - 1; + blas::swap(m, &A[i * lda], 1, &A[j * lda], 1); + + // swap idx array elements + // Find idx element with value i and assign it to j + auto it = std::find(idx.begin() + i, idx.begin() + k, i + 1); + idx[it - (idx.begin())] = j + 1; + } +} + +} // namespace miniapps::util +#endif diff --git a/miniapps/test/CMakeLists.txt b/miniapps/test/CMakeLists.txt new file mode 100644 index 0000000..733220a --- /dev/null +++ b/miniapps/test/CMakeLists.txt @@ -0,0 +1,2 @@ +add_subdirectory(randnla) +add_subdirectory(solvers) diff --git a/miniapps/test/randnla/CMakeLists.txt b/miniapps/test/randnla/CMakeLists.txt new file mode 100644 index 0000000..8696f44 --- /dev/null +++ b/miniapps/test/randnla/CMakeLists.txt @@ -0,0 +1,7 @@ +add_executable(miniapps_randnla_tests + test_cqrrpt.cc) + +include(GoogleTest) +target_link_libraries(miniapps_randnla_tests miniapps GTest::gtest_main) + +gtest_discover_tests(miniapps_randnla_tests) diff --git a/miniapps/test/randnla/test_cqrrpt.cc b/miniapps/test/randnla/test_cqrrpt.cc new file mode 100644 index 0000000..676a256 --- /dev/null +++ b/miniapps/test/randnla/test_cqrrpt.cc @@ -0,0 +1,142 @@ +#include "miniapps.hh" +#include "rl_blaspp.hh" +#include "rl_lapackpp.hh" +#include + +#include +#include + +class TestCQRRPT : public ::testing::Test { +protected: + virtual void SetUp(){}; + + virtual void TearDown(){}; + + template + struct CQRRPTTestData { + int64_t row; + int64_t col; + int64_t rank; + std::vector A; + std::vector R; + std::vector J; + std::vector A_cpy1; + std::vector A_cpy2; + std::vector I_ref; + + CQRRPTTestData(int64_t m, int64_t n, int64_t k) + : A(m * n, 0.0), R(n * n, 0.0), J(n, 0), A_cpy1(m * n, 0.0), + A_cpy2(m * n, 0.0), I_ref(k * k, 0.0) { + row = m; + col = n; + rank = k; + } + }; + + template + static void norm_and_copy_computational_helper(T& norm_A, + CQRRPTTestData& all_data) { + auto m = all_data.row; + auto n = all_data.col; + + lapack::lacpy(MatrixType::General, m, n, all_data.A.data(), m, + all_data.A_cpy1.data(), m); + lapack::lacpy(MatrixType::General, m, n, all_data.A.data(), m, + all_data.A_cpy2.data(), m); + norm_A = lapack::lange(Norm::Fro, m, n, all_data.A.data(), m); + } + + template + static void error_check(T& norm_A, CQRRPTTestData& all_data) { + + auto m = all_data.row; + auto n = all_data.col; + auto k = all_data.rank; + + all_data.I_ref.resize(k * k, 0.0); + for (int i = 0; i < k; ++i) + all_data.I_ref[(k + 1) * i] = 1.0; + + T* A_dat = all_data.A_cpy1.data(); + T const* A_cpy_dat = all_data.A_cpy2.data(); + T const* Q_dat = all_data.A.data(); + T const* R_dat = all_data.R.data(); + T* I_ref_dat = all_data.I_ref.data(); + + // Check orthogonality of Q + // Q' * Q - I = 0 + blas::syrk(Layout::ColMajor, Uplo::Upper, Op::Trans, k, m, 1.0, Q_dat, m, + -1.0, I_ref_dat, k); + T norm_QTQ = lapack::lansy(lapack::Norm::Fro, Uplo::Upper, k, I_ref_dat, k); + + // A - QR + blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, n, k, 1.0, Q_dat, + m, R_dat, n, -1.0, A_dat, m); + + // Implementing max col norm metric + T max_col_norm = 0.0; + T col_norm = 0.0; + int max_idx = 0; + for (int i = 0; i < n; ++i) { + col_norm = blas::nrm2(m, &A_dat[m * i], 1); + if (max_col_norm < col_norm) { + max_col_norm = col_norm; + max_idx = i; + } + } + T col_norm_A = blas::nrm2(n, &A_cpy_dat[m * max_idx], 1); + T norm_AQR = lapack::lange(Norm::Fro, m, n, A_dat, m); + + printf("REL NORM OF AP - QR: %15e\n", norm_AQR / norm_A); + printf("MAX COL NORM METRIC: %15e\n", max_col_norm / col_norm_A); + printf("FRO NORM OF (Q'Q - I): %2e\n\n", norm_QTQ); + + T atol = std::pow(std::numeric_limits::epsilon(), 0.75); + ASSERT_NEAR(norm_AQR / norm_A, 0.0, atol); + ASSERT_NEAR(max_col_norm / col_norm_A, 0.0, atol); + ASSERT_NEAR(norm_QTQ, 0.0, atol); + } + + /// General test for CQRRPT: + /// Computes QR factorzation, and computes A[:, J] - QR. + template + static void test_CQRRPT_general(T d_factor, T norm_A, + CQRRPTTestData& all_data, + alg_type& CQRRPT) { + + auto m = all_data.row; + auto n = all_data.col; + + CQRRPT.call(m, n, all_data.A.data(), m, all_data.R.data(), n, + all_data.J.data(), d_factor); + all_data.rank = CQRRPT.rank; + + printf("RANK AS RETURNED BY CQRRPT %ld\n", all_data.rank); + + miniapps::util::col_swap(m, n, n, all_data.A_cpy1.data(), m, all_data.J); + miniapps::util::col_swap(m, n, n, all_data.A_cpy2.data(), m, all_data.J); + + error_check(norm_A, all_data); + } +}; + +TEST_F(TestCQRRPT, CQRRPT_full_rank_no_hqrrp) { + int64_t m = 50; + int64_t n = 20; + int64_t k = 20; + double d_factor = 1.25; + double norm_A = 0; + int64_t nnz = 2; + double tol = std::pow(std::numeric_limits::epsilon(), 0.85); + + CQRRPTTestData all_data(m, n, k); + miniapps::CQRRPT CQRRPT(tol, nnz); + // Generate dense matrix + auto [buf, a_shape] = spblas::generate_dense(m, n); + + lapack::lacpy(MatrixType::General, m, n, buf.data(), m, all_data.A.data(), m); + + norm_and_copy_computational_helper(norm_A, all_data); + test_CQRRPT_general>(d_factor, norm_A, + all_data, CQRRPT); +} diff --git a/miniapps/test/solvers/CMakeLists.txt b/miniapps/test/solvers/CMakeLists.txt new file mode 100644 index 0000000..70a99d3 --- /dev/null +++ b/miniapps/test/solvers/CMakeLists.txt @@ -0,0 +1,8 @@ +add_executable(miniapps_solvers_tests + test_cg.cpp + test_bicgstab.cpp) + +target_link_libraries(miniapps_solvers_tests miniapps GTest::gtest_main) + +include(GoogleTest) +gtest_discover_tests(miniapps_solvers_tests) diff --git a/miniapps/test/solvers/test_bicgstab.cpp b/miniapps/test/solvers/test_bicgstab.cpp new file mode 100644 index 0000000..999702c --- /dev/null +++ b/miniapps/test/solvers/test_bicgstab.cpp @@ -0,0 +1,73 @@ +#include "miniapps.hh" +#include + +#include +#include +#include + +#include + +template +class TestBiCGSTAB : public testing::Test { +public: + auto generate_problem(std::size_t m, std::size_t n, std::size_t nnz_input, + std::size_t seed = 0) { + auto&& [values_orig, rowind_orig, colind_orig, shape, nnz_orig] = + spblas::generate_coo(m, n, nnz_input, seed); + miniapps::matrix_data data(rowind_orig, colind_orig, values_orig, shape); + auto&& [values, rowind, colind, nnz] = data.convert_to_coo(); + auto rowptr = miniapps::convert_rowind_to_rowptr(colind, nnz, shape); + return std::tuple(values, rowptr, colind, shape, nnz); + } + + auto generate_spd_problem(std::size_t m, std::size_t n, std::size_t nnz_input, + std::size_t seed = 0) { + auto&& [values_orig, rowind_orig, colind_orig, shape, nnz_orig] = + spblas::generate_coo(m, n, nnz_input, seed); + miniapps::matrix_data data(rowind_orig, colind_orig, values_orig, shape); + data.sort_row_major(); + data.make_symmetric(); + data.make_diag_dominant(); + auto&& [values, rowind, colind, nnz] = data.convert_to_coo(); + auto rowptr = miniapps::convert_rowind_to_rowptr(colind, nnz, shape); + return std::tuple(values, rowptr, colind, shape, nnz); + } +}; + +using BiCGSTABTestTypes = ::testing::Types; +TYPED_TEST_SUITE(TestBiCGSTAB, BiCGSTABTestTypes); + +TYPED_TEST(TestBiCGSTAB, ConvergesForSmallSystem) { + using T = TypeParam; + constexpr double tol = std::is_same::value ? 1e-14 : 1e-7; + constexpr int max_iters = 100; + std::tuple res; + auto&& [values, rowptr, colind, shape, nnz] = + this->generate_problem(10, 10, 42, 75); + spblas::csr_view a(values, rowptr, colind, shape, nnz); + std::vector b(10, 1.0); + std::vector x(10, 0.0); + + miniapps::BiCGSTAB BiCGSTAB(tol, max_iters); + auto [error, iters] = BiCGSTAB.apply(a, b, x); + + ASSERT_LE(error, tol); + ASSERT_LE(iters, max_iters); +} + +TYPED_TEST(TestBiCGSTAB, ConvergesForLargeSystem) { + using T = TypeParam; + constexpr double tol = std::is_same::value ? 1e-14 : 1e-7; + constexpr int max_iters = 100; + std::vector b(1000, 1.0); + std::vector x(1000, 0.0); + auto&& [values, rowptr, colind, shape, nnz] = + this->generate_spd_problem(1000, 1000, 12345, 75); + spblas::csr_view a(values, rowptr, colind, shape, nnz); + + miniapps::BiCGSTAB BiCGSTAB(tol, max_iters); + auto [error, iters] = BiCGSTAB.apply(a, b, x); + + ASSERT_LE(error, tol); + ASSERT_LE(iters, max_iters); +} diff --git a/miniapps/test/solvers/test_cg.cpp b/miniapps/test/solvers/test_cg.cpp new file mode 100644 index 0000000..0e159ca --- /dev/null +++ b/miniapps/test/solvers/test_cg.cpp @@ -0,0 +1,66 @@ +#include "miniapps.hh" +#include + +#include +#include +#include + +#include + +template +class TestCG : public testing::Test { +public: + using ValueType = T; + + auto generate_spd_problem(std::size_t m, std::size_t n, std::size_t nnz_input, + std::size_t seed = 0) { + auto&& [values_orig, rowind_orig, colind_orig, shape, nnz_orig] = + spblas::generate_coo(m, n, nnz_input, seed); + miniapps::matrix_data data(rowind_orig, colind_orig, values_orig, shape); + data.sort_row_major(); + data.make_symmetric(); + data.make_diag_dominant(); + auto&& [values, rowind, colind, nnz] = data.convert_to_coo(); + auto rowptr = miniapps::convert_rowind_to_rowptr(colind, nnz, shape); + return std::tuple(values, rowptr, colind, shape, nnz); + } +}; + +using CGTestTypes = ::testing::Types; +TYPED_TEST_SUITE(TestCG, CGTestTypes); + +TYPED_TEST(TestCG, ConvergesForSmallSystem) { + using T = TypeParam; + constexpr double tol = std::is_same::value ? 1e-14 : 1e-7; + constexpr int max_iters = 100; + std::vector b(10, 1.0); + std::vector x(10, 0.0); + + auto&& [values, rowptr, colind, shape, nnz] = + this->generate_spd_problem(10, 10, 42, 75); + spblas::csr_view a(values, rowptr, colind, shape, nnz); + + miniapps::CG CG(tol, max_iters); + auto [error, iters] = CG.apply(a, b, x); + + ASSERT_LE(error, tol); + ASSERT_LE(iters, max_iters); +} + +TYPED_TEST(TestCG, ConvergesForLargeSystem) { + using T = TypeParam; + constexpr double tol = std::is_same::value ? 1e-14 : 1e-7; + constexpr int max_iters = 100; + std::vector b(1000, 1.0); + std::vector x(1000, 0.0); + + auto&& [values, rowptr, colind, shape, nnz] = + this->generate_spd_problem(1000, 1000, 12345, 75); + spblas::csr_view a(values, rowptr, colind, shape, nnz); + + miniapps::CG CG(tol, max_iters); + auto [error, iters] = CG.apply(a, b, x); + + ASSERT_LE(error, tol); + ASSERT_LE(iters, max_iters); +}