From 23d26bc1ad79e9d1ef1f45ded7e7425b921027a2 Mon Sep 17 00:00:00 2001 From: Terry Cojean Date: Wed, 26 Feb 2025 23:23:14 +0100 Subject: [PATCH 1/5] Merge the miniapps as an optional This facilitates contribution, keeping the repositories in sync, and exapnds from simple examples. Cleanup is necessary to make this more convenient for CI and more consistent. Co-authored-by: Max Melnichenko --- CMakeLists.txt | 6 + miniapps/CMake/blaspp.cmake | 14 ++ miniapps/CMake/lapackpp.cmake | 14 ++ miniapps/CMake/miniappsConfig.cmake.in | 22 ++ miniapps/CMake/miniappsConfigVersion.cmake.in | 11 + miniapps/CMake/rl_build_options.cmake | 29 +++ miniapps/CMake/rl_config.cmake | 11 + miniapps/CMake/rl_version.cmake | 27 +++ miniapps/CMakeLists.txt | 30 +++ miniapps/README.md | 0 miniapps/include/CMakeLists.txt | 52 +++++ miniapps/include/bicgstab.hh | 160 +++++++++++++ miniapps/include/cg.hh | 107 +++++++++ miniapps/include/cqrrpt.hh | 221 ++++++++++++++++++ miniapps/include/krylov.hh | 36 +++ miniapps/include/matrix_utils.hh | 149 ++++++++++++ miniapps/include/miniapps.hh | 19 ++ miniapps/include/rl_blaspp.hh | 12 + miniapps/include/rl_config.hh.in | 11 + miniapps/include/rl_lapackpp.hh | 12 + miniapps/include/util.hh | 42 ++++ miniapps/test/CMakeLists.txt | 2 + miniapps/test/randnla/CMakeLists.txt | 7 + miniapps/test/randnla/test_cqrrpt.cc | 147 ++++++++++++ miniapps/test/solvers/CMakeLists.txt | 8 + miniapps/test/solvers/test_bicgstab.cpp | 72 ++++++ miniapps/test/solvers/test_cg.cpp | 65 ++++++ 27 files changed, 1286 insertions(+) create mode 100644 miniapps/CMake/blaspp.cmake create mode 100644 miniapps/CMake/lapackpp.cmake create mode 100644 miniapps/CMake/miniappsConfig.cmake.in create mode 100644 miniapps/CMake/miniappsConfigVersion.cmake.in create mode 100644 miniapps/CMake/rl_build_options.cmake create mode 100644 miniapps/CMake/rl_config.cmake create mode 100644 miniapps/CMake/rl_version.cmake create mode 100644 miniapps/CMakeLists.txt create mode 100644 miniapps/README.md create mode 100644 miniapps/include/CMakeLists.txt create mode 100644 miniapps/include/bicgstab.hh create mode 100644 miniapps/include/cg.hh create mode 100644 miniapps/include/cqrrpt.hh create mode 100644 miniapps/include/krylov.hh create mode 100644 miniapps/include/matrix_utils.hh create mode 100644 miniapps/include/miniapps.hh create mode 100644 miniapps/include/rl_blaspp.hh create mode 100644 miniapps/include/rl_config.hh.in create mode 100644 miniapps/include/rl_lapackpp.hh create mode 100644 miniapps/include/util.hh create mode 100644 miniapps/test/CMakeLists.txt create mode 100644 miniapps/test/randnla/CMakeLists.txt create mode 100644 miniapps/test/randnla/test_cqrrpt.cc create mode 100644 miniapps/test/solvers/CMakeLists.txt create mode 100644 miniapps/test/solvers/test_bicgstab.cpp create mode 100644 miniapps/test/solvers/test_cg.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 5734ab3..c439e1f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,6 +6,8 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_FLAGS "-O3 -march=native") +option(SPBLAS_BUILD_MINIAPPS "Also build the miniapps. Default: OFF." OFF) + # Get includes, which declares the `spblas` library add_subdirectory(include) @@ -90,3 +92,7 @@ if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME) add_subdirectory(examples) add_subdirectory(test) endif() + +if (SPBLAS_BUILD_MINIAPPS) + add_subdirectory(miniapps) +endif() diff --git a/miniapps/CMake/blaspp.cmake b/miniapps/CMake/blaspp.cmake new file mode 100644 index 0000000..986763a --- /dev/null +++ b/miniapps/CMake/blaspp.cmake @@ -0,0 +1,14 @@ +message(STATUS "\n\n-- miniapps checking for blaspp ... ") +find_package(blaspp REQUIRED) +message(STATUS "miniapps found blaspp ${blaspp_VERSION}\n") + +# interface library for use elsewhere in the project +add_library(miniapps_blaspp INTERFACE) + +target_link_libraries(miniapps_blaspp INTERFACE blaspp) + +install(TARGETS miniapps_blaspp EXPORT miniapps_blaspp) + +install(EXPORT miniapps_blaspp + DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake" + EXPORT_LINK_INTERFACE_LIBRARIES) \ No newline at end of file diff --git a/miniapps/CMake/lapackpp.cmake b/miniapps/CMake/lapackpp.cmake new file mode 100644 index 0000000..e7d9827 --- /dev/null +++ b/miniapps/CMake/lapackpp.cmake @@ -0,0 +1,14 @@ +message(STATUS "\n\n-- miniapps checking for lapackpp ... ") +find_package(lapackpp REQUIRED) +message(STATUS "miniapps found lapackpp ${lapackpp_VERSION}\n") + +# interface library for use elsewhere in the project +add_library(miniapps_lapackpp INTERFACE) + +target_link_libraries(miniapps_lapackpp INTERFACE lapackpp) + +install(TARGETS miniapps_lapackpp EXPORT miniapps_lapackpp) + +install(EXPORT miniapps_lapackpp + DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake" + EXPORT_LINK_INTERFACE_LIBRARIES) diff --git a/miniapps/CMake/miniappsConfig.cmake.in b/miniapps/CMake/miniappsConfig.cmake.in new file mode 100644 index 0000000..6e89b67 --- /dev/null +++ b/miniapps/CMake/miniappsConfig.cmake.in @@ -0,0 +1,22 @@ +include(CMakeFindDependencyMacro) +list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}") + +set(miniapps_VERSION "@miniapps_VERSION@") +set(miniapps_VERSION_MAJOR "@miniapps_VERSION_MAJOR@") +set(miniapps_VERSION_MINOR "@miniapps_VERSION_MINOR@") +set(miniapps_VERSION_PATCH "@miniapps_VERSION_PATCH@") +set(miniapps_VERSION_DEVEL "@miniapps_VERSION_DEVEL@") + +# blas++ +if (NOT blaspp_DIR) + set(blaspp_DIR @blaspp_DIR@) +endif() +find_dependency(blaspp) + +# lapack++ +if (NOT lapackpp_DIR) + set(lapackpp_DIR @lapackpp_DIR@) +endif() +find_dependency(lapackpp) + +include(miniapps) diff --git a/miniapps/CMake/miniappsConfigVersion.cmake.in b/miniapps/CMake/miniappsConfigVersion.cmake.in new file mode 100644 index 0000000..582661d --- /dev/null +++ b/miniapps/CMake/miniappsConfigVersion.cmake.in @@ -0,0 +1,11 @@ +set(PACKAGE_VERSION "@miniapps_VERSION@") + +# Check whether the requested PACKAGE_FIND_VERSION is compatible +if("${PACKAGE_VERSION}" VERSION_LESS "${PACKAGE_FIND_VERSION}") + set(PACKAGE_VERSION_COMPATIBLE FALSE) +else() + set(PACKAGE_VERSION_COMPATIBLE TRUE) + if ("${PACKAGE_VERSION}" VERSION_EQUAL "${PACKAGE_FIND_VERSION}") + set(PACKAGE_VERSION_EXACT TRUE) + endif() +endif() diff --git a/miniapps/CMake/rl_build_options.cmake b/miniapps/CMake/rl_build_options.cmake new file mode 100644 index 0000000..ff412d1 --- /dev/null +++ b/miniapps/CMake/rl_build_options.cmake @@ -0,0 +1,29 @@ +set(CMAKE_CXX_STANDARD 23) +set(CMAKE_POSITION_INDEPENDENT_CODE ON) + +option(BUILD_SHARED_LIBS OFF "Configure to build shared or static libraries") + +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE "Release" + CACHE STRING "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel." FORCE) + set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo") +endif() + +set(SANITIZE_ADDRESS OFF CACHE BOOL "Add address sanitizer flags to the library") + +message(STATUS "Checking for OpenMP ... ") +find_package(OpenMP COMPONENTS CXX) + +set(tmp FALSE) +if (OpenMP_CXX_FOUND) + set(tmp TRUE) +endif() + +set(miniapps_HAS_OpenMP ${tmp} CACHE BOOL "Set if we have a working OpenMP") + +include(GNUInstallDirs) + +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/${CMAKE_INSTALL_LIBDIR}") +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/${CMAKE_INSTALL_LIBDIR}") +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/${CMAKE_INSTALL_BINDIR}") + diff --git a/miniapps/CMake/rl_config.cmake b/miniapps/CMake/rl_config.cmake new file mode 100644 index 0000000..8df22e8 --- /dev/null +++ b/miniapps/CMake/rl_config.cmake @@ -0,0 +1,11 @@ + +configure_file(CMake/miniappsConfig.cmake.in + ${CMAKE_INSTALL_LIBDIR}/cmake/miniappsConfig.cmake @ONLY) + +configure_file(CMake/miniappsConfigVersion.cmake.in + ${CMAKE_INSTALL_LIBDIR}/cmake/miniappsConfigVersion.cmake @ONLY) + +install(FILES + ${CMAKE_BINARY_DIR}/${CMAKE_INSTALL_LIBDIR}/cmake/miniappsConfig.cmake + ${CMAKE_BINARY_DIR}/${CMAKE_INSTALL_LIBDIR}/cmake/miniappsConfigVersion.cmake + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake) diff --git a/miniapps/CMake/rl_version.cmake b/miniapps/CMake/rl_version.cmake new file mode 100644 index 0000000..4dc2f3c --- /dev/null +++ b/miniapps/CMake/rl_version.cmake @@ -0,0 +1,27 @@ +set(tmp) +find_package(Git QUIET) +if(GIT_FOUND) + execute_process(COMMAND ${GIT_EXECUTABLE} + --git-dir=${CMAKE_SOURCE_DIR}/.git describe + --tags --match "[0-9]*.[0-9]*.[0-9]*" + OUTPUT_VARIABLE tmp OUTPUT_STRIP_TRAILING_WHITESPACE + ERROR_QUIET) +endif() +if(NOT tmp) + set(tmp "0.0.0") +endif() + +set(miniapps_VERSION ${tmp} CACHE STRING "miniapps version" FORCE) + +string(REGEX REPLACE "^([0-9]+)\\.([0-9]+)\\.([0-9]+)(.*$)" + "\\1" miniapps_VERSION_MAJOR ${miniapps_VERSION}) + +string(REGEX REPLACE "^([0-9]+)\\.([0-9]+)\\.([0-9]+)(.*$)" + "\\2" miniapps_VERSION_MINOR ${miniapps_VERSION}) + +string(REGEX REPLACE "^([0-9]+)\\.([0-9]+)\\.([0-9]+)(.*$)" + "\\3" miniapps_VERSION_PATCH ${miniapps_VERSION}) + +message(STATUS "miniapps_VERSION_MAJOR=${miniapps_VERSION_MAJOR}") +message(STATUS "miniapps_VERSION_MINOR=${miniapps_VERSION_MINOR}") +message(STATUS "miniapps_VERSION_PATCH=${miniapps_VERSION_PATCH}") diff --git a/miniapps/CMakeLists.txt b/miniapps/CMakeLists.txt new file mode 100644 index 0000000..b3cd1b4 --- /dev/null +++ b/miniapps/CMakeLists.txt @@ -0,0 +1,30 @@ +cmake_minimum_required(VERSION 3.20) + +project(miniapps) + +list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/CMake") + +set(CMAKE_CXX_STANDARD 23) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +# configure the build +enable_testing() + +include(rl_build_options) +include(rl_version) + +# find dependencies +find_package(lapackpp REQUIRED) + +# find dependencies +find_package(blaspp REQUIRED) + +# compile sources +add_subdirectory(include) + +if (ENABLE_TESTING) + add_subdirectory(test) +endif() + +# export the configuration +include(rl_config) diff --git a/miniapps/README.md b/miniapps/README.md new file mode 100644 index 0000000..e69de29 diff --git a/miniapps/include/CMakeLists.txt b/miniapps/include/CMakeLists.txt new file mode 100644 index 0000000..b6f2c82 --- /dev/null +++ b/miniapps/include/CMakeLists.txt @@ -0,0 +1,52 @@ +set(miniapps_libs + lapackpp + blaspp + spblas +) +if (miniapps_HAS_OpenMP) + list(APPEND miniapps_libs OpenMP::OpenMP_CXX) +endif() + +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/rl_config.hh.in rl_config.hh) +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/rl_config.hh + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/miniapps) + +add_library(miniapps INTERFACE) +target_include_directories(miniapps INTERFACE + $ + $ + $ +) + +set(miniapps_cxx_opts -Wall -Wextra) +if (SANITIZE_ADDRESS) + list(APPEND miniapps_cxx_opts -fsanitize=address) + target_link_options(miniapps INTERFACE -fsanitize=address) +endif() +target_compile_options(miniapps INTERFACE ${miniapps_cxx_opts}) + +target_link_libraries(miniapps INTERFACE ${miniapps_libs}) + +install( + DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" + DESTINATION include FILES_MATCHING PATTERN "*.hh" +) + +install( + DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}" + DESTINATION include FILES_MATCHING PATTERN "*.hh" +) + +install( + TARGETS miniapps + EXPORT miniapps + INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/miniapps + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} +) + +# install( +# EXPORT miniapps +# DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake +# EXPORT_LINK_INTERFACE_LIBRARIES +# ) diff --git a/miniapps/include/bicgstab.hh b/miniapps/include/bicgstab.hh new file mode 100644 index 0000000..09b11d5 --- /dev/null +++ b/miniapps/include/bicgstab.hh @@ -0,0 +1,160 @@ +#ifndef MINIAPPS_BICGSTAB_HH +#define MINIAPPS_BICGSTAB_HH + +#include +#include +#include +#include + +#include "krylov.hh" +#include "rl_blaspp.hh" +#include "util.hh" +#include + +namespace miniapps { + +template class BiCGSTAB : public Krylov { +public: + BiCGSTAB(T eps, int max_iters) : Krylov{eps, max_iters} {} + + /** + * BiCGSTAB or the Bi-Conjugate Gradient-Stabilized is a Krylov subspace + * solver. + * + * Being a generic solver, it is capable of solving general matrices, + * including non-s.p.d matrices. Though, the memory and the computational + * requirement of the BiCGSTAB solver are higher than of its s.p.d solver + * counterpart, it has the capability to solve generic systems. It was + * developed by stabilizing the BiCG method. + * + * We implement the BiCGSTAB variant from Section 2.3.8 of "Templates for t + * e Solution of Linear Systems: Building Blocks for Iterative Methods" + * + * @param[in] A the matrix in CSR format + * @param[in] b the right hand side vector + * @param[out] x the solution vector + */ + std::tuple apply(const spblas::csr_view a, + const std::vector b, + std::vector x) override; +}; + +// ----------------------------------------------------------------------------- +template +std::tuple BiCGSTAB::apply(const spblas::csr_view a, + const std::vector b, + std::vector x) { + int iters = 0; + double error = 1.0; + + // r = dense_b + // prev_rho = rho = omega = alpha = beta = gamma = 1.0 + // rr = v = s = t = z = y = p = 0 + std::vector r(b); + std::vector rr(b.size(), 0.0); + std::vector v(b.size(), 0.0); + std::vector s(b.size(), 0.0); + std::vector t(b.size(), 0.0); + std::vector z(b.size(), 0.0); + std::vector y(b.size(), 0.0); + std::vector p(b.size(), 0.0); + std::vector tmp(b.size(), 0.0); + + double prev_rho, rho, omega, alpha, beta, gamma; + prev_rho = rho = omega = alpha = beta = gamma = 1.0; + + // r = b - Ax + spblas::multiply(a, x, tmp); + blas::axpy(r.size(), -1.0, tmp.data(), 1, r.data(), 1); + // rr = r + rr = r; + while (true) { + iters++; + + // rho = dot(rr, r) + rho = blas::dot(r.size(), rr.data(), 1, r.data(), 1); + + error = blas::nrm2(r.size(), r.data(), 1); + if (iters >= this->max_iters || error < this->eps) { + std::cout << "error = " << error << "\n"; + std::cout << "iters = " << iters << "\n"; + std::cout << "eps = " << this->eps << "\n"; + std::cout << "max_iters = " << this->max_iters << "\n"; + break; + } + + if (prev_rho * omega == 0.0) { + p = r; + } else { + // beta = (rho / prev_rho) * (alpha / omega) + beta = (rho / prev_rho) * (alpha / omega); + // p = r + beta * (p - omega * v) + tmp = p; + blas::axpy(tmp.size(), -omega, v.data(), 1, tmp.data(), 1); + blas::scal(tmp.size(), beta, tmp.data(), 1); + blas::axpy(tmp.size(), 1.0, r.data(), 1, tmp.data(), 1); + p = tmp; + } + + // y = preconditioner * p + y = p; + // v = A * y + spblas::multiply(a, y, v); + // beta = dot(rr, v) + beta = blas::dot(v.size(), rr.data(), 1, v.data(), 1); + if (beta == 0.0) { + s = r; + } else { + // alpha = rho / beta + alpha = rho / beta; + // s = r - alpha * v + tmp = v; + blas::scal(tmp.size(), -alpha, tmp.data(), 1); + blas::axpy(tmp.size(), 1.0, r.data(), 1, tmp.data(), 1); + s = tmp; + } + + error = blas::nrm2(s.size(), s.data(), 1); + if (iters >= this->max_iters || error < this->eps) { + std::cout << "error = " << error << "\n"; + std::cout << "iters = " << iters << "\n"; + std::cout << "eps = " << this->eps << "\n"; + std::cout << "max_iters = " << this->max_iters << "\n"; + blas::axpy(x.size(), alpha, y.data(), 1, x.data(), 1); + break; + } + + // z = preconditioner * s + z = s; + // t = A * z + spblas::multiply(a, z, t); + // gamma = dot(t, s) + gamma = blas::dot(s.size(), t.data(), 1, s.data(), 1); + // beta = dot(t, t) + beta = blas::dot(t.size(), t.data(), 1, t.data(), 1); + // omega = gamma / beta + if (beta == 0.0) { + omega = 0.0; + } else { + omega = gamma / beta; + } + // x = x + alpha * y + omega * z + tmp = z; + blas::scal(tmp.size(), omega, tmp.data(), 1); + blas::axpy(tmp.size(), alpha, y.data(), 1, tmp.data(), 1); + blas::axpy(x.size(), 1.0, tmp.data(), 1, x.data(), 1); + + // r = s - omega * t + tmp = t; + blas::scal(tmp.size(), -omega, tmp.data(), 1); + blas::axpy(tmp.size(), 1.0, s.data(), 1, tmp.data(), 1); + r = tmp; + + std::swap(prev_rho, rho); + } + return {error, iters}; +} + +} // namespace miniapps + +#endif // MINIAPPS_BICGSTAB_HH diff --git a/miniapps/include/cg.hh b/miniapps/include/cg.hh new file mode 100644 index 0000000..560ae56 --- /dev/null +++ b/miniapps/include/cg.hh @@ -0,0 +1,107 @@ +#ifndef MINIAPPS_CG_HH +#define MINIAPPS_CG_HH + +#include +#include +#include +#include +#include + +#include "krylov.hh" +#include "rl_blaspp.hh" +#include "util.hh" +#include + +namespace miniapps { + +template class CG : public Krylov { +public: + CG(T eps, int max_iters) : Krylov{eps, max_iters} {} + + /** + * CG or the conjugate gradient method is an iterative type Krylov subspace + * method which is suitable for symmetric positive definite methods. + * + * Though this method performs very well for symmetric positive definite + * matrices, it is in general not suitable for general matrices. + * + * We implement the CG variant from Section 2.3.1 of "Templates for the + * Solution of Linear Systems: Building Blocks for Iterative Methods" + * + * @param[in] A the matrix in CSR format + * @param[in] b the right hand side vector + * @param[out] x the solution vector + */ + std::tuple apply(const spblas::csr_view a, + const std::vector b, + std::vector x) override; +}; + +// ----------------------------------------------------------------------------- +template +std::tuple CG::apply(const spblas::csr_view a, + const std::vector b, std::vector x) { + int iters = 0; + double error = 1.0; + + // r = b + // rho = 0.0 + // prev_rho = 1.0 + // p = q = 0 + std::vector r(b); + std::vector z(r); + std::vector p(b.size(), 0); + std::vector q(b.size(), 0); + std::vector tmp(b.size(), 0); + + double alpha, beta, rho; + alpha = beta = rho = 0.0; + double prev_rho = 1.0; + + // r = b-Ax + spblas::multiply(a, x, tmp); + blas::axpy(r.size(), -1.0, tmp.data(), 1, r.data(), 1); + + while (true) { + // z = preconditioner*r + z = r; + // rho = dot(r, z) + rho = blas::dot(r.size(), r.data(), 1, z.data(), 1); + + iters++; + error = blas::nrm2(r.size(), r.data(), 1); + if (iters >= this->max_iters || error < this->eps) { + std::cout << "error = " << error << "\n"; + std::cout << "iters = " << iters << "\n"; + std::cout << "eps = " << this->eps << "\n"; + std::cout << "max_iters = " << this->max_iters << "\n"; + break; + } + + // beta = rho / prev_rho; + // p = z + beta * p + beta = rho / prev_rho; + tmp = p; + blas::scal(tmp.size(), beta, tmp.data(), 1); + blas::axpy(tmp.size(), 1.0, z.data(), 1, tmp.data(), 1); + p = tmp; + + // q = A * p + spblas::multiply(a, p, q); + + // alpha = rho / dot(p, q) + alpha = rho / blas::dot(q.size(), p.data(), 1, q.data(), 1); + + // x = x + alpha * p + // r = r - alpha * q + blas::axpy(x.size(), alpha, p.data(), 1, x.data(), 1); + blas::axpy(r.size(), -alpha, q.data(), 1, r.data(), 1); + + std::swap(rho, prev_rho); + } + return {error, iters}; +} + +} // namespace miniapps + +#endif // MINIAPPS_CG_HH diff --git a/miniapps/include/cqrrpt.hh b/miniapps/include/cqrrpt.hh new file mode 100644 index 0000000..cc1c380 --- /dev/null +++ b/miniapps/include/cqrrpt.hh @@ -0,0 +1,221 @@ +#ifndef miniapps_cqrrpt_h +#define miniapps_cqrrpt_h + +#include +#include +#include +#include + +#include "util.hh" +#include "rl_blaspp.hh" +#include "rl_lapackpp.hh" +#include + +namespace miniapps { + +template +class CQRRPTalg { + public: + + virtual ~CQRRPTalg() {} + + virtual int call( + int64_t m, + int64_t n, + T* A, + int64_t lda, + T* R, + int64_t ldr, + int64_t* J, + T d_factor + ) = 0; +}; + +template +class CQRRPT : public CQRRPTalg { + public: + CQRRPT(T ep, int64_t nz) { + eps = ep; + nnz = nz; + } + + /// Computes a QR factorization with column pivots of the form: + /// A[:, J] = QR, + /// where Q and R are of size m-by-k and k-by-n, with rank(A) = k. + /// Detailed description of this algorithm may be found in Section 5.1.2. + /// of "the RandLAPACK book". + /// + /// @param[in] m + /// The number of rows in the matrix A. + /// + /// @param[in] n + /// The number of columns in the matrix A. + /// + /// @param[in] A + /// The m-by-n matrix A, stored in a column-major format. + /// + /// @param[in] d + /// Embedding dimension of a sketch, m >= d >= n. + /// + /// @param[in] R + /// Represents the upper-triangular R factor of QR factorization. + /// On entry, is empty and may not have any space allocated for it. + /// + /// @param[out] A + /// Overwritten by an m-by-k orthogonal Q factor. + /// Matrix is stored explicitly. + /// + /// @param[out] R + /// Stores k-by-n matrix with upper-triangular R factor. + /// Zero entries are not compressed. + /// + /// @param[out] J + /// Stores k integer type pivot index extries. + /// + /// @return = 0: successful exit + /// + /// @return = 1: cholesky factorization failed + /// + + int call( + int64_t m, + int64_t n, + T* A, + int64_t lda, + T* R, + int64_t ldr, + int64_t* J, + T d_factor + ) override; + + public: + T eps; + int64_t nnz; + int64_t rank; +}; + +// ----------------------------------------------------------------------------- +template +int CQRRPT::call( + int64_t m, + int64_t n, + T* A, + int64_t lda, + T* R, + int64_t ldr, + int64_t* J, + T d_factor +){ + int i; + int64_t k = n; + int64_t d = d_factor * n; + // A constant for initial rank estimation. + T eps_initial_rank_estimation = 2 * std::pow(std::numeric_limits::epsilon(), 0.95); + // Variables for a posteriori rank estimation. + int64_t new_rank; + T running_max, running_min, curr_entry; + + T* A_hat = ( T * ) calloc( d * n, sizeof( T ) ); + T* tau = ( T * ) calloc( n, sizeof( T ) ); + // Buffer for column pivoting. + std::vector J_buf(n, 0); + + /* RandBLAS style + /// Generating a SASO + RandBLAS::SparseDist DS = {.n_rows = d, .n_cols = m, .vec_nnz = this->nnz}; + RandBLAS::SparseSkOp S(DS, state); + state = RandBLAS::fill_sparse(S); + + /// Applying a SASO + RandBLAS::sketch_general( + Layout::ColMajor, Op::NoTrans, Op::NoTrans, + d, n, m, 1.0, S, 0, 0, A, lda, 0.0, A_hat, d + ); + */ + + /// SparseBLAS style + /// Our sketching operators must be in COO format. + //auto&& [values, rowptr, colind, shape, _] = spblas::generate_coo(d, m, nnz); + /// TODO: add COO view + //spblas::coo_view s(values, rowptr, colind, shape, nnz); + + /// Perform dense sketching for the sake of tests passing. + /// This is not the intended behavior for this algorithm, as dense sketching + /// tanks the performance of CQRRPT. + auto [buf, a_shape] = spblas::generate_gaussian(d, m); + spblas::__mdspan::mdspan s(buf.data(), d, m); + + spblas::__mdspan::mdspan a(A, m, n); + spblas::__mdspan::mdspan ahat(A_hat, d, n); + + spblas::multiply(s, a, ahat); + A_hat = ahat.data_handle(); + + /// Performing QRCP on a sketch + lapack::geqp3(d, n, A_hat, d, J, tau); + + /// Using naive rank estimation to ensure that R used for preconditioning is invertible. + /// The actual rank estimate k will be computed a posteriori. + /// Using R[i,i] to approximate the i-th singular value of A_hat. + /// Truncate at the largest i where R[i,i] / R[0,0] >= eps. + for(i = 0; i < n; ++i) { + if(std::abs(A_hat[i * d + i]) / std::abs(A_hat[0]) < eps_initial_rank_estimation) { + k = i; + break; + } + } + this->rank = k; + + // Allocating space for a preconditioner buffer. + T* R_sp = ( T * ) calloc( k * k, sizeof( T ) ); + /// Extracting a k by k upper-triangular R. + lapack::lacpy(MatrixType::Upper, k, k, A_hat, d, R_sp, k); + /// Extracting a k by n R representation (k by k upper-triangular, rest - general) + lapack::lacpy(MatrixType::Upper, k, k, A_hat, d, R, ldr); + lapack::lacpy(MatrixType::General, k, n - k, &A_hat[d * k], d, &R[n * k], ldr); + + // Swap k columns of A with pivots from J + blas::copy(n, J, 1, J_buf.data(), 1); + util::col_swap(m, n, k, A, lda, J_buf); + + // A_pre * R_sp = AP + blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, m, k, 1.0, R_sp, k, A, lda); + + // Do Cholesky QR + blas::syrk(Layout::ColMajor, Uplo::Upper, Op::Trans, k, m, 1.0, A, lda, 0.0, R_sp, k); + lapack::potrf(Uplo::Upper, k, R_sp, k); + + // Re-estimate rank after we have the R-factor form Cholesky QR. + // The strategy here is the same as in naive rank estimation. + // This also automatically takes care of any potentical failures in Cholesky factorization. + // Note that the diagonal of R_sp may not be sorted, so we need to keep the running max/min + // We expect the loss in the orthogonality of Q to be approximately equal to u * cond(R_sp)^2, where u is the unit roundoff for the numerical type T. + new_rank = k; + running_max = R_sp[0]; + running_min = R_sp[0]; + + for(i = 0; i < k; ++i) { + curr_entry = std::abs(R_sp[i * k + i]); + running_max = std::max(running_max, curr_entry); + running_min = std::min(running_min, curr_entry); + if(running_max / running_min >= std::sqrt(this->eps / std::numeric_limits::epsilon())) { + new_rank = i - 1; + break; + } + } + + blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, m, new_rank, 1.0, R_sp, k, A, lda); + // Get the final R-factor. + blas::trmm(Layout::ColMajor, Side::Left, Uplo::Upper, Op::NoTrans, Diag::NonUnit, new_rank, n, 1.0, R_sp, k, R, ldr); + + // Set the rank parameter to the value comuted a posteriori. + this->rank = k; + + free(A_hat); + free(R_sp); + free(tau); + + return 0; +} +} // end namespace miniapps +#endif diff --git a/miniapps/include/krylov.hh b/miniapps/include/krylov.hh new file mode 100644 index 0000000..a7ca818 --- /dev/null +++ b/miniapps/include/krylov.hh @@ -0,0 +1,36 @@ +#ifndef MINIAPPS_KRYLOV_HH +#define MINIAPPS_KRYLOV_HH + +#include + +#include + +namespace miniapps { + +/** + * A very simple Krylov Solver interface for convenience. + * + * @tparam T the value type + */ +template +class Krylov { + public: + Krylov(T eps, int max_iters) : eps{eps}, max_iters{max_iters} {} + + virtual ~Krylov() {} + + virtual std::tuple apply(const spblas::csr_view a, + const std::vector b, + std::vector x) = 0; + + T get_eps() { return eps; } + + int get_max_iters() { return max_iters; } + + const T eps; + const int max_iters; +}; + +} + +#endif // MINIAPPS_KRYLOV_HH diff --git a/miniapps/include/matrix_utils.hh b/miniapps/include/matrix_utils.hh new file mode 100644 index 0000000..322459d --- /dev/null +++ b/miniapps/include/matrix_utils.hh @@ -0,0 +1,149 @@ +#ifndef SPBLAS_MINIAPPS_MATRIX_UTILS_HH +#define SPBLAS_MINIAPPS_MATRIX_UTILS_HH + +#include + +#include +#include +#include +#include + +namespace miniapps { + +template struct matrix_data_entry { + std::size_t row; + std::size_t col; + T value; +}; + +template struct matrix_data { + using nonzero_type = matrix_data_entry; + + std::size_t num_rows; + std::size_t num_cols; + std::vector nonzeros; + + matrix_data(std::vector rowind, std::vector colind, + std::vector values, spblas::index shape) + : num_rows{shape[0]}, num_cols{shape[1]} { + assert(rowind.size() == colind.size() && rowind.size() == values.size()); + for (std::size_t i = 0; i < values.size(); i++) { + nonzeros.emplace_back(rowind[i], colind[i], values[i]); + } + } + + auto convert_to_coo() { + auto nnz = nonzeros.size(); + std::vector values(nnz); + std::vector rowind(nnz); + std::vector colind(nnz); + + for (std::size_t i = 0; i < nnz; i++) { + values[i] = nonzeros[i].value; + rowind[i] = nonzeros[i].row; + colind[i] = nonzeros[i].col; + } + return std::tuple(values, rowind, colind, nnz); + } + + void sort_row_major() { + std::sort(begin(nonzeros), end(nonzeros), + [](nonzero_type x, nonzero_type y) { + return std::tie(x.row, x.col) < std::tie(y.row, y.col); + }); + } + + void sum_duplicates() { + this->sort_row_major(); + std::vector new_nonzeros; + if (!nonzeros.empty()) { + new_nonzeros.emplace_back(nonzeros.front().row, nonzeros.front().col, + 0.0); + for (auto entry : nonzeros) { + if (entry.row != new_nonzeros.back().row || + entry.col != new_nonzeros.back().col) { + new_nonzeros.emplace_back(entry.row, entry.col, 0.0); + } + new_nonzeros.back().value += entry.value; + } + nonzeros = std::move(new_nonzeros); + } + } + + void make_symmetric() { + const auto nnz = nonzeros.size(); + // compute (A + op(A^T)) / 2 + for (std::size_t i = 0; i < nnz; i++) { + nonzeros[i].value /= 2.0; + auto entry = nonzeros[i]; + nonzeros.emplace_back(entry.col, entry.row, entry.value); + } + // combine duplicates + this->sum_duplicates(); + } + + void make_diag_dominant(T ratio = 1.0) { + std::vector norms(num_rows); + std::vector diag_positions(num_rows, -1); + std::int64_t i{}; + for (auto entry : nonzeros) { + if (entry.row == entry.col) { + diag_positions[entry.row] = i; + } else { + norms[entry.row] += std::abs(entry.value); + } + i++; + } + for (i = 0; i < num_rows; i++) { + if (norms[i] == 0.0) { + // make sure empty rows don't make the matrix singular + norms[i] = 1.0; + } + if (diag_positions[i] < 0) { + nonzeros.emplace_back(i, i, norms[i] * ratio); + } else { + auto &diag_value = nonzeros[diag_positions[i]].value; + const auto diag_magnitude = std::abs(diag_value); + const auto offdiag_magnitude = norms[i]; + if (diag_magnitude < offdiag_magnitude * ratio) { + const auto scaled_value = + diag_value * (offdiag_magnitude * ratio / diag_magnitude); + if (std::isfinite(scaled_value)) { + diag_value = scaled_value; + } else { + diag_value = offdiag_magnitude * ratio; + } + } + } + } + this->sort_row_major(); + } +}; + +template +auto convert_rowind_to_rowptr(std::vector rowind, std::size_t nnz, + spblas::index shape) { + auto num_rows = shape[0]; + std::vector rowptr(num_rows + 1, 0); + assert(rowind.size() == nnz); + + for (I i = 0; i < nnz; i++) { + rowptr[rowind[i]]++; + } + constexpr auto max = std::numeric_limits::max(); + std::size_t partial_sum{}; + for (std::size_t i = 0; i < num_rows + 1; ++i) { + auto this_nnz = i < num_rows ? rowptr[i] : 0; + rowptr[i] = partial_sum; + if (max - partial_sum < this_nnz) { + throw std::exception(); + } + partial_sum += this_nnz; + } + + return rowptr; +} + +} // namespace miniapps + +#endif // SPBLAS_MINIAPPS_MATRIX_UTILS_HH diff --git a/miniapps/include/miniapps.hh b/miniapps/include/miniapps.hh new file mode 100644 index 0000000..291c427 --- /dev/null +++ b/miniapps/include/miniapps.hh @@ -0,0 +1,19 @@ +#ifndef SPBLAS_MINIAPPS_HH +#define SPBLAS_MINIAPPS_HH + +// misc +#include "util.hh" + +// misc +#include "matrix_utils.hh" + +// randnla +#include "cqrrpt.hh" + +// cg +#include "cg.hh" + +// bicgstab +#include "bicgstab.hh" + +#endif // SPBLAS_MINIAPPS_HH diff --git a/miniapps/include/rl_blaspp.hh b/miniapps/include/rl_blaspp.hh new file mode 100644 index 0000000..1ac915e --- /dev/null +++ b/miniapps/include/rl_blaspp.hh @@ -0,0 +1,12 @@ +#ifndef randlapack_blaspp_h +#define randlapack_blaspp_h + +#include + +using Layout = blas::Layout; +using Op = blas::Op; +using Side = blas::Side; +using Diag = blas::Diag; +using Uplo = blas::Uplo; + +#endif diff --git a/miniapps/include/rl_config.hh.in b/miniapps/include/rl_config.hh.in new file mode 100644 index 0000000..68e2991 --- /dev/null +++ b/miniapps/include/rl_config.hh.in @@ -0,0 +1,11 @@ +#ifndef miniapps_config_h +#define miniapps_config_h + +#define miniapps_VERSION "@miniapps_VERSION@" +#define miniapps_VERSION_MAJOR @miniapps_VERSION_MAJOR@ +#define miniapps_VERSION_MINOR @miniapps_VERSION_MINOR@ +#define miniapps_VERSION_PATCH @miniapps_VERSION_PATCH@ + +#CMAKEDEFINE miniapps_HAS_OpenMP + +#endif diff --git a/miniapps/include/rl_lapackpp.hh b/miniapps/include/rl_lapackpp.hh new file mode 100644 index 0000000..ba62260 --- /dev/null +++ b/miniapps/include/rl_lapackpp.hh @@ -0,0 +1,12 @@ +#ifndef randlapack_lapackpp_h +#define randlapack_lapackpp_h + +#define LAPACK_COMPLEX_CPP 1 + +#include + +using Job = lapack::Job; +using MatrixType = lapack::MatrixType; +using Norm = lapack::Norm; + +#endif diff --git a/miniapps/include/util.hh b/miniapps/include/util.hh new file mode 100644 index 0000000..0462f13 --- /dev/null +++ b/miniapps/include/util.hh @@ -0,0 +1,42 @@ +#ifndef miniapps_util_h +#define miniapps_util_h + +#include "rl_blaspp.hh" +#include "rl_lapackpp.hh" +#include + +#include +#include +#include +#include +#include + +namespace miniapps::util { + +/// A version of the above function to be used on a vector of integers +template +void col_swap( + int64_t m, + int64_t n, + int64_t k, + T* A, + int64_t lda, + std::vector idx +) { + if(k > n) + throw std::runtime_error("Invalid rank parameter."); + + int64_t i, j; //, l; + for (i = 0, j = 0; i < k; ++i) { + j = idx[i] - 1; + blas::swap(m, &A[i * lda], 1, &A[j * lda], 1); + + // swap idx array elements + // Find idx element with value i and assign it to j + auto it = std::find(idx.begin() + i, idx.begin() + k, i + 1); + idx[it - (idx.begin())] = j + 1; + } +} + +} // end namespace util +#endif diff --git a/miniapps/test/CMakeLists.txt b/miniapps/test/CMakeLists.txt new file mode 100644 index 0000000..733220a --- /dev/null +++ b/miniapps/test/CMakeLists.txt @@ -0,0 +1,2 @@ +add_subdirectory(randnla) +add_subdirectory(solvers) diff --git a/miniapps/test/randnla/CMakeLists.txt b/miniapps/test/randnla/CMakeLists.txt new file mode 100644 index 0000000..8696f44 --- /dev/null +++ b/miniapps/test/randnla/CMakeLists.txt @@ -0,0 +1,7 @@ +add_executable(miniapps_randnla_tests + test_cqrrpt.cc) + +include(GoogleTest) +target_link_libraries(miniapps_randnla_tests miniapps GTest::gtest_main) + +gtest_discover_tests(miniapps_randnla_tests) diff --git a/miniapps/test/randnla/test_cqrrpt.cc b/miniapps/test/randnla/test_cqrrpt.cc new file mode 100644 index 0000000..cfed8f6 --- /dev/null +++ b/miniapps/test/randnla/test_cqrrpt.cc @@ -0,0 +1,147 @@ +#include "miniapps.hh" +#include "rl_blaspp.hh" +#include "rl_lapackpp.hh" +#include + +#include +#include + + +class TestCQRRPT : public ::testing::Test +{ + protected: + + virtual void SetUp() {}; + + virtual void TearDown() {}; + + template + struct CQRRPTTestData { + int64_t row; + int64_t col; + int64_t rank; + std::vector A; + std::vector R; + std::vector J; + std::vector A_cpy1; + std::vector A_cpy2; + std::vector I_ref; + + CQRRPTTestData(int64_t m, int64_t n, int64_t k) : + A(m * n, 0.0), + R(n * n, 0.0), + J(n, 0), + A_cpy1(m * n, 0.0), + A_cpy2(m * n, 0.0), + I_ref(k * k, 0.0) { + row = m; + col = n; + rank = k; + } + }; + + template + static void norm_and_copy_computational_helper(T &norm_A, CQRRPTTestData &all_data) { + auto m = all_data.row; + auto n = all_data.col; + + lapack::lacpy(MatrixType::General, m, n, all_data.A.data(), m, all_data.A_cpy1.data(), m); + lapack::lacpy(MatrixType::General, m, n, all_data.A.data(), m, all_data.A_cpy2.data(), m); + norm_A = lapack::lange(Norm::Fro, m, n, all_data.A.data(), m); + } + + template + static void + error_check(T &norm_A, CQRRPTTestData &all_data) { + + auto m = all_data.row; + auto n = all_data.col; + auto k = all_data.rank; + + all_data.I_ref.resize(k * k, 0.0); + for(int i = 0; i < k; ++i) + all_data.I_ref[(k + 1) * i] = 1.0; + + T* A_dat = all_data.A_cpy1.data(); + T const* A_cpy_dat = all_data.A_cpy2.data(); + T const* Q_dat = all_data.A.data(); + T const* R_dat = all_data.R.data(); + T* I_ref_dat = all_data.I_ref.data(); + + // Check orthogonality of Q + // Q' * Q - I = 0 + blas::syrk(Layout::ColMajor, Uplo::Upper, Op::Trans, k, m, 1.0, Q_dat, m, -1.0, I_ref_dat, k); + T norm_QTQ = lapack::lansy(lapack::Norm::Fro, Uplo::Upper, k, I_ref_dat, k); + + // A - QR + blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, n, k, 1.0, Q_dat, m, R_dat, n, -1.0, A_dat, m); + + // Implementing max col norm metric + T max_col_norm = 0.0; + T col_norm = 0.0; + int max_idx = 0; + for(int i = 0; i < n; ++i) { + col_norm = blas::nrm2(m, &A_dat[m * i], 1); + if(max_col_norm < col_norm) { + max_col_norm = col_norm; + max_idx = i; + } + } + T col_norm_A = blas::nrm2(n, &A_cpy_dat[m * max_idx], 1); + T norm_AQR = lapack::lange(Norm::Fro, m, n, A_dat, m); + + printf("REL NORM OF AP - QR: %15e\n", norm_AQR / norm_A); + printf("MAX COL NORM METRIC: %15e\n", max_col_norm / col_norm_A); + printf("FRO NORM OF (Q'Q - I): %2e\n\n", norm_QTQ); + + T atol = std::pow(std::numeric_limits::epsilon(), 0.75); + ASSERT_NEAR(norm_AQR / norm_A, 0.0, atol); + ASSERT_NEAR(max_col_norm / col_norm_A, 0.0, atol); + ASSERT_NEAR(norm_QTQ, 0.0, atol); + } + + /// General test for CQRRPT: + /// Computes QR factorzation, and computes A[:, J] - QR. + template + static void test_CQRRPT_general( + T d_factor, + T norm_A, + CQRRPTTestData &all_data, + alg_type &CQRRPT) { + + auto m = all_data.row; + auto n = all_data.col; + + CQRRPT.call(m, n, all_data.A.data(), m, all_data.R.data(), n, all_data.J.data(), d_factor); + all_data.rank = CQRRPT.rank; + + printf("RANK AS RETURNED BY CQRRPT %ld\n", all_data.rank); + + miniapps::util::col_swap(m, n, n, all_data.A_cpy1.data(), m, all_data.J); + miniapps::util::col_swap(m, n, n, all_data.A_cpy2.data(), m, all_data.J); + + error_check(norm_A, all_data); + } +}; + +TEST_F(TestCQRRPT, CQRRPT_full_rank_no_hqrrp) { + int64_t m = 50; + int64_t n = 20; + int64_t k = 20; + double d_factor = 1.25; + double norm_A = 0; + int64_t nnz = 2; + double tol = std::pow(std::numeric_limits::epsilon(), 0.85); + + CQRRPTTestData all_data(m, n, k); + miniapps::CQRRPT CQRRPT(tol, nnz); + // Generate dense matrix + auto [buf, a_shape] = spblas::generate_dense(m, n); + + lapack::lacpy(MatrixType::General, m, n, buf.data(), m, all_data.A.data(), m); + + norm_and_copy_computational_helper(norm_A, all_data); + test_CQRRPT_general>(d_factor, norm_A, all_data, CQRRPT); +} + + diff --git a/miniapps/test/solvers/CMakeLists.txt b/miniapps/test/solvers/CMakeLists.txt new file mode 100644 index 0000000..70a99d3 --- /dev/null +++ b/miniapps/test/solvers/CMakeLists.txt @@ -0,0 +1,8 @@ +add_executable(miniapps_solvers_tests + test_cg.cpp + test_bicgstab.cpp) + +target_link_libraries(miniapps_solvers_tests miniapps GTest::gtest_main) + +include(GoogleTest) +gtest_discover_tests(miniapps_solvers_tests) diff --git a/miniapps/test/solvers/test_bicgstab.cpp b/miniapps/test/solvers/test_bicgstab.cpp new file mode 100644 index 0000000..445dee3 --- /dev/null +++ b/miniapps/test/solvers/test_bicgstab.cpp @@ -0,0 +1,72 @@ +#include "miniapps.hh" +#include + +#include +#include +#include + +#include + +template class TestBiCGSTAB : public testing::Test { +public: + auto generate_problem(std::size_t m, std::size_t n, std::size_t nnz_input, + std::size_t seed = 0) { + auto &&[values_orig, rowind_orig, colind_orig, shape, nnz_orig] = + spblas::generate_coo(m, n, nnz_input, seed); + miniapps::matrix_data data(rowind_orig, colind_orig, values_orig, shape); + auto &&[values, rowind, colind, nnz] = data.convert_to_coo(); + auto rowptr = miniapps::convert_rowind_to_rowptr(colind, nnz, shape); + return std::tuple(values, rowptr, colind, shape, nnz); + } + + auto generate_spd_problem(std::size_t m, std::size_t n, std::size_t nnz_input, + std::size_t seed = 0) { + auto &&[values_orig, rowind_orig, colind_orig, shape, nnz_orig] = + spblas::generate_coo(m, n, nnz_input, seed); + miniapps::matrix_data data(rowind_orig, colind_orig, values_orig, shape); + data.sort_row_major(); + data.make_symmetric(); + data.make_diag_dominant(); + auto &&[values, rowind, colind, nnz] = data.convert_to_coo(); + auto rowptr = miniapps::convert_rowind_to_rowptr(colind, nnz, shape); + return std::tuple(values, rowptr, colind, shape, nnz); + } +}; + +using BiCGSTABTestTypes = ::testing::Types; +TYPED_TEST_SUITE(TestBiCGSTAB, BiCGSTABTestTypes); + +TYPED_TEST(TestBiCGSTAB, ConvergesForSmallSystem) { + using T = TypeParam; + constexpr double tol = std::is_same::value ? 1e-14 : 1e-7; + constexpr int max_iters = 100; + std::tuple res; + auto &&[values, rowptr, colind, shape, nnz] = + this->generate_problem(10, 10, 42, 75); + spblas::csr_view a(values, rowptr, colind, shape, nnz); + std::vector b(10, 1.0); + std::vector x(10, 0.0); + + miniapps::BiCGSTAB BiCGSTAB(tol, max_iters); + auto [error, iters] = BiCGSTAB.apply(a, b, x); + + ASSERT_LE(error, tol); + ASSERT_LE(iters, max_iters); +} + +TYPED_TEST(TestBiCGSTAB, ConvergesForLargeSystem) { + using T = TypeParam; + constexpr double tol = std::is_same::value ? 1e-14 : 1e-7; + constexpr int max_iters = 100; + std::vector b(1000, 1.0); + std::vector x(1000, 0.0); + auto &&[values, rowptr, colind, shape, nnz] = + this->generate_spd_problem(1000, 1000, 12345, 75); + spblas::csr_view a(values, rowptr, colind, shape, nnz); + + miniapps::BiCGSTAB BiCGSTAB(tol, max_iters); + auto [error, iters] = BiCGSTAB.apply(a, b, x); + + ASSERT_LE(error, tol); + ASSERT_LE(iters, max_iters); +} diff --git a/miniapps/test/solvers/test_cg.cpp b/miniapps/test/solvers/test_cg.cpp new file mode 100644 index 0000000..09984c3 --- /dev/null +++ b/miniapps/test/solvers/test_cg.cpp @@ -0,0 +1,65 @@ +#include "miniapps.hh" +#include + +#include +#include +#include + +#include + +template class TestCG : public testing::Test { +public: + using ValueType = T; + + auto generate_spd_problem(std::size_t m, std::size_t n, std::size_t nnz_input, + std::size_t seed = 0) { + auto &&[values_orig, rowind_orig, colind_orig, shape, nnz_orig] = + spblas::generate_coo(m, n, nnz_input, seed); + miniapps::matrix_data data(rowind_orig, colind_orig, values_orig, shape); + data.sort_row_major(); + data.make_symmetric(); + data.make_diag_dominant(); + auto &&[values, rowind, colind, nnz] = data.convert_to_coo(); + auto rowptr = miniapps::convert_rowind_to_rowptr(colind, nnz, shape); + return std::tuple(values, rowptr, colind, shape, nnz); + } +}; + +using CGTestTypes = ::testing::Types; +TYPED_TEST_SUITE(TestCG, CGTestTypes); + +TYPED_TEST(TestCG, ConvergesForSmallSystem) { + using T = TypeParam; + constexpr double tol = std::is_same::value ? 1e-14 : 1e-7; + constexpr int max_iters = 100; + std::vector b(10, 1.0); + std::vector x(10, 0.0); + + auto &&[values, rowptr, colind, shape, nnz] = + this->generate_spd_problem(10, 10, 42, 75); + spblas::csr_view a(values, rowptr, colind, shape, nnz); + + miniapps::CG CG(tol, max_iters); + auto [error, iters] = CG.apply(a, b, x); + + ASSERT_LE(error, tol); + ASSERT_LE(iters, max_iters); +} + +TYPED_TEST(TestCG, ConvergesForLargeSystem) { + using T = TypeParam; + constexpr double tol = std::is_same::value ? 1e-14 : 1e-7; + constexpr int max_iters = 100; + std::vector b(1000, 1.0); + std::vector x(1000, 0.0); + + auto &&[values, rowptr, colind, shape, nnz] = + this->generate_spd_problem(1000, 1000, 12345, 75); + spblas::csr_view a(values, rowptr, colind, shape, nnz); + + miniapps::CG CG(tol, max_iters); + auto [error, iters] = CG.apply(a, b, x); + + ASSERT_LE(error, tol); + ASSERT_LE(iters, max_iters); +} From 7aa5beea8c24c343ac9f9de8260ff0604785fde3 Mon Sep 17 00:00:00 2001 From: Terry Cojean Date: Thu, 27 Feb 2025 22:42:35 +0100 Subject: [PATCH 2/5] miniapps: auto compile BLAS++ and LAPACK++ --- CMakeLists.txt | 8 ++++--- miniapps/CMake/blaspp.cmake | 14 ------------- miniapps/CMake/lapackpp.cmake | 14 ------------- miniapps/CMake/rl_version.cmake | 27 ------------------------ miniapps/CMakeLists.txt | 37 +++++++++++++++++++++++---------- 5 files changed, 31 insertions(+), 69 deletions(-) delete mode 100644 miniapps/CMake/blaspp.cmake delete mode 100644 miniapps/CMake/lapackpp.cmake delete mode 100644 miniapps/CMake/rl_version.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index c439e1f..d4d1c0f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,6 +8,8 @@ set(CMAKE_CXX_FLAGS "-O3 -march=native") option(SPBLAS_BUILD_MINIAPPS "Also build the miniapps. Default: OFF." OFF) +enable_testing() + # Get includes, which declares the `spblas` library add_subdirectory(include) @@ -91,8 +93,8 @@ if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME) add_subdirectory(examples) add_subdirectory(test) -endif() -if (SPBLAS_BUILD_MINIAPPS) - add_subdirectory(miniapps) + if (SPBLAS_BUILD_MINIAPPS) + add_subdirectory(miniapps) + endif() endif() diff --git a/miniapps/CMake/blaspp.cmake b/miniapps/CMake/blaspp.cmake deleted file mode 100644 index 986763a..0000000 --- a/miniapps/CMake/blaspp.cmake +++ /dev/null @@ -1,14 +0,0 @@ -message(STATUS "\n\n-- miniapps checking for blaspp ... ") -find_package(blaspp REQUIRED) -message(STATUS "miniapps found blaspp ${blaspp_VERSION}\n") - -# interface library for use elsewhere in the project -add_library(miniapps_blaspp INTERFACE) - -target_link_libraries(miniapps_blaspp INTERFACE blaspp) - -install(TARGETS miniapps_blaspp EXPORT miniapps_blaspp) - -install(EXPORT miniapps_blaspp - DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake" - EXPORT_LINK_INTERFACE_LIBRARIES) \ No newline at end of file diff --git a/miniapps/CMake/lapackpp.cmake b/miniapps/CMake/lapackpp.cmake deleted file mode 100644 index e7d9827..0000000 --- a/miniapps/CMake/lapackpp.cmake +++ /dev/null @@ -1,14 +0,0 @@ -message(STATUS "\n\n-- miniapps checking for lapackpp ... ") -find_package(lapackpp REQUIRED) -message(STATUS "miniapps found lapackpp ${lapackpp_VERSION}\n") - -# interface library for use elsewhere in the project -add_library(miniapps_lapackpp INTERFACE) - -target_link_libraries(miniapps_lapackpp INTERFACE lapackpp) - -install(TARGETS miniapps_lapackpp EXPORT miniapps_lapackpp) - -install(EXPORT miniapps_lapackpp - DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake" - EXPORT_LINK_INTERFACE_LIBRARIES) diff --git a/miniapps/CMake/rl_version.cmake b/miniapps/CMake/rl_version.cmake deleted file mode 100644 index 4dc2f3c..0000000 --- a/miniapps/CMake/rl_version.cmake +++ /dev/null @@ -1,27 +0,0 @@ -set(tmp) -find_package(Git QUIET) -if(GIT_FOUND) - execute_process(COMMAND ${GIT_EXECUTABLE} - --git-dir=${CMAKE_SOURCE_DIR}/.git describe - --tags --match "[0-9]*.[0-9]*.[0-9]*" - OUTPUT_VARIABLE tmp OUTPUT_STRIP_TRAILING_WHITESPACE - ERROR_QUIET) -endif() -if(NOT tmp) - set(tmp "0.0.0") -endif() - -set(miniapps_VERSION ${tmp} CACHE STRING "miniapps version" FORCE) - -string(REGEX REPLACE "^([0-9]+)\\.([0-9]+)\\.([0-9]+)(.*$)" - "\\1" miniapps_VERSION_MAJOR ${miniapps_VERSION}) - -string(REGEX REPLACE "^([0-9]+)\\.([0-9]+)\\.([0-9]+)(.*$)" - "\\2" miniapps_VERSION_MINOR ${miniapps_VERSION}) - -string(REGEX REPLACE "^([0-9]+)\\.([0-9]+)\\.([0-9]+)(.*$)" - "\\3" miniapps_VERSION_PATCH ${miniapps_VERSION}) - -message(STATUS "miniapps_VERSION_MAJOR=${miniapps_VERSION_MAJOR}") -message(STATUS "miniapps_VERSION_MINOR=${miniapps_VERSION_MINOR}") -message(STATUS "miniapps_VERSION_PATCH=${miniapps_VERSION_PATCH}") diff --git a/miniapps/CMakeLists.txt b/miniapps/CMakeLists.txt index b3cd1b4..e931875 100644 --- a/miniapps/CMakeLists.txt +++ b/miniapps/CMakeLists.txt @@ -1,30 +1,45 @@ -cmake_minimum_required(VERSION 3.20) +cmake_minimum_required(VERSION 3.24) -project(miniapps) +project(miniapps VERSION 0.1.0) list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/CMake") set(CMAKE_CXX_STANDARD 23) set(CMAKE_CXX_STANDARD_REQUIRED ON) -# configure the build +option(SPBLAS_MINIAPPS_ENABLE_CQRRPT "Enable the CQRRPT miniapp. Default ON." ON) +option(SPBLAS_MINIAPPS_ENABLE_SOLVERS "Enable the solvers miniapp. Default ON." ON) + enable_testing() include(rl_build_options) -include(rl_version) -# find dependencies -find_package(lapackpp REQUIRED) +# Find dependencies +find_package(blaspp 2024.10.26 QUIET) +if(NOT blaspp_FOUND) + FetchContent_Declare( + blaspp + GIT_REPOSITORY https://github.com/icl-utk-edu/blaspp + GIT_TAG v2024.10.26) + FetchContent_MakeAvailable(blaspp) +endif() -# find dependencies -find_package(blaspp REQUIRED) +set(blas "auto") +set(lapack "auto") + +find_package(lapackpp 2024.10.26 QUIET) +if(NOT lapackpp_FOUND) + FetchContent_Declare( + lapackpp + GIT_REPOSITORY https://github.com/icl-utk-edu/lapackpp + GIT_TAG v2024.10.26) + FetchContent_MakeAvailable(lapackpp) +endif() # compile sources add_subdirectory(include) -if (ENABLE_TESTING) - add_subdirectory(test) -endif() +add_subdirectory(test) # export the configuration include(rl_config) From d495be029fb93dc61f77a8b3d6275946362ed2c2 Mon Sep 17 00:00:00 2001 From: Benjamin Brock Date: Thu, 6 Mar 2025 09:57:35 -0800 Subject: [PATCH 3/5] Fix formatting for `clang-format`. --- miniapps/CMake/rl_build_options.cmake | 1 - miniapps/include/bicgstab.hh | 3 +- miniapps/include/cg.hh | 3 +- miniapps/include/cqrrpt.hh | 367 ++++++++++++------------ miniapps/include/krylov.hh | 12 +- miniapps/include/matrix_utils.hh | 8 +- miniapps/include/util.hh | 40 ++- miniapps/test/randnla/test_cqrrpt.cc | 249 ++++++++-------- miniapps/test/solvers/test_bicgstab.cpp | 19 +- miniapps/test/solvers/test_cg.cpp | 11 +- 10 files changed, 349 insertions(+), 364 deletions(-) diff --git a/miniapps/CMake/rl_build_options.cmake b/miniapps/CMake/rl_build_options.cmake index ff412d1..ed85878 100644 --- a/miniapps/CMake/rl_build_options.cmake +++ b/miniapps/CMake/rl_build_options.cmake @@ -26,4 +26,3 @@ include(GNUInstallDirs) set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/${CMAKE_INSTALL_LIBDIR}") set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/${CMAKE_INSTALL_LIBDIR}") set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/${CMAKE_INSTALL_BINDIR}") - diff --git a/miniapps/include/bicgstab.hh b/miniapps/include/bicgstab.hh index 09b11d5..7d6e751 100644 --- a/miniapps/include/bicgstab.hh +++ b/miniapps/include/bicgstab.hh @@ -13,7 +13,8 @@ namespace miniapps { -template class BiCGSTAB : public Krylov { +template +class BiCGSTAB : public Krylov { public: BiCGSTAB(T eps, int max_iters) : Krylov{eps, max_iters} {} diff --git a/miniapps/include/cg.hh b/miniapps/include/cg.hh index 560ae56..7dbbea1 100644 --- a/miniapps/include/cg.hh +++ b/miniapps/include/cg.hh @@ -14,7 +14,8 @@ namespace miniapps { -template class CG : public Krylov { +template +class CG : public Krylov { public: CG(T eps, int max_iters) : Krylov{eps, max_iters} {} diff --git a/miniapps/include/cqrrpt.hh b/miniapps/include/cqrrpt.hh index cc1c380..26e3f32 100644 --- a/miniapps/include/cqrrpt.hh +++ b/miniapps/include/cqrrpt.hh @@ -1,221 +1,208 @@ #ifndef miniapps_cqrrpt_h #define miniapps_cqrrpt_h -#include -#include #include +#include #include +#include -#include "util.hh" #include "rl_blaspp.hh" #include "rl_lapackpp.hh" +#include "util.hh" #include namespace miniapps { template class CQRRPTalg { - public: - - virtual ~CQRRPTalg() {} - - virtual int call( - int64_t m, - int64_t n, - T* A, - int64_t lda, - T* R, - int64_t ldr, - int64_t* J, - T d_factor - ) = 0; +public: + virtual ~CQRRPTalg() {} + + virtual int call(int64_t m, int64_t n, T* A, int64_t lda, T* R, int64_t ldr, + int64_t* J, T d_factor) = 0; }; template class CQRRPT : public CQRRPTalg { - public: - CQRRPT(T ep, int64_t nz) { - eps = ep; - nnz = nz; - } - - /// Computes a QR factorization with column pivots of the form: - /// A[:, J] = QR, - /// where Q and R are of size m-by-k and k-by-n, with rank(A) = k. - /// Detailed description of this algorithm may be found in Section 5.1.2. - /// of "the RandLAPACK book". - /// - /// @param[in] m - /// The number of rows in the matrix A. - /// - /// @param[in] n - /// The number of columns in the matrix A. - /// - /// @param[in] A - /// The m-by-n matrix A, stored in a column-major format. - /// - /// @param[in] d - /// Embedding dimension of a sketch, m >= d >= n. - /// - /// @param[in] R - /// Represents the upper-triangular R factor of QR factorization. - /// On entry, is empty and may not have any space allocated for it. - /// - /// @param[out] A - /// Overwritten by an m-by-k orthogonal Q factor. - /// Matrix is stored explicitly. - /// - /// @param[out] R - /// Stores k-by-n matrix with upper-triangular R factor. - /// Zero entries are not compressed. - /// - /// @param[out] J - /// Stores k integer type pivot index extries. - /// - /// @return = 0: successful exit - /// - /// @return = 1: cholesky factorization failed - /// - - int call( - int64_t m, - int64_t n, - T* A, - int64_t lda, - T* R, - int64_t ldr, - int64_t* J, - T d_factor - ) override; - - public: - T eps; - int64_t nnz; - int64_t rank; +public: + CQRRPT(T ep, int64_t nz) { + eps = ep; + nnz = nz; + } + + /// Computes a QR factorization with column pivots of the form: + /// A[:, J] = QR, + /// where Q and R are of size m-by-k and k-by-n, with rank(A) = k. + /// Detailed description of this algorithm may be found in Section 5.1.2. + /// of "the RandLAPACK book". + /// + /// @param[in] m + /// The number of rows in the matrix A. + /// + /// @param[in] n + /// The number of columns in the matrix A. + /// + /// @param[in] A + /// The m-by-n matrix A, stored in a column-major format. + /// + /// @param[in] d + /// Embedding dimension of a sketch, m >= d >= n. + /// + /// @param[in] R + /// Represents the upper-triangular R factor of QR factorization. + /// On entry, is empty and may not have any space allocated for it. + /// + /// @param[out] A + /// Overwritten by an m-by-k orthogonal Q factor. + /// Matrix is stored explicitly. + /// + /// @param[out] R + /// Stores k-by-n matrix with upper-triangular R factor. + /// Zero entries are not compressed. + /// + /// @param[out] J + /// Stores k integer type pivot index extries. + /// + /// @return = 0: successful exit + /// + /// @return = 1: cholesky factorization failed + /// + + int call(int64_t m, int64_t n, T* A, int64_t lda, T* R, int64_t ldr, + int64_t* J, T d_factor) override; + +public: + T eps; + int64_t nnz; + int64_t rank; }; // ----------------------------------------------------------------------------- template -int CQRRPT::call( - int64_t m, - int64_t n, - T* A, - int64_t lda, - T* R, - int64_t ldr, - int64_t* J, - T d_factor -){ - int i; - int64_t k = n; - int64_t d = d_factor * n; - // A constant for initial rank estimation. - T eps_initial_rank_estimation = 2 * std::pow(std::numeric_limits::epsilon(), 0.95); - // Variables for a posteriori rank estimation. - int64_t new_rank; - T running_max, running_min, curr_entry; - - T* A_hat = ( T * ) calloc( d * n, sizeof( T ) ); - T* tau = ( T * ) calloc( n, sizeof( T ) ); - // Buffer for column pivoting. - std::vector J_buf(n, 0); - - /* RandBLAS style - /// Generating a SASO - RandBLAS::SparseDist DS = {.n_rows = d, .n_cols = m, .vec_nnz = this->nnz}; - RandBLAS::SparseSkOp S(DS, state); - state = RandBLAS::fill_sparse(S); - - /// Applying a SASO - RandBLAS::sketch_general( - Layout::ColMajor, Op::NoTrans, Op::NoTrans, - d, n, m, 1.0, S, 0, 0, A, lda, 0.0, A_hat, d - ); - */ - - /// SparseBLAS style - /// Our sketching operators must be in COO format. - //auto&& [values, rowptr, colind, shape, _] = spblas::generate_coo(d, m, nnz); - /// TODO: add COO view - //spblas::coo_view s(values, rowptr, colind, shape, nnz); - - /// Perform dense sketching for the sake of tests passing. - /// This is not the intended behavior for this algorithm, as dense sketching - /// tanks the performance of CQRRPT. - auto [buf, a_shape] = spblas::generate_gaussian(d, m); - spblas::__mdspan::mdspan s(buf.data(), d, m); - - spblas::__mdspan::mdspan a(A, m, n); - spblas::__mdspan::mdspan ahat(A_hat, d, n); - - spblas::multiply(s, a, ahat); - A_hat = ahat.data_handle(); - - /// Performing QRCP on a sketch - lapack::geqp3(d, n, A_hat, d, J, tau); - - /// Using naive rank estimation to ensure that R used for preconditioning is invertible. - /// The actual rank estimate k will be computed a posteriori. - /// Using R[i,i] to approximate the i-th singular value of A_hat. - /// Truncate at the largest i where R[i,i] / R[0,0] >= eps. - for(i = 0; i < n; ++i) { - if(std::abs(A_hat[i * d + i]) / std::abs(A_hat[0]) < eps_initial_rank_estimation) { - k = i; - break; - } +int CQRRPT::call(int64_t m, int64_t n, T* A, int64_t lda, T* R, int64_t ldr, + int64_t* J, T d_factor) { + int i; + int64_t k = n; + int64_t d = d_factor * n; + // A constant for initial rank estimation. + T eps_initial_rank_estimation = + 2 * std::pow(std::numeric_limits::epsilon(), 0.95); + // Variables for a posteriori rank estimation. + int64_t new_rank; + T running_max, running_min, curr_entry; + + T* A_hat = (T*) calloc(d * n, sizeof(T)); + T* tau = (T*) calloc(n, sizeof(T)); + // Buffer for column pivoting. + std::vector J_buf(n, 0); + + /* RandBLAS style + /// Generating a SASO + RandBLAS::SparseDist DS = {.n_rows = d, .n_cols = m, .vec_nnz = this->nnz}; + RandBLAS::SparseSkOp S(DS, state); + state = RandBLAS::fill_sparse(S); + + /// Applying a SASO + RandBLAS::sketch_general( + Layout::ColMajor, Op::NoTrans, Op::NoTrans, + d, n, m, 1.0, S, 0, 0, A, lda, 0.0, A_hat, d + ); + */ + + /// SparseBLAS style + /// Our sketching operators must be in COO format. + // auto&& [values, rowptr, colind, shape, _] = spblas::generate_coo(d, m, + // nnz); + /// TODO: add COO view + // spblas::coo_view s(values, rowptr, colind, shape, nnz); + + /// Perform dense sketching for the sake of tests passing. + /// This is not the intended behavior for this algorithm, as dense sketching + /// tanks the performance of CQRRPT. + auto [buf, a_shape] = spblas::generate_gaussian(d, m); + spblas::__mdspan::mdspan s(buf.data(), d, m); + + spblas::__mdspan::mdspan a(A, m, n); + spblas::__mdspan::mdspan ahat(A_hat, d, n); + + spblas::multiply(s, a, ahat); + A_hat = ahat.data_handle(); + + /// Performing QRCP on a sketch + lapack::geqp3(d, n, A_hat, d, J, tau); + + /// Using naive rank estimation to ensure that R used for preconditioning is + /// invertible. The actual rank estimate k will be computed a posteriori. + /// Using R[i,i] to approximate the i-th singular value of A_hat. + /// Truncate at the largest i where R[i,i] / R[0,0] >= eps. + for (i = 0; i < n; ++i) { + if (std::abs(A_hat[i * d + i]) / std::abs(A_hat[0]) < + eps_initial_rank_estimation) { + k = i; + break; } - this->rank = k; - - // Allocating space for a preconditioner buffer. - T* R_sp = ( T * ) calloc( k * k, sizeof( T ) ); - /// Extracting a k by k upper-triangular R. - lapack::lacpy(MatrixType::Upper, k, k, A_hat, d, R_sp, k); - /// Extracting a k by n R representation (k by k upper-triangular, rest - general) - lapack::lacpy(MatrixType::Upper, k, k, A_hat, d, R, ldr); - lapack::lacpy(MatrixType::General, k, n - k, &A_hat[d * k], d, &R[n * k], ldr); - - // Swap k columns of A with pivots from J - blas::copy(n, J, 1, J_buf.data(), 1); - util::col_swap(m, n, k, A, lda, J_buf); - - // A_pre * R_sp = AP - blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, m, k, 1.0, R_sp, k, A, lda); - - // Do Cholesky QR - blas::syrk(Layout::ColMajor, Uplo::Upper, Op::Trans, k, m, 1.0, A, lda, 0.0, R_sp, k); - lapack::potrf(Uplo::Upper, k, R_sp, k); - - // Re-estimate rank after we have the R-factor form Cholesky QR. - // The strategy here is the same as in naive rank estimation. - // This also automatically takes care of any potentical failures in Cholesky factorization. - // Note that the diagonal of R_sp may not be sorted, so we need to keep the running max/min - // We expect the loss in the orthogonality of Q to be approximately equal to u * cond(R_sp)^2, where u is the unit roundoff for the numerical type T. - new_rank = k; - running_max = R_sp[0]; - running_min = R_sp[0]; - - for(i = 0; i < k; ++i) { - curr_entry = std::abs(R_sp[i * k + i]); - running_max = std::max(running_max, curr_entry); - running_min = std::min(running_min, curr_entry); - if(running_max / running_min >= std::sqrt(this->eps / std::numeric_limits::epsilon())) { - new_rank = i - 1; - break; - } + } + this->rank = k; + + // Allocating space for a preconditioner buffer. + T* R_sp = (T*) calloc(k * k, sizeof(T)); + /// Extracting a k by k upper-triangular R. + lapack::lacpy(MatrixType::Upper, k, k, A_hat, d, R_sp, k); + /// Extracting a k by n R representation (k by k upper-triangular, rest - + /// general) + lapack::lacpy(MatrixType::Upper, k, k, A_hat, d, R, ldr); + lapack::lacpy(MatrixType::General, k, n - k, &A_hat[d * k], d, &R[n * k], + ldr); + + // Swap k columns of A with pivots from J + blas::copy(n, J, 1, J_buf.data(), 1); + util::col_swap(m, n, k, A, lda, J_buf); + + // A_pre * R_sp = AP + blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, + Diag::NonUnit, m, k, 1.0, R_sp, k, A, lda); + + // Do Cholesky QR + blas::syrk(Layout::ColMajor, Uplo::Upper, Op::Trans, k, m, 1.0, A, lda, 0.0, + R_sp, k); + lapack::potrf(Uplo::Upper, k, R_sp, k); + + // Re-estimate rank after we have the R-factor form Cholesky QR. + // The strategy here is the same as in naive rank estimation. + // This also automatically takes care of any potentical failures in Cholesky + // factorization. Note that the diagonal of R_sp may not be sorted, so we need + // to keep the running max/min We expect the loss in the orthogonality of Q to + // be approximately equal to u * cond(R_sp)^2, where u is the unit roundoff + // for the numerical type T. + new_rank = k; + running_max = R_sp[0]; + running_min = R_sp[0]; + + for (i = 0; i < k; ++i) { + curr_entry = std::abs(R_sp[i * k + i]); + running_max = std::max(running_max, curr_entry); + running_min = std::min(running_min, curr_entry); + if (running_max / running_min >= + std::sqrt(this->eps / std::numeric_limits::epsilon())) { + new_rank = i - 1; + break; } + } - blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, m, new_rank, 1.0, R_sp, k, A, lda); - // Get the final R-factor. - blas::trmm(Layout::ColMajor, Side::Left, Uplo::Upper, Op::NoTrans, Diag::NonUnit, new_rank, n, 1.0, R_sp, k, R, ldr); + blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, + Diag::NonUnit, m, new_rank, 1.0, R_sp, k, A, lda); + // Get the final R-factor. + blas::trmm(Layout::ColMajor, Side::Left, Uplo::Upper, Op::NoTrans, + Diag::NonUnit, new_rank, n, 1.0, R_sp, k, R, ldr); - // Set the rank parameter to the value comuted a posteriori. - this->rank = k; + // Set the rank parameter to the value comuted a posteriori. + this->rank = k; - free(A_hat); - free(R_sp); - free(tau); + free(A_hat); + free(R_sp); + free(tau); - return 0; + return 0; } } // end namespace miniapps #endif diff --git a/miniapps/include/krylov.hh b/miniapps/include/krylov.hh index a7ca818..e24f3a5 100644 --- a/miniapps/include/krylov.hh +++ b/miniapps/include/krylov.hh @@ -14,7 +14,7 @@ namespace miniapps { */ template class Krylov { - public: +public: Krylov(T eps, int max_iters) : eps{eps}, max_iters{max_iters} {} virtual ~Krylov() {} @@ -23,14 +23,18 @@ class Krylov { const std::vector b, std::vector x) = 0; - T get_eps() { return eps; } + T get_eps() { + return eps; + } - int get_max_iters() { return max_iters; } + int get_max_iters() { + return max_iters; + } const T eps; const int max_iters; }; -} +} // namespace miniapps #endif // MINIAPPS_KRYLOV_HH diff --git a/miniapps/include/matrix_utils.hh b/miniapps/include/matrix_utils.hh index 322459d..ebb249c 100644 --- a/miniapps/include/matrix_utils.hh +++ b/miniapps/include/matrix_utils.hh @@ -10,13 +10,15 @@ namespace miniapps { -template struct matrix_data_entry { +template +struct matrix_data_entry { std::size_t row; std::size_t col; T value; }; -template struct matrix_data { +template +struct matrix_data { using nonzero_type = matrix_data_entry; std::size_t num_rows; @@ -102,7 +104,7 @@ template struct matrix_data { if (diag_positions[i] < 0) { nonzeros.emplace_back(i, i, norms[i] * ratio); } else { - auto &diag_value = nonzeros[diag_positions[i]].value; + auto& diag_value = nonzeros[diag_positions[i]].value; const auto diag_magnitude = std::abs(diag_value); const auto offdiag_magnitude = norms[i]; if (diag_magnitude < offdiag_magnitude * ratio) { diff --git a/miniapps/include/util.hh b/miniapps/include/util.hh index 0462f13..0066136 100644 --- a/miniapps/include/util.hh +++ b/miniapps/include/util.hh @@ -5,38 +5,32 @@ #include "rl_lapackpp.hh" #include -#include -#include #include -#include +#include #include +#include +#include namespace miniapps::util { /// A version of the above function to be used on a vector of integers template -void col_swap( - int64_t m, - int64_t n, - int64_t k, - T* A, - int64_t lda, - std::vector idx -) { - if(k > n) - throw std::runtime_error("Invalid rank parameter."); +void col_swap(int64_t m, int64_t n, int64_t k, T* A, int64_t lda, + std::vector idx) { + if (k > n) + throw std::runtime_error("Invalid rank parameter."); - int64_t i, j; //, l; - for (i = 0, j = 0; i < k; ++i) { - j = idx[i] - 1; - blas::swap(m, &A[i * lda], 1, &A[j * lda], 1); + int64_t i, j; //, l; + for (i = 0, j = 0; i < k; ++i) { + j = idx[i] - 1; + blas::swap(m, &A[i * lda], 1, &A[j * lda], 1); - // swap idx array elements - // Find idx element with value i and assign it to j - auto it = std::find(idx.begin() + i, idx.begin() + k, i + 1); - idx[it - (idx.begin())] = j + 1; - } + // swap idx array elements + // Find idx element with value i and assign it to j + auto it = std::find(idx.begin() + i, idx.begin() + k, i + 1); + idx[it - (idx.begin())] = j + 1; + } } -} // end namespace util +} // namespace miniapps::util #endif diff --git a/miniapps/test/randnla/test_cqrrpt.cc b/miniapps/test/randnla/test_cqrrpt.cc index cfed8f6..676a256 100644 --- a/miniapps/test/randnla/test_cqrrpt.cc +++ b/miniapps/test/randnla/test_cqrrpt.cc @@ -6,142 +6,137 @@ #include #include - -class TestCQRRPT : public ::testing::Test -{ - protected: - - virtual void SetUp() {}; - - virtual void TearDown() {}; - - template - struct CQRRPTTestData { - int64_t row; - int64_t col; - int64_t rank; - std::vector A; - std::vector R; - std::vector J; - std::vector A_cpy1; - std::vector A_cpy2; - std::vector I_ref; - - CQRRPTTestData(int64_t m, int64_t n, int64_t k) : - A(m * n, 0.0), - R(n * n, 0.0), - J(n, 0), - A_cpy1(m * n, 0.0), - A_cpy2(m * n, 0.0), - I_ref(k * k, 0.0) { - row = m; - col = n; - rank = k; - } - }; - - template - static void norm_and_copy_computational_helper(T &norm_A, CQRRPTTestData &all_data) { - auto m = all_data.row; - auto n = all_data.col; - - lapack::lacpy(MatrixType::General, m, n, all_data.A.data(), m, all_data.A_cpy1.data(), m); - lapack::lacpy(MatrixType::General, m, n, all_data.A.data(), m, all_data.A_cpy2.data(), m); - norm_A = lapack::lange(Norm::Fro, m, n, all_data.A.data(), m); +class TestCQRRPT : public ::testing::Test { +protected: + virtual void SetUp(){}; + + virtual void TearDown(){}; + + template + struct CQRRPTTestData { + int64_t row; + int64_t col; + int64_t rank; + std::vector A; + std::vector R; + std::vector J; + std::vector A_cpy1; + std::vector A_cpy2; + std::vector I_ref; + + CQRRPTTestData(int64_t m, int64_t n, int64_t k) + : A(m * n, 0.0), R(n * n, 0.0), J(n, 0), A_cpy1(m * n, 0.0), + A_cpy2(m * n, 0.0), I_ref(k * k, 0.0) { + row = m; + col = n; + rank = k; } - - template - static void - error_check(T &norm_A, CQRRPTTestData &all_data) { - - auto m = all_data.row; - auto n = all_data.col; - auto k = all_data.rank; - - all_data.I_ref.resize(k * k, 0.0); - for(int i = 0; i < k; ++i) - all_data.I_ref[(k + 1) * i] = 1.0; - - T* A_dat = all_data.A_cpy1.data(); - T const* A_cpy_dat = all_data.A_cpy2.data(); - T const* Q_dat = all_data.A.data(); - T const* R_dat = all_data.R.data(); - T* I_ref_dat = all_data.I_ref.data(); - - // Check orthogonality of Q - // Q' * Q - I = 0 - blas::syrk(Layout::ColMajor, Uplo::Upper, Op::Trans, k, m, 1.0, Q_dat, m, -1.0, I_ref_dat, k); - T norm_QTQ = lapack::lansy(lapack::Norm::Fro, Uplo::Upper, k, I_ref_dat, k); - - // A - QR - blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, n, k, 1.0, Q_dat, m, R_dat, n, -1.0, A_dat, m); - - // Implementing max col norm metric - T max_col_norm = 0.0; - T col_norm = 0.0; - int max_idx = 0; - for(int i = 0; i < n; ++i) { - col_norm = blas::nrm2(m, &A_dat[m * i], 1); - if(max_col_norm < col_norm) { - max_col_norm = col_norm; - max_idx = i; - } - } - T col_norm_A = blas::nrm2(n, &A_cpy_dat[m * max_idx], 1); - T norm_AQR = lapack::lange(Norm::Fro, m, n, A_dat, m); - - printf("REL NORM OF AP - QR: %15e\n", norm_AQR / norm_A); - printf("MAX COL NORM METRIC: %15e\n", max_col_norm / col_norm_A); - printf("FRO NORM OF (Q'Q - I): %2e\n\n", norm_QTQ); - - T atol = std::pow(std::numeric_limits::epsilon(), 0.75); - ASSERT_NEAR(norm_AQR / norm_A, 0.0, atol); - ASSERT_NEAR(max_col_norm / col_norm_A, 0.0, atol); - ASSERT_NEAR(norm_QTQ, 0.0, atol); + }; + + template + static void norm_and_copy_computational_helper(T& norm_A, + CQRRPTTestData& all_data) { + auto m = all_data.row; + auto n = all_data.col; + + lapack::lacpy(MatrixType::General, m, n, all_data.A.data(), m, + all_data.A_cpy1.data(), m); + lapack::lacpy(MatrixType::General, m, n, all_data.A.data(), m, + all_data.A_cpy2.data(), m); + norm_A = lapack::lange(Norm::Fro, m, n, all_data.A.data(), m); + } + + template + static void error_check(T& norm_A, CQRRPTTestData& all_data) { + + auto m = all_data.row; + auto n = all_data.col; + auto k = all_data.rank; + + all_data.I_ref.resize(k * k, 0.0); + for (int i = 0; i < k; ++i) + all_data.I_ref[(k + 1) * i] = 1.0; + + T* A_dat = all_data.A_cpy1.data(); + T const* A_cpy_dat = all_data.A_cpy2.data(); + T const* Q_dat = all_data.A.data(); + T const* R_dat = all_data.R.data(); + T* I_ref_dat = all_data.I_ref.data(); + + // Check orthogonality of Q + // Q' * Q - I = 0 + blas::syrk(Layout::ColMajor, Uplo::Upper, Op::Trans, k, m, 1.0, Q_dat, m, + -1.0, I_ref_dat, k); + T norm_QTQ = lapack::lansy(lapack::Norm::Fro, Uplo::Upper, k, I_ref_dat, k); + + // A - QR + blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, n, k, 1.0, Q_dat, + m, R_dat, n, -1.0, A_dat, m); + + // Implementing max col norm metric + T max_col_norm = 0.0; + T col_norm = 0.0; + int max_idx = 0; + for (int i = 0; i < n; ++i) { + col_norm = blas::nrm2(m, &A_dat[m * i], 1); + if (max_col_norm < col_norm) { + max_col_norm = col_norm; + max_idx = i; + } } + T col_norm_A = blas::nrm2(n, &A_cpy_dat[m * max_idx], 1); + T norm_AQR = lapack::lange(Norm::Fro, m, n, A_dat, m); - /// General test for CQRRPT: - /// Computes QR factorzation, and computes A[:, J] - QR. - template - static void test_CQRRPT_general( - T d_factor, - T norm_A, - CQRRPTTestData &all_data, - alg_type &CQRRPT) { + printf("REL NORM OF AP - QR: %15e\n", norm_AQR / norm_A); + printf("MAX COL NORM METRIC: %15e\n", max_col_norm / col_norm_A); + printf("FRO NORM OF (Q'Q - I): %2e\n\n", norm_QTQ); - auto m = all_data.row; - auto n = all_data.col; + T atol = std::pow(std::numeric_limits::epsilon(), 0.75); + ASSERT_NEAR(norm_AQR / norm_A, 0.0, atol); + ASSERT_NEAR(max_col_norm / col_norm_A, 0.0, atol); + ASSERT_NEAR(norm_QTQ, 0.0, atol); + } - CQRRPT.call(m, n, all_data.A.data(), m, all_data.R.data(), n, all_data.J.data(), d_factor); - all_data.rank = CQRRPT.rank; + /// General test for CQRRPT: + /// Computes QR factorzation, and computes A[:, J] - QR. + template + static void test_CQRRPT_general(T d_factor, T norm_A, + CQRRPTTestData& all_data, + alg_type& CQRRPT) { - printf("RANK AS RETURNED BY CQRRPT %ld\n", all_data.rank); + auto m = all_data.row; + auto n = all_data.col; - miniapps::util::col_swap(m, n, n, all_data.A_cpy1.data(), m, all_data.J); - miniapps::util::col_swap(m, n, n, all_data.A_cpy2.data(), m, all_data.J); + CQRRPT.call(m, n, all_data.A.data(), m, all_data.R.data(), n, + all_data.J.data(), d_factor); + all_data.rank = CQRRPT.rank; - error_check(norm_A, all_data); - } + printf("RANK AS RETURNED BY CQRRPT %ld\n", all_data.rank); + + miniapps::util::col_swap(m, n, n, all_data.A_cpy1.data(), m, all_data.J); + miniapps::util::col_swap(m, n, n, all_data.A_cpy2.data(), m, all_data.J); + + error_check(norm_A, all_data); + } }; TEST_F(TestCQRRPT, CQRRPT_full_rank_no_hqrrp) { - int64_t m = 50; - int64_t n = 20; - int64_t k = 20; - double d_factor = 1.25; - double norm_A = 0; - int64_t nnz = 2; - double tol = std::pow(std::numeric_limits::epsilon(), 0.85); - - CQRRPTTestData all_data(m, n, k); - miniapps::CQRRPT CQRRPT(tol, nnz); - // Generate dense matrix - auto [buf, a_shape] = spblas::generate_dense(m, n); - - lapack::lacpy(MatrixType::General, m, n, buf.data(), m, all_data.A.data(), m); - - norm_and_copy_computational_helper(norm_A, all_data); - test_CQRRPT_general>(d_factor, norm_A, all_data, CQRRPT); + int64_t m = 50; + int64_t n = 20; + int64_t k = 20; + double d_factor = 1.25; + double norm_A = 0; + int64_t nnz = 2; + double tol = std::pow(std::numeric_limits::epsilon(), 0.85); + + CQRRPTTestData all_data(m, n, k); + miniapps::CQRRPT CQRRPT(tol, nnz); + // Generate dense matrix + auto [buf, a_shape] = spblas::generate_dense(m, n); + + lapack::lacpy(MatrixType::General, m, n, buf.data(), m, all_data.A.data(), m); + + norm_and_copy_computational_helper(norm_A, all_data); + test_CQRRPT_general>(d_factor, norm_A, + all_data, CQRRPT); } - - diff --git a/miniapps/test/solvers/test_bicgstab.cpp b/miniapps/test/solvers/test_bicgstab.cpp index 445dee3..999702c 100644 --- a/miniapps/test/solvers/test_bicgstab.cpp +++ b/miniapps/test/solvers/test_bicgstab.cpp @@ -7,27 +7,28 @@ #include -template class TestBiCGSTAB : public testing::Test { +template +class TestBiCGSTAB : public testing::Test { public: auto generate_problem(std::size_t m, std::size_t n, std::size_t nnz_input, - std::size_t seed = 0) { - auto &&[values_orig, rowind_orig, colind_orig, shape, nnz_orig] = + std::size_t seed = 0) { + auto&& [values_orig, rowind_orig, colind_orig, shape, nnz_orig] = spblas::generate_coo(m, n, nnz_input, seed); miniapps::matrix_data data(rowind_orig, colind_orig, values_orig, shape); - auto &&[values, rowind, colind, nnz] = data.convert_to_coo(); + auto&& [values, rowind, colind, nnz] = data.convert_to_coo(); auto rowptr = miniapps::convert_rowind_to_rowptr(colind, nnz, shape); return std::tuple(values, rowptr, colind, shape, nnz); } auto generate_spd_problem(std::size_t m, std::size_t n, std::size_t nnz_input, - std::size_t seed = 0) { - auto &&[values_orig, rowind_orig, colind_orig, shape, nnz_orig] = + std::size_t seed = 0) { + auto&& [values_orig, rowind_orig, colind_orig, shape, nnz_orig] = spblas::generate_coo(m, n, nnz_input, seed); miniapps::matrix_data data(rowind_orig, colind_orig, values_orig, shape); data.sort_row_major(); data.make_symmetric(); data.make_diag_dominant(); - auto &&[values, rowind, colind, nnz] = data.convert_to_coo(); + auto&& [values, rowind, colind, nnz] = data.convert_to_coo(); auto rowptr = miniapps::convert_rowind_to_rowptr(colind, nnz, shape); return std::tuple(values, rowptr, colind, shape, nnz); } @@ -41,7 +42,7 @@ TYPED_TEST(TestBiCGSTAB, ConvergesForSmallSystem) { constexpr double tol = std::is_same::value ? 1e-14 : 1e-7; constexpr int max_iters = 100; std::tuple res; - auto &&[values, rowptr, colind, shape, nnz] = + auto&& [values, rowptr, colind, shape, nnz] = this->generate_problem(10, 10, 42, 75); spblas::csr_view a(values, rowptr, colind, shape, nnz); std::vector b(10, 1.0); @@ -60,7 +61,7 @@ TYPED_TEST(TestBiCGSTAB, ConvergesForLargeSystem) { constexpr int max_iters = 100; std::vector b(1000, 1.0); std::vector x(1000, 0.0); - auto &&[values, rowptr, colind, shape, nnz] = + auto&& [values, rowptr, colind, shape, nnz] = this->generate_spd_problem(1000, 1000, 12345, 75); spblas::csr_view a(values, rowptr, colind, shape, nnz); diff --git a/miniapps/test/solvers/test_cg.cpp b/miniapps/test/solvers/test_cg.cpp index 09984c3..780c514 100644 --- a/miniapps/test/solvers/test_cg.cpp +++ b/miniapps/test/solvers/test_cg.cpp @@ -7,19 +7,20 @@ #include -template class TestCG : public testing::Test { +template +class TestCG : public testing::Test { public: using ValueType = T; auto generate_spd_problem(std::size_t m, std::size_t n, std::size_t nnz_input, std::size_t seed = 0) { - auto &&[values_orig, rowind_orig, colind_orig, shape, nnz_orig] = + auto&& [values_orig, rowind_orig, colind_orig, shape, nnz_orig] = spblas::generate_coo(m, n, nnz_input, seed); miniapps::matrix_data data(rowind_orig, colind_orig, values_orig, shape); data.sort_row_major(); data.make_symmetric(); data.make_diag_dominant(); - auto &&[values, rowind, colind, nnz] = data.convert_to_coo(); + auto&& [values, rowind, colind, nnz] = data.convert_to_coo(); auto rowptr = miniapps::convert_rowind_to_rowptr(colind, nnz, shape); return std::tuple(values, rowptr, colind, shape, nnz); } @@ -35,7 +36,7 @@ TYPED_TEST(TestCG, ConvergesForSmallSystem) { std::vector b(10, 1.0); std::vector x(10, 0.0); - auto &&[values, rowptr, colind, shape, nnz] = + auto&& [values, rowptr, colind, shape, nnz] = this->generate_spd_problem(10, 10, 42, 75); spblas::csr_view a(values, rowptr, colind, shape, nnz); @@ -53,7 +54,7 @@ TYPED_TEST(TestCG, ConvergesForLargeSystem) { std::vector b(1000, 1.0); std::vector x(1000, 0.0); - auto &&[values, rowptr, colind, shape, nnz] = + auto&& [values, rowptr, colind, shape, nnz] = this->generate_spd_problem(1000, 1000, 12345, 75); spblas::csr_view a(values, rowptr, colind, shape, nnz); From 46c7ab9c28da42dfbdc19ae0b79aaf1a0352d8ad Mon Sep 17 00:00:00 2001 From: Benjamin Brock Date: Thu, 6 Mar 2025 10:59:42 -0800 Subject: [PATCH 4/5] `SPBLAS_BUILD_MINIAPPS` -> `ENABLE_MINIAPPS`, remove `enable_testing()`, since we're using GTest, not CTest. --- CMakeLists.txt | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d4d1c0f..0644cae 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,9 +6,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_FLAGS "-O3 -march=native") -option(SPBLAS_BUILD_MINIAPPS "Also build the miniapps. Default: OFF." OFF) - -enable_testing() +option(ENABLE_MINIAPPS "Also build the miniapps. Default: OFF." OFF) # Get includes, which declares the `spblas` library add_subdirectory(include) @@ -94,7 +92,7 @@ if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME) add_subdirectory(examples) add_subdirectory(test) - if (SPBLAS_BUILD_MINIAPPS) + if (ENABLE_MINIAPPS) add_subdirectory(miniapps) endif() endif() From 023f18e50f85834bd216aaced3ab59556bced806 Mon Sep 17 00:00:00 2001 From: Benjamin Brock Date: Thu, 6 Mar 2025 13:03:04 -0800 Subject: [PATCH 5/5] Updates to work with MKL backend. --- .../spblas/vendor/onemkl_sycl/algorithms.hpp | 1 + include/spblas/vendor/onemkl_sycl/gemm.hpp | 51 +++++++++++++++++++ miniapps/include/matrix_utils.hh | 16 +++--- miniapps/test/solvers/test_cg.cpp | 2 +- 4 files changed, 61 insertions(+), 9 deletions(-) create mode 100644 include/spblas/vendor/onemkl_sycl/gemm.hpp diff --git a/include/spblas/vendor/onemkl_sycl/algorithms.hpp b/include/spblas/vendor/onemkl_sycl/algorithms.hpp index 9f582a8..cafcbcf 100644 --- a/include/spblas/vendor/onemkl_sycl/algorithms.hpp +++ b/include/spblas/vendor/onemkl_sycl/algorithms.hpp @@ -1,5 +1,6 @@ #pragma once +#include "gemm.hpp" #include "spgemm_impl.hpp" #include "spmm_impl.hpp" #include "spmv_impl.hpp" diff --git a/include/spblas/vendor/onemkl_sycl/gemm.hpp b/include/spblas/vendor/onemkl_sycl/gemm.hpp new file mode 100644 index 0000000..e949059 --- /dev/null +++ b/include/spblas/vendor/onemkl_sycl/gemm.hpp @@ -0,0 +1,51 @@ +#pragma once + +#include + +#include +#include + +namespace spblas { + +template + requires __detail::has_mdspan_matrix_base && + __detail::has_mdspan_matrix_base && + __detail::is_matrix_instantiation_of_mdspan_v +void multiply(A&& a, B&& b, C&& c) { + auto a_base = __detail::get_ultimate_base(a); + auto b_base = __detail::get_ultimate_base(a); + + auto alpha_optional = __detail::get_scaling_factor(a, b); + tensor_scalar_t alpha = alpha_optional.value_or(1); + + sycl::queue q(sycl::cpu_selector_v); + + /* + namespace oneapi::mkl::blas::row_major { + sycl::event gemm(sycl::queue &queue, + onemkl::transpose transa, + onemkl::transpose transb, + std::int64_t m, + std::int64_t n, + std::int64_t k, + Ts alpha, + const Ta *a, + std::int64_t lda, + const Tb *b, + std::int64_t ldb, + Ts beta, + Tc *c, + std::int64_t ldc, + const std::vector &dependencies = {}) + } +*/ + + oneapi::mkl::blas::row_major::gemm( + q, oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::nontrans, + __backend::shape(a)[0], __backend::shape(c)[1], __backend::shape(a)[1], + alpha, a_base.data_handle(), __backend::shape(a)[1], b_base.data_handle(), + __backend::shape(b)[1], 0, c.data_handle(), __backend::shape(c)[1]) + .wait(); +} + +} // namespace spblas diff --git a/miniapps/include/matrix_utils.hh b/miniapps/include/matrix_utils.hh index ebb249c..50adaa2 100644 --- a/miniapps/include/matrix_utils.hh +++ b/miniapps/include/matrix_utils.hh @@ -17,17 +17,17 @@ struct matrix_data_entry { T value; }; -template +template struct matrix_data { using nonzero_type = matrix_data_entry; - std::size_t num_rows; - std::size_t num_cols; + I num_rows; + I num_cols; std::vector nonzeros; - matrix_data(std::vector rowind, std::vector colind, - std::vector values, spblas::index shape) - : num_rows{shape[0]}, num_cols{shape[1]} { + matrix_data(std::vector rowind, std::vector colind, + std::vector values, spblas::index shape) + : num_rows(shape[0]), num_cols(shape[1]) { assert(rowind.size() == colind.size() && rowind.size() == values.size()); for (std::size_t i = 0; i < values.size(); i++) { nonzeros.emplace_back(rowind[i], colind[i], values[i]); @@ -37,8 +37,8 @@ struct matrix_data { auto convert_to_coo() { auto nnz = nonzeros.size(); std::vector values(nnz); - std::vector rowind(nnz); - std::vector colind(nnz); + std::vector rowind(nnz); + std::vector colind(nnz); for (std::size_t i = 0; i < nnz; i++) { values[i] = nonzeros[i].value; diff --git a/miniapps/test/solvers/test_cg.cpp b/miniapps/test/solvers/test_cg.cpp index 780c514..0e159ca 100644 --- a/miniapps/test/solvers/test_cg.cpp +++ b/miniapps/test/solvers/test_cg.cpp @@ -15,7 +15,7 @@ class TestCG : public testing::Test { auto generate_spd_problem(std::size_t m, std::size_t n, std::size_t nnz_input, std::size_t seed = 0) { auto&& [values_orig, rowind_orig, colind_orig, shape, nnz_orig] = - spblas::generate_coo(m, n, nnz_input, seed); + spblas::generate_coo(m, n, nnz_input, seed); miniapps::matrix_data data(rowind_orig, colind_orig, values_orig, shape); data.sort_row_major(); data.make_symmetric();