From 23d26bc1ad79e9d1ef1f45ded7e7425b921027a2 Mon Sep 17 00:00:00 2001
From: Terry Cojean <terry.cojean@kit.edu>
Date: Wed, 26 Feb 2025 23:23:14 +0100
Subject: [PATCH 1/5] Merge the miniapps as an optional

This facilitates contribution, keeping the repositories in sync, and exapnds from simple examples.
Cleanup is necessary to make this more convenient for CI and more consistent.

Co-authored-by: Max Melnichenko <m.y.melnich@gmail.com>
---
 CMakeLists.txt                                |   6 +
 miniapps/CMake/blaspp.cmake                   |  14 ++
 miniapps/CMake/lapackpp.cmake                 |  14 ++
 miniapps/CMake/miniappsConfig.cmake.in        |  22 ++
 miniapps/CMake/miniappsConfigVersion.cmake.in |  11 +
 miniapps/CMake/rl_build_options.cmake         |  29 +++
 miniapps/CMake/rl_config.cmake                |  11 +
 miniapps/CMake/rl_version.cmake               |  27 +++
 miniapps/CMakeLists.txt                       |  30 +++
 miniapps/README.md                            |   0
 miniapps/include/CMakeLists.txt               |  52 +++++
 miniapps/include/bicgstab.hh                  | 160 +++++++++++++
 miniapps/include/cg.hh                        | 107 +++++++++
 miniapps/include/cqrrpt.hh                    | 221 ++++++++++++++++++
 miniapps/include/krylov.hh                    |  36 +++
 miniapps/include/matrix_utils.hh              | 149 ++++++++++++
 miniapps/include/miniapps.hh                  |  19 ++
 miniapps/include/rl_blaspp.hh                 |  12 +
 miniapps/include/rl_config.hh.in              |  11 +
 miniapps/include/rl_lapackpp.hh               |  12 +
 miniapps/include/util.hh                      |  42 ++++
 miniapps/test/CMakeLists.txt                  |   2 +
 miniapps/test/randnla/CMakeLists.txt          |   7 +
 miniapps/test/randnla/test_cqrrpt.cc          | 147 ++++++++++++
 miniapps/test/solvers/CMakeLists.txt          |   8 +
 miniapps/test/solvers/test_bicgstab.cpp       |  72 ++++++
 miniapps/test/solvers/test_cg.cpp             |  65 ++++++
 27 files changed, 1286 insertions(+)
 create mode 100644 miniapps/CMake/blaspp.cmake
 create mode 100644 miniapps/CMake/lapackpp.cmake
 create mode 100644 miniapps/CMake/miniappsConfig.cmake.in
 create mode 100644 miniapps/CMake/miniappsConfigVersion.cmake.in
 create mode 100644 miniapps/CMake/rl_build_options.cmake
 create mode 100644 miniapps/CMake/rl_config.cmake
 create mode 100644 miniapps/CMake/rl_version.cmake
 create mode 100644 miniapps/CMakeLists.txt
 create mode 100644 miniapps/README.md
 create mode 100644 miniapps/include/CMakeLists.txt
 create mode 100644 miniapps/include/bicgstab.hh
 create mode 100644 miniapps/include/cg.hh
 create mode 100644 miniapps/include/cqrrpt.hh
 create mode 100644 miniapps/include/krylov.hh
 create mode 100644 miniapps/include/matrix_utils.hh
 create mode 100644 miniapps/include/miniapps.hh
 create mode 100644 miniapps/include/rl_blaspp.hh
 create mode 100644 miniapps/include/rl_config.hh.in
 create mode 100644 miniapps/include/rl_lapackpp.hh
 create mode 100644 miniapps/include/util.hh
 create mode 100644 miniapps/test/CMakeLists.txt
 create mode 100644 miniapps/test/randnla/CMakeLists.txt
 create mode 100644 miniapps/test/randnla/test_cqrrpt.cc
 create mode 100644 miniapps/test/solvers/CMakeLists.txt
 create mode 100644 miniapps/test/solvers/test_bicgstab.cpp
 create mode 100644 miniapps/test/solvers/test_cg.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5734ab3..c439e1f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,6 +6,8 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
 set(CMAKE_CXX_FLAGS "-O3 -march=native")
 
+option(SPBLAS_BUILD_MINIAPPS "Also build the miniapps. Default: OFF." OFF)
+
 # Get includes, which declares the `spblas` library
 add_subdirectory(include)
 
@@ -90,3 +92,7 @@ if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
   add_subdirectory(examples)
   add_subdirectory(test)
 endif()
+
+if (SPBLAS_BUILD_MINIAPPS)
+  add_subdirectory(miniapps)
+endif()
diff --git a/miniapps/CMake/blaspp.cmake b/miniapps/CMake/blaspp.cmake
new file mode 100644
index 0000000..986763a
--- /dev/null
+++ b/miniapps/CMake/blaspp.cmake
@@ -0,0 +1,14 @@
+message(STATUS "\n\n-- miniapps checking for blaspp ... ")
+find_package(blaspp REQUIRED)
+message(STATUS "miniapps found blaspp ${blaspp_VERSION}\n")
+
+# interface library for use elsewhere in the project
+add_library(miniapps_blaspp INTERFACE)
+
+target_link_libraries(miniapps_blaspp INTERFACE blaspp)
+
+install(TARGETS miniapps_blaspp EXPORT miniapps_blaspp)
+
+install(EXPORT miniapps_blaspp
+    DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake"
+    EXPORT_LINK_INTERFACE_LIBRARIES)
\ No newline at end of file
diff --git a/miniapps/CMake/lapackpp.cmake b/miniapps/CMake/lapackpp.cmake
new file mode 100644
index 0000000..e7d9827
--- /dev/null
+++ b/miniapps/CMake/lapackpp.cmake
@@ -0,0 +1,14 @@
+message(STATUS "\n\n-- miniapps checking for lapackpp ... ")
+find_package(lapackpp REQUIRED)
+message(STATUS "miniapps found lapackpp ${lapackpp_VERSION}\n")
+
+# interface library for use elsewhere in the project
+add_library(miniapps_lapackpp INTERFACE)
+
+target_link_libraries(miniapps_lapackpp INTERFACE lapackpp)
+
+install(TARGETS miniapps_lapackpp EXPORT miniapps_lapackpp)
+
+install(EXPORT miniapps_lapackpp
+    DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake"
+    EXPORT_LINK_INTERFACE_LIBRARIES)
diff --git a/miniapps/CMake/miniappsConfig.cmake.in b/miniapps/CMake/miniappsConfig.cmake.in
new file mode 100644
index 0000000..6e89b67
--- /dev/null
+++ b/miniapps/CMake/miniappsConfig.cmake.in
@@ -0,0 +1,22 @@
+include(CMakeFindDependencyMacro)
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}")
+
+set(miniapps_VERSION "@miniapps_VERSION@")
+set(miniapps_VERSION_MAJOR "@miniapps_VERSION_MAJOR@")
+set(miniapps_VERSION_MINOR "@miniapps_VERSION_MINOR@")
+set(miniapps_VERSION_PATCH "@miniapps_VERSION_PATCH@")
+set(miniapps_VERSION_DEVEL "@miniapps_VERSION_DEVEL@")
+
+# blas++
+if (NOT blaspp_DIR)
+    set(blaspp_DIR @blaspp_DIR@)
+endif()
+find_dependency(blaspp)
+
+# lapack++
+if (NOT lapackpp_DIR)
+    set(lapackpp_DIR @lapackpp_DIR@)
+endif()
+find_dependency(lapackpp)
+
+include(miniapps)
diff --git a/miniapps/CMake/miniappsConfigVersion.cmake.in b/miniapps/CMake/miniappsConfigVersion.cmake.in
new file mode 100644
index 0000000..582661d
--- /dev/null
+++ b/miniapps/CMake/miniappsConfigVersion.cmake.in
@@ -0,0 +1,11 @@
+set(PACKAGE_VERSION "@miniapps_VERSION@")
+
+# Check whether the requested PACKAGE_FIND_VERSION is compatible
+if("${PACKAGE_VERSION}" VERSION_LESS "${PACKAGE_FIND_VERSION}")
+    set(PACKAGE_VERSION_COMPATIBLE FALSE)
+else()
+    set(PACKAGE_VERSION_COMPATIBLE TRUE)
+    if ("${PACKAGE_VERSION}" VERSION_EQUAL "${PACKAGE_FIND_VERSION}")
+        set(PACKAGE_VERSION_EXACT TRUE)
+    endif()
+endif()
diff --git a/miniapps/CMake/rl_build_options.cmake b/miniapps/CMake/rl_build_options.cmake
new file mode 100644
index 0000000..ff412d1
--- /dev/null
+++ b/miniapps/CMake/rl_build_options.cmake
@@ -0,0 +1,29 @@
+set(CMAKE_CXX_STANDARD 23)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+option(BUILD_SHARED_LIBS OFF "Configure to build shared or static libraries")
+
+if(NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE "Release"
+  CACHE STRING "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel." FORCE)
+  set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
+endif()
+
+set(SANITIZE_ADDRESS OFF CACHE BOOL "Add address sanitizer flags to the library")
+
+message(STATUS "Checking for OpenMP ... ")
+find_package(OpenMP COMPONENTS CXX)
+
+set(tmp FALSE)
+if (OpenMP_CXX_FOUND)
+    set(tmp TRUE)
+endif()
+
+set(miniapps_HAS_OpenMP ${tmp} CACHE BOOL "Set if we have a working OpenMP")
+
+include(GNUInstallDirs)
+
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/${CMAKE_INSTALL_LIBDIR}")
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/${CMAKE_INSTALL_LIBDIR}")
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/${CMAKE_INSTALL_BINDIR}")
+
diff --git a/miniapps/CMake/rl_config.cmake b/miniapps/CMake/rl_config.cmake
new file mode 100644
index 0000000..8df22e8
--- /dev/null
+++ b/miniapps/CMake/rl_config.cmake
@@ -0,0 +1,11 @@
+
+configure_file(CMake/miniappsConfig.cmake.in
+    ${CMAKE_INSTALL_LIBDIR}/cmake/miniappsConfig.cmake @ONLY)
+
+configure_file(CMake/miniappsConfigVersion.cmake.in
+    ${CMAKE_INSTALL_LIBDIR}/cmake/miniappsConfigVersion.cmake @ONLY)
+
+install(FILES
+    ${CMAKE_BINARY_DIR}/${CMAKE_INSTALL_LIBDIR}/cmake/miniappsConfig.cmake
+    ${CMAKE_BINARY_DIR}/${CMAKE_INSTALL_LIBDIR}/cmake/miniappsConfigVersion.cmake
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake)
diff --git a/miniapps/CMake/rl_version.cmake b/miniapps/CMake/rl_version.cmake
new file mode 100644
index 0000000..4dc2f3c
--- /dev/null
+++ b/miniapps/CMake/rl_version.cmake
@@ -0,0 +1,27 @@
+set(tmp)
+find_package(Git QUIET)
+if(GIT_FOUND)
+    execute_process(COMMAND ${GIT_EXECUTABLE}
+        --git-dir=${CMAKE_SOURCE_DIR}/.git describe
+        --tags --match "[0-9]*.[0-9]*.[0-9]*"
+        OUTPUT_VARIABLE tmp OUTPUT_STRIP_TRAILING_WHITESPACE
+        ERROR_QUIET)
+endif()
+if(NOT tmp)
+    set(tmp "0.0.0")
+endif()
+
+set(miniapps_VERSION ${tmp} CACHE STRING "miniapps version" FORCE)
+
+string(REGEX REPLACE "^([0-9]+)\\.([0-9]+)\\.([0-9]+)(.*$)"
+  "\\1" miniapps_VERSION_MAJOR ${miniapps_VERSION})
+
+string(REGEX REPLACE "^([0-9]+)\\.([0-9]+)\\.([0-9]+)(.*$)"
+  "\\2" miniapps_VERSION_MINOR ${miniapps_VERSION})
+
+string(REGEX REPLACE "^([0-9]+)\\.([0-9]+)\\.([0-9]+)(.*$)"
+  "\\3" miniapps_VERSION_PATCH ${miniapps_VERSION})
+
+message(STATUS "miniapps_VERSION_MAJOR=${miniapps_VERSION_MAJOR}")
+message(STATUS "miniapps_VERSION_MINOR=${miniapps_VERSION_MINOR}")
+message(STATUS "miniapps_VERSION_PATCH=${miniapps_VERSION_PATCH}")
diff --git a/miniapps/CMakeLists.txt b/miniapps/CMakeLists.txt
new file mode 100644
index 0000000..b3cd1b4
--- /dev/null
+++ b/miniapps/CMakeLists.txt
@@ -0,0 +1,30 @@
+cmake_minimum_required(VERSION 3.20)
+
+project(miniapps)
+
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/CMake")
+
+set(CMAKE_CXX_STANDARD 23)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+# configure the build
+enable_testing()
+
+include(rl_build_options)
+include(rl_version)
+
+# find dependencies
+find_package(lapackpp REQUIRED)
+
+# find dependencies
+find_package(blaspp REQUIRED)
+
+# compile sources
+add_subdirectory(include)
+
+if (ENABLE_TESTING)
+    add_subdirectory(test)
+endif()
+
+# export the configuration
+include(rl_config)
diff --git a/miniapps/README.md b/miniapps/README.md
new file mode 100644
index 0000000..e69de29
diff --git a/miniapps/include/CMakeLists.txt b/miniapps/include/CMakeLists.txt
new file mode 100644
index 0000000..b6f2c82
--- /dev/null
+++ b/miniapps/include/CMakeLists.txt
@@ -0,0 +1,52 @@
+set(miniapps_libs
+    lapackpp
+    blaspp
+    spblas
+)
+if (miniapps_HAS_OpenMP)
+    list(APPEND miniapps_libs OpenMP::OpenMP_CXX)
+endif()
+
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/rl_config.hh.in rl_config.hh)
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/rl_config.hh
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/miniapps)
+
+add_library(miniapps INTERFACE)
+target_include_directories(miniapps INTERFACE
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/..>
+  $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
+)
+
+set(miniapps_cxx_opts -Wall -Wextra)
+if (SANITIZE_ADDRESS)
+    list(APPEND miniapps_cxx_opts -fsanitize=address)
+    target_link_options(miniapps INTERFACE -fsanitize=address)
+endif()
+target_compile_options(miniapps INTERFACE ${miniapps_cxx_opts})
+
+target_link_libraries(miniapps INTERFACE ${miniapps_libs})
+
+install(
+    DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}"
+    DESTINATION include FILES_MATCHING PATTERN "*.hh"
+)
+
+install(
+    DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}"
+    DESTINATION include FILES_MATCHING PATTERN "*.hh"
+)
+
+install(
+    TARGETS miniapps
+    EXPORT miniapps
+    INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/miniapps
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+)
+
+# install(
+#     EXPORT miniapps
+#     DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake
+#     EXPORT_LINK_INTERFACE_LIBRARIES
+# )
diff --git a/miniapps/include/bicgstab.hh b/miniapps/include/bicgstab.hh
new file mode 100644
index 0000000..09b11d5
--- /dev/null
+++ b/miniapps/include/bicgstab.hh
@@ -0,0 +1,160 @@
+#ifndef MINIAPPS_BICGSTAB_HH
+#define MINIAPPS_BICGSTAB_HH
+
+#include <algorithm>
+#include <iostream>
+#include <utility>
+#include <vector>
+
+#include "krylov.hh"
+#include "rl_blaspp.hh"
+#include "util.hh"
+#include <spblas/spblas.hpp>
+
+namespace miniapps {
+
+template <typename T> class BiCGSTAB : public Krylov<T> {
+public:
+  BiCGSTAB(T eps, int max_iters) : Krylov<T>{eps, max_iters} {}
+
+  /**
+   * BiCGSTAB or the Bi-Conjugate Gradient-Stabilized is a Krylov subspace
+   * solver.
+   *
+   * Being a generic solver, it is capable of solving general matrices,
+   * including non-s.p.d matrices. Though, the memory and the computational
+   * requirement of the BiCGSTAB solver are higher than of its s.p.d solver
+   * counterpart, it has the capability to solve generic systems. It was
+   * developed by stabilizing the BiCG method.
+   *
+   * We implement the BiCGSTAB variant from Section 2.3.8 of "Templates for t
+   * e Solution of Linear Systems: Building Blocks for Iterative Methods"
+   *
+   * @param[in] A  the matrix in CSR format
+   * @param[in] b  the right hand side vector
+   * @param[out] x  the solution vector
+   */
+  std::tuple<double, int> apply(const spblas::csr_view<T> a,
+                                const std::vector<T> b,
+                                std::vector<T> x) override;
+};
+
+// -----------------------------------------------------------------------------
+template <typename T>
+std::tuple<double, int> BiCGSTAB<T>::apply(const spblas::csr_view<T> a,
+                                           const std::vector<T> b,
+                                           std::vector<T> x) {
+  int iters = 0;
+  double error = 1.0;
+
+  // r = dense_b
+  // prev_rho = rho = omega = alpha = beta = gamma = 1.0
+  // rr = v = s = t = z = y = p = 0
+  std::vector<T> r(b);
+  std::vector<T> rr(b.size(), 0.0);
+  std::vector<T> v(b.size(), 0.0);
+  std::vector<T> s(b.size(), 0.0);
+  std::vector<T> t(b.size(), 0.0);
+  std::vector<T> z(b.size(), 0.0);
+  std::vector<T> y(b.size(), 0.0);
+  std::vector<T> p(b.size(), 0.0);
+  std::vector<T> tmp(b.size(), 0.0);
+
+  double prev_rho, rho, omega, alpha, beta, gamma;
+  prev_rho = rho = omega = alpha = beta = gamma = 1.0;
+
+  // r = b - Ax
+  spblas::multiply(a, x, tmp);
+  blas::axpy(r.size(), -1.0, tmp.data(), 1, r.data(), 1);
+  // rr = r
+  rr = r;
+  while (true) {
+    iters++;
+
+    // rho = dot(rr, r)
+    rho = blas::dot(r.size(), rr.data(), 1, r.data(), 1);
+
+    error = blas::nrm2(r.size(), r.data(), 1);
+    if (iters >= this->max_iters || error < this->eps) {
+      std::cout << "error = " << error << "\n";
+      std::cout << "iters = " << iters << "\n";
+      std::cout << "eps = " << this->eps << "\n";
+      std::cout << "max_iters = " << this->max_iters << "\n";
+      break;
+    }
+
+    if (prev_rho * omega == 0.0) {
+      p = r;
+    } else {
+      // beta = (rho / prev_rho) * (alpha / omega)
+      beta = (rho / prev_rho) * (alpha / omega);
+      // p = r + beta * (p - omega * v)
+      tmp = p;
+      blas::axpy(tmp.size(), -omega, v.data(), 1, tmp.data(), 1);
+      blas::scal(tmp.size(), beta, tmp.data(), 1);
+      blas::axpy(tmp.size(), 1.0, r.data(), 1, tmp.data(), 1);
+      p = tmp;
+    }
+
+    // y = preconditioner * p
+    y = p;
+    // v = A * y
+    spblas::multiply(a, y, v);
+    // beta = dot(rr, v)
+    beta = blas::dot(v.size(), rr.data(), 1, v.data(), 1);
+    if (beta == 0.0) {
+      s = r;
+    } else {
+      // alpha = rho / beta
+      alpha = rho / beta;
+      // s = r - alpha * v
+      tmp = v;
+      blas::scal(tmp.size(), -alpha, tmp.data(), 1);
+      blas::axpy(tmp.size(), 1.0, r.data(), 1, tmp.data(), 1);
+      s = tmp;
+    }
+
+    error = blas::nrm2(s.size(), s.data(), 1);
+    if (iters >= this->max_iters || error < this->eps) {
+      std::cout << "error = " << error << "\n";
+      std::cout << "iters = " << iters << "\n";
+      std::cout << "eps = " << this->eps << "\n";
+      std::cout << "max_iters = " << this->max_iters << "\n";
+      blas::axpy(x.size(), alpha, y.data(), 1, x.data(), 1);
+      break;
+    }
+
+    // z = preconditioner * s
+    z = s;
+    // t = A * z
+    spblas::multiply(a, z, t);
+    // gamma = dot(t, s)
+    gamma = blas::dot(s.size(), t.data(), 1, s.data(), 1);
+    // beta = dot(t, t)
+    beta = blas::dot(t.size(), t.data(), 1, t.data(), 1);
+    // omega = gamma / beta
+    if (beta == 0.0) {
+      omega = 0.0;
+    } else {
+      omega = gamma / beta;
+    }
+    // x = x + alpha * y + omega * z
+    tmp = z;
+    blas::scal(tmp.size(), omega, tmp.data(), 1);
+    blas::axpy(tmp.size(), alpha, y.data(), 1, tmp.data(), 1);
+    blas::axpy(x.size(), 1.0, tmp.data(), 1, x.data(), 1);
+
+    // r = s - omega * t
+    tmp = t;
+    blas::scal(tmp.size(), -omega, tmp.data(), 1);
+    blas::axpy(tmp.size(), 1.0, s.data(), 1, tmp.data(), 1);
+    r = tmp;
+
+    std::swap(prev_rho, rho);
+  }
+  return {error, iters};
+}
+
+} // namespace miniapps
+
+#endif // MINIAPPS_BICGSTAB_HH
diff --git a/miniapps/include/cg.hh b/miniapps/include/cg.hh
new file mode 100644
index 0000000..560ae56
--- /dev/null
+++ b/miniapps/include/cg.hh
@@ -0,0 +1,107 @@
+#ifndef MINIAPPS_CG_HH
+#define MINIAPPS_CG_HH
+
+#include <algorithm>
+#include <iostream>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "krylov.hh"
+#include "rl_blaspp.hh"
+#include "util.hh"
+#include <spblas/spblas.hpp>
+
+namespace miniapps {
+
+template <typename T> class CG : public Krylov<T> {
+public:
+  CG(T eps, int max_iters) : Krylov<T>{eps, max_iters} {}
+
+  /**
+   * CG or the conjugate gradient method is an iterative type Krylov subspace
+   * method which is suitable for symmetric positive definite methods.
+   *
+   * Though this method performs very well for symmetric positive definite
+   * matrices, it is in general not suitable for general matrices.
+   *
+   * We implement the CG variant from Section 2.3.1 of "Templates for the
+   * Solution of Linear Systems: Building Blocks for Iterative Methods"
+   *
+   * @param[in] A  the matrix in CSR format
+   * @param[in] b  the right hand side vector
+   * @param[out] x  the solution vector
+   */
+  std::tuple<double, int> apply(const spblas::csr_view<T> a,
+                                const std::vector<T> b,
+                                std::vector<T> x) override;
+};
+
+// -----------------------------------------------------------------------------
+template <typename T>
+std::tuple<double, int> CG<T>::apply(const spblas::csr_view<T> a,
+                                     const std::vector<T> b, std::vector<T> x) {
+  int iters = 0;
+  double error = 1.0;
+
+  // r = b
+  // rho = 0.0
+  // prev_rho = 1.0
+  // p = q = 0
+  std::vector<T> r(b);
+  std::vector<T> z(r);
+  std::vector<T> p(b.size(), 0);
+  std::vector<T> q(b.size(), 0);
+  std::vector<T> tmp(b.size(), 0);
+
+  double alpha, beta, rho;
+  alpha = beta = rho = 0.0;
+  double prev_rho = 1.0;
+
+  // r = b-Ax
+  spblas::multiply(a, x, tmp);
+  blas::axpy(r.size(), -1.0, tmp.data(), 1, r.data(), 1);
+
+  while (true) {
+    // z = preconditioner*r
+    z = r;
+    // rho = dot(r, z)
+    rho = blas::dot(r.size(), r.data(), 1, z.data(), 1);
+
+    iters++;
+    error = blas::nrm2(r.size(), r.data(), 1);
+    if (iters >= this->max_iters || error < this->eps) {
+      std::cout << "error = " << error << "\n";
+      std::cout << "iters = " << iters << "\n";
+      std::cout << "eps = " << this->eps << "\n";
+      std::cout << "max_iters = " << this->max_iters << "\n";
+      break;
+    }
+
+    // beta = rho / prev_rho;
+    // p = z + beta * p
+    beta = rho / prev_rho;
+    tmp = p;
+    blas::scal(tmp.size(), beta, tmp.data(), 1);
+    blas::axpy(tmp.size(), 1.0, z.data(), 1, tmp.data(), 1);
+    p = tmp;
+
+    // q = A * p
+    spblas::multiply(a, p, q);
+
+    // alpha = rho / dot(p, q)
+    alpha = rho / blas::dot(q.size(), p.data(), 1, q.data(), 1);
+
+    // x = x + alpha * p
+    // r = r - alpha * q
+    blas::axpy(x.size(), alpha, p.data(), 1, x.data(), 1);
+    blas::axpy(r.size(), -alpha, q.data(), 1, r.data(), 1);
+
+    std::swap(rho, prev_rho);
+  }
+  return {error, iters};
+}
+
+} // namespace miniapps
+
+#endif // MINIAPPS_CG_HH
diff --git a/miniapps/include/cqrrpt.hh b/miniapps/include/cqrrpt.hh
new file mode 100644
index 0000000..cc1c380
--- /dev/null
+++ b/miniapps/include/cqrrpt.hh
@@ -0,0 +1,221 @@
+#ifndef miniapps_cqrrpt_h
+#define miniapps_cqrrpt_h
+
+#include <cstdint>
+#include <vector>
+#include <chrono>
+#include <numeric>
+
+#include "util.hh"
+#include "rl_blaspp.hh"
+#include "rl_lapackpp.hh"
+#include <spblas/spblas.hpp>
+
+namespace miniapps {
+
+template <typename T>
+class CQRRPTalg {
+    public:
+
+        virtual ~CQRRPTalg() {}
+
+        virtual int call(
+            int64_t m,
+            int64_t n,
+            T* A,
+            int64_t lda,
+            T* R,
+            int64_t ldr,
+            int64_t* J,
+            T d_factor
+        ) = 0;
+};
+
+template <typename T>
+class CQRRPT : public CQRRPTalg<T> {
+    public:
+        CQRRPT(T ep, int64_t nz) {
+            eps = ep;
+            nnz = nz;
+        }
+
+        /// Computes a QR factorization with column pivots of the form:
+        ///     A[:, J] = QR,
+        /// where Q and R are of size m-by-k and k-by-n, with rank(A) = k.
+        /// Detailed description of this algorithm may be found in Section 5.1.2.
+        /// of "the RandLAPACK book". 
+        ///
+        /// @param[in] m
+        ///     The number of rows in the matrix A.
+        ///
+        /// @param[in] n
+        ///     The number of columns in the matrix A.
+        ///
+        /// @param[in] A
+        ///     The m-by-n matrix A, stored in a column-major format.
+        ///
+        /// @param[in] d
+        ///     Embedding dimension of a sketch, m >= d >= n.
+        ///
+        /// @param[in] R
+        ///     Represents the upper-triangular R factor of QR factorization.
+        ///     On entry, is empty and may not have any space allocated for it.
+        ///
+        /// @param[out] A
+        ///     Overwritten by an m-by-k orthogonal Q factor.
+        ///     Matrix is stored explicitly.
+        ///
+        /// @param[out] R
+        ///     Stores k-by-n matrix with upper-triangular R factor.
+        ///     Zero entries are not compressed.
+        ///
+        /// @param[out] J
+        ///     Stores k integer type pivot index extries.
+        ///
+        /// @return = 0: successful exit
+        ///
+        /// @return = 1: cholesky factorization failed
+        ///
+
+        int call(
+            int64_t m,
+            int64_t n,
+            T* A,
+            int64_t lda,
+            T* R,
+            int64_t ldr,
+            int64_t* J,
+            T d_factor
+        ) override;
+
+    public:
+        T eps;
+        int64_t nnz;
+        int64_t rank;
+};
+
+// -----------------------------------------------------------------------------
+template <typename T>
+int CQRRPT<T>::call(
+    int64_t m,
+    int64_t n,
+    T* A,
+    int64_t lda,
+    T* R,
+    int64_t ldr,
+    int64_t* J,
+    T d_factor
+){
+    int i;
+    int64_t k = n;
+    int64_t d = d_factor * n;
+    // A constant for initial rank estimation.
+    T eps_initial_rank_estimation = 2 * std::pow(std::numeric_limits<T>::epsilon(), 0.95);
+    // Variables for a posteriori rank estimation.
+    int64_t new_rank;
+    T running_max, running_min, curr_entry;
+
+    T* A_hat = ( T * ) calloc( d * n, sizeof( T ) );
+    T* tau   = ( T * ) calloc( n, sizeof( T ) );
+    // Buffer for column pivoting.
+    std::vector<int64_t> J_buf(n, 0);
+    
+    /* RandBLAS style
+    /// Generating a SASO
+    RandBLAS::SparseDist DS = {.n_rows = d, .n_cols = m, .vec_nnz = this->nnz};
+    RandBLAS::SparseSkOp<T, RNG> S(DS, state);
+    state = RandBLAS::fill_sparse(S);
+
+    /// Applying a SASO
+    RandBLAS::sketch_general(
+        Layout::ColMajor, Op::NoTrans, Op::NoTrans,
+        d, n, m, 1.0, S, 0, 0, A, lda, 0.0, A_hat, d
+    );
+    */
+
+    /// SparseBLAS style
+    /// Our sketching operators must be in COO format.
+    //auto&& [values, rowptr, colind, shape, _] = spblas::generate_coo<T>(d, m, nnz);
+    /// TODO: add COO view
+    //spblas::coo_view<T> s(values, rowptr, colind, shape, nnz);
+    
+    /// Perform dense sketching for the sake of tests passing.
+    /// This is not the intended behavior for this algorithm, as dense sketching 
+    /// tanks the performance of CQRRPT.
+    auto [buf, a_shape] = spblas::generate_gaussian<double>(d, m);
+    spblas::__mdspan::mdspan s(buf.data(), d, m);
+
+    spblas::__mdspan::mdspan a(A, m, n);
+    spblas::__mdspan::mdspan ahat(A_hat, d, n);
+
+    spblas::multiply(s, a, ahat);
+    A_hat = ahat.data_handle();
+
+    /// Performing QRCP on a sketch
+    lapack::geqp3(d, n, A_hat, d, J, tau);
+
+    /// Using naive rank estimation to ensure that R used for preconditioning is invertible.
+    /// The actual rank estimate k will be computed a posteriori. 
+    /// Using R[i,i] to approximate the i-th singular value of A_hat. 
+    /// Truncate at the largest i where R[i,i] / R[0,0] >= eps.
+    for(i = 0; i < n; ++i) {
+        if(std::abs(A_hat[i * d + i]) / std::abs(A_hat[0]) < eps_initial_rank_estimation) {
+            k = i;
+            break;
+        }
+    }
+    this->rank = k;
+
+    // Allocating space for a preconditioner buffer.
+    T* R_sp  = ( T * ) calloc( k * k, sizeof( T ) );
+    /// Extracting a k by k upper-triangular R.
+    lapack::lacpy(MatrixType::Upper, k, k, A_hat, d, R_sp, k);
+    /// Extracting a k by n R representation (k by k upper-triangular, rest - general)
+    lapack::lacpy(MatrixType::Upper, k, k, A_hat, d, R, ldr);
+    lapack::lacpy(MatrixType::General, k, n - k, &A_hat[d * k], d, &R[n * k], ldr);
+
+    // Swap k columns of A with pivots from J
+    blas::copy(n, J, 1, J_buf.data(), 1);
+    util::col_swap(m, n, k, A, lda, J_buf);
+
+    // A_pre * R_sp = AP
+    blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, m, k, 1.0, R_sp, k, A, lda);
+
+    // Do Cholesky QR
+    blas::syrk(Layout::ColMajor, Uplo::Upper, Op::Trans, k, m, 1.0, A, lda, 0.0, R_sp, k);
+    lapack::potrf(Uplo::Upper, k, R_sp, k);
+
+    // Re-estimate rank after we have the R-factor form Cholesky QR.
+    // The strategy here is the same as in naive rank estimation.
+    // This also automatically takes care of any potentical failures in Cholesky factorization.
+    // Note that the diagonal of R_sp may not be sorted, so we need to keep the running max/min
+    // We expect the loss in the orthogonality of Q to be approximately equal to u * cond(R_sp)^2, where u is the unit roundoff for the numerical type T.
+    new_rank = k;
+    running_max = R_sp[0];
+    running_min = R_sp[0];
+    
+    for(i = 0; i < k; ++i) {
+        curr_entry = std::abs(R_sp[i * k + i]);
+        running_max = std::max(running_max, curr_entry);
+        running_min = std::min(running_min, curr_entry);
+        if(running_max / running_min >= std::sqrt(this->eps / std::numeric_limits<T>::epsilon())) {
+            new_rank = i - 1;
+            break;
+        }
+    }
+
+    blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, m, new_rank, 1.0, R_sp, k, A, lda);
+    // Get the final R-factor.
+    blas::trmm(Layout::ColMajor, Side::Left, Uplo::Upper, Op::NoTrans, Diag::NonUnit, new_rank, n, 1.0, R_sp, k, R, ldr);
+
+    // Set the rank parameter to the value comuted a posteriori.
+    this->rank = k;
+
+    free(A_hat);
+    free(R_sp);
+    free(tau);
+
+    return 0;
+}
+} // end namespace miniapps
+#endif
diff --git a/miniapps/include/krylov.hh b/miniapps/include/krylov.hh
new file mode 100644
index 0000000..a7ca818
--- /dev/null
+++ b/miniapps/include/krylov.hh
@@ -0,0 +1,36 @@
+#ifndef MINIAPPS_KRYLOV_HH
+#define MINIAPPS_KRYLOV_HH
+
+#include <vector>
+
+#include <spblas/spblas.hpp>
+
+namespace miniapps {
+
+/**
+ * A very simple Krylov Solver interface for convenience.
+ *
+ * @tparam T  the value type
+ */
+template <typename T>
+class Krylov {
+ public:
+  Krylov(T eps, int max_iters) : eps{eps}, max_iters{max_iters} {}
+
+  virtual ~Krylov() {}
+
+  virtual std::tuple<double, int> apply(const spblas::csr_view<T> a,
+                                        const std::vector<T> b,
+                                        std::vector<T> x) = 0;
+
+  T get_eps() { return eps; }
+
+  int get_max_iters() { return max_iters; }
+
+  const T eps;
+  const int max_iters;
+};
+
+}
+
+#endif // MINIAPPS_KRYLOV_HH
diff --git a/miniapps/include/matrix_utils.hh b/miniapps/include/matrix_utils.hh
new file mode 100644
index 0000000..322459d
--- /dev/null
+++ b/miniapps/include/matrix_utils.hh
@@ -0,0 +1,149 @@
+#ifndef SPBLAS_MINIAPPS_MATRIX_UTILS_HH
+#define SPBLAS_MINIAPPS_MATRIX_UTILS_HH
+
+#include <spblas/spblas.hpp>
+
+#include <algorithm>
+#include <cstddef>
+#include <exception>
+#include <vector>
+
+namespace miniapps {
+
+template <typename T> struct matrix_data_entry {
+  std::size_t row;
+  std::size_t col;
+  T value;
+};
+
+template <typename T> struct matrix_data {
+  using nonzero_type = matrix_data_entry<T>;
+
+  std::size_t num_rows;
+  std::size_t num_cols;
+  std::vector<nonzero_type> nonzeros;
+
+  matrix_data(std::vector<std::size_t> rowind, std::vector<std::size_t> colind,
+              std::vector<T> values, spblas::index<std::size_t> shape)
+      : num_rows{shape[0]}, num_cols{shape[1]} {
+    assert(rowind.size() == colind.size() && rowind.size() == values.size());
+    for (std::size_t i = 0; i < values.size(); i++) {
+      nonzeros.emplace_back(rowind[i], colind[i], values[i]);
+    }
+  }
+
+  auto convert_to_coo() {
+    auto nnz = nonzeros.size();
+    std::vector<T> values(nnz);
+    std::vector<std::size_t> rowind(nnz);
+    std::vector<std::size_t> colind(nnz);
+
+    for (std::size_t i = 0; i < nnz; i++) {
+      values[i] = nonzeros[i].value;
+      rowind[i] = nonzeros[i].row;
+      colind[i] = nonzeros[i].col;
+    }
+    return std::tuple(values, rowind, colind, nnz);
+  }
+
+  void sort_row_major() {
+    std::sort(begin(nonzeros), end(nonzeros),
+              [](nonzero_type x, nonzero_type y) {
+                return std::tie(x.row, x.col) < std::tie(y.row, y.col);
+              });
+  }
+
+  void sum_duplicates() {
+    this->sort_row_major();
+    std::vector<nonzero_type> new_nonzeros;
+    if (!nonzeros.empty()) {
+      new_nonzeros.emplace_back(nonzeros.front().row, nonzeros.front().col,
+                                0.0);
+      for (auto entry : nonzeros) {
+        if (entry.row != new_nonzeros.back().row ||
+            entry.col != new_nonzeros.back().col) {
+          new_nonzeros.emplace_back(entry.row, entry.col, 0.0);
+        }
+        new_nonzeros.back().value += entry.value;
+      }
+      nonzeros = std::move(new_nonzeros);
+    }
+  }
+
+  void make_symmetric() {
+    const auto nnz = nonzeros.size();
+    // compute (A + op(A^T)) / 2
+    for (std::size_t i = 0; i < nnz; i++) {
+      nonzeros[i].value /= 2.0;
+      auto entry = nonzeros[i];
+      nonzeros.emplace_back(entry.col, entry.row, entry.value);
+    }
+    // combine duplicates
+    this->sum_duplicates();
+  }
+
+  void make_diag_dominant(T ratio = 1.0) {
+    std::vector<T> norms(num_rows);
+    std::vector<std::int64_t> diag_positions(num_rows, -1);
+    std::int64_t i{};
+    for (auto entry : nonzeros) {
+      if (entry.row == entry.col) {
+        diag_positions[entry.row] = i;
+      } else {
+        norms[entry.row] += std::abs(entry.value);
+      }
+      i++;
+    }
+    for (i = 0; i < num_rows; i++) {
+      if (norms[i] == 0.0) {
+        // make sure empty rows don't make the matrix singular
+        norms[i] = 1.0;
+      }
+      if (diag_positions[i] < 0) {
+        nonzeros.emplace_back(i, i, norms[i] * ratio);
+      } else {
+        auto &diag_value = nonzeros[diag_positions[i]].value;
+        const auto diag_magnitude = std::abs(diag_value);
+        const auto offdiag_magnitude = norms[i];
+        if (diag_magnitude < offdiag_magnitude * ratio) {
+          const auto scaled_value =
+              diag_value * (offdiag_magnitude * ratio / diag_magnitude);
+          if (std::isfinite(scaled_value)) {
+            diag_value = scaled_value;
+          } else {
+            diag_value = offdiag_magnitude * ratio;
+          }
+        }
+      }
+    }
+    this->sort_row_major();
+  }
+};
+
+template <typename I = std::size_t>
+auto convert_rowind_to_rowptr(std::vector<I> rowind, std::size_t nnz,
+                              spblas::index<I> shape) {
+  auto num_rows = shape[0];
+  std::vector<I> rowptr(num_rows + 1, 0);
+  assert(rowind.size() == nnz);
+
+  for (I i = 0; i < nnz; i++) {
+    rowptr[rowind[i]]++;
+  }
+  constexpr auto max = std::numeric_limits<I>::max();
+  std::size_t partial_sum{};
+  for (std::size_t i = 0; i < num_rows + 1; ++i) {
+    auto this_nnz = i < num_rows ? rowptr[i] : 0;
+    rowptr[i] = partial_sum;
+    if (max - partial_sum < this_nnz) {
+      throw std::exception();
+    }
+    partial_sum += this_nnz;
+  }
+
+  return rowptr;
+}
+
+} // namespace miniapps
+
+#endif // SPBLAS_MINIAPPS_MATRIX_UTILS_HH
diff --git a/miniapps/include/miniapps.hh b/miniapps/include/miniapps.hh
new file mode 100644
index 0000000..291c427
--- /dev/null
+++ b/miniapps/include/miniapps.hh
@@ -0,0 +1,19 @@
+#ifndef SPBLAS_MINIAPPS_HH
+#define SPBLAS_MINIAPPS_HH
+
+// misc
+#include "util.hh"
+
+// misc
+#include "matrix_utils.hh"
+
+// randnla
+#include "cqrrpt.hh"
+
+// cg
+#include "cg.hh"
+
+// bicgstab
+#include "bicgstab.hh"
+
+#endif // SPBLAS_MINIAPPS_HH
diff --git a/miniapps/include/rl_blaspp.hh b/miniapps/include/rl_blaspp.hh
new file mode 100644
index 0000000..1ac915e
--- /dev/null
+++ b/miniapps/include/rl_blaspp.hh
@@ -0,0 +1,12 @@
+#ifndef randlapack_blaspp_h
+#define randlapack_blaspp_h
+
+#include <blas.hh>
+
+using Layout = blas::Layout;
+using Op = blas::Op;
+using Side = blas::Side;
+using Diag = blas::Diag;
+using Uplo = blas::Uplo;
+
+#endif
diff --git a/miniapps/include/rl_config.hh.in b/miniapps/include/rl_config.hh.in
new file mode 100644
index 0000000..68e2991
--- /dev/null
+++ b/miniapps/include/rl_config.hh.in
@@ -0,0 +1,11 @@
+#ifndef miniapps_config_h
+#define miniapps_config_h
+
+#define miniapps_VERSION "@miniapps_VERSION@"
+#define miniapps_VERSION_MAJOR @miniapps_VERSION_MAJOR@
+#define miniapps_VERSION_MINOR @miniapps_VERSION_MINOR@
+#define miniapps_VERSION_PATCH @miniapps_VERSION_PATCH@
+
+#CMAKEDEFINE miniapps_HAS_OpenMP
+
+#endif
diff --git a/miniapps/include/rl_lapackpp.hh b/miniapps/include/rl_lapackpp.hh
new file mode 100644
index 0000000..ba62260
--- /dev/null
+++ b/miniapps/include/rl_lapackpp.hh
@@ -0,0 +1,12 @@
+#ifndef randlapack_lapackpp_h
+#define randlapack_lapackpp_h
+
+#define LAPACK_COMPLEX_CPP 1
+
+#include <lapack.hh>
+
+using Job = lapack::Job;
+using MatrixType = lapack::MatrixType;
+using Norm = lapack::Norm;
+
+#endif
diff --git a/miniapps/include/util.hh b/miniapps/include/util.hh
new file mode 100644
index 0000000..0462f13
--- /dev/null
+++ b/miniapps/include/util.hh
@@ -0,0 +1,42 @@
+#ifndef miniapps_util_h
+#define miniapps_util_h
+
+#include "rl_blaspp.hh"
+#include "rl_lapackpp.hh"
+#include <spblas/spblas.hpp>
+
+#include <iostream>
+#include <cmath>
+#include <algorithm>
+#include <vector>
+#include <cstdint>
+
+namespace miniapps::util {
+
+/// A version of the above function to be used on a vector of integers
+template <typename T>
+void col_swap(
+    int64_t m,
+    int64_t n,
+    int64_t k,
+    T* A,
+    int64_t lda,
+    std::vector<int64_t> idx
+) {
+    if(k > n) 
+        throw std::runtime_error("Invalid rank parameter.");
+
+    int64_t i, j; //, l;
+    for (i = 0, j = 0; i < k; ++i) {
+        j = idx[i] - 1;
+        blas::swap(m, &A[i * lda], 1, &A[j * lda], 1);
+
+        // swap idx array elements
+        // Find idx element with value i and assign it to j
+        auto it = std::find(idx.begin() + i, idx.begin() + k, i + 1);
+        idx[it - (idx.begin())] = j + 1;
+    }
+}
+
+} // end namespace util
+#endif
diff --git a/miniapps/test/CMakeLists.txt b/miniapps/test/CMakeLists.txt
new file mode 100644
index 0000000..733220a
--- /dev/null
+++ b/miniapps/test/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_subdirectory(randnla)
+add_subdirectory(solvers)
diff --git a/miniapps/test/randnla/CMakeLists.txt b/miniapps/test/randnla/CMakeLists.txt
new file mode 100644
index 0000000..8696f44
--- /dev/null
+++ b/miniapps/test/randnla/CMakeLists.txt
@@ -0,0 +1,7 @@
+add_executable(miniapps_randnla_tests
+  test_cqrrpt.cc)
+
+include(GoogleTest)
+target_link_libraries(miniapps_randnla_tests miniapps GTest::gtest_main)
+
+gtest_discover_tests(miniapps_randnla_tests)
diff --git a/miniapps/test/randnla/test_cqrrpt.cc b/miniapps/test/randnla/test_cqrrpt.cc
new file mode 100644
index 0000000..cfed8f6
--- /dev/null
+++ b/miniapps/test/randnla/test_cqrrpt.cc
@@ -0,0 +1,147 @@
+#include "miniapps.hh"
+#include "rl_blaspp.hh"
+#include "rl_lapackpp.hh"
+#include <spblas/spblas.hpp>
+
+#include <fstream>
+#include <gtest/gtest.h>
+
+
+class TestCQRRPT : public ::testing::Test
+{
+    protected:
+
+    virtual void SetUp() {};
+
+    virtual void TearDown() {};
+
+    template <typename T>
+    struct CQRRPTTestData {
+        int64_t row;
+        int64_t col;
+        int64_t rank;
+        std::vector<T> A;
+        std::vector<T> R;
+        std::vector<int64_t> J;
+        std::vector<T> A_cpy1;
+        std::vector<T> A_cpy2;
+        std::vector<T> I_ref;
+
+        CQRRPTTestData(int64_t m, int64_t n, int64_t k) :
+        A(m * n, 0.0), 
+        R(n * n, 0.0),
+        J(n, 0),  
+        A_cpy1(m * n, 0.0),
+        A_cpy2(m * n, 0.0),
+        I_ref(k * k, 0.0) {
+            row = m;
+            col = n;
+            rank = k;
+        }
+    };
+
+    template <typename T>
+    static void norm_and_copy_computational_helper(T &norm_A, CQRRPTTestData<T> &all_data) {
+        auto m = all_data.row;
+        auto n = all_data.col;
+
+        lapack::lacpy(MatrixType::General, m, n, all_data.A.data(), m, all_data.A_cpy1.data(), m);
+        lapack::lacpy(MatrixType::General, m, n, all_data.A.data(), m, all_data.A_cpy2.data(), m);
+        norm_A = lapack::lange(Norm::Fro, m, n, all_data.A.data(), m);
+    }
+
+    template <typename T>
+    static void
+    error_check(T &norm_A, CQRRPTTestData<T> &all_data) {
+
+        auto m = all_data.row;
+        auto n = all_data.col;
+        auto k = all_data.rank;
+
+        all_data.I_ref.resize(k * k, 0.0);
+        for(int i = 0; i < k; ++i)
+            all_data.I_ref[(k + 1) * i] = 1.0;
+
+        T* A_dat         = all_data.A_cpy1.data();
+        T const* A_cpy_dat = all_data.A_cpy2.data();
+        T const* Q_dat   = all_data.A.data();
+        T const* R_dat   = all_data.R.data();
+        T* I_ref_dat     = all_data.I_ref.data();
+
+        // Check orthogonality of Q
+        // Q' * Q  - I = 0
+        blas::syrk(Layout::ColMajor, Uplo::Upper, Op::Trans, k, m, 1.0, Q_dat, m, -1.0, I_ref_dat, k);
+        T norm_QTQ = lapack::lansy(lapack::Norm::Fro, Uplo::Upper, k, I_ref_dat, k);
+
+        // A - QR
+        blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, n, k, 1.0, Q_dat, m, R_dat, n, -1.0, A_dat, m);
+        
+        // Implementing max col norm metric
+        T max_col_norm = 0.0;
+        T col_norm = 0.0;
+        int max_idx = 0;
+        for(int i = 0; i < n; ++i) {
+            col_norm = blas::nrm2(m, &A_dat[m * i], 1);
+            if(max_col_norm < col_norm) {
+                max_col_norm = col_norm;
+                max_idx = i;
+            }
+        }
+        T col_norm_A = blas::nrm2(n, &A_cpy_dat[m * max_idx], 1);
+        T norm_AQR = lapack::lange(Norm::Fro, m, n, A_dat, m);
+        
+        printf("REL NORM OF AP - QR:    %15e\n", norm_AQR / norm_A);
+        printf("MAX COL NORM METRIC:    %15e\n", max_col_norm / col_norm_A);
+        printf("FRO NORM OF (Q'Q - I):  %2e\n\n", norm_QTQ);
+
+        T atol = std::pow(std::numeric_limits<T>::epsilon(), 0.75);
+        ASSERT_NEAR(norm_AQR / norm_A,         0.0, atol);
+        ASSERT_NEAR(max_col_norm / col_norm_A, 0.0, atol);
+        ASSERT_NEAR(norm_QTQ,                  0.0, atol);
+    }
+
+    /// General test for CQRRPT:
+    /// Computes QR factorzation, and computes A[:, J] - QR.
+    template <typename T, typename alg_type>
+    static void test_CQRRPT_general(
+        T d_factor, 
+        T norm_A,
+        CQRRPTTestData<T> &all_data,
+        alg_type &CQRRPT) {
+
+        auto m = all_data.row;
+        auto n = all_data.col;
+
+        CQRRPT.call(m, n, all_data.A.data(), m, all_data.R.data(), n, all_data.J.data(), d_factor);
+        all_data.rank = CQRRPT.rank;
+
+        printf("RANK AS RETURNED BY CQRRPT %ld\n", all_data.rank);
+
+        miniapps::util::col_swap(m, n, n, all_data.A_cpy1.data(), m, all_data.J);
+        miniapps::util::col_swap(m, n, n, all_data.A_cpy2.data(), m, all_data.J);
+
+        error_check(norm_A, all_data); 
+    }
+};
+
+TEST_F(TestCQRRPT, CQRRPT_full_rank_no_hqrrp) {
+    int64_t m = 50;
+    int64_t n = 20;
+    int64_t k = 20;
+    double d_factor = 1.25;
+    double norm_A = 0;
+    int64_t nnz = 2;
+    double tol = std::pow(std::numeric_limits<double>::epsilon(), 0.85);
+
+    CQRRPTTestData<double> all_data(m, n, k);
+    miniapps::CQRRPT<double> CQRRPT(tol, nnz);
+    // Generate dense matrix
+    auto [buf, a_shape] = spblas::generate_dense<double>(m, n);
+
+    lapack::lacpy(MatrixType::General, m, n, buf.data(), m, all_data.A.data(), m);
+
+    norm_and_copy_computational_helper<double>(norm_A, all_data);
+    test_CQRRPT_general<double, miniapps::CQRRPT<double>>(d_factor, norm_A, all_data, CQRRPT);
+}
+
+
diff --git a/miniapps/test/solvers/CMakeLists.txt b/miniapps/test/solvers/CMakeLists.txt
new file mode 100644
index 0000000..70a99d3
--- /dev/null
+++ b/miniapps/test/solvers/CMakeLists.txt
@@ -0,0 +1,8 @@
+add_executable(miniapps_solvers_tests
+  test_cg.cpp
+  test_bicgstab.cpp)
+
+target_link_libraries(miniapps_solvers_tests miniapps GTest::gtest_main)
+
+include(GoogleTest)
+gtest_discover_tests(miniapps_solvers_tests)
diff --git a/miniapps/test/solvers/test_bicgstab.cpp b/miniapps/test/solvers/test_bicgstab.cpp
new file mode 100644
index 0000000..445dee3
--- /dev/null
+++ b/miniapps/test/solvers/test_bicgstab.cpp
@@ -0,0 +1,72 @@
+#include "miniapps.hh"
+#include <spblas/spblas.hpp>
+
+#include <algorithm>
+#include <exception>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+template <typename T> class TestBiCGSTAB : public testing::Test {
+public:
+  auto generate_problem(std::size_t m, std::size_t n, std::size_t nnz_input,
+                              std::size_t seed = 0) {
+    auto &&[values_orig, rowind_orig, colind_orig, shape, nnz_orig] =
+        spblas::generate_coo<T>(m, n, nnz_input, seed);
+    miniapps::matrix_data<T> data(rowind_orig, colind_orig, values_orig, shape);
+    auto &&[values, rowind, colind, nnz] = data.convert_to_coo();
+    auto rowptr = miniapps::convert_rowind_to_rowptr(colind, nnz, shape);
+    return std::tuple(values, rowptr, colind, shape, nnz);
+  }
+
+  auto generate_spd_problem(std::size_t m, std::size_t n, std::size_t nnz_input,
+                              std::size_t seed = 0) {
+    auto &&[values_orig, rowind_orig, colind_orig, shape, nnz_orig] =
+        spblas::generate_coo<T>(m, n, nnz_input, seed);
+    miniapps::matrix_data<T> data(rowind_orig, colind_orig, values_orig, shape);
+    data.sort_row_major();
+    data.make_symmetric();
+    data.make_diag_dominant();
+    auto &&[values, rowind, colind, nnz] = data.convert_to_coo();
+    auto rowptr = miniapps::convert_rowind_to_rowptr(colind, nnz, shape);
+    return std::tuple(values, rowptr, colind, shape, nnz);
+  }
+};
+
+using BiCGSTABTestTypes = ::testing::Types<double, float>;
+TYPED_TEST_SUITE(TestBiCGSTAB, BiCGSTABTestTypes);
+
+TYPED_TEST(TestBiCGSTAB, ConvergesForSmallSystem) {
+  using T = TypeParam;
+  constexpr double tol = std::is_same<T, double>::value ? 1e-14 : 1e-7;
+  constexpr int max_iters = 100;
+  std::tuple<double, int> res;
+  auto &&[values, rowptr, colind, shape, nnz] =
+      this->generate_problem(10, 10, 42, 75);
+  spblas::csr_view<T> a(values, rowptr, colind, shape, nnz);
+  std::vector<T> b(10, 1.0);
+  std::vector<T> x(10, 0.0);
+
+  miniapps::BiCGSTAB<T> BiCGSTAB(tol, max_iters);
+  auto [error, iters] = BiCGSTAB.apply(a, b, x);
+
+  ASSERT_LE(error, tol);
+  ASSERT_LE(iters, max_iters);
+}
+
+TYPED_TEST(TestBiCGSTAB, ConvergesForLargeSystem) {
+  using T = TypeParam;
+  constexpr double tol = std::is_same<T, double>::value ? 1e-14 : 1e-7;
+  constexpr int max_iters = 100;
+  std::vector<T> b(1000, 1.0);
+  std::vector<T> x(1000, 0.0);
+  auto &&[values, rowptr, colind, shape, nnz] =
+      this->generate_spd_problem(1000, 1000, 12345, 75);
+  spblas::csr_view<T> a(values, rowptr, colind, shape, nnz);
+
+  miniapps::BiCGSTAB<T> BiCGSTAB(tol, max_iters);
+  auto [error, iters] = BiCGSTAB.apply(a, b, x);
+
+  ASSERT_LE(error, tol);
+  ASSERT_LE(iters, max_iters);
+}
diff --git a/miniapps/test/solvers/test_cg.cpp b/miniapps/test/solvers/test_cg.cpp
new file mode 100644
index 0000000..09984c3
--- /dev/null
+++ b/miniapps/test/solvers/test_cg.cpp
@@ -0,0 +1,65 @@
+#include "miniapps.hh"
+#include <spblas/spblas.hpp>
+
+#include <algorithm>
+#include <exception>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+template <typename T> class TestCG : public testing::Test {
+public:
+  using ValueType = T;
+
+  auto generate_spd_problem(std::size_t m, std::size_t n, std::size_t nnz_input,
+                            std::size_t seed = 0) {
+    auto &&[values_orig, rowind_orig, colind_orig, shape, nnz_orig] =
+        spblas::generate_coo<T>(m, n, nnz_input, seed);
+    miniapps::matrix_data<T> data(rowind_orig, colind_orig, values_orig, shape);
+    data.sort_row_major();
+    data.make_symmetric();
+    data.make_diag_dominant();
+    auto &&[values, rowind, colind, nnz] = data.convert_to_coo();
+    auto rowptr = miniapps::convert_rowind_to_rowptr(colind, nnz, shape);
+    return std::tuple(values, rowptr, colind, shape, nnz);
+  }
+};
+
+using CGTestTypes = ::testing::Types<double, float>;
+TYPED_TEST_SUITE(TestCG, CGTestTypes);
+
+TYPED_TEST(TestCG, ConvergesForSmallSystem) {
+  using T = TypeParam;
+  constexpr double tol = std::is_same<T, double>::value ? 1e-14 : 1e-7;
+  constexpr int max_iters = 100;
+  std::vector<T> b(10, 1.0);
+  std::vector<T> x(10, 0.0);
+
+  auto &&[values, rowptr, colind, shape, nnz] =
+      this->generate_spd_problem(10, 10, 42, 75);
+  spblas::csr_view<T> a(values, rowptr, colind, shape, nnz);
+
+  miniapps::CG<T> CG(tol, max_iters);
+  auto [error, iters] = CG.apply(a, b, x);
+
+  ASSERT_LE(error, tol);
+  ASSERT_LE(iters, max_iters);
+}
+
+TYPED_TEST(TestCG, ConvergesForLargeSystem) {
+  using T = TypeParam;
+  constexpr double tol = std::is_same<T, double>::value ? 1e-14 : 1e-7;
+  constexpr int max_iters = 100;
+  std::vector<T> b(1000, 1.0);
+  std::vector<T> x(1000, 0.0);
+
+  auto &&[values, rowptr, colind, shape, nnz] =
+      this->generate_spd_problem(1000, 1000, 12345, 75);
+  spblas::csr_view<T> a(values, rowptr, colind, shape, nnz);
+
+  miniapps::CG<T> CG(tol, max_iters);
+  auto [error, iters] = CG.apply(a, b, x);
+
+  ASSERT_LE(error, tol);
+  ASSERT_LE(iters, max_iters);
+}

From 7aa5beea8c24c343ac9f9de8260ff0604785fde3 Mon Sep 17 00:00:00 2001
From: Terry Cojean <terry.cojean@kit.edu>
Date: Thu, 27 Feb 2025 22:42:35 +0100
Subject: [PATCH 2/5] miniapps: auto compile BLAS++ and LAPACK++

---
 CMakeLists.txt                  |  8 ++++---
 miniapps/CMake/blaspp.cmake     | 14 -------------
 miniapps/CMake/lapackpp.cmake   | 14 -------------
 miniapps/CMake/rl_version.cmake | 27 ------------------------
 miniapps/CMakeLists.txt         | 37 +++++++++++++++++++++++----------
 5 files changed, 31 insertions(+), 69 deletions(-)
 delete mode 100644 miniapps/CMake/blaspp.cmake
 delete mode 100644 miniapps/CMake/lapackpp.cmake
 delete mode 100644 miniapps/CMake/rl_version.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c439e1f..d4d1c0f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,6 +8,8 @@ set(CMAKE_CXX_FLAGS "-O3 -march=native")
 
 option(SPBLAS_BUILD_MINIAPPS "Also build the miniapps. Default: OFF." OFF)
 
+enable_testing()
+
 # Get includes, which declares the `spblas` library
 add_subdirectory(include)
 
@@ -91,8 +93,8 @@ if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
 
   add_subdirectory(examples)
   add_subdirectory(test)
-endif()
 
-if (SPBLAS_BUILD_MINIAPPS)
-  add_subdirectory(miniapps)
+  if (SPBLAS_BUILD_MINIAPPS)
+    add_subdirectory(miniapps)
+  endif()
 endif()
diff --git a/miniapps/CMake/blaspp.cmake b/miniapps/CMake/blaspp.cmake
deleted file mode 100644
index 986763a..0000000
--- a/miniapps/CMake/blaspp.cmake
+++ /dev/null
@@ -1,14 +0,0 @@
-message(STATUS "\n\n-- miniapps checking for blaspp ... ")
-find_package(blaspp REQUIRED)
-message(STATUS "miniapps found blaspp ${blaspp_VERSION}\n")
-
-# interface library for use elsewhere in the project
-add_library(miniapps_blaspp INTERFACE)
-
-target_link_libraries(miniapps_blaspp INTERFACE blaspp)
-
-install(TARGETS miniapps_blaspp EXPORT miniapps_blaspp)
-
-install(EXPORT miniapps_blaspp
-    DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake"
-    EXPORT_LINK_INTERFACE_LIBRARIES)
\ No newline at end of file
diff --git a/miniapps/CMake/lapackpp.cmake b/miniapps/CMake/lapackpp.cmake
deleted file mode 100644
index e7d9827..0000000
--- a/miniapps/CMake/lapackpp.cmake
+++ /dev/null
@@ -1,14 +0,0 @@
-message(STATUS "\n\n-- miniapps checking for lapackpp ... ")
-find_package(lapackpp REQUIRED)
-message(STATUS "miniapps found lapackpp ${lapackpp_VERSION}\n")
-
-# interface library for use elsewhere in the project
-add_library(miniapps_lapackpp INTERFACE)
-
-target_link_libraries(miniapps_lapackpp INTERFACE lapackpp)
-
-install(TARGETS miniapps_lapackpp EXPORT miniapps_lapackpp)
-
-install(EXPORT miniapps_lapackpp
-    DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake"
-    EXPORT_LINK_INTERFACE_LIBRARIES)
diff --git a/miniapps/CMake/rl_version.cmake b/miniapps/CMake/rl_version.cmake
deleted file mode 100644
index 4dc2f3c..0000000
--- a/miniapps/CMake/rl_version.cmake
+++ /dev/null
@@ -1,27 +0,0 @@
-set(tmp)
-find_package(Git QUIET)
-if(GIT_FOUND)
-    execute_process(COMMAND ${GIT_EXECUTABLE}
-        --git-dir=${CMAKE_SOURCE_DIR}/.git describe
-        --tags --match "[0-9]*.[0-9]*.[0-9]*"
-        OUTPUT_VARIABLE tmp OUTPUT_STRIP_TRAILING_WHITESPACE
-        ERROR_QUIET)
-endif()
-if(NOT tmp)
-    set(tmp "0.0.0")
-endif()
-
-set(miniapps_VERSION ${tmp} CACHE STRING "miniapps version" FORCE)
-
-string(REGEX REPLACE "^([0-9]+)\\.([0-9]+)\\.([0-9]+)(.*$)"
-  "\\1" miniapps_VERSION_MAJOR ${miniapps_VERSION})
-
-string(REGEX REPLACE "^([0-9]+)\\.([0-9]+)\\.([0-9]+)(.*$)"
-  "\\2" miniapps_VERSION_MINOR ${miniapps_VERSION})
-
-string(REGEX REPLACE "^([0-9]+)\\.([0-9]+)\\.([0-9]+)(.*$)"
-  "\\3" miniapps_VERSION_PATCH ${miniapps_VERSION})
-
-message(STATUS "miniapps_VERSION_MAJOR=${miniapps_VERSION_MAJOR}")
-message(STATUS "miniapps_VERSION_MINOR=${miniapps_VERSION_MINOR}")
-message(STATUS "miniapps_VERSION_PATCH=${miniapps_VERSION_PATCH}")
diff --git a/miniapps/CMakeLists.txt b/miniapps/CMakeLists.txt
index b3cd1b4..e931875 100644
--- a/miniapps/CMakeLists.txt
+++ b/miniapps/CMakeLists.txt
@@ -1,30 +1,45 @@
-cmake_minimum_required(VERSION 3.20)
+cmake_minimum_required(VERSION 3.24)
 
-project(miniapps)
+project(miniapps VERSION 0.1.0)
 
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/CMake")
 
 set(CMAKE_CXX_STANDARD 23)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
-# configure the build
+option(SPBLAS_MINIAPPS_ENABLE_CQRRPT "Enable the CQRRPT miniapp. Default ON." ON)
+option(SPBLAS_MINIAPPS_ENABLE_SOLVERS "Enable the solvers miniapp. Default ON." ON)
+
 enable_testing()
 
 include(rl_build_options)
-include(rl_version)
 
-# find dependencies
-find_package(lapackpp REQUIRED)
+# Find dependencies
+find_package(blaspp 2024.10.26 QUIET)
+if(NOT blaspp_FOUND)
+  FetchContent_Declare(
+    blaspp
+    GIT_REPOSITORY https://github.com/icl-utk-edu/blaspp
+    GIT_TAG v2024.10.26)
+    FetchContent_MakeAvailable(blaspp)
+endif()
 
-# find dependencies
-find_package(blaspp REQUIRED)
+set(blas "auto")
+set(lapack "auto")
+
+find_package(lapackpp 2024.10.26 QUIET)
+if(NOT lapackpp_FOUND)
+  FetchContent_Declare(
+    lapackpp
+    GIT_REPOSITORY https://github.com/icl-utk-edu/lapackpp
+    GIT_TAG v2024.10.26)
+  FetchContent_MakeAvailable(lapackpp)
+endif()
 
 # compile sources
 add_subdirectory(include)
 
-if (ENABLE_TESTING)
-    add_subdirectory(test)
-endif()
+add_subdirectory(test)
 
 # export the configuration
 include(rl_config)

From d495be029fb93dc61f77a8b3d6275946362ed2c2 Mon Sep 17 00:00:00 2001
From: Benjamin Brock <benjamin.brock@intel.com>
Date: Thu, 6 Mar 2025 09:57:35 -0800
Subject: [PATCH 3/5] Fix formatting for `clang-format`.

---
 miniapps/CMake/rl_build_options.cmake   |   1 -
 miniapps/include/bicgstab.hh            |   3 +-
 miniapps/include/cg.hh                  |   3 +-
 miniapps/include/cqrrpt.hh              | 367 ++++++++++++------------
 miniapps/include/krylov.hh              |  12 +-
 miniapps/include/matrix_utils.hh        |   8 +-
 miniapps/include/util.hh                |  40 ++-
 miniapps/test/randnla/test_cqrrpt.cc    | 249 ++++++++--------
 miniapps/test/solvers/test_bicgstab.cpp |  19 +-
 miniapps/test/solvers/test_cg.cpp       |  11 +-
 10 files changed, 349 insertions(+), 364 deletions(-)

diff --git a/miniapps/CMake/rl_build_options.cmake b/miniapps/CMake/rl_build_options.cmake
index ff412d1..ed85878 100644
--- a/miniapps/CMake/rl_build_options.cmake
+++ b/miniapps/CMake/rl_build_options.cmake
@@ -26,4 +26,3 @@ include(GNUInstallDirs)
 set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/${CMAKE_INSTALL_LIBDIR}")
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/${CMAKE_INSTALL_LIBDIR}")
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/${CMAKE_INSTALL_BINDIR}")
-
diff --git a/miniapps/include/bicgstab.hh b/miniapps/include/bicgstab.hh
index 09b11d5..7d6e751 100644
--- a/miniapps/include/bicgstab.hh
+++ b/miniapps/include/bicgstab.hh
@@ -13,7 +13,8 @@
 
 namespace miniapps {
 
-template <typename T> class BiCGSTAB : public Krylov<T> {
+template <typename T>
+class BiCGSTAB : public Krylov<T> {
 public:
   BiCGSTAB(T eps, int max_iters) : Krylov<T>{eps, max_iters} {}
 
diff --git a/miniapps/include/cg.hh b/miniapps/include/cg.hh
index 560ae56..7dbbea1 100644
--- a/miniapps/include/cg.hh
+++ b/miniapps/include/cg.hh
@@ -14,7 +14,8 @@
 
 namespace miniapps {
 
-template <typename T> class CG : public Krylov<T> {
+template <typename T>
+class CG : public Krylov<T> {
 public:
   CG(T eps, int max_iters) : Krylov<T>{eps, max_iters} {}
 
diff --git a/miniapps/include/cqrrpt.hh b/miniapps/include/cqrrpt.hh
index cc1c380..26e3f32 100644
--- a/miniapps/include/cqrrpt.hh
+++ b/miniapps/include/cqrrpt.hh
@@ -1,221 +1,208 @@
 #ifndef miniapps_cqrrpt_h
 #define miniapps_cqrrpt_h
 
-#include <cstdint>
-#include <vector>
 #include <chrono>
+#include <cstdint>
 #include <numeric>
+#include <vector>
 
-#include "util.hh"
 #include "rl_blaspp.hh"
 #include "rl_lapackpp.hh"
+#include "util.hh"
 #include <spblas/spblas.hpp>
 
 namespace miniapps {
 
 template <typename T>
 class CQRRPTalg {
-    public:
-
-        virtual ~CQRRPTalg() {}
-
-        virtual int call(
-            int64_t m,
-            int64_t n,
-            T* A,
-            int64_t lda,
-            T* R,
-            int64_t ldr,
-            int64_t* J,
-            T d_factor
-        ) = 0;
+public:
+  virtual ~CQRRPTalg() {}
+
+  virtual int call(int64_t m, int64_t n, T* A, int64_t lda, T* R, int64_t ldr,
+                   int64_t* J, T d_factor) = 0;
 };
 
 template <typename T>
 class CQRRPT : public CQRRPTalg<T> {
-    public:
-        CQRRPT(T ep, int64_t nz) {
-            eps = ep;
-            nnz = nz;
-        }
-
-        /// Computes a QR factorization with column pivots of the form:
-        ///     A[:, J] = QR,
-        /// where Q and R are of size m-by-k and k-by-n, with rank(A) = k.
-        /// Detailed description of this algorithm may be found in Section 5.1.2.
-        /// of "the RandLAPACK book". 
-        ///
-        /// @param[in] m
-        ///     The number of rows in the matrix A.
-        ///
-        /// @param[in] n
-        ///     The number of columns in the matrix A.
-        ///
-        /// @param[in] A
-        ///     The m-by-n matrix A, stored in a column-major format.
-        ///
-        /// @param[in] d
-        ///     Embedding dimension of a sketch, m >= d >= n.
-        ///
-        /// @param[in] R
-        ///     Represents the upper-triangular R factor of QR factorization.
-        ///     On entry, is empty and may not have any space allocated for it.
-        ///
-        /// @param[out] A
-        ///     Overwritten by an m-by-k orthogonal Q factor.
-        ///     Matrix is stored explicitly.
-        ///
-        /// @param[out] R
-        ///     Stores k-by-n matrix with upper-triangular R factor.
-        ///     Zero entries are not compressed.
-        ///
-        /// @param[out] J
-        ///     Stores k integer type pivot index extries.
-        ///
-        /// @return = 0: successful exit
-        ///
-        /// @return = 1: cholesky factorization failed
-        ///
-
-        int call(
-            int64_t m,
-            int64_t n,
-            T* A,
-            int64_t lda,
-            T* R,
-            int64_t ldr,
-            int64_t* J,
-            T d_factor
-        ) override;
-
-    public:
-        T eps;
-        int64_t nnz;
-        int64_t rank;
+public:
+  CQRRPT(T ep, int64_t nz) {
+    eps = ep;
+    nnz = nz;
+  }
+
+  /// Computes a QR factorization with column pivots of the form:
+  ///     A[:, J] = QR,
+  /// where Q and R are of size m-by-k and k-by-n, with rank(A) = k.
+  /// Detailed description of this algorithm may be found in Section 5.1.2.
+  /// of "the RandLAPACK book".
+  ///
+  /// @param[in] m
+  ///     The number of rows in the matrix A.
+  ///
+  /// @param[in] n
+  ///     The number of columns in the matrix A.
+  ///
+  /// @param[in] A
+  ///     The m-by-n matrix A, stored in a column-major format.
+  ///
+  /// @param[in] d
+  ///     Embedding dimension of a sketch, m >= d >= n.
+  ///
+  /// @param[in] R
+  ///     Represents the upper-triangular R factor of QR factorization.
+  ///     On entry, is empty and may not have any space allocated for it.
+  ///
+  /// @param[out] A
+  ///     Overwritten by an m-by-k orthogonal Q factor.
+  ///     Matrix is stored explicitly.
+  ///
+  /// @param[out] R
+  ///     Stores k-by-n matrix with upper-triangular R factor.
+  ///     Zero entries are not compressed.
+  ///
+  /// @param[out] J
+  ///     Stores k integer type pivot index extries.
+  ///
+  /// @return = 0: successful exit
+  ///
+  /// @return = 1: cholesky factorization failed
+  ///
+
+  int call(int64_t m, int64_t n, T* A, int64_t lda, T* R, int64_t ldr,
+           int64_t* J, T d_factor) override;
+
+public:
+  T eps;
+  int64_t nnz;
+  int64_t rank;
 };
 
 // -----------------------------------------------------------------------------
 template <typename T>
-int CQRRPT<T>::call(
-    int64_t m,
-    int64_t n,
-    T* A,
-    int64_t lda,
-    T* R,
-    int64_t ldr,
-    int64_t* J,
-    T d_factor
-){
-    int i;
-    int64_t k = n;
-    int64_t d = d_factor * n;
-    // A constant for initial rank estimation.
-    T eps_initial_rank_estimation = 2 * std::pow(std::numeric_limits<T>::epsilon(), 0.95);
-    // Variables for a posteriori rank estimation.
-    int64_t new_rank;
-    T running_max, running_min, curr_entry;
-
-    T* A_hat = ( T * ) calloc( d * n, sizeof( T ) );
-    T* tau   = ( T * ) calloc( n, sizeof( T ) );
-    // Buffer for column pivoting.
-    std::vector<int64_t> J_buf(n, 0);
-    
-    /* RandBLAS style
-    /// Generating a SASO
-    RandBLAS::SparseDist DS = {.n_rows = d, .n_cols = m, .vec_nnz = this->nnz};
-    RandBLAS::SparseSkOp<T, RNG> S(DS, state);
-    state = RandBLAS::fill_sparse(S);
-
-    /// Applying a SASO
-    RandBLAS::sketch_general(
-        Layout::ColMajor, Op::NoTrans, Op::NoTrans,
-        d, n, m, 1.0, S, 0, 0, A, lda, 0.0, A_hat, d
-    );
-    */
-
-    /// SparseBLAS style
-    /// Our sketching operators must be in COO format.
-    //auto&& [values, rowptr, colind, shape, _] = spblas::generate_coo<T>(d, m, nnz);
-    /// TODO: add COO view
-    //spblas::coo_view<T> s(values, rowptr, colind, shape, nnz);
-    
-    /// Perform dense sketching for the sake of tests passing.
-    /// This is not the intended behavior for this algorithm, as dense sketching 
-    /// tanks the performance of CQRRPT.
-    auto [buf, a_shape] = spblas::generate_gaussian<double>(d, m);
-    spblas::__mdspan::mdspan s(buf.data(), d, m);
-
-    spblas::__mdspan::mdspan a(A, m, n);
-    spblas::__mdspan::mdspan ahat(A_hat, d, n);
-
-    spblas::multiply(s, a, ahat);
-    A_hat = ahat.data_handle();
-
-    /// Performing QRCP on a sketch
-    lapack::geqp3(d, n, A_hat, d, J, tau);
-
-    /// Using naive rank estimation to ensure that R used for preconditioning is invertible.
-    /// The actual rank estimate k will be computed a posteriori. 
-    /// Using R[i,i] to approximate the i-th singular value of A_hat. 
-    /// Truncate at the largest i where R[i,i] / R[0,0] >= eps.
-    for(i = 0; i < n; ++i) {
-        if(std::abs(A_hat[i * d + i]) / std::abs(A_hat[0]) < eps_initial_rank_estimation) {
-            k = i;
-            break;
-        }
+int CQRRPT<T>::call(int64_t m, int64_t n, T* A, int64_t lda, T* R, int64_t ldr,
+                    int64_t* J, T d_factor) {
+  int i;
+  int64_t k = n;
+  int64_t d = d_factor * n;
+  // A constant for initial rank estimation.
+  T eps_initial_rank_estimation =
+      2 * std::pow(std::numeric_limits<T>::epsilon(), 0.95);
+  // Variables for a posteriori rank estimation.
+  int64_t new_rank;
+  T running_max, running_min, curr_entry;
+
+  T* A_hat = (T*) calloc(d * n, sizeof(T));
+  T* tau = (T*) calloc(n, sizeof(T));
+  // Buffer for column pivoting.
+  std::vector<int64_t> J_buf(n, 0);
+
+  /* RandBLAS style
+  /// Generating a SASO
+  RandBLAS::SparseDist DS = {.n_rows = d, .n_cols = m, .vec_nnz = this->nnz};
+  RandBLAS::SparseSkOp<T, RNG> S(DS, state);
+  state = RandBLAS::fill_sparse(S);
+
+  /// Applying a SASO
+  RandBLAS::sketch_general(
+      Layout::ColMajor, Op::NoTrans, Op::NoTrans,
+      d, n, m, 1.0, S, 0, 0, A, lda, 0.0, A_hat, d
+  );
+  */
+
+  /// SparseBLAS style
+  /// Our sketching operators must be in COO format.
+  // auto&& [values, rowptr, colind, shape, _] = spblas::generate_coo<T>(d, m,
+  // nnz);
+  /// TODO: add COO view
+  // spblas::coo_view<T> s(values, rowptr, colind, shape, nnz);
+
+  /// Perform dense sketching for the sake of tests passing.
+  /// This is not the intended behavior for this algorithm, as dense sketching
+  /// tanks the performance of CQRRPT.
+  auto [buf, a_shape] = spblas::generate_gaussian<double>(d, m);
+  spblas::__mdspan::mdspan s(buf.data(), d, m);
+
+  spblas::__mdspan::mdspan a(A, m, n);
+  spblas::__mdspan::mdspan ahat(A_hat, d, n);
+
+  spblas::multiply(s, a, ahat);
+  A_hat = ahat.data_handle();
+
+  /// Performing QRCP on a sketch
+  lapack::geqp3(d, n, A_hat, d, J, tau);
+
+  /// Using naive rank estimation to ensure that R used for preconditioning is
+  /// invertible. The actual rank estimate k will be computed a posteriori.
+  /// Using R[i,i] to approximate the i-th singular value of A_hat.
+  /// Truncate at the largest i where R[i,i] / R[0,0] >= eps.
+  for (i = 0; i < n; ++i) {
+    if (std::abs(A_hat[i * d + i]) / std::abs(A_hat[0]) <
+        eps_initial_rank_estimation) {
+      k = i;
+      break;
     }
-    this->rank = k;
-
-    // Allocating space for a preconditioner buffer.
-    T* R_sp  = ( T * ) calloc( k * k, sizeof( T ) );
-    /// Extracting a k by k upper-triangular R.
-    lapack::lacpy(MatrixType::Upper, k, k, A_hat, d, R_sp, k);
-    /// Extracting a k by n R representation (k by k upper-triangular, rest - general)
-    lapack::lacpy(MatrixType::Upper, k, k, A_hat, d, R, ldr);
-    lapack::lacpy(MatrixType::General, k, n - k, &A_hat[d * k], d, &R[n * k], ldr);
-
-    // Swap k columns of A with pivots from J
-    blas::copy(n, J, 1, J_buf.data(), 1);
-    util::col_swap(m, n, k, A, lda, J_buf);
-
-    // A_pre * R_sp = AP
-    blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, m, k, 1.0, R_sp, k, A, lda);
-
-    // Do Cholesky QR
-    blas::syrk(Layout::ColMajor, Uplo::Upper, Op::Trans, k, m, 1.0, A, lda, 0.0, R_sp, k);
-    lapack::potrf(Uplo::Upper, k, R_sp, k);
-
-    // Re-estimate rank after we have the R-factor form Cholesky QR.
-    // The strategy here is the same as in naive rank estimation.
-    // This also automatically takes care of any potentical failures in Cholesky factorization.
-    // Note that the diagonal of R_sp may not be sorted, so we need to keep the running max/min
-    // We expect the loss in the orthogonality of Q to be approximately equal to u * cond(R_sp)^2, where u is the unit roundoff for the numerical type T.
-    new_rank = k;
-    running_max = R_sp[0];
-    running_min = R_sp[0];
-    
-    for(i = 0; i < k; ++i) {
-        curr_entry = std::abs(R_sp[i * k + i]);
-        running_max = std::max(running_max, curr_entry);
-        running_min = std::min(running_min, curr_entry);
-        if(running_max / running_min >= std::sqrt(this->eps / std::numeric_limits<T>::epsilon())) {
-            new_rank = i - 1;
-            break;
-        }
+  }
+  this->rank = k;
+
+  // Allocating space for a preconditioner buffer.
+  T* R_sp = (T*) calloc(k * k, sizeof(T));
+  /// Extracting a k by k upper-triangular R.
+  lapack::lacpy(MatrixType::Upper, k, k, A_hat, d, R_sp, k);
+  /// Extracting a k by n R representation (k by k upper-triangular, rest -
+  /// general)
+  lapack::lacpy(MatrixType::Upper, k, k, A_hat, d, R, ldr);
+  lapack::lacpy(MatrixType::General, k, n - k, &A_hat[d * k], d, &R[n * k],
+                ldr);
+
+  // Swap k columns of A with pivots from J
+  blas::copy(n, J, 1, J_buf.data(), 1);
+  util::col_swap(m, n, k, A, lda, J_buf);
+
+  // A_pre * R_sp = AP
+  blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans,
+             Diag::NonUnit, m, k, 1.0, R_sp, k, A, lda);
+
+  // Do Cholesky QR
+  blas::syrk(Layout::ColMajor, Uplo::Upper, Op::Trans, k, m, 1.0, A, lda, 0.0,
+             R_sp, k);
+  lapack::potrf(Uplo::Upper, k, R_sp, k);
+
+  // Re-estimate rank after we have the R-factor form Cholesky QR.
+  // The strategy here is the same as in naive rank estimation.
+  // This also automatically takes care of any potentical failures in Cholesky
+  // factorization. Note that the diagonal of R_sp may not be sorted, so we need
+  // to keep the running max/min We expect the loss in the orthogonality of Q to
+  // be approximately equal to u * cond(R_sp)^2, where u is the unit roundoff
+  // for the numerical type T.
+  new_rank = k;
+  running_max = R_sp[0];
+  running_min = R_sp[0];
+
+  for (i = 0; i < k; ++i) {
+    curr_entry = std::abs(R_sp[i * k + i]);
+    running_max = std::max(running_max, curr_entry);
+    running_min = std::min(running_min, curr_entry);
+    if (running_max / running_min >=
+        std::sqrt(this->eps / std::numeric_limits<T>::epsilon())) {
+      new_rank = i - 1;
+      break;
     }
+  }
 
-    blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, m, new_rank, 1.0, R_sp, k, A, lda);
-    // Get the final R-factor.
-    blas::trmm(Layout::ColMajor, Side::Left, Uplo::Upper, Op::NoTrans, Diag::NonUnit, new_rank, n, 1.0, R_sp, k, R, ldr);
+  blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans,
+             Diag::NonUnit, m, new_rank, 1.0, R_sp, k, A, lda);
+  // Get the final R-factor.
+  blas::trmm(Layout::ColMajor, Side::Left, Uplo::Upper, Op::NoTrans,
+             Diag::NonUnit, new_rank, n, 1.0, R_sp, k, R, ldr);
 
-    // Set the rank parameter to the value comuted a posteriori.
-    this->rank = k;
+  // Set the rank parameter to the value comuted a posteriori.
+  this->rank = k;
 
-    free(A_hat);
-    free(R_sp);
-    free(tau);
+  free(A_hat);
+  free(R_sp);
+  free(tau);
 
-    return 0;
+  return 0;
 }
 } // end namespace miniapps
 #endif
diff --git a/miniapps/include/krylov.hh b/miniapps/include/krylov.hh
index a7ca818..e24f3a5 100644
--- a/miniapps/include/krylov.hh
+++ b/miniapps/include/krylov.hh
@@ -14,7 +14,7 @@ namespace miniapps {
  */
 template <typename T>
 class Krylov {
- public:
+public:
   Krylov(T eps, int max_iters) : eps{eps}, max_iters{max_iters} {}
 
   virtual ~Krylov() {}
@@ -23,14 +23,18 @@ class Krylov {
                                         const std::vector<T> b,
                                         std::vector<T> x) = 0;
 
-  T get_eps() { return eps; }
+  T get_eps() {
+    return eps;
+  }
 
-  int get_max_iters() { return max_iters; }
+  int get_max_iters() {
+    return max_iters;
+  }
 
   const T eps;
   const int max_iters;
 };
 
-}
+} // namespace miniapps
 
 #endif // MINIAPPS_KRYLOV_HH
diff --git a/miniapps/include/matrix_utils.hh b/miniapps/include/matrix_utils.hh
index 322459d..ebb249c 100644
--- a/miniapps/include/matrix_utils.hh
+++ b/miniapps/include/matrix_utils.hh
@@ -10,13 +10,15 @@
 
 namespace miniapps {
 
-template <typename T> struct matrix_data_entry {
+template <typename T>
+struct matrix_data_entry {
   std::size_t row;
   std::size_t col;
   T value;
 };
 
-template <typename T> struct matrix_data {
+template <typename T>
+struct matrix_data {
   using nonzero_type = matrix_data_entry<T>;
 
   std::size_t num_rows;
@@ -102,7 +104,7 @@ template <typename T> struct matrix_data {
       if (diag_positions[i] < 0) {
         nonzeros.emplace_back(i, i, norms[i] * ratio);
       } else {
-        auto &diag_value = nonzeros[diag_positions[i]].value;
+        auto& diag_value = nonzeros[diag_positions[i]].value;
         const auto diag_magnitude = std::abs(diag_value);
         const auto offdiag_magnitude = norms[i];
         if (diag_magnitude < offdiag_magnitude * ratio) {
diff --git a/miniapps/include/util.hh b/miniapps/include/util.hh
index 0462f13..0066136 100644
--- a/miniapps/include/util.hh
+++ b/miniapps/include/util.hh
@@ -5,38 +5,32 @@
 #include "rl_lapackpp.hh"
 #include <spblas/spblas.hpp>
 
-#include <iostream>
-#include <cmath>
 #include <algorithm>
-#include <vector>
+#include <cmath>
 #include <cstdint>
+#include <iostream>
+#include <vector>
 
 namespace miniapps::util {
 
 /// A version of the above function to be used on a vector of integers
 template <typename T>
-void col_swap(
-    int64_t m,
-    int64_t n,
-    int64_t k,
-    T* A,
-    int64_t lda,
-    std::vector<int64_t> idx
-) {
-    if(k > n) 
-        throw std::runtime_error("Invalid rank parameter.");
+void col_swap(int64_t m, int64_t n, int64_t k, T* A, int64_t lda,
+              std::vector<int64_t> idx) {
+  if (k > n)
+    throw std::runtime_error("Invalid rank parameter.");
 
-    int64_t i, j; //, l;
-    for (i = 0, j = 0; i < k; ++i) {
-        j = idx[i] - 1;
-        blas::swap(m, &A[i * lda], 1, &A[j * lda], 1);
+  int64_t i, j; //, l;
+  for (i = 0, j = 0; i < k; ++i) {
+    j = idx[i] - 1;
+    blas::swap(m, &A[i * lda], 1, &A[j * lda], 1);
 
-        // swap idx array elements
-        // Find idx element with value i and assign it to j
-        auto it = std::find(idx.begin() + i, idx.begin() + k, i + 1);
-        idx[it - (idx.begin())] = j + 1;
-    }
+    // swap idx array elements
+    // Find idx element with value i and assign it to j
+    auto it = std::find(idx.begin() + i, idx.begin() + k, i + 1);
+    idx[it - (idx.begin())] = j + 1;
+  }
 }
 
-} // end namespace util
+} // namespace miniapps::util
 #endif
diff --git a/miniapps/test/randnla/test_cqrrpt.cc b/miniapps/test/randnla/test_cqrrpt.cc
index cfed8f6..676a256 100644
--- a/miniapps/test/randnla/test_cqrrpt.cc
+++ b/miniapps/test/randnla/test_cqrrpt.cc
@@ -6,142 +6,137 @@
 #include <fstream>
 #include <gtest/gtest.h>
 
-
-class TestCQRRPT : public ::testing::Test
-{
-    protected:
-
-    virtual void SetUp() {};
-
-    virtual void TearDown() {};
-
-    template <typename T>
-    struct CQRRPTTestData {
-        int64_t row;
-        int64_t col;
-        int64_t rank;
-        std::vector<T> A;
-        std::vector<T> R;
-        std::vector<int64_t> J;
-        std::vector<T> A_cpy1;
-        std::vector<T> A_cpy2;
-        std::vector<T> I_ref;
-
-        CQRRPTTestData(int64_t m, int64_t n, int64_t k) :
-        A(m * n, 0.0), 
-        R(n * n, 0.0),
-        J(n, 0),  
-        A_cpy1(m * n, 0.0),
-        A_cpy2(m * n, 0.0),
-        I_ref(k * k, 0.0) {
-            row = m;
-            col = n;
-            rank = k;
-        }
-    };
-
-    template <typename T>
-    static void norm_and_copy_computational_helper(T &norm_A, CQRRPTTestData<T> &all_data) {
-        auto m = all_data.row;
-        auto n = all_data.col;
-
-        lapack::lacpy(MatrixType::General, m, n, all_data.A.data(), m, all_data.A_cpy1.data(), m);
-        lapack::lacpy(MatrixType::General, m, n, all_data.A.data(), m, all_data.A_cpy2.data(), m);
-        norm_A = lapack::lange(Norm::Fro, m, n, all_data.A.data(), m);
+class TestCQRRPT : public ::testing::Test {
+protected:
+  virtual void SetUp(){};
+
+  virtual void TearDown(){};
+
+  template <typename T>
+  struct CQRRPTTestData {
+    int64_t row;
+    int64_t col;
+    int64_t rank;
+    std::vector<T> A;
+    std::vector<T> R;
+    std::vector<int64_t> J;
+    std::vector<T> A_cpy1;
+    std::vector<T> A_cpy2;
+    std::vector<T> I_ref;
+
+    CQRRPTTestData(int64_t m, int64_t n, int64_t k)
+        : A(m * n, 0.0), R(n * n, 0.0), J(n, 0), A_cpy1(m * n, 0.0),
+          A_cpy2(m * n, 0.0), I_ref(k * k, 0.0) {
+      row = m;
+      col = n;
+      rank = k;
     }
-
-    template <typename T>
-    static void
-    error_check(T &norm_A, CQRRPTTestData<T> &all_data) {
-
-        auto m = all_data.row;
-        auto n = all_data.col;
-        auto k = all_data.rank;
-
-        all_data.I_ref.resize(k * k, 0.0);
-        for(int i = 0; i < k; ++i)
-            all_data.I_ref[(k + 1) * i] = 1.0;
-
-        T* A_dat         = all_data.A_cpy1.data();
-        T const* A_cpy_dat = all_data.A_cpy2.data();
-        T const* Q_dat   = all_data.A.data();
-        T const* R_dat   = all_data.R.data();
-        T* I_ref_dat     = all_data.I_ref.data();
-
-        // Check orthogonality of Q
-        // Q' * Q  - I = 0
-        blas::syrk(Layout::ColMajor, Uplo::Upper, Op::Trans, k, m, 1.0, Q_dat, m, -1.0, I_ref_dat, k);
-        T norm_QTQ = lapack::lansy(lapack::Norm::Fro, Uplo::Upper, k, I_ref_dat, k);
-
-        // A - QR
-        blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, n, k, 1.0, Q_dat, m, R_dat, n, -1.0, A_dat, m);
-        
-        // Implementing max col norm metric
-        T max_col_norm = 0.0;
-        T col_norm = 0.0;
-        int max_idx = 0;
-        for(int i = 0; i < n; ++i) {
-            col_norm = blas::nrm2(m, &A_dat[m * i], 1);
-            if(max_col_norm < col_norm) {
-                max_col_norm = col_norm;
-                max_idx = i;
-            }
-        }
-        T col_norm_A = blas::nrm2(n, &A_cpy_dat[m * max_idx], 1);
-        T norm_AQR = lapack::lange(Norm::Fro, m, n, A_dat, m);
-        
-        printf("REL NORM OF AP - QR:    %15e\n", norm_AQR / norm_A);
-        printf("MAX COL NORM METRIC:    %15e\n", max_col_norm / col_norm_A);
-        printf("FRO NORM OF (Q'Q - I):  %2e\n\n", norm_QTQ);
-
-        T atol = std::pow(std::numeric_limits<T>::epsilon(), 0.75);
-        ASSERT_NEAR(norm_AQR / norm_A,         0.0, atol);
-        ASSERT_NEAR(max_col_norm / col_norm_A, 0.0, atol);
-        ASSERT_NEAR(norm_QTQ,                  0.0, atol);
+  };
+
+  template <typename T>
+  static void norm_and_copy_computational_helper(T& norm_A,
+                                                 CQRRPTTestData<T>& all_data) {
+    auto m = all_data.row;
+    auto n = all_data.col;
+
+    lapack::lacpy(MatrixType::General, m, n, all_data.A.data(), m,
+                  all_data.A_cpy1.data(), m);
+    lapack::lacpy(MatrixType::General, m, n, all_data.A.data(), m,
+                  all_data.A_cpy2.data(), m);
+    norm_A = lapack::lange(Norm::Fro, m, n, all_data.A.data(), m);
+  }
+
+  template <typename T>
+  static void error_check(T& norm_A, CQRRPTTestData<T>& all_data) {
+
+    auto m = all_data.row;
+    auto n = all_data.col;
+    auto k = all_data.rank;
+
+    all_data.I_ref.resize(k * k, 0.0);
+    for (int i = 0; i < k; ++i)
+      all_data.I_ref[(k + 1) * i] = 1.0;
+
+    T* A_dat = all_data.A_cpy1.data();
+    T const* A_cpy_dat = all_data.A_cpy2.data();
+    T const* Q_dat = all_data.A.data();
+    T const* R_dat = all_data.R.data();
+    T* I_ref_dat = all_data.I_ref.data();
+
+    // Check orthogonality of Q
+    // Q' * Q  - I = 0
+    blas::syrk(Layout::ColMajor, Uplo::Upper, Op::Trans, k, m, 1.0, Q_dat, m,
+               -1.0, I_ref_dat, k);
+    T norm_QTQ = lapack::lansy(lapack::Norm::Fro, Uplo::Upper, k, I_ref_dat, k);
+
+    // A - QR
+    blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, n, k, 1.0, Q_dat,
+               m, R_dat, n, -1.0, A_dat, m);
+
+    // Implementing max col norm metric
+    T max_col_norm = 0.0;
+    T col_norm = 0.0;
+    int max_idx = 0;
+    for (int i = 0; i < n; ++i) {
+      col_norm = blas::nrm2(m, &A_dat[m * i], 1);
+      if (max_col_norm < col_norm) {
+        max_col_norm = col_norm;
+        max_idx = i;
+      }
     }
+    T col_norm_A = blas::nrm2(n, &A_cpy_dat[m * max_idx], 1);
+    T norm_AQR = lapack::lange(Norm::Fro, m, n, A_dat, m);
 
-    /// General test for CQRRPT:
-    /// Computes QR factorzation, and computes A[:, J] - QR.
-    template <typename T, typename alg_type>
-    static void test_CQRRPT_general(
-        T d_factor, 
-        T norm_A,
-        CQRRPTTestData<T> &all_data,
-        alg_type &CQRRPT) {
+    printf("REL NORM OF AP - QR:    %15e\n", norm_AQR / norm_A);
+    printf("MAX COL NORM METRIC:    %15e\n", max_col_norm / col_norm_A);
+    printf("FRO NORM OF (Q'Q - I):  %2e\n\n", norm_QTQ);
 
-        auto m = all_data.row;
-        auto n = all_data.col;
+    T atol = std::pow(std::numeric_limits<T>::epsilon(), 0.75);
+    ASSERT_NEAR(norm_AQR / norm_A, 0.0, atol);
+    ASSERT_NEAR(max_col_norm / col_norm_A, 0.0, atol);
+    ASSERT_NEAR(norm_QTQ, 0.0, atol);
+  }
 
-        CQRRPT.call(m, n, all_data.A.data(), m, all_data.R.data(), n, all_data.J.data(), d_factor);
-        all_data.rank = CQRRPT.rank;
+  /// General test for CQRRPT:
+  /// Computes QR factorzation, and computes A[:, J] - QR.
+  template <typename T, typename alg_type>
+  static void test_CQRRPT_general(T d_factor, T norm_A,
+                                  CQRRPTTestData<T>& all_data,
+                                  alg_type& CQRRPT) {
 
-        printf("RANK AS RETURNED BY CQRRPT %ld\n", all_data.rank);
+    auto m = all_data.row;
+    auto n = all_data.col;
 
-        miniapps::util::col_swap(m, n, n, all_data.A_cpy1.data(), m, all_data.J);
-        miniapps::util::col_swap(m, n, n, all_data.A_cpy2.data(), m, all_data.J);
+    CQRRPT.call(m, n, all_data.A.data(), m, all_data.R.data(), n,
+                all_data.J.data(), d_factor);
+    all_data.rank = CQRRPT.rank;
 
-        error_check(norm_A, all_data); 
-    }
+    printf("RANK AS RETURNED BY CQRRPT %ld\n", all_data.rank);
+
+    miniapps::util::col_swap(m, n, n, all_data.A_cpy1.data(), m, all_data.J);
+    miniapps::util::col_swap(m, n, n, all_data.A_cpy2.data(), m, all_data.J);
+
+    error_check(norm_A, all_data);
+  }
 };
 
 TEST_F(TestCQRRPT, CQRRPT_full_rank_no_hqrrp) {
-    int64_t m = 50;
-    int64_t n = 20;
-    int64_t k = 20;
-    double d_factor = 1.25;
-    double norm_A = 0;
-    int64_t nnz = 2;
-    double tol = std::pow(std::numeric_limits<double>::epsilon(), 0.85);
-
-    CQRRPTTestData<double> all_data(m, n, k);
-    miniapps::CQRRPT<double> CQRRPT(tol, nnz);
-    // Generate dense matrix
-    auto [buf, a_shape] = spblas::generate_dense<double>(m, n);
-
-    lapack::lacpy(MatrixType::General, m, n, buf.data(), m, all_data.A.data(), m);
-
-    norm_and_copy_computational_helper<double>(norm_A, all_data);
-    test_CQRRPT_general<double, miniapps::CQRRPT<double>>(d_factor, norm_A, all_data, CQRRPT);
+  int64_t m = 50;
+  int64_t n = 20;
+  int64_t k = 20;
+  double d_factor = 1.25;
+  double norm_A = 0;
+  int64_t nnz = 2;
+  double tol = std::pow(std::numeric_limits<double>::epsilon(), 0.85);
+
+  CQRRPTTestData<double> all_data(m, n, k);
+  miniapps::CQRRPT<double> CQRRPT(tol, nnz);
+  // Generate dense matrix
+  auto [buf, a_shape] = spblas::generate_dense<double>(m, n);
+
+  lapack::lacpy(MatrixType::General, m, n, buf.data(), m, all_data.A.data(), m);
+
+  norm_and_copy_computational_helper<double>(norm_A, all_data);
+  test_CQRRPT_general<double, miniapps::CQRRPT<double>>(d_factor, norm_A,
+                                                        all_data, CQRRPT);
 }
-
-
diff --git a/miniapps/test/solvers/test_bicgstab.cpp b/miniapps/test/solvers/test_bicgstab.cpp
index 445dee3..999702c 100644
--- a/miniapps/test/solvers/test_bicgstab.cpp
+++ b/miniapps/test/solvers/test_bicgstab.cpp
@@ -7,27 +7,28 @@
 
 #include <gtest/gtest.h>
 
-template <typename T> class TestBiCGSTAB : public testing::Test {
+template <typename T>
+class TestBiCGSTAB : public testing::Test {
 public:
   auto generate_problem(std::size_t m, std::size_t n, std::size_t nnz_input,
-                              std::size_t seed = 0) {
-    auto &&[values_orig, rowind_orig, colind_orig, shape, nnz_orig] =
+                        std::size_t seed = 0) {
+    auto&& [values_orig, rowind_orig, colind_orig, shape, nnz_orig] =
         spblas::generate_coo<T>(m, n, nnz_input, seed);
     miniapps::matrix_data<T> data(rowind_orig, colind_orig, values_orig, shape);
-    auto &&[values, rowind, colind, nnz] = data.convert_to_coo();
+    auto&& [values, rowind, colind, nnz] = data.convert_to_coo();
     auto rowptr = miniapps::convert_rowind_to_rowptr(colind, nnz, shape);
     return std::tuple(values, rowptr, colind, shape, nnz);
   }
 
   auto generate_spd_problem(std::size_t m, std::size_t n, std::size_t nnz_input,
-                              std::size_t seed = 0) {
-    auto &&[values_orig, rowind_orig, colind_orig, shape, nnz_orig] =
+                            std::size_t seed = 0) {
+    auto&& [values_orig, rowind_orig, colind_orig, shape, nnz_orig] =
         spblas::generate_coo<T>(m, n, nnz_input, seed);
     miniapps::matrix_data<T> data(rowind_orig, colind_orig, values_orig, shape);
     data.sort_row_major();
     data.make_symmetric();
     data.make_diag_dominant();
-    auto &&[values, rowind, colind, nnz] = data.convert_to_coo();
+    auto&& [values, rowind, colind, nnz] = data.convert_to_coo();
     auto rowptr = miniapps::convert_rowind_to_rowptr(colind, nnz, shape);
     return std::tuple(values, rowptr, colind, shape, nnz);
   }
@@ -41,7 +42,7 @@ TYPED_TEST(TestBiCGSTAB, ConvergesForSmallSystem) {
   constexpr double tol = std::is_same<T, double>::value ? 1e-14 : 1e-7;
   constexpr int max_iters = 100;
   std::tuple<double, int> res;
-  auto &&[values, rowptr, colind, shape, nnz] =
+  auto&& [values, rowptr, colind, shape, nnz] =
       this->generate_problem(10, 10, 42, 75);
   spblas::csr_view<T> a(values, rowptr, colind, shape, nnz);
   std::vector<T> b(10, 1.0);
@@ -60,7 +61,7 @@ TYPED_TEST(TestBiCGSTAB, ConvergesForLargeSystem) {
   constexpr int max_iters = 100;
   std::vector<T> b(1000, 1.0);
   std::vector<T> x(1000, 0.0);
-  auto &&[values, rowptr, colind, shape, nnz] =
+  auto&& [values, rowptr, colind, shape, nnz] =
       this->generate_spd_problem(1000, 1000, 12345, 75);
   spblas::csr_view<T> a(values, rowptr, colind, shape, nnz);
 
diff --git a/miniapps/test/solvers/test_cg.cpp b/miniapps/test/solvers/test_cg.cpp
index 09984c3..780c514 100644
--- a/miniapps/test/solvers/test_cg.cpp
+++ b/miniapps/test/solvers/test_cg.cpp
@@ -7,19 +7,20 @@
 
 #include <gtest/gtest.h>
 
-template <typename T> class TestCG : public testing::Test {
+template <typename T>
+class TestCG : public testing::Test {
 public:
   using ValueType = T;
 
   auto generate_spd_problem(std::size_t m, std::size_t n, std::size_t nnz_input,
                             std::size_t seed = 0) {
-    auto &&[values_orig, rowind_orig, colind_orig, shape, nnz_orig] =
+    auto&& [values_orig, rowind_orig, colind_orig, shape, nnz_orig] =
         spblas::generate_coo<T>(m, n, nnz_input, seed);
     miniapps::matrix_data<T> data(rowind_orig, colind_orig, values_orig, shape);
     data.sort_row_major();
     data.make_symmetric();
     data.make_diag_dominant();
-    auto &&[values, rowind, colind, nnz] = data.convert_to_coo();
+    auto&& [values, rowind, colind, nnz] = data.convert_to_coo();
     auto rowptr = miniapps::convert_rowind_to_rowptr(colind, nnz, shape);
     return std::tuple(values, rowptr, colind, shape, nnz);
   }
@@ -35,7 +36,7 @@ TYPED_TEST(TestCG, ConvergesForSmallSystem) {
   std::vector<T> b(10, 1.0);
   std::vector<T> x(10, 0.0);
 
-  auto &&[values, rowptr, colind, shape, nnz] =
+  auto&& [values, rowptr, colind, shape, nnz] =
       this->generate_spd_problem(10, 10, 42, 75);
   spblas::csr_view<T> a(values, rowptr, colind, shape, nnz);
 
@@ -53,7 +54,7 @@ TYPED_TEST(TestCG, ConvergesForLargeSystem) {
   std::vector<T> b(1000, 1.0);
   std::vector<T> x(1000, 0.0);
 
-  auto &&[values, rowptr, colind, shape, nnz] =
+  auto&& [values, rowptr, colind, shape, nnz] =
       this->generate_spd_problem(1000, 1000, 12345, 75);
   spblas::csr_view<T> a(values, rowptr, colind, shape, nnz);
 

From 46c7ab9c28da42dfbdc19ae0b79aaf1a0352d8ad Mon Sep 17 00:00:00 2001
From: Benjamin Brock <benjamin.brock@intel.com>
Date: Thu, 6 Mar 2025 10:59:42 -0800
Subject: [PATCH 4/5] `SPBLAS_BUILD_MINIAPPS` -> `ENABLE_MINIAPPS`, remove
 `enable_testing()`, since we're using GTest, not CTest.

---
 CMakeLists.txt | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d4d1c0f..0644cae 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,9 +6,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
 set(CMAKE_CXX_FLAGS "-O3 -march=native")
 
-option(SPBLAS_BUILD_MINIAPPS "Also build the miniapps. Default: OFF." OFF)
-
-enable_testing()
+option(ENABLE_MINIAPPS "Also build the miniapps. Default: OFF." OFF)
 
 # Get includes, which declares the `spblas` library
 add_subdirectory(include)
@@ -94,7 +92,7 @@ if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
   add_subdirectory(examples)
   add_subdirectory(test)
 
-  if (SPBLAS_BUILD_MINIAPPS)
+  if (ENABLE_MINIAPPS)
     add_subdirectory(miniapps)
   endif()
 endif()

From 023f18e50f85834bd216aaced3ab59556bced806 Mon Sep 17 00:00:00 2001
From: Benjamin Brock <brock@cs.berkeley.edu>
Date: Thu, 6 Mar 2025 13:03:04 -0800
Subject: [PATCH 5/5] Updates to work with MKL backend.

---
 .../spblas/vendor/onemkl_sycl/algorithms.hpp  |  1 +
 include/spblas/vendor/onemkl_sycl/gemm.hpp    | 51 +++++++++++++++++++
 miniapps/include/matrix_utils.hh              | 16 +++---
 miniapps/test/solvers/test_cg.cpp             |  2 +-
 4 files changed, 61 insertions(+), 9 deletions(-)
 create mode 100644 include/spblas/vendor/onemkl_sycl/gemm.hpp

diff --git a/include/spblas/vendor/onemkl_sycl/algorithms.hpp b/include/spblas/vendor/onemkl_sycl/algorithms.hpp
index 9f582a8..cafcbcf 100644
--- a/include/spblas/vendor/onemkl_sycl/algorithms.hpp
+++ b/include/spblas/vendor/onemkl_sycl/algorithms.hpp
@@ -1,5 +1,6 @@
 #pragma once
 
+#include "gemm.hpp"
 #include "spgemm_impl.hpp"
 #include "spmm_impl.hpp"
 #include "spmv_impl.hpp"
diff --git a/include/spblas/vendor/onemkl_sycl/gemm.hpp b/include/spblas/vendor/onemkl_sycl/gemm.hpp
new file mode 100644
index 0000000..e949059
--- /dev/null
+++ b/include/spblas/vendor/onemkl_sycl/gemm.hpp
@@ -0,0 +1,51 @@
+#pragma once
+
+#include <oneapi/mkl.hpp>
+
+#include <spblas/detail/ranges.hpp>
+#include <spblas/detail/view_inspectors.hpp>
+
+namespace spblas {
+
+template <matrix A, matrix B, matrix C>
+  requires __detail::has_mdspan_matrix_base<A> &&
+           __detail::has_mdspan_matrix_base<B> &&
+           __detail::is_matrix_instantiation_of_mdspan_v<C>
+void multiply(A&& a, B&& b, C&& c) {
+  auto a_base = __detail::get_ultimate_base(a);
+  auto b_base = __detail::get_ultimate_base(a);
+
+  auto alpha_optional = __detail::get_scaling_factor(a, b);
+  tensor_scalar_t<A> alpha = alpha_optional.value_or(1);
+
+  sycl::queue q(sycl::cpu_selector_v);
+
+  /*
+  namespace oneapi::mkl::blas::row_major {
+    sycl::event gemm(sycl::queue &queue,
+                     onemkl::transpose transa,
+                     onemkl::transpose transb,
+                     std::int64_t m,
+                     std::int64_t n,
+                     std::int64_t k,
+                     Ts alpha,
+                     const Ta *a,
+                     std::int64_t lda,
+                     const Tb *b,
+                     std::int64_t ldb,
+                     Ts beta,
+                     Tc *c,
+                     std::int64_t ldc,
+                     const std::vector<sycl::event> &dependencies = {})
+                     }
+*/
+
+  oneapi::mkl::blas::row_major::gemm(
+      q, oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::nontrans,
+      __backend::shape(a)[0], __backend::shape(c)[1], __backend::shape(a)[1],
+      alpha, a_base.data_handle(), __backend::shape(a)[1], b_base.data_handle(),
+      __backend::shape(b)[1], 0, c.data_handle(), __backend::shape(c)[1])
+      .wait();
+}
+
+} // namespace spblas
diff --git a/miniapps/include/matrix_utils.hh b/miniapps/include/matrix_utils.hh
index ebb249c..50adaa2 100644
--- a/miniapps/include/matrix_utils.hh
+++ b/miniapps/include/matrix_utils.hh
@@ -17,17 +17,17 @@ struct matrix_data_entry {
   T value;
 };
 
-template <typename T>
+template <typename T, typename I = spblas::index_t>
 struct matrix_data {
   using nonzero_type = matrix_data_entry<T>;
 
-  std::size_t num_rows;
-  std::size_t num_cols;
+  I num_rows;
+  I num_cols;
   std::vector<nonzero_type> nonzeros;
 
-  matrix_data(std::vector<std::size_t> rowind, std::vector<std::size_t> colind,
-              std::vector<T> values, spblas::index<std::size_t> shape)
-      : num_rows{shape[0]}, num_cols{shape[1]} {
+  matrix_data(std::vector<I> rowind, std::vector<I> colind,
+              std::vector<T> values, spblas::index<I> shape)
+      : num_rows(shape[0]), num_cols(shape[1]) {
     assert(rowind.size() == colind.size() && rowind.size() == values.size());
     for (std::size_t i = 0; i < values.size(); i++) {
       nonzeros.emplace_back(rowind[i], colind[i], values[i]);
@@ -37,8 +37,8 @@ struct matrix_data {
   auto convert_to_coo() {
     auto nnz = nonzeros.size();
     std::vector<T> values(nnz);
-    std::vector<std::size_t> rowind(nnz);
-    std::vector<std::size_t> colind(nnz);
+    std::vector<I> rowind(nnz);
+    std::vector<I> colind(nnz);
 
     for (std::size_t i = 0; i < nnz; i++) {
       values[i] = nonzeros[i].value;
diff --git a/miniapps/test/solvers/test_cg.cpp b/miniapps/test/solvers/test_cg.cpp
index 780c514..0e159ca 100644
--- a/miniapps/test/solvers/test_cg.cpp
+++ b/miniapps/test/solvers/test_cg.cpp
@@ -15,7 +15,7 @@ class TestCG : public testing::Test {
   auto generate_spd_problem(std::size_t m, std::size_t n, std::size_t nnz_input,
                             std::size_t seed = 0) {
     auto&& [values_orig, rowind_orig, colind_orig, shape, nnz_orig] =
-        spblas::generate_coo<T>(m, n, nnz_input, seed);
+        spblas::generate_coo<T, spblas::index_t>(m, n, nnz_input, seed);
     miniapps::matrix_data<T> data(rowind_orig, colind_orig, values_orig, shape);
     data.sort_row_major();
     data.make_symmetric();