From 26884c2d3fe7c29f7c4a2902a5d7d5a36fc8bd95 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 3 Apr 2025 00:59:06 +0200 Subject: [PATCH 1/6] add mkl_allocator --- .../vendor/onemkl_sycl/mkl_allocator.hpp | 69 +++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 include/spblas/vendor/onemkl_sycl/mkl_allocator.hpp diff --git a/include/spblas/vendor/onemkl_sycl/mkl_allocator.hpp b/include/spblas/vendor/onemkl_sycl/mkl_allocator.hpp new file mode 100644 index 0000000..116401e --- /dev/null +++ b/include/spblas/vendor/onemkl_sycl/mkl_allocator.hpp @@ -0,0 +1,69 @@ +#pragma once + +#include + +namespace spblas { +namespace mkl { + +template +class mkl_allocator { +public: + using value_type = T; + using pointer = T*; + using const_pointer = const T*; + using reference = T&; + using const_reference = const T&; + using size_type = std::size_t; + using difference_type = std::ptrdiff_t; + + mkl_allocator() noexcept { + auto* queue = new sycl::queue{sycl::default_selector_v}; + queue_manager_ = + std::move(std::shared_ptr{queue, [](sycl::queue* q) { + q->wait_and_throw(); + delete q; + }}); + } + + mkl_allocator(sycl::queue* q) noexcept + : queue_manager_(q, [](sycl::queue* q) {}) {} + + template + mkl_allocator(const mkl_allocator& other) noexcept + : queue_manager_(other.queue_) {} + + mkl_allocator(const mkl_allocator&) = default; + mkl_allocator& operator=(const mkl_allocator&) = default; + ~mkl_allocator() = default; + + using is_always_equal = std::false_type; + + pointer allocate(std::size_t size) { + return sycl::malloc_device(size, *(this->queue())); + } + + void deallocate(pointer ptr, std::size_t n = 0) { + if (ptr != nullptr) { + sycl::free(ptr, *(this->queue())); + } + } + + bool operator==(const mkl_allocator&) const = default; + bool operator!=(const mkl_allocator&) const = default; + + template + struct rebind { + using other = mkl_allocator; + }; + + sycl::queue* queue() const noexcept { + return queue_manager_.get(); + } + +private: + // using shared_ptr to support copy constructor + std::shared_ptr queue_manager_; +}; + +} // namespace mkl +} // namespace spblas From 90cd04963479d2911381e939f3dd19f9e2d94ddd Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 3 Apr 2025 00:59:56 +0200 Subject: [PATCH 2/6] use allocator in oneMKL spmv. It still keeps no state input --- .github/workflows/ci.yml | 2 +- .../spblas/vendor/onemkl_sycl/spmv_impl.hpp | 36 ++++++++++++++++--- 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7f1a6dc..20cc46d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -69,7 +69,7 @@ jobs: - name: Test run: | source /opt/intel/oneapi/setvars.sh - ./build/test/gtest/spblas-tests + ONEMKL_DEVICE_SELECTOR=*:cpu ./build/test/gtest/spblas-tests macos: runs-on: 'macos-latest' diff --git a/include/spblas/vendor/onemkl_sycl/spmv_impl.hpp b/include/spblas/vendor/onemkl_sycl/spmv_impl.hpp index bd438e3..e4cf08f 100644 --- a/include/spblas/vendor/onemkl_sycl/spmv_impl.hpp +++ b/include/spblas/vendor/onemkl_sycl/spmv_impl.hpp @@ -2,6 +2,7 @@ #include +#include "mkl_allocator.hpp" #include #include #include @@ -24,11 +25,27 @@ namespace spblas { +class spmv_state_t { +public: + spmv_state_t() : spmv_state_t(mkl::mkl_allocator{}) {} + + spmv_state_t(sycl::queue* q) : spmv_state_t(mkl::mkl_allocator{q}) {} + + spmv_state_t(mkl::mkl_allocator alloc) : alloc_(alloc) {} + + sycl::queue* queue() { + return alloc_.queue(); + } + +private: + mkl::mkl_allocator alloc_; +}; + template requires((__detail::has_csr_base || __detail::has_csc_base) && __detail::has_contiguous_range_base && __ranges::contiguous_range) -void multiply(A&& a, X&& x, Y&& y) { +void multiply(spmv_state_t& state, A&& a, X&& x, Y&& y) { log_trace(""); auto a_base = __detail::get_ultimate_base(a); auto x_base = __detail::get_ultimate_base(x); @@ -36,16 +53,25 @@ void multiply(A&& a, X&& x, Y&& y) { auto alpha_optional = __detail::get_scaling_factor(a, x); tensor_scalar_t alpha = alpha_optional.value_or(1); - sycl::queue q(sycl::cpu_selector_v); + auto q_ptr = state.queue(); - auto a_handle = __mkl::create_matrix_handle(q, a_base); + auto a_handle = __mkl::create_matrix_handle(*q_ptr, a_base); auto a_transpose = __mkl::get_transpose(a); - oneapi::mkl::sparse::gemv(q, a_transpose, alpha, a_handle, + oneapi::mkl::sparse::gemv(*q_ptr, a_transpose, alpha, a_handle, __ranges::data(x_base), 0.0, __ranges::data(y)) .wait(); - oneapi::mkl::sparse::release_matrix_handle(q, &a_handle).wait(); + oneapi::mkl::sparse::release_matrix_handle(*q_ptr, &a_handle).wait(); +} + +template + requires((__detail::has_csr_base || __detail::has_csc_base) && + __detail::has_contiguous_range_base && + __ranges::contiguous_range) +void multiply(A&& a, X&& x, Y&& y) { + spmv_state_t state; + multiply(state, a, x, y); } } // namespace spblas From ce922af882c556172146c30191d2f311aa8791de Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 3 Apr 2025 02:32:18 +0200 Subject: [PATCH 3/6] add the incompleted thrust impl for oneMKL and enable gpu spmv test --- test/gtest/CMakeLists.txt | 8 ++++ test/gtest/onemkl/device_vector.hpp | 59 +++++++++++++++++++++++++++++ test/gtest/rocsparse/spmv_test.cpp | 5 +++ 3 files changed, 72 insertions(+) create mode 100644 test/gtest/onemkl/device_vector.hpp diff --git a/test/gtest/CMakeLists.txt b/test/gtest/CMakeLists.txt index cf57db1..cfb49f5 100644 --- a/test/gtest/CMakeLists.txt +++ b/test/gtest/CMakeLists.txt @@ -21,3 +21,11 @@ target_link_libraries(spblas-tests spblas fmt GTest::gtest_main) include(GoogleTest) gtest_discover_tests(spblas-tests) + +# unify it together after cusparse +if(ENABLE_ONEMKL_SYCL) + add_executable(spblas-gpu-tests rocsparse/spmv_test.cpp) + target_include_directories(spblas-gpu-tests PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) + target_link_libraries(spblas-gpu-tests spblas fmt GTest::gtest_main) + gtest_discover_tests(spblas-gpu-tests) +endif() diff --git a/test/gtest/onemkl/device_vector.hpp b/test/gtest/onemkl/device_vector.hpp new file mode 100644 index 0000000..c3032ef --- /dev/null +++ b/test/gtest/onemkl/device_vector.hpp @@ -0,0 +1,59 @@ +#pragma once +#include +#include +#include +#include +#include + +namespace thrust { + +template + requires(std::contiguous_iterator && + std::contiguous_iterator) +OutputIt copy(InputIt first, InputIt last, OutputIt d_first) { + sycl::queue queue(sycl::default_selector_v); + using input_value_type = typename std::iterator_traits::value_type; + using output_value_type = typename std::iterator_traits::value_type; + input_value_type* first_ptr = std::to_address(first); + output_value_type* d_first_ptr = std::to_address(d_first); + auto num = std::distance(first, last); + queue.memcpy(d_first_ptr, first_ptr, num * sizeof(input_value_type)) + .wait_and_throw(); + return d_first + num; +} + +// incompleted impl for thrust vector in oneMKL just for test usage +template +class device_vector { +public: + device_vector(std::vector host_vector) + : alloc_{}, size_(host_vector.size()), ptr_(nullptr) { + ptr_ = alloc_.allocate(size_); + thrust::copy(host_vector.begin(), host_vector.end(), ptr_); + } + + ~device_vector() { + alloc_.deallocate(ptr_, size_); + ptr_ = nullptr; + } + + ValueType* begin() { + return ptr_; + } + + ValueType* end() { + return ptr_ + size_; + } + + // just to give data().get() + std::shared_ptr data() { + return std::shared_ptr(ptr_, [](ValueType* ptr) {}); + } + +private: + spblas::mkl::mkl_allocator alloc_; + std::size_t size_; + ValueType* ptr_; +}; + +} // namespace thrust diff --git a/test/gtest/rocsparse/spmv_test.cpp b/test/gtest/rocsparse/spmv_test.cpp index cf28f89..7ab19b8 100644 --- a/test/gtest/rocsparse/spmv_test.cpp +++ b/test/gtest/rocsparse/spmv_test.cpp @@ -3,7 +3,12 @@ #include #include + +#ifdef SPBLAS_ENABLE_ONEMKL_SYCL +#include "onemkl/device_vector.hpp" +#else #include +#endif using value_t = float; using index_t = spblas::index_t; From 17d52b42198f5c95a03a2541b87c440214cac6e9 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 17 Apr 2025 16:22:00 +0200 Subject: [PATCH 4/6] add gpu intel ci --- .github/workflows/ci.yml | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 20cc46d..76574c3 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -111,3 +111,23 @@ jobs: shell: bash -l {0} run: | ./build/test/gtest/spblas-tests + + intel-llvm-gpu: + runs-on: 'gpu_intel' + steps: + - uses: actions/checkout@v4 + - name: CMake + shell: bash -l {0} + run: | + module load intel-oneapi-compilers intel-oneapi-dpl intel-oneapi-mkl cmake + cmake -B build -DCMAKE_CXX_COMPILER=icpx -DENABLE_ONEMKL_SYCL=ON + - name: Build + shell: bash -l {0} + run: | + module load intel-oneapi-compilers intel-oneapi-dpl intel-oneapi-mkl + make -C build -j `nproc` + - name: Test + shell: bash -l {0} + run: | + module load intel-oneapi-compilers intel-oneapi-dpl intel-oneapi-mkl + ONEMKL_DEVICE_SELECTOR=level_zero:gpu ./build/test/gtest/spblas-gpu-tests From a69e41bc2add0a4b1e7358fcc5712788682c1bba Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 24 Apr 2025 19:20:55 +0200 Subject: [PATCH 5/6] add -fsycl --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index abdcc9d..dae38fa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,7 +22,7 @@ include(FetchContent) if (ENABLE_ONEMKL_SYCL) find_package(MKL REQUIRED) target_link_libraries(spblas INTERFACE MKL::MKL_SYCL) # SYCL APIs - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSPBLAS_ENABLE_ONEMKL_SYCL") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -DSPBLAS_ENABLE_ONEMKL_SYCL") endif() if (ENABLE_ARMPL) From 4bb2d5c82183ac9f780338b91c0ebafeac508838 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 24 Apr 2025 23:37:47 +0200 Subject: [PATCH 6/6] add comment on the queue shallow copy and use proper path for sycl.hpp --- include/spblas/vendor/onemkl_sycl/mkl_allocator.hpp | 3 ++- test/gtest/onemkl/device_vector.hpp | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/include/spblas/vendor/onemkl_sycl/mkl_allocator.hpp b/include/spblas/vendor/onemkl_sycl/mkl_allocator.hpp index 116401e..8b43414 100644 --- a/include/spblas/vendor/onemkl_sycl/mkl_allocator.hpp +++ b/include/spblas/vendor/onemkl_sycl/mkl_allocator.hpp @@ -1,6 +1,6 @@ #pragma once -#include +#include namespace spblas { namespace mkl { @@ -25,6 +25,7 @@ class mkl_allocator { }}); } + // taking a shallow copy of queue from elsewhere, so we don't own destruction mkl_allocator(sycl::queue* q) noexcept : queue_manager_(q, [](sycl::queue* q) {}) {} diff --git a/test/gtest/onemkl/device_vector.hpp b/test/gtest/onemkl/device_vector.hpp index c3032ef..dc52eb4 100644 --- a/test/gtest/onemkl/device_vector.hpp +++ b/test/gtest/onemkl/device_vector.hpp @@ -2,7 +2,7 @@ #include #include #include -#include +#include #include namespace thrust {