diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 9fd1045..bdd29f5 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -13,81 +13,95 @@ jobs: device-build-test: name: device-build-test runs-on: ${{ matrix.setup.runner }} - container: ${{ matrix.setup.container }} + container: + image: ${{ matrix.setup.container }} + options: ${{ matrix.setup.container-options || ' ' }} strategy: fail-fast: false matrix: arch: - - snb # <-- needed for the self-hosted CI node for now :/ + - hsw build_type: - Release - Debug setup: - - arch: sm_60 + - arch: sm_86 backend: cuda cc: gcc-13 cxx: g++-13 fc: gfortran-13 - container: seissol/gha-gpu-nv:davschneller-gpu-image - runner: ubuntu-24.04 + container: seissol/gha-gpu-nv:davschneller-ci-merge + runner: self-hosted + container-options: --runtime=nvidia --gpus=all pythonbreak: true - - arch: sm_60 + test: true + - arch: sm_86 backend: acpp cc: gcc-13 cxx: g++-13 fc: gfortran-13 - container: seissol/gha-gpu-nv:davschneller-gpu-image - runner: ubuntu-24.04 + container: seissol/gha-gpu-nv:davschneller-ci-merge + runner: self-hosted + container-options: --runtime=nvidia --gpus=all pythonbreak: true - - arch: sm_60 + test: true + - arch: sm_86 backend: cuda cc: clang-18 cxx: clang++-18 fc: gfortran-13 # TODO? - container: seissol/gha-gpu-nv:davschneller-gpu-image - runner: ubuntu-24.04 + container: seissol/gha-gpu-nv:davschneller-ci-merge + runner: self-hosted + container-options: --runtime=nvidia --gpus=all pythonbreak: true - # TODO: needs a working GPU runner - #- arch: sm_60 - # backend: cuda - # cc: nvc - # cxx: nvc++ - # fc: nvfortran - # container: seissol/gha-gpu-nvhpc:davschneller-gpu-image - # runner: sccs-ci-nv-sm60 - # pythonbreak: true + test: true + - arch: sm_86 + backend: cuda + cc: nvc + cxx: nvc++ + fc: nvfortran + container: seissol/gha-gpu-nvhpc:davschneller-ci-merge + runner: self-hosted + container-options: --runtime=nvidia --gpus=all + pythonbreak: true + test: true - arch: gfx906 backend: hip cc: gcc-13 cxx: g++-13 fc: gfortran-13 - container: seissol/gha-gpu-amd:davschneller-gpu-image + container: seissol/gha-gpu-amd:davschneller-ci-merge runner: ubuntu-24.04 pythonbreak: true + test: false - arch: gfx906 backend: acpp cc: gcc-13 cxx: g++-13 fc: gfortran-13 - container: seissol/gha-gpu-amd:davschneller-gpu-image + container: seissol/gha-gpu-amd:davschneller-ci-merge runner: ubuntu-24.04 pythonbreak: true + test: false - arch: gfx906 backend: hip cc: clang-18 cxx: clang++-18 fc: gfortran-13 # TODO? - container: seissol/gha-gpu-amd:davschneller-gpu-image + container: seissol/gha-gpu-amd:davschneller-ci-merge runner: ubuntu-24.04 pythonbreak: true + test: false - arch: skl backend: oneapi cc: icx cxx: icpx fc: ifx container: seissol/gha-gpu-intel:davschneller-gpu-image - runner: ubuntu-24.04 + container-options: --device /dev/dri + runner: self-hosted pythonbreak: false + test: true steps: - name: install-gtest run: | @@ -98,7 +112,7 @@ jobs: cd ../.. - name: checkout-device - uses: actions/checkout@v4 + uses: actions/checkout@v6 with: submodules: recursive @@ -119,5 +133,18 @@ jobs: export CXX=${{matrix.setup.cxx}} export FC=${{matrix.setup.fc}} - cmake .. -GNinja -DDEVICE_BACKEND=${{matrix.setup.backend}} -DSM=${{matrix.setup.arch}} + cmake .. -GNinja \ + -DDEVICE_BACKEND=${{matrix.setup.backend}} \ + -DSM=${{matrix.setup.arch}} \ + -DCMAKE_BUILD_TYPE=${{matrix.build_type}} + ninja + + - id: test + name: test-device + if: ${{matrix.setup.test}} + run: | + cd tests + cd build + + ./tests diff --git a/algorithms/cudahip/ArrayManip.cpp b/algorithms/cudahip/ArrayManip.cpp index ea3024f..91820ab 100644 --- a/algorithms/cudahip/ArrayManip.cpp +++ b/algorithms/cudahip/ArrayManip.cpp @@ -80,7 +80,7 @@ template void //-------------------------------------------------------------------------------------------------- __global__ void kernel_touchMemory(void* ptr, size_t size, bool clean) { - int id = threadIdx.x + blockIdx.x * blockDim.x; + const int id = threadIdx.x + blockIdx.x * blockDim.x; if (clean) { imemset(ptr, size, id, blockDim.x * gridDim.x); } else { diff --git a/algorithms/cudahip/BatchManip.cpp b/algorithms/cudahip/BatchManip.cpp index 55fba3a..9ce8b04 100644 --- a/algorithms/cudahip/BatchManip.cpp +++ b/algorithms/cudahip/BatchManip.cpp @@ -131,6 +131,13 @@ template void Algorithms::setToValue( int** out, int value, size_t elementSize, size_t numElements, void* streamPtr); template void Algorithms::setToValue( unsigned** out, unsigned value, size_t elementSize, size_t numElements, void* streamPtr); +template void Algorithms::setToValue( + long** out, long value, size_t elementSize, size_t numElements, void* streamPtr); +template void Algorithms::setToValue(unsigned long** out, + unsigned long value, + size_t elementSize, + size_t numElements, + void* streamPtr); template void Algorithms::setToValue( char** out, char value, size_t elementSize, size_t numElements, void* streamPtr); diff --git a/algorithms/cudahip/Reduction.cpp b/algorithms/cudahip/Reduction.cpp index f8342c4..45e8873 100644 --- a/algorithms/cudahip/Reduction.cpp +++ b/algorithms/cudahip/Reduction.cpp @@ -56,20 +56,24 @@ __launch_bounds__(1024) void __global__ kernel_reduce( const auto threadInWarp = threadIdx.x % warpSize; const auto warpsNeeded = (size + warpSize - 1) / warpSize; + auto value = operation.defaultValue; auto acc = operation.defaultValue; #pragma unroll 4 for (std::size_t i = currentWarp; i < warpsNeeded; i += warpCount) { const auto id = threadInWarp + i * warpSize; - auto value = (id < size) ? static_cast(ntload(&vector[id])) : operation.defaultValue; + const auto valueNew = + (id < size) ? static_cast(ntload(&vector[id])) : operation.defaultValue; - for (int offset = 1; offset < warpSize; offset *= 2) { - value = operation(value, shuffledown(value, offset)); - } + value = operation(value, valueNew); + } - acc = operation(acc, value); + for (int offset = 1; offset < warpSize; offset *= 2) { + value = operation(value, shuffledown(value, offset)); } + acc = operation(acc, value); + if (threadInWarp == 0) { shmem[currentWarp] = acc; } @@ -78,19 +82,24 @@ __launch_bounds__(1024) void __global__ kernel_reduce( if (currentWarp == 0) { const auto lastWarpsNeeded = (warpCount + warpSize - 1) / warpSize; + + auto value = operation.defaultValue; auto lastAcc = operation.defaultValue; + #pragma unroll 2 for (int i = 0; i < lastWarpsNeeded; ++i) { const auto id = threadInWarp + i * warpSize; - auto value = (id < warpCount) ? shmem[id] : operation.defaultValue; + const auto valueNew = (id < warpCount) ? shmem[id] : operation.defaultValue; - for (int offset = 1; offset < warpSize; offset *= 2) { - value = operation(value, shuffledown(value, offset)); - } + value = operation(value, valueNew); + } - lastAcc = operation(lastAcc, value); + for (int offset = 1; offset < warpSize; offset *= 2) { + value = operation(value, shuffledown(value, offset)); } + lastAcc = operation(lastAcc, value); + if (threadIdx.x == 0) { if (overrideResult) { ntstore(result, lastAcc); diff --git a/algorithms/sycl/BatchManip.cpp b/algorithms/sycl/BatchManip.cpp index ddc97cf..0aab2b3 100644 --- a/algorithms/sycl/BatchManip.cpp +++ b/algorithms/sycl/BatchManip.cpp @@ -118,6 +118,13 @@ template void Algorithms::setToValue( int** out, int value, size_t elementSize, size_t numElements, void* streamPtr); template void Algorithms::setToValue( unsigned** out, unsigned value, size_t elementSize, size_t numElements, void* streamPtr); +template void Algorithms::setToValue( + long** out, long value, size_t elementSize, size_t numElements, void* streamPtr); +template void Algorithms::setToValue(unsigned long** out, + unsigned long value, + size_t elementSize, + size_t numElements, + void* streamPtr); template void Algorithms::setToValue( char** out, char value, size_t elementSize, size_t numElements, void* streamPtr); diff --git a/algorithms/sycl/Reduction.cpp b/algorithms/sycl/Reduction.cpp index 21116de..e0d5d52 100644 --- a/algorithms/sycl/Reduction.cpp +++ b/algorithms/sycl/Reduction.cpp @@ -93,8 +93,7 @@ void launchReduction(AccT* result, (size + (workGroupSize * itemsPerWorkItem) - 1) / (workGroupSize * itemsPerWorkItem); cgh.parallel_for( - sycl::nd_range<1>{numWorkGroups * itemsPerWorkItem, workGroupSize}, - [=](sycl::nd_item<1> idx) { + sycl::nd_range<1>{numWorkGroups * workGroupSize, workGroupSize}, [=](sycl::nd_item<1> idx) { const auto localId = idx.get_local_id(0); const auto groupId = idx.get_group(0); diff --git a/interfaces/sycl/Internals.h b/interfaces/sycl/Internals.h index 82230b0..ce14865 100644 --- a/interfaces/sycl/Internals.h +++ b/interfaces/sycl/Internals.h @@ -12,7 +12,7 @@ #include namespace device::internals { -constexpr static int DefaultBlockDim = 1024; +constexpr static int DefaultBlockDim = 256; template void waitCheck(T&& result) { diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index ab5d521..7369f47 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -22,7 +22,7 @@ add_subdirectory(.. root) find_package(GTest REQUIRED) -add_executable(tests main.cpp reductions.cpp array_manip.cpp memory.cpp batch_manip.cpp) +add_executable(tests main.cpp reductions.cpp memory.cpp array_manip.cpp batch_manip.cpp) target_link_libraries(tests PRIVATE device ${GTEST_BOTH_LIBRARIES}) target_include_directories(tests PRIVATE ${GTEST_INCLUDE_DIR}) diff --git a/tests/array_manip.cpp b/tests/array_manip.cpp index ba05977..71df856 100644 --- a/tests/array_manip.cpp +++ b/tests/array_manip.cpp @@ -73,15 +73,17 @@ TEST_F(ArrayManip, touchNoClean32) { device->api->freeGlobMem(arr); } +// avoid double, due to some archs (also in the CI) don't seem to thoroughly support it + TEST_F(ArrayManip, touchClean64) { const int N = 100; - double* arr = (double*)device->api->allocGlobMem(N * sizeof(double)); + long* arr = (long*)device->api->allocGlobMem(N * sizeof(long)); device->algorithms.touchMemory(arr, N, true, device->api->getDefaultStream()); - std::vector hostVector(N, 1); + std::vector hostVector(N, 1); device->api->copyFromAsync( - &hostVector[0], arr, N * sizeof(double), device->api->getDefaultStream()); + &hostVector[0], arr, N * sizeof(long), device->api->getDefaultStream()); device->api->syncDefaultStreamWithHost(); @@ -95,14 +97,13 @@ TEST_F(ArrayManip, touchClean64) { TEST_F(ArrayManip, touchNoClean64) { const int N = 100; - double* arr = (double*)device->api->allocGlobMem(N * sizeof(double)); - std::vector hostVector(N, 0); + long* arr = (long*)device->api->allocGlobMem(N * sizeof(long)); + std::vector hostVector(N, 0); - device->api->copyToAsync( - arr, &hostVector[0], N * sizeof(double), device->api->getDefaultStream()); + device->api->copyToAsync(arr, &hostVector[0], N * sizeof(long), device->api->getDefaultStream()); device->algorithms.touchMemory(arr, N, false, device->api->getDefaultStream()); device->api->copyFromAsync( - &hostVector[0], arr, N * sizeof(double), device->api->getDefaultStream()); + &hostVector[0], arr, N * sizeof(long), device->api->getDefaultStream()); device->api->syncDefaultStreamWithHost(); diff --git a/tests/batch_manip.cpp b/tests/batch_manip.cpp index a84a719..54ad1b1 100644 --- a/tests/batch_manip.cpp +++ b/tests/batch_manip.cpp @@ -32,7 +32,7 @@ class BatchManip : public BaseTestSuite { std::forward(inner)(batch, data); device->api->freeGlobMem(data); - device->api->freeGlobMem(batch); + device->api->freeUnifiedMem(batch); } }; @@ -166,14 +166,14 @@ TEST_F(BatchManip, uniformToScatter32) { TEST_F(BatchManip, fill64) { const int N = 100; const int M = 120; - testWrapper(N, M, false, [&](double** batch, double* data) { - double scalar = 502; + testWrapper(N, M, false, [&](long** batch, long* data) { + long scalar = 502; device->algorithms.setToValue(batch, scalar, M, N, device->api->getDefaultStream()); - std::vector hostVector(N * M, 0); + std::vector hostVector(N * M, 0); device->api->copyFromAsync( - &hostVector[0], data, N * M * sizeof(double), device->api->getDefaultStream()); + &hostVector[0], data, N * M * sizeof(long), device->api->getDefaultStream()); device->api->syncDefaultStreamWithHost(); @@ -186,12 +186,12 @@ TEST_F(BatchManip, fill64) { TEST_F(BatchManip, touchClean64) { const int N = 100; const int M = 120; - testWrapper(N, M, false, [&](double** batch, double* data) { + testWrapper(N, M, false, [&](long** batch, long* data) { device->algorithms.touchBatchedMemory(batch, M, N, true, device->api->getDefaultStream()); - std::vector hostVector(N * M, 1); + std::vector hostVector(N * M, 1); device->api->copyFromAsync( - &hostVector[0], data, M * N * sizeof(double), device->api->getDefaultStream()); + &hostVector[0], data, M * N * sizeof(long), device->api->getDefaultStream()); device->api->syncDefaultStreamWithHost(); @@ -200,14 +200,14 @@ TEST_F(BatchManip, touchClean64) { } }); - testWrapper(N, M, true, [&](double** batch, double* data) { - std::vector hostVector(N * M, 1); + testWrapper(N, M, true, [&](long** batch, long* data) { + std::vector hostVector(N * M, 1); device->api->copyToAsync( - data, &hostVector[0], N * M * sizeof(double), device->api->getDefaultStream()); + data, &hostVector[0], N * M * sizeof(long), device->api->getDefaultStream()); device->algorithms.touchBatchedMemory(batch, M, N, true, device->api->getDefaultStream()); device->api->copyFromAsync( - &hostVector[0], data, M * N * sizeof(double), device->api->getDefaultStream()); + &hostVector[0], data, M * N * sizeof(long), device->api->getDefaultStream()); device->api->syncDefaultStreamWithHost(); @@ -226,14 +226,14 @@ TEST_F(BatchManip, touchClean64) { TEST_F(BatchManip, touchNoClean64) { const int N = 100; const int M = 120; - testWrapper(N, M, false, [&](double** batch, double* data) { - std::vector hostVector(N * M, 1); + testWrapper(N, M, false, [&](long** batch, long* data) { + std::vector hostVector(N * M, 1); device->api->copyToAsync( - data, &hostVector[0], N * M * sizeof(double), device->api->getDefaultStream()); + data, &hostVector[0], N * M * sizeof(long), device->api->getDefaultStream()); device->algorithms.touchBatchedMemory(batch, M, N, false, device->api->getDefaultStream()); device->api->copyFromAsync( - &hostVector[0], data, N * M * sizeof(double), device->api->getDefaultStream()); + &hostVector[0], data, N * M * sizeof(long), device->api->getDefaultStream()); device->api->syncDefaultStreamWithHost(); @@ -247,16 +247,16 @@ TEST_F(BatchManip, scatterToUniform64) { const int N = 100; const int M = 120; - double* data2 = (double*)device->api->allocGlobMem(N * M * sizeof(double)); - testWrapper(N, M, false, [&](double** batch, double* data) { - std::vector hostVector(N * M, 1); + long* data2 = (long*)device->api->allocGlobMem(N * M * sizeof(long)); + testWrapper(N, M, false, [&](long** batch, long* data) { + std::vector hostVector(N * M, 1); device->api->copyToAsync( - data, &hostVector[0], N * M * sizeof(double), device->api->getDefaultStream()); + data, &hostVector[0], N * M * sizeof(long), device->api->getDefaultStream()); device->algorithms.copyScatterToUniform( - const_cast(batch), data2, M, M, N, device->api->getDefaultStream()); + const_cast(batch), data2, M, M, N, device->api->getDefaultStream()); device->api->copyFromAsync( - &hostVector[0], data2, N * M * sizeof(double), device->api->getDefaultStream()); + &hostVector[0], data2, N * M * sizeof(long), device->api->getDefaultStream()); device->api->syncDefaultStreamWithHost(); @@ -271,15 +271,15 @@ TEST_F(BatchManip, uniformToScatter64) { const int N = 100; const int M = 120; - double* data2 = (double*)device->api->allocGlobMem(N * M * sizeof(double)); - testWrapper(N, M, false, [&](double** batch, double* data) { - std::vector hostVector(N * M, 1); + long* data2 = (long*)device->api->allocGlobMem(N * M * sizeof(long)); + testWrapper(N, M, false, [&](long** batch, long* data) { + std::vector hostVector(N * M, 1); device->api->copyToAsync( - data2, &hostVector[0], N * M * sizeof(double), device->api->getDefaultStream()); + data2, &hostVector[0], N * M * sizeof(long), device->api->getDefaultStream()); device->algorithms.copyUniformToScatter(data2, batch, M, M, N, device->api->getDefaultStream()); device->api->copyFromAsync( - &hostVector[0], data, N * M * sizeof(double), device->api->getDefaultStream()); + &hostVector[0], data, N * M * sizeof(long), device->api->getDefaultStream()); device->api->syncDefaultStreamWithHost(); diff --git a/tests/main.cpp b/tests/main.cpp index 3686780..cfba087 100644 --- a/tests/main.cpp +++ b/tests/main.cpp @@ -14,5 +14,10 @@ int main(int argc, char** argv) { device.api->setDevice(0); device.api->initialize(); - return RUN_ALL_TESTS(); + const auto result = RUN_ALL_TESTS(); + + device.api->syncDevice(); + device.api->finalize(); + + return result; }