Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 53 additions & 26 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,81 +13,95 @@ jobs:
device-build-test:
name: device-build-test
runs-on: ${{ matrix.setup.runner }}
container: ${{ matrix.setup.container }}
container:
image: ${{ matrix.setup.container }}
options: ${{ matrix.setup.container-options || ' ' }}
strategy:
fail-fast: false
matrix:
arch:
- snb # <-- needed for the self-hosted CI node for now :/
- hsw
build_type:
- Release
- Debug
setup:
- arch: sm_60
- arch: sm_86
backend: cuda
cc: gcc-13
cxx: g++-13
fc: gfortran-13
container: seissol/gha-gpu-nv:davschneller-gpu-image
runner: ubuntu-24.04
container: seissol/gha-gpu-nv:davschneller-ci-merge
runner: self-hosted
container-options: --runtime=nvidia --gpus=all
pythonbreak: true
- arch: sm_60
test: true
- arch: sm_86
backend: acpp
cc: gcc-13
cxx: g++-13
fc: gfortran-13
container: seissol/gha-gpu-nv:davschneller-gpu-image
runner: ubuntu-24.04
container: seissol/gha-gpu-nv:davschneller-ci-merge
runner: self-hosted
container-options: --runtime=nvidia --gpus=all
pythonbreak: true
- arch: sm_60
test: true
- arch: sm_86
backend: cuda
cc: clang-18
cxx: clang++-18
fc: gfortran-13 # TODO?
container: seissol/gha-gpu-nv:davschneller-gpu-image
runner: ubuntu-24.04
container: seissol/gha-gpu-nv:davschneller-ci-merge
runner: self-hosted
container-options: --runtime=nvidia --gpus=all
pythonbreak: true
# TODO: needs a working GPU runner
#- arch: sm_60
# backend: cuda
# cc: nvc
# cxx: nvc++
# fc: nvfortran
# container: seissol/gha-gpu-nvhpc:davschneller-gpu-image
# runner: sccs-ci-nv-sm60
# pythonbreak: true
test: true
- arch: sm_86
backend: cuda
cc: nvc
cxx: nvc++
fc: nvfortran
container: seissol/gha-gpu-nvhpc:davschneller-ci-merge
runner: self-hosted
container-options: --runtime=nvidia --gpus=all
pythonbreak: true
test: true
- arch: gfx906
backend: hip
cc: gcc-13
cxx: g++-13
fc: gfortran-13
container: seissol/gha-gpu-amd:davschneller-gpu-image
container: seissol/gha-gpu-amd:davschneller-ci-merge
runner: ubuntu-24.04
pythonbreak: true
test: false
- arch: gfx906
backend: acpp
cc: gcc-13
cxx: g++-13
fc: gfortran-13
container: seissol/gha-gpu-amd:davschneller-gpu-image
container: seissol/gha-gpu-amd:davschneller-ci-merge
runner: ubuntu-24.04
pythonbreak: true
test: false
- arch: gfx906
backend: hip
cc: clang-18
cxx: clang++-18
fc: gfortran-13 # TODO?
container: seissol/gha-gpu-amd:davschneller-gpu-image
container: seissol/gha-gpu-amd:davschneller-ci-merge
runner: ubuntu-24.04
pythonbreak: true
test: false
- arch: skl
backend: oneapi
cc: icx
cxx: icpx
fc: ifx
container: seissol/gha-gpu-intel:davschneller-gpu-image
runner: ubuntu-24.04
container-options: --device /dev/dri
runner: self-hosted
pythonbreak: false
test: true
steps:
- name: install-gtest
run: |
Expand All @@ -98,7 +112,7 @@ jobs:
cd ../..

- name: checkout-device
uses: actions/checkout@v4
uses: actions/checkout@v6
with:
submodules: recursive

Expand All @@ -119,5 +133,18 @@ jobs:
export CXX=${{matrix.setup.cxx}}
export FC=${{matrix.setup.fc}}

cmake .. -GNinja -DDEVICE_BACKEND=${{matrix.setup.backend}} -DSM=${{matrix.setup.arch}}
cmake .. -GNinja \
-DDEVICE_BACKEND=${{matrix.setup.backend}} \
-DSM=${{matrix.setup.arch}} \
-DCMAKE_BUILD_TYPE=${{matrix.build_type}}

ninja

- id: test
name: test-device
if: ${{matrix.setup.test}}
run: |
cd tests
cd build

./tests
2 changes: 1 addition & 1 deletion algorithms/cudahip/ArrayManip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ template void

//--------------------------------------------------------------------------------------------------
__global__ void kernel_touchMemory(void* ptr, size_t size, bool clean) {
int id = threadIdx.x + blockIdx.x * blockDim.x;
const int id = threadIdx.x + blockIdx.x * blockDim.x;
if (clean) {
imemset(ptr, size, id, blockDim.x * gridDim.x);
} else {
Expand Down
7 changes: 7 additions & 0 deletions algorithms/cudahip/BatchManip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,13 @@ template void Algorithms::setToValue(
int** out, int value, size_t elementSize, size_t numElements, void* streamPtr);
template void Algorithms::setToValue(
unsigned** out, unsigned value, size_t elementSize, size_t numElements, void* streamPtr);
template void Algorithms::setToValue(
long** out, long value, size_t elementSize, size_t numElements, void* streamPtr);
template void Algorithms::setToValue(unsigned long** out,
unsigned long value,
size_t elementSize,
size_t numElements,
void* streamPtr);
template void Algorithms::setToValue(
char** out, char value, size_t elementSize, size_t numElements, void* streamPtr);

Expand Down
29 changes: 19 additions & 10 deletions algorithms/cudahip/Reduction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,20 +56,24 @@ __launch_bounds__(1024) void __global__ kernel_reduce(
const auto threadInWarp = threadIdx.x % warpSize;
const auto warpsNeeded = (size + warpSize - 1) / warpSize;

auto value = operation.defaultValue;
auto acc = operation.defaultValue;

#pragma unroll 4
for (std::size_t i = currentWarp; i < warpsNeeded; i += warpCount) {
const auto id = threadInWarp + i * warpSize;
auto value = (id < size) ? static_cast<AccT>(ntload(&vector[id])) : operation.defaultValue;
const auto valueNew =
(id < size) ? static_cast<AccT>(ntload(&vector[id])) : operation.defaultValue;

for (int offset = 1; offset < warpSize; offset *= 2) {
value = operation(value, shuffledown(value, offset));
}
value = operation(value, valueNew);
}

acc = operation(acc, value);
for (int offset = 1; offset < warpSize; offset *= 2) {
value = operation(value, shuffledown(value, offset));
}

acc = operation(acc, value);

if (threadInWarp == 0) {
shmem[currentWarp] = acc;
}
Expand All @@ -78,19 +82,24 @@ __launch_bounds__(1024) void __global__ kernel_reduce(

if (currentWarp == 0) {
const auto lastWarpsNeeded = (warpCount + warpSize - 1) / warpSize;

auto value = operation.defaultValue;
auto lastAcc = operation.defaultValue;

#pragma unroll 2
for (int i = 0; i < lastWarpsNeeded; ++i) {
const auto id = threadInWarp + i * warpSize;
auto value = (id < warpCount) ? shmem[id] : operation.defaultValue;
const auto valueNew = (id < warpCount) ? shmem[id] : operation.defaultValue;

for (int offset = 1; offset < warpSize; offset *= 2) {
value = operation(value, shuffledown(value, offset));
}
value = operation(value, valueNew);
}

lastAcc = operation(lastAcc, value);
for (int offset = 1; offset < warpSize; offset *= 2) {
value = operation(value, shuffledown(value, offset));
}

lastAcc = operation(lastAcc, value);

if (threadIdx.x == 0) {
if (overrideResult) {
ntstore(result, lastAcc);
Expand Down
7 changes: 7 additions & 0 deletions algorithms/sycl/BatchManip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,13 @@ template void Algorithms::setToValue(
int** out, int value, size_t elementSize, size_t numElements, void* streamPtr);
template void Algorithms::setToValue(
unsigned** out, unsigned value, size_t elementSize, size_t numElements, void* streamPtr);
template void Algorithms::setToValue(
long** out, long value, size_t elementSize, size_t numElements, void* streamPtr);
template void Algorithms::setToValue(unsigned long** out,
unsigned long value,
size_t elementSize,
size_t numElements,
void* streamPtr);
template void Algorithms::setToValue(
char** out, char value, size_t elementSize, size_t numElements, void* streamPtr);

Expand Down
3 changes: 1 addition & 2 deletions algorithms/sycl/Reduction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,7 @@ void launchReduction(AccT* result,
(size + (workGroupSize * itemsPerWorkItem) - 1) / (workGroupSize * itemsPerWorkItem);

cgh.parallel_for(
sycl::nd_range<1>{numWorkGroups * itemsPerWorkItem, workGroupSize},
[=](sycl::nd_item<1> idx) {
sycl::nd_range<1>{numWorkGroups * workGroupSize, workGroupSize}, [=](sycl::nd_item<1> idx) {
const auto localId = idx.get_local_id(0);
const auto groupId = idx.get_group(0);

Expand Down
2 changes: 1 addition & 1 deletion interfaces/sycl/Internals.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
#include <sycl/sycl.hpp>

namespace device::internals {
constexpr static int DefaultBlockDim = 1024;
constexpr static int DefaultBlockDim = 256;

template <typename T>
void waitCheck(T&& result) {
Expand Down
2 changes: 1 addition & 1 deletion tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ add_subdirectory(.. root)

find_package(GTest REQUIRED)

add_executable(tests main.cpp reductions.cpp array_manip.cpp memory.cpp batch_manip.cpp)
add_executable(tests main.cpp reductions.cpp memory.cpp array_manip.cpp batch_manip.cpp)
target_link_libraries(tests PRIVATE device ${GTEST_BOTH_LIBRARIES})
target_include_directories(tests PRIVATE ${GTEST_INCLUDE_DIR})

Expand Down
17 changes: 9 additions & 8 deletions tests/array_manip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,15 +73,17 @@ TEST_F(ArrayManip, touchNoClean32) {
device->api->freeGlobMem(arr);
}

// avoid double, due to some archs (also in the CI) don't seem to thoroughly support it

TEST_F(ArrayManip, touchClean64) {

const int N = 100;
double* arr = (double*)device->api->allocGlobMem(N * sizeof(double));
long* arr = (long*)device->api->allocGlobMem(N * sizeof(long));
device->algorithms.touchMemory(arr, N, true, device->api->getDefaultStream());
std::vector<double> hostVector(N, 1);
std::vector<long> hostVector(N, 1);

device->api->copyFromAsync(
&hostVector[0], arr, N * sizeof(double), device->api->getDefaultStream());
&hostVector[0], arr, N * sizeof(long), device->api->getDefaultStream());

device->api->syncDefaultStreamWithHost();

Expand All @@ -95,14 +97,13 @@ TEST_F(ArrayManip, touchClean64) {
TEST_F(ArrayManip, touchNoClean64) {

const int N = 100;
double* arr = (double*)device->api->allocGlobMem(N * sizeof(double));
std::vector<double> hostVector(N, 0);
long* arr = (long*)device->api->allocGlobMem(N * sizeof(long));
std::vector<long> hostVector(N, 0);

device->api->copyToAsync(
arr, &hostVector[0], N * sizeof(double), device->api->getDefaultStream());
device->api->copyToAsync(arr, &hostVector[0], N * sizeof(long), device->api->getDefaultStream());
device->algorithms.touchMemory(arr, N, false, device->api->getDefaultStream());
device->api->copyFromAsync(
&hostVector[0], arr, N * sizeof(double), device->api->getDefaultStream());
&hostVector[0], arr, N * sizeof(long), device->api->getDefaultStream());

device->api->syncDefaultStreamWithHost();

Expand Down
Loading
Loading