diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 9fd1045..bdd29f5 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -13,81 +13,95 @@ jobs:
   device-build-test:
     name: device-build-test
     runs-on: ${{ matrix.setup.runner }}
-    container: ${{ matrix.setup.container }}
+    container:
+      image: ${{ matrix.setup.container }}
+      options: ${{ matrix.setup.container-options || ' ' }}
     strategy:
       fail-fast: false
       matrix:
         arch:
-          - snb # <-- needed for the self-hosted CI node for now :/
+          - hsw
         build_type:
           - Release
           - Debug
         setup:
-          - arch: sm_60
+          - arch: sm_86
             backend: cuda
             cc: gcc-13
             cxx: g++-13
             fc: gfortran-13
-            container: seissol/gha-gpu-nv:davschneller-gpu-image
-            runner: ubuntu-24.04
+            container: seissol/gha-gpu-nv:davschneller-ci-merge
+            runner: self-hosted
+            container-options: --runtime=nvidia --gpus=all
             pythonbreak: true
-          - arch: sm_60
+            test: true
+          - arch: sm_86
             backend: acpp
             cc: gcc-13
             cxx: g++-13
             fc: gfortran-13
-            container: seissol/gha-gpu-nv:davschneller-gpu-image
-            runner: ubuntu-24.04
+            container: seissol/gha-gpu-nv:davschneller-ci-merge
+            runner: self-hosted
+            container-options: --runtime=nvidia --gpus=all
             pythonbreak: true
-          - arch: sm_60
+            test: true
+          - arch: sm_86
             backend: cuda
             cc: clang-18
             cxx: clang++-18
             fc: gfortran-13 # TODO?
-            container: seissol/gha-gpu-nv:davschneller-gpu-image
-            runner: ubuntu-24.04
+            container: seissol/gha-gpu-nv:davschneller-ci-merge
+            runner: self-hosted
+            container-options: --runtime=nvidia --gpus=all
             pythonbreak: true
-          # TODO: needs a working GPU runner
-          #- arch: sm_60
-          #  backend: cuda
-          #  cc: nvc
-          #  cxx: nvc++
-          #  fc: nvfortran
-          #  container: seissol/gha-gpu-nvhpc:davschneller-gpu-image
-          #  runner: sccs-ci-nv-sm60
-          #  pythonbreak: true
+            test: true
+          - arch: sm_86
+            backend: cuda
+            cc: nvc
+            cxx: nvc++
+            fc: nvfortran
+            container: seissol/gha-gpu-nvhpc:davschneller-ci-merge
+            runner: self-hosted
+            container-options: --runtime=nvidia --gpus=all
+            pythonbreak: true
+            test: true
           - arch: gfx906
             backend: hip
             cc: gcc-13
             cxx: g++-13
             fc: gfortran-13
-            container: seissol/gha-gpu-amd:davschneller-gpu-image
+            container: seissol/gha-gpu-amd:davschneller-ci-merge
             runner: ubuntu-24.04
             pythonbreak: true
+            test: false
           - arch: gfx906
             backend: acpp
             cc: gcc-13
             cxx: g++-13
             fc: gfortran-13
-            container: seissol/gha-gpu-amd:davschneller-gpu-image
+            container: seissol/gha-gpu-amd:davschneller-ci-merge
             runner: ubuntu-24.04
             pythonbreak: true
+            test: false
           - arch: gfx906
             backend: hip
             cc: clang-18
             cxx: clang++-18
             fc: gfortran-13 # TODO?
-            container: seissol/gha-gpu-amd:davschneller-gpu-image
+            container: seissol/gha-gpu-amd:davschneller-ci-merge
             runner: ubuntu-24.04
             pythonbreak: true
+            test: false
           - arch: skl
             backend: oneapi
             cc: icx
             cxx: icpx
             fc: ifx
             container: seissol/gha-gpu-intel:davschneller-gpu-image
-            runner: ubuntu-24.04
+            container-options: --device /dev/dri
+            runner: self-hosted
             pythonbreak: false
+            test: true
     steps:
       - name: install-gtest
         run: |
@@ -98,7 +112,7 @@ jobs:
           cd ../..
 
       - name: checkout-device
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
         with:
           submodules: recursive
 
@@ -119,5 +133,18 @@ jobs:
           export CXX=${{matrix.setup.cxx}}
           export FC=${{matrix.setup.fc}}
 
-          cmake .. -GNinja -DDEVICE_BACKEND=${{matrix.setup.backend}} -DSM=${{matrix.setup.arch}}
+          cmake .. -GNinja \
+            -DDEVICE_BACKEND=${{matrix.setup.backend}} \
+            -DSM=${{matrix.setup.arch}} \
+            -DCMAKE_BUILD_TYPE=${{matrix.build_type}}
+
           ninja
+
+      - id: test
+        name: test-device
+        if: ${{matrix.setup.test}}
+        run: |
+          cd tests
+          cd build
+
+          ./tests
diff --git a/algorithms/cudahip/ArrayManip.cpp b/algorithms/cudahip/ArrayManip.cpp
index ea3024f..91820ab 100644
--- a/algorithms/cudahip/ArrayManip.cpp
+++ b/algorithms/cudahip/ArrayManip.cpp
@@ -80,7 +80,7 @@ template void
 
 //--------------------------------------------------------------------------------------------------
 __global__ void kernel_touchMemory(void* ptr, size_t size, bool clean) {
-  int id = threadIdx.x + blockIdx.x * blockDim.x;
+  const int id = threadIdx.x + blockIdx.x * blockDim.x;
   if (clean) {
     imemset(ptr, size, id, blockDim.x * gridDim.x);
   } else {
diff --git a/algorithms/cudahip/BatchManip.cpp b/algorithms/cudahip/BatchManip.cpp
index 55fba3a..9ce8b04 100644
--- a/algorithms/cudahip/BatchManip.cpp
+++ b/algorithms/cudahip/BatchManip.cpp
@@ -131,6 +131,13 @@ template void Algorithms::setToValue(
     int** out, int value, size_t elementSize, size_t numElements, void* streamPtr);
 template void Algorithms::setToValue(
     unsigned** out, unsigned value, size_t elementSize, size_t numElements, void* streamPtr);
+template void Algorithms::setToValue(
+    long** out, long value, size_t elementSize, size_t numElements, void* streamPtr);
+template void Algorithms::setToValue(unsigned long** out,
+                                     unsigned long value,
+                                     size_t elementSize,
+                                     size_t numElements,
+                                     void* streamPtr);
 template void Algorithms::setToValue(
     char** out, char value, size_t elementSize, size_t numElements, void* streamPtr);
 
diff --git a/algorithms/cudahip/Reduction.cpp b/algorithms/cudahip/Reduction.cpp
index f8342c4..45e8873 100644
--- a/algorithms/cudahip/Reduction.cpp
+++ b/algorithms/cudahip/Reduction.cpp
@@ -56,20 +56,24 @@ __launch_bounds__(1024) void __global__ kernel_reduce(
   const auto threadInWarp = threadIdx.x % warpSize;
   const auto warpsNeeded = (size + warpSize - 1) / warpSize;
 
+  auto value = operation.defaultValue;
   auto acc = operation.defaultValue;
 
 #pragma unroll 4
   for (std::size_t i = currentWarp; i < warpsNeeded; i += warpCount) {
     const auto id = threadInWarp + i * warpSize;
-    auto value = (id < size) ? static_cast<AccT>(ntload(&vector[id])) : operation.defaultValue;
+    const auto valueNew =
+        (id < size) ? static_cast<AccT>(ntload(&vector[id])) : operation.defaultValue;
 
-    for (int offset = 1; offset < warpSize; offset *= 2) {
-      value = operation(value, shuffledown(value, offset));
-    }
+    value = operation(value, valueNew);
+  }
 
-    acc = operation(acc, value);
+  for (int offset = 1; offset < warpSize; offset *= 2) {
+    value = operation(value, shuffledown(value, offset));
   }
 
+  acc = operation(acc, value);
+
   if (threadInWarp == 0) {
     shmem[currentWarp] = acc;
   }
@@ -78,19 +82,24 @@ __launch_bounds__(1024) void __global__ kernel_reduce(
 
   if (currentWarp == 0) {
     const auto lastWarpsNeeded = (warpCount + warpSize - 1) / warpSize;
+
+    auto value = operation.defaultValue;
     auto lastAcc = operation.defaultValue;
+
 #pragma unroll 2
     for (int i = 0; i < lastWarpsNeeded; ++i) {
       const auto id = threadInWarp + i * warpSize;
-      auto value = (id < warpCount) ? shmem[id] : operation.defaultValue;
+      const auto valueNew = (id < warpCount) ? shmem[id] : operation.defaultValue;
 
-      for (int offset = 1; offset < warpSize; offset *= 2) {
-        value = operation(value, shuffledown(value, offset));
-      }
+      value = operation(value, valueNew);
+    }
 
-      lastAcc = operation(lastAcc, value);
+    for (int offset = 1; offset < warpSize; offset *= 2) {
+      value = operation(value, shuffledown(value, offset));
     }
 
+    lastAcc = operation(lastAcc, value);
+
     if (threadIdx.x == 0) {
       if (overrideResult) {
         ntstore(result, lastAcc);
diff --git a/algorithms/sycl/BatchManip.cpp b/algorithms/sycl/BatchManip.cpp
index ddc97cf..0aab2b3 100644
--- a/algorithms/sycl/BatchManip.cpp
+++ b/algorithms/sycl/BatchManip.cpp
@@ -118,6 +118,13 @@ template void Algorithms::setToValue(
     int** out, int value, size_t elementSize, size_t numElements, void* streamPtr);
 template void Algorithms::setToValue(
     unsigned** out, unsigned value, size_t elementSize, size_t numElements, void* streamPtr);
+template void Algorithms::setToValue(
+    long** out, long value, size_t elementSize, size_t numElements, void* streamPtr);
+template void Algorithms::setToValue(unsigned long** out,
+                                     unsigned long value,
+                                     size_t elementSize,
+                                     size_t numElements,
+                                     void* streamPtr);
 template void Algorithms::setToValue(
     char** out, char value, size_t elementSize, size_t numElements, void* streamPtr);
 
diff --git a/algorithms/sycl/Reduction.cpp b/algorithms/sycl/Reduction.cpp
index 21116de..e0d5d52 100644
--- a/algorithms/sycl/Reduction.cpp
+++ b/algorithms/sycl/Reduction.cpp
@@ -93,8 +93,7 @@ void launchReduction(AccT* result,
         (size + (workGroupSize * itemsPerWorkItem) - 1) / (workGroupSize * itemsPerWorkItem);
 
     cgh.parallel_for(
-        sycl::nd_range<1>{numWorkGroups * itemsPerWorkItem, workGroupSize},
-        [=](sycl::nd_item<1> idx) {
+        sycl::nd_range<1>{numWorkGroups * workGroupSize, workGroupSize}, [=](sycl::nd_item<1> idx) {
           const auto localId = idx.get_local_id(0);
           const auto groupId = idx.get_group(0);
 
diff --git a/interfaces/sycl/Internals.h b/interfaces/sycl/Internals.h
index 82230b0..ce14865 100644
--- a/interfaces/sycl/Internals.h
+++ b/interfaces/sycl/Internals.h
@@ -12,7 +12,7 @@
 #include <sycl/sycl.hpp>
 
 namespace device::internals {
-constexpr static int DefaultBlockDim = 1024;
+constexpr static int DefaultBlockDim = 256;
 
 template <typename T>
 void waitCheck(T&& result) {
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index ab5d521..7369f47 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -22,7 +22,7 @@ add_subdirectory(.. root)
 
 find_package(GTest REQUIRED)
 
-add_executable(tests main.cpp reductions.cpp array_manip.cpp memory.cpp batch_manip.cpp)
+add_executable(tests main.cpp reductions.cpp memory.cpp array_manip.cpp batch_manip.cpp)
 target_link_libraries(tests PRIVATE device ${GTEST_BOTH_LIBRARIES})
 target_include_directories(tests PRIVATE ${GTEST_INCLUDE_DIR})
 
diff --git a/tests/array_manip.cpp b/tests/array_manip.cpp
index ba05977..71df856 100644
--- a/tests/array_manip.cpp
+++ b/tests/array_manip.cpp
@@ -73,15 +73,17 @@ TEST_F(ArrayManip, touchNoClean32) {
   device->api->freeGlobMem(arr);
 }
 
+// avoid double, due to some archs (also in the CI) don't seem to thoroughly support it
+
 TEST_F(ArrayManip, touchClean64) {
 
   const int N = 100;
-  double* arr = (double*)device->api->allocGlobMem(N * sizeof(double));
+  long* arr = (long*)device->api->allocGlobMem(N * sizeof(long));
   device->algorithms.touchMemory(arr, N, true, device->api->getDefaultStream());
-  std::vector<double> hostVector(N, 1);
+  std::vector<long> hostVector(N, 1);
 
   device->api->copyFromAsync(
-      &hostVector[0], arr, N * sizeof(double), device->api->getDefaultStream());
+      &hostVector[0], arr, N * sizeof(long), device->api->getDefaultStream());
 
   device->api->syncDefaultStreamWithHost();
 
@@ -95,14 +97,13 @@ TEST_F(ArrayManip, touchClean64) {
 TEST_F(ArrayManip, touchNoClean64) {
 
   const int N = 100;
-  double* arr = (double*)device->api->allocGlobMem(N * sizeof(double));
-  std::vector<double> hostVector(N, 0);
+  long* arr = (long*)device->api->allocGlobMem(N * sizeof(long));
+  std::vector<long> hostVector(N, 0);
 
-  device->api->copyToAsync(
-      arr, &hostVector[0], N * sizeof(double), device->api->getDefaultStream());
+  device->api->copyToAsync(arr, &hostVector[0], N * sizeof(long), device->api->getDefaultStream());
   device->algorithms.touchMemory(arr, N, false, device->api->getDefaultStream());
   device->api->copyFromAsync(
-      &hostVector[0], arr, N * sizeof(double), device->api->getDefaultStream());
+      &hostVector[0], arr, N * sizeof(long), device->api->getDefaultStream());
 
   device->api->syncDefaultStreamWithHost();
 
diff --git a/tests/batch_manip.cpp b/tests/batch_manip.cpp
index a84a719..54ad1b1 100644
--- a/tests/batch_manip.cpp
+++ b/tests/batch_manip.cpp
@@ -32,7 +32,7 @@ class BatchManip : public BaseTestSuite {
     std::forward<F>(inner)(batch, data);
 
     device->api->freeGlobMem(data);
-    device->api->freeGlobMem(batch);
+    device->api->freeUnifiedMem(batch);
   }
 };
 
@@ -166,14 +166,14 @@ TEST_F(BatchManip, uniformToScatter32) {
 TEST_F(BatchManip, fill64) {
   const int N = 100;
   const int M = 120;
-  testWrapper<double>(N, M, false, [&](double** batch, double* data) {
-    double scalar = 502;
+  testWrapper<long>(N, M, false, [&](long** batch, long* data) {
+    long scalar = 502;
 
     device->algorithms.setToValue(batch, scalar, M, N, device->api->getDefaultStream());
 
-    std::vector<double> hostVector(N * M, 0);
+    std::vector<long> hostVector(N * M, 0);
     device->api->copyFromAsync(
-        &hostVector[0], data, N * M * sizeof(double), device->api->getDefaultStream());
+        &hostVector[0], data, N * M * sizeof(long), device->api->getDefaultStream());
 
     device->api->syncDefaultStreamWithHost();
 
@@ -186,12 +186,12 @@ TEST_F(BatchManip, fill64) {
 TEST_F(BatchManip, touchClean64) {
   const int N = 100;
   const int M = 120;
-  testWrapper<double>(N, M, false, [&](double** batch, double* data) {
+  testWrapper<long>(N, M, false, [&](long** batch, long* data) {
     device->algorithms.touchBatchedMemory(batch, M, N, true, device->api->getDefaultStream());
-    std::vector<double> hostVector(N * M, 1);
+    std::vector<long> hostVector(N * M, 1);
 
     device->api->copyFromAsync(
-        &hostVector[0], data, M * N * sizeof(double), device->api->getDefaultStream());
+        &hostVector[0], data, M * N * sizeof(long), device->api->getDefaultStream());
 
     device->api->syncDefaultStreamWithHost();
 
@@ -200,14 +200,14 @@ TEST_F(BatchManip, touchClean64) {
     }
   });
 
-  testWrapper<double>(N, M, true, [&](double** batch, double* data) {
-    std::vector<double> hostVector(N * M, 1);
+  testWrapper<long>(N, M, true, [&](long** batch, long* data) {
+    std::vector<long> hostVector(N * M, 1);
 
     device->api->copyToAsync(
-        data, &hostVector[0], N * M * sizeof(double), device->api->getDefaultStream());
+        data, &hostVector[0], N * M * sizeof(long), device->api->getDefaultStream());
     device->algorithms.touchBatchedMemory(batch, M, N, true, device->api->getDefaultStream());
     device->api->copyFromAsync(
-        &hostVector[0], data, M * N * sizeof(double), device->api->getDefaultStream());
+        &hostVector[0], data, M * N * sizeof(long), device->api->getDefaultStream());
 
     device->api->syncDefaultStreamWithHost();
 
@@ -226,14 +226,14 @@ TEST_F(BatchManip, touchClean64) {
 TEST_F(BatchManip, touchNoClean64) {
   const int N = 100;
   const int M = 120;
-  testWrapper<double>(N, M, false, [&](double** batch, double* data) {
-    std::vector<double> hostVector(N * M, 1);
+  testWrapper<long>(N, M, false, [&](long** batch, long* data) {
+    std::vector<long> hostVector(N * M, 1);
 
     device->api->copyToAsync(
-        data, &hostVector[0], N * M * sizeof(double), device->api->getDefaultStream());
+        data, &hostVector[0], N * M * sizeof(long), device->api->getDefaultStream());
     device->algorithms.touchBatchedMemory(batch, M, N, false, device->api->getDefaultStream());
     device->api->copyFromAsync(
-        &hostVector[0], data, N * M * sizeof(double), device->api->getDefaultStream());
+        &hostVector[0], data, N * M * sizeof(long), device->api->getDefaultStream());
 
     device->api->syncDefaultStreamWithHost();
 
@@ -247,16 +247,16 @@ TEST_F(BatchManip, scatterToUniform64) {
   const int N = 100;
   const int M = 120;
 
-  double* data2 = (double*)device->api->allocGlobMem(N * M * sizeof(double));
-  testWrapper<double>(N, M, false, [&](double** batch, double* data) {
-    std::vector<double> hostVector(N * M, 1);
+  long* data2 = (long*)device->api->allocGlobMem(N * M * sizeof(long));
+  testWrapper<long>(N, M, false, [&](long** batch, long* data) {
+    std::vector<long> hostVector(N * M, 1);
 
     device->api->copyToAsync(
-        data, &hostVector[0], N * M * sizeof(double), device->api->getDefaultStream());
+        data, &hostVector[0], N * M * sizeof(long), device->api->getDefaultStream());
     device->algorithms.copyScatterToUniform(
-        const_cast<const double**>(batch), data2, M, M, N, device->api->getDefaultStream());
+        const_cast<const long**>(batch), data2, M, M, N, device->api->getDefaultStream());
     device->api->copyFromAsync(
-        &hostVector[0], data2, N * M * sizeof(double), device->api->getDefaultStream());
+        &hostVector[0], data2, N * M * sizeof(long), device->api->getDefaultStream());
 
     device->api->syncDefaultStreamWithHost();
 
@@ -271,15 +271,15 @@ TEST_F(BatchManip, uniformToScatter64) {
   const int N = 100;
   const int M = 120;
 
-  double* data2 = (double*)device->api->allocGlobMem(N * M * sizeof(double));
-  testWrapper<double>(N, M, false, [&](double** batch, double* data) {
-    std::vector<double> hostVector(N * M, 1);
+  long* data2 = (long*)device->api->allocGlobMem(N * M * sizeof(long));
+  testWrapper<long>(N, M, false, [&](long** batch, long* data) {
+    std::vector<long> hostVector(N * M, 1);
 
     device->api->copyToAsync(
-        data2, &hostVector[0], N * M * sizeof(double), device->api->getDefaultStream());
+        data2, &hostVector[0], N * M * sizeof(long), device->api->getDefaultStream());
     device->algorithms.copyUniformToScatter(data2, batch, M, M, N, device->api->getDefaultStream());
     device->api->copyFromAsync(
-        &hostVector[0], data, N * M * sizeof(double), device->api->getDefaultStream());
+        &hostVector[0], data, N * M * sizeof(long), device->api->getDefaultStream());
 
     device->api->syncDefaultStreamWithHost();
 
diff --git a/tests/main.cpp b/tests/main.cpp
index 3686780..cfba087 100644
--- a/tests/main.cpp
+++ b/tests/main.cpp
@@ -14,5 +14,10 @@ int main(int argc, char** argv) {
   device.api->setDevice(0);
   device.api->initialize();
 
-  return RUN_ALL_TESTS();
+  const auto result = RUN_ALL_TESTS();
+
+  device.api->syncDevice();
+  device.api->finalize();
+
+  return result;
 }