diff --git a/ci/blackbox.sh b/ci/blackbox.sh index e89e0b40a2..8a771ea445 100755 --- a/ci/blackbox.sh +++ b/ci/blackbox.sh @@ -19,7 +19,7 @@ ROOT_DIR=$SCRIPT_DIR/.. show_usage() { echo "Vortex BlackBox Test Driver v1.0" - echo "Usage: $0 [[--clusters=#n] [--cores=#n] [--warps=#n] [--threads=#n] [--l2cache] [--l3cache] [[--driver=#name] [--app=#app] [--args=#args] [--debug=#level] [--scope] [--perf=#class] [--log=logfile] [--nohup] [--help]]" + echo "Usage: $0 [[--clusters=#n] [--cores=#n] [--warps=#n] [--threads=#n] [--l2cache] [--l3cache] [[--driver=#name] [--app=#app] [--args=#args] [--debug=#level] [--np=#num of MPI processes] [--scope] [--perf=#class] [--log=logfile] [--nohup] [--help]]" } show_help() @@ -47,6 +47,7 @@ DEFAULTS() { DEBUG_LEVEL=0 SCOPE=0 HAS_ARGS=0 + HAS_NP=0 PERF_CLASS=0 CONFIGS="$CONFIGS" TEMPBUILD=0 @@ -69,6 +70,7 @@ parse_args() { --debug=*) DEBUG=1; DEBUG_LEVEL=${i#*=} ;; --scope) SCOPE=1; ;; --args=*) HAS_ARGS=1; ARGS=${i#*=} ;; + --np=*) HAS_NP=1; NP=${i#*=} ;; --log=*) LOGFILE=${i#*=} ;; --nohup) TEMPBUILD=1 ;; --help) show_help; exit 0 ;; @@ -123,6 +125,7 @@ run_app() { [ $DEBUG -eq 1 ] && cmd_opts=$(add_option "$cmd_opts" "DEBUG=1") [ $TEMPBUILD -eq 1 ] && cmd_opts=$(add_option "$cmd_opts" "VORTEX_RT_PATH=\"$TEMPDIR\"") [ $HAS_ARGS -eq 1 ] && cmd_opts=$(add_option "$cmd_opts" "OPTS=\"$ARGS\"") + [ $HAS_NP -eq 1 ] && cmd_opts=$(add_option "$cmd_opts" "NP=$NP") cmd_opts=$(add_option "$cmd_opts" "make -C \"$APP_PATH\" run-$DRIVER") [ $DEBUG -ne 0 ] && cmd_opts=$(add_option "$cmd_opts" "> $LOGFILE 2>&1") echo "Running: $cmd_opts" diff --git a/miscs/apptainer/vortex.def b/miscs/apptainer/vortex.def index cd32138928..ecb57cce29 100644 --- a/miscs/apptainer/vortex.def +++ b/miscs/apptainer/vortex.def @@ -45,7 +45,8 @@ From: ubuntu:22.04 openjdk-11-jre-zero libtheora0 libavcodec58 libcairo-gobject2 \ ca-certificates-java libchromaprint1 software-properties-common perl-modules bzip2 \ unzip zlib1g-dev libtinfo5 g++ usbutils pciutils gawk bison gcc make tar python3.9 locales zstd uuid-dev ccache \ - libboost-filesystem1.74.0 libboost-program-options1.74.0 libboost-system1.74.0 libboost-chrono1.74.0 libboost-thread1.74.0 environment-modules || true + libboost-filesystem1.74.0 libboost-program-options1.74.0 libboost-system1.74.0 libboost-chrono1.74.0 libboost-thread1.74.0 \ + environment-modules openmpi-bin libopenmpi-dev || true ln -s /usr/bin/python3 /usr/bin/python diff --git a/tests/regression/Makefile b/tests/regression/Makefile index be3ccc9636..b9f090e875 100644 --- a/tests/regression/Makefile +++ b/tests/regression/Makefile @@ -14,6 +14,7 @@ all: $(MAKE) -C sort $(MAKE) -C fence $(MAKE) -C vecadd + $(MAKE) -C mpi_vecadd $(MAKE) -C sgemm $(MAKE) -C conv3 $(MAKE) -C relu @@ -35,6 +36,7 @@ run-simx: $(MAKE) -C sort run-simx $(MAKE) -C fence run-simx $(MAKE) -C vecadd run-simx + $(MAKE) -C mpi_vecadd run-simx $(MAKE) -C sgemm run-simx $(MAKE) -C conv3 run-simx $(MAKE) -C relu run-simx @@ -77,6 +79,7 @@ clean: $(MAKE) -C sort clean $(MAKE) -C fence clean $(MAKE) -C vecadd clean + $(MAKE) -C mpi_vecadd clean $(MAKE) -C sgemm clean $(MAKE) -C conv3 clean $(MAKE) -C relu clean diff --git a/tests/regression/common.mk b/tests/regression/common.mk index dea17512b6..ae8cc9bae2 100644 --- a/tests/regression/common.mk +++ b/tests/regression/common.mk @@ -65,6 +65,12 @@ CXXFLAGS += $(CONFIGS) LDFLAGS += -L$(VORTEX_RT_PATH) -lvortex +ifdef MPI + MPIRUN = mpirun --allow-run-as-root --oversubscribe -np $(NP) +else + MPIRUN = +endif + # Debugging ifdef DEBUG CXXFLAGS += -g -O0 @@ -99,7 +105,7 @@ $(PROJECT): $(SRCS) $(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@ run-simx: $(PROJECT) kernel.vxbin - LD_LIBRARY_PATH=$(VORTEX_RT_PATH):$(LD_LIBRARY_PATH) VORTEX_DRIVER=simx ./$(PROJECT) $(OPTS) + LD_LIBRARY_PATH=$(VORTEX_RT_PATH):$(LD_LIBRARY_PATH) VORTEX_DRIVER=simx $(MPIRUN) ./$(PROJECT) $(OPTS) run-rtlsim: $(PROJECT) kernel.vxbin LD_LIBRARY_PATH=$(VORTEX_RT_PATH):$(LD_LIBRARY_PATH) VORTEX_DRIVER=rtlsim ./$(PROJECT) $(OPTS) diff --git a/tests/regression/mpi_vecadd/Makefile b/tests/regression/mpi_vecadd/Makefile new file mode 100644 index 0000000000..7fb9732810 --- /dev/null +++ b/tests/regression/mpi_vecadd/Makefile @@ -0,0 +1,21 @@ +ROOT_DIR := $(realpath ../../..) +include $(ROOT_DIR)/config.mk + +PROJECT := mpi_vecadd + +SRC_DIR := $(VORTEX_HOME)/tests/regression/$(PROJECT) + +SRCS := $(SRC_DIR)/main.cpp + +VX_SRCS := $(SRC_DIR)/kernel.cpp + +OPTS ?= -n64 + +MPI ?= 0 +NP ?= 1 + +ifdef MPI + CXX = mpic++ +endif + +include ../common.mk \ No newline at end of file diff --git a/tests/regression/mpi_vecadd/README.md b/tests/regression/mpi_vecadd/README.md new file mode 100644 index 0000000000..c03a26787e --- /dev/null +++ b/tests/regression/mpi_vecadd/README.md @@ -0,0 +1,91 @@ +## MPI With SIMX + + +### Usage + +``` +Apptainer> ./ci/blackbox.sh --cores=2 --app=mpi_vecadd --driver=simx --np=4 --args="-n5000" +CONFIGS=-DNUM_CORES=2 +Running: CONFIGS="-DNUM_CORES=2" make -C ./ci/../runtime/simx > /dev/null +Running: OPTS="-n5000" NP=4 make -C "./ci/../tests/regression/mpi_vecadd" run-simx +make: Entering directory '/home/vortex/build/tests/regression/mpi_vecadd' +LD_LIBRARY_PATH=/home/vortex/build/runtime:/opt/boost-1.66/lib:/opt/openssl-1.1/lib::/.singularity.d/libs VORTEX_DRIVER=simx mpirun --allow-run-as-root --oversubscribe -np 4 ./mpi_vecadd -n5000 +rank = 3, world_size = 4 +rank = 0, world_size = 4 +rank = 1, world_size = 4 +rank = 2, world_size = 4 +Rank: 3- Upload kernel binary +Rank: 0- Upload kernel binary +Rank: 1- Upload kernel binary +Rank: 2- Upload kernel binary +PERF: core0: instrs=22440, cycles=59003, IPC=0.380320 +PERF: core1: instrs=22440, cycles=58635, IPC=0.382707 +PERF: instrs=44880, cycles=59003, IPC=0.760639 +PERF: core0: instrs=22440, cycles=59003, IPC=0.380320 +PERF: core1: instrs=22440, cycles=58635, IPC=0.382707 +PERF: instrs=44880, cycles=59003, IPC=0.760639 +PERF: core0: instrs=22440, cycles=59003, IPC=0.380320 +PERF: core1: instrs=22440, cycles=58635, IPC=0.382707 +PERF: instrs=44880, cycles=59003, IPC=0.760639 +PASSED! +PERF: core0: instrs=22440, cycles=59003, IPC=0.380320 +PERF: core1: instrs=22440, cycles=58635, IPC=0.382707 +PERF: instrs=44880, cycles=59003, IPC=0.760639 +make: Leaving directory '/home/vortex/build/tests/regression/mpi_vecadd' +Apptainer> +``` + + +### High-Level Summary of main.cpp + +#### MPI Setup + +Calls MPI_Init, gets the rank (MPI_Comm_rank) and world size (MPI_Comm_size). + +Each MPI rank prints its rank and total world_size. + +#### Argument Parsing + +Reads -n from the command line (number of elements in the vector). + +Rank 0 parses this value, then broadcasts it to all ranks with MPI_Bcast(&size, 1, MPI_UNSIGNED, 0, MPI_COMM_WORLD). + +This ensures every rank sees the same problem size. + +#### Data Partitioning + +Total work = size elements. + +Each rank computes its chunk: + +``` + // Compute local chunk + uint32_t chunk = (size + world_size - 1) / world_size; // ceil div + uint32_t start = rank * chunk; + uint32_t end = std::min(start + chunk, size); + uint32_t num_points = end - start; +``` + + +So if size=50 and np=8, each rank gets about 6–7 elements. + +#### Kernel Upload + Execution + +Each rank loads the Vortex kernel binary (mpi_vecadd) into its own Vortex instance. + +That’s why you see “Upload kernel binary” printed for every rank, not just once. + +Then each rank launches the kernel for its assigned portion of the data. + +#### Performance Reporting + +After kernel finishes, each rank prints Vortex perf stats (instrs, cycles, IPC). + +These numbers are per rank’s Vortex instance, not shared across ranks. + + +#### Verification + +Each rank validates its results (checks that vector addition is correct). + +Finally, the ranks synchronize (MPI_Barrier) and finalize (MPI_Finalize). \ No newline at end of file diff --git a/tests/regression/mpi_vecadd/common.h b/tests/regression/mpi_vecadd/common.h new file mode 100644 index 0000000000..b511332c11 --- /dev/null +++ b/tests/regression/mpi_vecadd/common.h @@ -0,0 +1,15 @@ +#ifndef _COMMON_H_ +#define _COMMON_H_ + +#ifndef TYPE +#define TYPE float +#endif + +typedef struct { + uint32_t num_points; + uint64_t src0_addr; + uint64_t src1_addr; + uint64_t dst_addr; +} kernel_arg_t; + +#endif diff --git a/tests/regression/mpi_vecadd/kernel.cpp b/tests/regression/mpi_vecadd/kernel.cpp new file mode 100644 index 0000000000..7774c970a7 --- /dev/null +++ b/tests/regression/mpi_vecadd/kernel.cpp @@ -0,0 +1,15 @@ +#include +#include "common.h" + +void kernel_body(kernel_arg_t* __UNIFORM__ arg) { + auto src0_ptr = reinterpret_cast(arg->src0_addr); + auto src1_ptr = reinterpret_cast(arg->src1_addr); + auto dst_ptr = reinterpret_cast(arg->dst_addr); + + dst_ptr[blockIdx.x] = src0_ptr[blockIdx.x] + src1_ptr[blockIdx.x]; +} + +int main() { + kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH); + return vx_spawn_threads(1, &arg->num_points, nullptr, (vx_kernel_func_cb)kernel_body, arg); +} diff --git a/tests/regression/mpi_vecadd/main.cpp b/tests/regression/mpi_vecadd/main.cpp new file mode 100644 index 0000000000..3bcd415192 --- /dev/null +++ b/tests/regression/mpi_vecadd/main.cpp @@ -0,0 +1,186 @@ +#include +#include +#include +#include +#include +#include "common.h" + +#define FLOAT_ULP 6 + +#define RT_CHECK(_expr) \ + do { \ + int _ret = _expr; \ + if (0 == _ret) \ + break; \ + printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \ + cleanup(); \ + MPI_Abort(MPI_COMM_WORLD, -1); \ + } while (false) + +/////////////////////////////////////////////////////////////////////////////// + +template +class Comparator {}; + +template <> +class Comparator { +public: + static const char* type_str() { return "integer"; } + static int generate(uint32_t idx) { return rand(); } + static bool compare(int a, int b, int index, int errors) { + if (a != b && errors < 100) { + printf("*** error: [%d] expected=%d, actual=%d\n", index, b, a); + return false; + } + return a == b; + } +}; + +template <> +class Comparator { +public: + static const char* type_str() { return "float"; } + static float generate(uint32_t idx) { return static_cast(rand()) / RAND_MAX; } + static bool compare(float a, float b, int index, int errors) { + union { float f; int i; } fa, fb; + fa.f = a; fb.f = b; + int d = std::abs(fa.i - fb.i); + if (d > FLOAT_ULP && errors < 100) { + printf("*** error: [%d] expected=%f, actual=%f\n", index, b, a); + return false; + } + return d <= FLOAT_ULP; + } +}; + +const char* kernel_file = "kernel.vxbin"; +uint32_t size = 16; + +vx_device_h device = nullptr; +vx_buffer_h src0_buffer = nullptr; +vx_buffer_h src1_buffer = nullptr; +vx_buffer_h dst_buffer = nullptr; +vx_buffer_h krnl_buffer = nullptr; +vx_buffer_h args_buffer = nullptr; +kernel_arg_t kernel_arg = {}; + +void cleanup() { + if (device) { + vx_mem_free(src0_buffer); + vx_mem_free(src1_buffer); + vx_mem_free(dst_buffer); + vx_mem_free(krnl_buffer); + vx_mem_free(args_buffer); + vx_dev_close(device); + } +} + +static void parse_args(int argc, char **argv) { + int c; + while ((c = getopt(argc, argv, "n:k:h")) != -1) { + switch (c) { + case 'n': size = atoi(optarg); break; + case 'k': kernel_file = optarg; break; + case 'h': std::cout << "Usage: [-k kernel] [-n size] [-h help]\n"; exit(0); + default: std::cout << "Usage: [-k kernel] [-n size] [-h help]\n"; exit(-1); + } + } +} + +int main(int argc, char* argv[]) { + MPI_Init(&argc, &argv); + int rank, world_size; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &world_size); + + std::cout << "rank = " << rank << ", world_size = " << world_size << "\n"; + if (rank == 0) parse_args(argc, argv); + MPI_Bcast(&size, 1, MPI_UNSIGNED, 0, MPI_COMM_WORLD); + + // Rank 0 generates full input arrays + std::vector full_src0, full_src1; + if (rank == 0) { + std::srand(50); + full_src0.resize(size); + full_src1.resize(size); + for (uint32_t i = 0; i < size; i++) { + full_src0[i] = Comparator::generate(i); + full_src1[i] = Comparator::generate(i); + } + } + + // Compute local chunk + uint32_t chunk = (size + world_size - 1) / world_size; // ceil div + uint32_t start = rank * chunk; + uint32_t end = std::min(start + chunk, size); + uint32_t num_points = end - start; + + // Local buffers + std::vector h_src0(num_points); + std::vector h_src1(num_points); + std::vector h_dst(num_points); + + // Scatter inputs + std::vector recvcounts(world_size), displs(world_size); + for (int i = 0; i < world_size; i++) { + int s = i * chunk; + int e = std::min(s + chunk, size); + recvcounts[i] = e - s; + displs[i] = s; + } + + MPI_Scatterv(full_src0.data(), recvcounts.data(), displs.data(), MPI_FLOAT, + h_src0.data(), num_points, MPI_FLOAT, 0, MPI_COMM_WORLD); + + MPI_Scatterv(full_src1.data(), recvcounts.data(), displs.data(), MPI_FLOAT, + h_src1.data(), num_points, MPI_FLOAT, 0, MPI_COMM_WORLD); + + // Open device + RT_CHECK(vx_dev_open(&device)); + + uint32_t buf_size = num_points * sizeof(TYPE); + kernel_arg.num_points = num_points; + + RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &src0_buffer)); + RT_CHECK(vx_mem_address(src0_buffer, &kernel_arg.src0_addr)); + RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &src1_buffer)); + RT_CHECK(vx_mem_address(src1_buffer, &kernel_arg.src1_addr)); + RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_WRITE, &dst_buffer)); + RT_CHECK(vx_mem_address(dst_buffer, &kernel_arg.dst_addr)); + + RT_CHECK(vx_copy_to_dev(src0_buffer, h_src0.data(), 0, buf_size)); + RT_CHECK(vx_copy_to_dev(src1_buffer, h_src1.data(), 0, buf_size)); + + std::cout << "Rank: " << rank << "- Upload kernel binary" << std::endl; + RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer)); + RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &args_buffer)); + + // Run kernel + RT_CHECK(vx_start(device, krnl_buffer, args_buffer)); + RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT)); + RT_CHECK(vx_copy_from_dev(h_dst.data(), dst_buffer, 0, buf_size)); + + // Gather results + std::vector full_dst; + if (rank == 0) full_dst.resize(size); + + MPI_Gatherv(h_dst.data(), num_points, MPI_FLOAT, + full_dst.data(), recvcounts.data(), displs.data(), MPI_FLOAT, + 0, MPI_COMM_WORLD); + + // Verify (rank 0) + if (rank == 0) { + int errors = 0; + for (uint32_t i = 0; i < size; i++) { + auto ref = full_src0[i] + full_src1[i]; + auto cur = full_dst[i]; + if (!Comparator::compare(cur, ref, i, errors)) errors++; + } + if (errors) std::cout << "Found " << errors << " errors!\nFAILED!\n"; + else std::cout << "PASSED!\n"; + } + + cleanup(); + MPI_Finalize(); + return 0; +}