First-time:
conda create --prefix $PSCRATCH/project -c nvidia ncclEvery-time:
# start up GPUs & navigate to directory
salloc --nodes 1 --qos interactive --time 01:00:00 --constraint gpu --gpus 4 --account m4999_g
cd ~/CS5470/async-ring-allreduce/rei
conda activate $PSCRATCH/project
# compile (TODO: add new implementations here to be compiled)
# NOTE: for the real benchmarking, compile with -DNDEBUG and -O2
nvcc -o benchmark \
src/benchmark.cu src/utils.cu \
src/nccl_ringreduce.cu src/naive_ringreduce.cu \
src/pipelined_ringreduce_async.cu src/pipelined_ringreduce_nccl.cu \
-I$PSCRATCH/project/include \
-L$PSCRATCH/project/lib \
-lnccl -lpthread
# run
LD_LIBRARY_PATH=$PSCRATCH/project/lib NCCL_DEBUG=WARN ./benchmark 4 output.csv