diff --git a/docker/roce-workload/Dockerfile b/docker/roce-workload/Dockerfile index 549bd5b6..49fadf8d 100644 --- a/docker/roce-workload/Dockerfile +++ b/docker/roce-workload/Dockerfile @@ -102,16 +102,14 @@ RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && mkdir /var/run/sshd -p RUN mkdir -p /root/.ssh && chmod 700 /root/.ssh -RUN ssh-keygen -t rsa -b 4096 -N '' -f /root/.ssh/id_rsa -RUN cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys -RUN chmod 600 /root/.ssh/authorized_keys RUN echo "PermitRootLogin yes" >> /etc/ssh/sshd_config # User Scripts WORKDIR /tmp -COPY run_rccl.sh /tmp/ COPY show_gid /usr/sbin/show_gid -RUN chmod +x /tmp/run_rccl.sh /usr/sbin/show_gid +COPY run_rccl.sh /tmp/ +COPY entrypoint.sh /root/entrypoint.sh +RUN chmod +x /tmp/run_rccl.sh /usr/sbin/show_gid /root/entrypoint.sh # ENV setup ENV LD_LIBRARY_PATH=$WORK_DIR/amd-anp/build:$WORK_DIR/rccl/build/release/build/lib:/opt/rocm/lib:/opt/rocm/lib64:$WORK_DIR/ompi/install/lib:$LD_LIBRARY_PATH @@ -128,7 +126,7 @@ ENV IMAGE_NAME=$IMAGE_NAME ENV IMAGE_VER=$IMAGE_VER ENV DRIVER_LABEL=$DRIVER_LABEL ENV DRIVERS_VERSION=$DRIVERS_VERSION -ENV DOCKERFILE_VER="roce-workload-10dec" +ENV DOCKERFILE_VER="roce-workload-01feb" LABEL image.version.name=$IMAGE_NAME LABEL image.version.bldver=$IMAGE_VER @@ -142,7 +140,7 @@ LABEL image.driver.version=$DRIVERS_VERSION LABEL base_image=$ROCM_BASE_IMAGE \ name="roce-workload" \ maintainer="sundaramurthy.gurunathan@amd.com" \ - version="v1.0.0" \ + version="v1.0.2" \ description="Create RoCE workload docker image with ROCm, RCCL, RCCL-Tests, AMD Network Plugin, AMD AINIC drivers and libraries" -ENTRYPOINT ["sh", "-c", "service ssh restart && sleep infinity"] +ENTRYPOINT ["/root/entrypoint.sh"] diff --git a/docker/roce-workload/docker-build.sh b/docker/roce-workload/docker-build.sh index 5fddcd50..2fcd43fc 100755 --- a/docker/roce-workload/docker-build.sh +++ b/docker/roce-workload/docker-build.sh @@ -9,10 +9,10 @@ AMD_GPUS="gfx90a;gfx942;gfx950" REPO_URL="https://repo.radeon.com" DRIVER_LABEL="noble" DRIVERS_VERSION="$2" -RCCL_DROP_TAG="drop/2025-06-J13A-1" -ANP_DROP_TAG="tags/v1.1.0-4D" +RCCL_DROP_TAG="rocm-7.0.2" +ANP_DROP_TAG="tags/v1.2.0" -rocm_base_image=${3:-docker.io/rocm/dev-ubuntu-24.04:7.0} +rocm_base_image=${3:-docker.io/rocm/dev-ubuntu-24.04:7.0.2} docker build \ --build-arg ROCM_BASE_IMAGE="${rocm_base_image}" \ diff --git a/docker/roce-workload/entrypoint.sh b/docker/roce-workload/entrypoint.sh new file mode 100644 index 00000000..730db9ba --- /dev/null +++ b/docker/roce-workload/entrypoint.sh @@ -0,0 +1,16 @@ +#!/bin/bash +set -e + +if [ ! -f /root/.ssh/id_rsa ]; then + ssh-keygen -t rsa -b 4096 -N "" -f /root/.ssh/id_rsa + cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys +fi + +### --v1.0.2-- Start SSH daemon ---- +/usr/sbin/sshd + +if [ "$#" -eq 0 ]; then + exec sleep infinity +fi + +exec "$@" diff --git a/docker/roce-workload/run_rccl.sh b/docker/roce-workload/run_rccl.sh index 15ee887e..55dce14d 100755 --- a/docker/roce-workload/run_rccl.sh +++ b/docker/roce-workload/run_rccl.sh @@ -12,24 +12,28 @@ export OMPI_DIR=/root/ompi/install export PERF_TEST_DIR=/root/rccl-tests/build RCCL_ENV="" -RCCL_ENV="$RCCL_ENV -x NCCL_DEBUG=VERSION" -RCCL_ENV="$RCCL_ENV -x HSA_NO_SCRATCH_RECLAIM=1" -RCCL_ENV="$RCCL_ENV -x IONIC_LOCKFREE=all" -RCCL_ENV="$RCCL_ENV -x NCCL_GDRCOPY_ENABLE=0" +RCCL_ENV="$RCCL_ENV -x NCCL_NET_OPTIONAL_RECV_COMPLETION=0" RCCL_ENV="$RCCL_ENV -x NCCL_GDR_FLUSH_DISABLE=1" -RCCL_ENV="$RCCL_ENV -x NCCL_GRAPH_DUMP_FILE=/tmp/graph_all.txt" -RCCL_ENV="$RCCL_ENV -x NCCL_IB_GID_INDEX=1" -RCCL_ENV="$RCCL_ENV -x NCCL_PXN_DISABLE=0" -RCCL_ENV="$RCCL_ENV -x NCCL_IB_QPS_PER_CONNECTION=1" -RCCL_ENV="$RCCL_ENV -x NCCL_IB_TC=96" -RCCL_ENV="$RCCL_ENV -x NCCL_IB_FIFO_TC=184" -RCCL_ENV="$RCCL_ENV -x NCCL_IGNORE_CPU_AFFINITY=1" -RCCL_ENV="$RCCL_ENV -x NCCL_IB_USE_INLINE=1" -RCCL_ENV="$RCCL_ENV -x NCCL_NET_OPTIONAL_RECV_COMPLETION=1" -RCCL_ENV="$RCCL_ENV -x NCCL_SOCKET_IFNAME=${MCA_IF}" -RCCL_ENV="$RCCL_ENV -x NCCL_TOPO_DUMP_FILE=/tmp/topo_all.txt" RCCL_ENV="$RCCL_ENV -x RCCL_GDR_FLUSH_GPU_MEM_NO_RELAXED_ORDERING=0" -RCCL_ENV="$RCCL_ENV -x UCX_UNIFIED_MODE=y" +RCCL_ENV="$RCCL_ENV -x NCCL_IB_USE_INLINE=1" +RCCL_ENV="$RCCL_ENV -x IONIC_LOCKFREE=all" +RCCL_ENV="$RCCL_ENV -x NCCL_NET_PLUGIN=librccl-anp.so" +RCCL_ENV="$RCCL_ENV -x NCCL_DEBUG=INFO" + +## RCCL_ENV="$RCCL_ENV -x NCCL_IB_TC=96" +## RCCL_ENV="$RCCL_ENV -x NCCL_IB_FIFO_TC=184" +## RCCL_ENV="$RCCL_ENV -x NCCL_IB_QPS_PER_CONNECTION=1" +## RCCL_ENV="$RCCL_ENV -x NCCL_TOPO_DUMP_FILE=/tmp/topo_all.txt" +## RCCL_ENV="$RCCL_ENV -x NCCL_SOCKET_IFNAME=${MCA_IF}" +## RCCL_ENV="$RCCL_ENV -x UCX_UNIFIED_MODE=y" +## RCCL_ENV="$RCCL_ENV -x NCCL_GRAPH_DUMP_FILE=/tmp/graph_all.txt" +## RCCL_ENV="$RCCL_ENV -x HSA_NO_SCRATCH_RECLAIM=1" +## RCCL_ENV="$RCCL_ENV -x NCCL_GDRCOPY_ENABLE=0" +## RCCL_ENV="$RCCL_ENV -x NCCL_IB_GID_INDEX=1" +## RCCL_ENV="$RCCL_ENV -x NCCL_PXN_DISABLE=0" +## RCCL_ENV="$RCCL_ENV -x NCCL_IGNORE_CPU_AFFINITY=1" +## RCCL_ENV="$RCCL_ENV -x NCCL_MIN_NCHANNELS=1" +## RCCL_ENV="$RCCL_ENV -x NCCL_MAX_NCHANNELS=1" # -- rccl run start --- mkdir -p /tmp/run_logs