Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 6 additions & 8 deletions docker/roce-workload/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -102,16 +102,14 @@ RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config &&
mkdir /var/run/sshd -p

RUN mkdir -p /root/.ssh && chmod 700 /root/.ssh
RUN ssh-keygen -t rsa -b 4096 -N '' -f /root/.ssh/id_rsa
RUN cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys
RUN chmod 600 /root/.ssh/authorized_keys
RUN echo "PermitRootLogin yes" >> /etc/ssh/sshd_config

# User Scripts
WORKDIR /tmp
COPY run_rccl.sh /tmp/
COPY show_gid /usr/sbin/show_gid
RUN chmod +x /tmp/run_rccl.sh /usr/sbin/show_gid
COPY run_rccl.sh /tmp/
COPY entrypoint.sh /root/entrypoint.sh
RUN chmod +x /tmp/run_rccl.sh /usr/sbin/show_gid /root/entrypoint.sh

# ENV setup
ENV LD_LIBRARY_PATH=$WORK_DIR/amd-anp/build:$WORK_DIR/rccl/build/release/build/lib:/opt/rocm/lib:/opt/rocm/lib64:$WORK_DIR/ompi/install/lib:$LD_LIBRARY_PATH
Expand All @@ -128,7 +126,7 @@ ENV IMAGE_NAME=$IMAGE_NAME
ENV IMAGE_VER=$IMAGE_VER
ENV DRIVER_LABEL=$DRIVER_LABEL
ENV DRIVERS_VERSION=$DRIVERS_VERSION
ENV DOCKERFILE_VER="roce-workload-10dec"
ENV DOCKERFILE_VER="roce-workload-01feb"

LABEL image.version.name=$IMAGE_NAME
LABEL image.version.bldver=$IMAGE_VER
Expand All @@ -142,7 +140,7 @@ LABEL image.driver.version=$DRIVERS_VERSION
LABEL base_image=$ROCM_BASE_IMAGE \
name="roce-workload" \
maintainer="sundaramurthy.gurunathan@amd.com" \
version="v1.0.0" \
version="v1.0.2" \
description="Create RoCE workload docker image with ROCm, RCCL, RCCL-Tests, AMD Network Plugin, AMD AINIC drivers and libraries"

ENTRYPOINT ["sh", "-c", "service ssh restart && sleep infinity"]
ENTRYPOINT ["/root/entrypoint.sh"]
6 changes: 3 additions & 3 deletions docker/roce-workload/docker-build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@ AMD_GPUS="gfx90a;gfx942;gfx950"
REPO_URL="https://repo.radeon.com"
DRIVER_LABEL="noble"
DRIVERS_VERSION="$2"
RCCL_DROP_TAG="drop/2025-06-J13A-1"
ANP_DROP_TAG="tags/v1.1.0-4D"
RCCL_DROP_TAG="rocm-7.0.2"
ANP_DROP_TAG="tags/v1.2.0"

rocm_base_image=${3:-docker.io/rocm/dev-ubuntu-24.04:7.0}
rocm_base_image=${3:-docker.io/rocm/dev-ubuntu-24.04:7.0.2}

docker build \
--build-arg ROCM_BASE_IMAGE="${rocm_base_image}" \
Expand Down
16 changes: 16 additions & 0 deletions docker/roce-workload/entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/bash
set -e

if [ ! -f /root/.ssh/id_rsa ]; then
ssh-keygen -t rsa -b 4096 -N "" -f /root/.ssh/id_rsa
cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys
fi

### --v1.0.2-- Start SSH daemon ----
/usr/sbin/sshd

if [ "$#" -eq 0 ]; then
exec sleep infinity
fi

exec "$@"
36 changes: 20 additions & 16 deletions docker/roce-workload/run_rccl.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,24 +12,28 @@ export OMPI_DIR=/root/ompi/install
export PERF_TEST_DIR=/root/rccl-tests/build

RCCL_ENV=""
RCCL_ENV="$RCCL_ENV -x NCCL_DEBUG=VERSION"
RCCL_ENV="$RCCL_ENV -x HSA_NO_SCRATCH_RECLAIM=1"
RCCL_ENV="$RCCL_ENV -x IONIC_LOCKFREE=all"
RCCL_ENV="$RCCL_ENV -x NCCL_GDRCOPY_ENABLE=0"
RCCL_ENV="$RCCL_ENV -x NCCL_NET_OPTIONAL_RECV_COMPLETION=0"
RCCL_ENV="$RCCL_ENV -x NCCL_GDR_FLUSH_DISABLE=1"
RCCL_ENV="$RCCL_ENV -x NCCL_GRAPH_DUMP_FILE=/tmp/graph_all.txt"
RCCL_ENV="$RCCL_ENV -x NCCL_IB_GID_INDEX=1"
RCCL_ENV="$RCCL_ENV -x NCCL_PXN_DISABLE=0"
RCCL_ENV="$RCCL_ENV -x NCCL_IB_QPS_PER_CONNECTION=1"
RCCL_ENV="$RCCL_ENV -x NCCL_IB_TC=96"
RCCL_ENV="$RCCL_ENV -x NCCL_IB_FIFO_TC=184"
RCCL_ENV="$RCCL_ENV -x NCCL_IGNORE_CPU_AFFINITY=1"
RCCL_ENV="$RCCL_ENV -x NCCL_IB_USE_INLINE=1"
RCCL_ENV="$RCCL_ENV -x NCCL_NET_OPTIONAL_RECV_COMPLETION=1"
RCCL_ENV="$RCCL_ENV -x NCCL_SOCKET_IFNAME=${MCA_IF}"
RCCL_ENV="$RCCL_ENV -x NCCL_TOPO_DUMP_FILE=/tmp/topo_all.txt"
RCCL_ENV="$RCCL_ENV -x RCCL_GDR_FLUSH_GPU_MEM_NO_RELAXED_ORDERING=0"
RCCL_ENV="$RCCL_ENV -x UCX_UNIFIED_MODE=y"
RCCL_ENV="$RCCL_ENV -x NCCL_IB_USE_INLINE=1"
RCCL_ENV="$RCCL_ENV -x IONIC_LOCKFREE=all"
RCCL_ENV="$RCCL_ENV -x NCCL_NET_PLUGIN=librccl-anp.so"
RCCL_ENV="$RCCL_ENV -x NCCL_DEBUG=INFO"

## RCCL_ENV="$RCCL_ENV -x NCCL_IB_TC=96"
## RCCL_ENV="$RCCL_ENV -x NCCL_IB_FIFO_TC=184"
## RCCL_ENV="$RCCL_ENV -x NCCL_IB_QPS_PER_CONNECTION=1"
## RCCL_ENV="$RCCL_ENV -x NCCL_TOPO_DUMP_FILE=/tmp/topo_all.txt"
## RCCL_ENV="$RCCL_ENV -x NCCL_SOCKET_IFNAME=${MCA_IF}"
## RCCL_ENV="$RCCL_ENV -x UCX_UNIFIED_MODE=y"
## RCCL_ENV="$RCCL_ENV -x NCCL_GRAPH_DUMP_FILE=/tmp/graph_all.txt"
## RCCL_ENV="$RCCL_ENV -x HSA_NO_SCRATCH_RECLAIM=1"
## RCCL_ENV="$RCCL_ENV -x NCCL_GDRCOPY_ENABLE=0"
## RCCL_ENV="$RCCL_ENV -x NCCL_IB_GID_INDEX=1"
## RCCL_ENV="$RCCL_ENV -x NCCL_PXN_DISABLE=0"
## RCCL_ENV="$RCCL_ENV -x NCCL_IGNORE_CPU_AFFINITY=1"
## RCCL_ENV="$RCCL_ENV -x NCCL_MIN_NCHANNELS=1"
## RCCL_ENV="$RCCL_ENV -x NCCL_MAX_NCHANNELS=1"

# -- rccl run start ---
mkdir -p /tmp/run_logs
Expand Down