Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
204 changes: 188 additions & 16 deletions .github/workflows/enroot-tests.yml
Original file line number Diff line number Diff line change
@@ -1,15 +1,26 @@
name: Enroot Tests

on:
push:
branches:
- main
workflow_dispatch:
inputs:
test_name:
description: 'Select test to run'
required: true
type: choice
options:
- test_single_node_pytorch
- test_multi_node_distributed_pytorch
run_single_node_pytorch:
description: 'Run single-node PyTorch test'
required: false
type: boolean
default: true
run_multi_node_pytorch:
description: 'Run multi-node distributed PyTorch test'
required: false
type: boolean
default: true
run_multi_node_rccl:
description: 'Run multi-node RCCL test'
required: false
type: boolean
default: true
no_install:
description: 'Skip installation (--no-install)'
required: false
Expand All @@ -20,20 +31,27 @@ on:
required: false
type: boolean
default: false
docker_image:
description: 'Docker image to use (default: rocm/pytorch:latest for single-node, docker://rocm/pytorch:rocm7.0.2_ubuntu22.04_py3.10_pytorch_release_2.7.1 for multi-node)'
docker_image_single_node:
description: 'Docker image for single-node test (default: rocm/pytorch:latest)'
required: false
type: string
default: ''
testbed_file:
description: 'Path to testbed file (e.g. tests/enroot/testbeds/mi325.yaml)'
docker_image_multi_node:
description: 'Docker image for multi-node PyTorch test (default: docker://rocm/pytorch:rocm6.2.4_ubuntu22.04_py3.10_pytorch_release_2.3.0)'
required: false
type: string
default: 'testbed/enroot_tb.yml'
default: ''
docker_image_rccl:
description: 'Docker image for RCCL test (default: docker://rocm/roce-workload:ubuntu24_rocm-7.0.2_rccl-7.0.2_anp-v1.2.0_ainic-1.117.5-a-56)'
required: false
type: string
default: ''


jobs:
run-enroot-tests:
# Single-node PyTorch test
test-single-node-pytorch:
if: github.event_name == 'push' || inputs.run_single_node_pytorch == true
runs-on: enroot-runners
timeout-minutes: 120

Expand All @@ -51,16 +69,170 @@ jobs:
python3 -m pip install --upgrade pip
pip install -r tests/enroot/requirements.txt

- name: Run enroot tests
- name: Create testbed files from secrets
working-directory: tests/enroot
env:
SINGLE_NODE_TESTBED: ${{ secrets.SINGLE_NODE_TESTBED_FILE }}
run: |
mkdir -p testbed
if [ -n "$SINGLE_NODE_TESTBED" ]; then
printf '%s\n' "$SINGLE_NODE_TESTBED" > testbed/single_node_tb.yml
echo "Created testbed/single_node_tb.yml from secret"
else
echo "[ERROR] SINGLE_NODE_TESTBED_FILE secret is not set"
exit 1
fi

- name: Run test_single_node_pytorch
working-directory: tests/enroot
run: |
DOCKER_IMAGE="${{ inputs.docker_image_single_node }}"
if [ "${{ github.event_name }}" = "push" ]; then
NO_INSTALL="false"
NO_UNINSTALL="false"
else
NO_INSTALL="${{ inputs.no_install }}"
NO_UNINSTALL="${{ inputs.no_uninstall }}"
fi

echo "Running test: test_single_node_pytorch"
echo "Docker image: ${DOCKER_IMAGE:-'(using default from batch script)'}"

python3 run_test.py "test_single_node_pytorch" "$DOCKER_IMAGE" "$NO_INSTALL" "$NO_UNINSTALL" "testbed/single_node_tb.yml"

- name: Upload test results
if: always()
uses: actions/upload-artifact@v4
with:
name: test-results-single-node-pytorch-${{ github.run_number }}
path: tests/enroot/results/
if-no-files-found: warn
retention-days: 30

# Multi-node distributed PyTorch test
test-multi-node-pytorch:
needs: test-single-node-pytorch
if: |
always() &&
(github.event_name == 'push' || inputs.run_multi_node_pytorch == true) &&
(needs.test-single-node-pytorch.result == 'success' || needs.test-single-node-pytorch.result == 'skipped')
runs-on: enroot-runners
timeout-minutes: 120

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.8'

- name: Install dependencies
run: |
python3 -m pip install --upgrade pip
pip install -r tests/enroot/requirements.txt

- name: Create testbed files from secrets
working-directory: tests/enroot
env:
MULTI_NODE_TESTBED: ${{ secrets.MULTI_NODE_TESTBED_FILE }}
run: |
mkdir -p testbed
if [ -n "$MULTI_NODE_TESTBED" ]; then
printf '%s\n' "$MULTI_NODE_TESTBED" > testbed/multi_node_tb.yml
echo "Created testbed/multi_node_tb.yml from secret"
else
echo "[ERROR] MULTI_NODE_TESTBED_FILE secret is not set"
exit 1
fi

- name: Run test_multi_node_distributed_pytorch
working-directory: tests/enroot
run: |
DOCKER_IMAGE="${{ inputs.docker_image_multi_node }}"
if [ "${{ github.event_name }}" = "push" ]; then
NO_INSTALL="false"
NO_UNINSTALL="false"
else
NO_INSTALL="${{ inputs.no_install }}"
NO_UNINSTALL="${{ inputs.no_uninstall }}"
fi

echo "Running test: test_multi_node_distributed_pytorch"
echo "Docker image: ${DOCKER_IMAGE:-'(using default from batch script)'}"

python3 run_test.py "test_multi_node_distributed_pytorch" "$DOCKER_IMAGE" "$NO_INSTALL" "$NO_UNINSTALL" "testbed/multi_node_tb.yml"

- name: Upload test results
if: always()
uses: actions/upload-artifact@v4
with:
name: test-results-multi-node-pytorch-${{ github.run_number }}
path: tests/enroot/results/
if-no-files-found: warn
retention-days: 30

# Multi-node RCCL test
test-multi-node-rccl:
needs: test-multi-node-pytorch
if: |
always() &&
(github.event_name == 'push' || inputs.run_multi_node_rccl == true) &&
(needs.test-multi-node-pytorch.result == 'success' || needs.test-multi-node-pytorch.result == 'skipped')
runs-on: enroot-runners
timeout-minutes: 120

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.8'

- name: Install dependencies
run: |
python3 -m pip install --upgrade pip
pip install -r tests/enroot/requirements.txt

- name: Create testbed files from secrets
working-directory: tests/enroot
env:
MULTI_NODE_TESTBED: ${{ secrets.MULTI_NODE_TESTBED_FILE }}
run: |
mkdir -p testbed
if [ -n "$MULTI_NODE_TESTBED" ]; then
printf '%s\n' "$MULTI_NODE_TESTBED" > testbed/multi_node_tb.yml
echo "Created testbed/multi_node_tb.yml from secret"
else
echo "[ERROR] MULTI_NODE_TESTBED_FILE secret is not set"
exit 1
fi

- name: Run test_multi_node_rccl
working-directory: tests/enroot
run: |
python3 run_test.py "${{ inputs.test_name }}" "${{ inputs.docker_image }}" "${{ inputs.no_install }}" "${{ inputs.no_uninstall }}" "${{ inputs.testbed_file }}"
DOCKER_IMAGE="${{ inputs.docker_image_rccl }}"
if [ "${{ github.event_name }}" = "push" ]; then
NO_INSTALL="false"
NO_UNINSTALL="false"
else
NO_INSTALL="${{ inputs.no_install }}"
NO_UNINSTALL="${{ inputs.no_uninstall }}"
fi

echo "Running test: test_multi_node_rccl"
echo "Docker image: ${DOCKER_IMAGE:-'(using default from batch script)'}"

python3 run_test.py "test_multi_node_rccl" "$DOCKER_IMAGE" "$NO_INSTALL" "$NO_UNINSTALL" "testbed/multi_node_tb.yml"

- name: Upload test results
if: always()
uses: actions/upload-artifact@v4
with:
name: test-results-${{ inputs.test_name }}-${{ github.run_number }}
name: test-results-multi-node-rccl-${{ github.run_number }}
path: tests/enroot/results/
if-no-files-found: warn
retention-days: 30
7 changes: 7 additions & 0 deletions tests/enroot/run_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,13 @@ def update_docker_image(test_name, docker_image):
pattern = r'export DOCKER_IMAGE=.*'
replacement = f'export DOCKER_IMAGE={docker_image}'
print(f"Updating distributed_pytorch_sbatch.sh with image: {docker_image}")
elif test_name == "test_multi_node_rccl":
script_path = Path("batch_scripts/rccl_tests_sbatch.sh")
# Extract version tag from docker image (e.g., docker://rocm/roce-workload:version -> version)
version = docker_image.split(':')[-1] if ':' in docker_image else docker_image
pattern = r'DOCKER_IMAGE_VERSION=\${DOCKER_IMAGE_VERSION:-"[^"]*"}'
replacement = f'DOCKER_IMAGE_VERSION="${{DOCKER_IMAGE_VERSION:-"{version}"}}"'
print(f"Updating rccl_tests_sbatch.sh with image version: {version}")
else:
print(f"Unknown test name: {test_name}")
return
Expand Down
Loading