From 1a2ed98a4dfe77ca7a5a47d8bbf1285b960e6050 Mon Sep 17 00:00:00 2001 From: kithumma Date: Sat, 24 Jan 2026 06:14:17 +0000 Subject: [PATCH 1/2] update workflow --- .github/workflows/enroot-tests.yml | 142 +++++++++++++++++++++++++---- tests/enroot/run_test.py | 7 ++ 2 files changed, 132 insertions(+), 17 deletions(-) diff --git a/.github/workflows/enroot-tests.yml b/.github/workflows/enroot-tests.yml index 4ee2c96..33c4489 100644 --- a/.github/workflows/enroot-tests.yml +++ b/.github/workflows/enroot-tests.yml @@ -1,15 +1,26 @@ name: Enroot Tests on: + push: + branches: + - main workflow_dispatch: inputs: - test_name: - description: 'Select test to run' - required: true - type: choice - options: - - test_single_node_pytorch - - test_multi_node_distributed_pytorch + run_single_node_pytorch: + description: 'Run single-node PyTorch test' + required: false + type: boolean + default: true + run_multi_node_pytorch: + description: 'Run multi-node distributed PyTorch test' + required: false + type: boolean + default: true + run_multi_node_rccl: + description: 'Run multi-node RCCL test' + required: false + type: boolean + default: true no_install: description: 'Skip installation (--no-install)' required: false @@ -20,47 +31,144 @@ on: required: false type: boolean default: false - docker_image: - description: 'Docker image to use (default: rocm/pytorch:latest for single-node, docker://rocm/pytorch:rocm7.0.2_ubuntu22.04_py3.10_pytorch_release_2.7.1 for multi-node)' + docker_image_single_node: + description: 'Docker image for single-node test (default: rocm/pytorch:latest)' required: false type: string default: '' - testbed_file: - description: 'Path to testbed file (e.g. tests/enroot/testbeds/mi325.yaml)' + docker_image_multi_node: + description: 'Docker image for multi-node PyTorch test (default: docker://rocm/pytorch:rocm6.2.4_ubuntu22.04_py3.10_pytorch_release_2.3.0)' required: false type: string - default: 'testbed/enroot_tb.yml' + default: '' + docker_image_rccl: + description: 'Docker image for RCCL test (default: docker://rocm/roce-workload:ubuntu24_rocm-7.0.2_rccl-7.0.2_anp-v1.2.0_ainic-1.117.5-a-56)' + required: false + type: string + default: '' jobs: run-enroot-tests: runs-on: enroot-runners - timeout-minutes: 120 + timeout-minutes: 360 + strategy: + matrix: + test_name: + - test_single_node_pytorch + - test_multi_node_distributed_pytorch + - test_multi_node_rccl + max-parallel: 1 + fail-fast: false steps: + - name: Check if test should run + id: check + run: | + if [ "${{ github.event_name }}" = "push" ]; then + echo "should_run=true" >> $GITHUB_OUTPUT + elif [ "${{ matrix.test_name }}" = "test_single_node_pytorch" ] && [ "${{ inputs.run_single_node_pytorch }}" = "true" ]; then + echo "should_run=true" >> $GITHUB_OUTPUT + elif [ "${{ matrix.test_name }}" = "test_multi_node_distributed_pytorch" ] && [ "${{ inputs.run_multi_node_pytorch }}" = "true" ]; then + echo "should_run=true" >> $GITHUB_OUTPUT + elif [ "${{ matrix.test_name }}" = "test_multi_node_rccl" ] && [ "${{ inputs.run_multi_node_rccl }}" = "true" ]; then + echo "should_run=true" >> $GITHUB_OUTPUT + else + echo "should_run=false" >> $GITHUB_OUTPUT + fi + - name: Checkout repository + if: steps.check.outputs.should_run == 'true' uses: actions/checkout@v4 - name: Set up Python + if: steps.check.outputs.should_run == 'true' uses: actions/setup-python@v5 with: python-version: '3.8' - name: Install dependencies + if: steps.check.outputs.should_run == 'true' run: | python3 -m pip install --upgrade pip pip install -r tests/enroot/requirements.txt - - name: Run enroot tests + - name: Create testbed files from secrets + if: steps.check.outputs.should_run == 'true' + working-directory: tests/enroot + env: + SINGLE_NODE_TESTBED: ${{ secrets.SINGLE_NODE_TESTBED_FILE }} + MULTI_NODE_TESTBED: ${{ secrets.MULTI_NODE_TESTBED_FILE }} + run: | + mkdir -p testbed + + # Write single-node testbed + if [ -n "$SINGLE_NODE_TESTBED" ]; then + printf '%s\n' "$SINGLE_NODE_TESTBED" > testbed/single_node_tb.yml + echo "Created testbed/single_node_tb.yml from secret" + else + echo "[WARNING] SINGLE_NODE_TESTBED_FILE secret is not set" + fi + + # Write multi-node testbed + if [ -n "$MULTI_NODE_TESTBED" ]; then + printf '%s\n' "$MULTI_NODE_TESTBED" > testbed/multi_node_tb.yml + echo "Created testbed/multi_node_tb.yml from secret" + else + echo "[WARNING] MULTI_NODE_TESTBED_FILE secret is not set" + fi + + echo "Testbed files:" + ls -la testbed/ + + - name: Run ${{ matrix.test_name }} + if: steps.check.outputs.should_run == 'true' working-directory: tests/enroot run: | - python3 run_test.py "${{ inputs.test_name }}" "${{ inputs.docker_image }}" "${{ inputs.no_install }}" "${{ inputs.no_uninstall }}" "${{ inputs.testbed_file }}" + # Determine testbed file and docker image based on test type + if [ "${{ matrix.test_name }}" = "test_single_node_pytorch" ]; then + TESTBED_FILE="testbed/single_node_tb.yml" + DOCKER_IMAGE="${{ inputs.docker_image_single_node }}" + else + # Multi-node tests use multi_node testbed + TESTBED_FILE="testbed/multi_node_tb.yml" + if [ "${{ matrix.test_name }}" = "test_multi_node_distributed_pytorch" ]; then + DOCKER_IMAGE="${{ inputs.docker_image_multi_node }}" + else + DOCKER_IMAGE="${{ inputs.docker_image_rccl }}" + fi + fi + + # Set flags based on event type + if [ "${{ github.event_name }}" = "push" ]; then + NO_INSTALL="false" + NO_UNINSTALL="false" + DOCKER_IMAGE="" + else + NO_INSTALL="${{ inputs.no_install }}" + NO_UNINSTALL="${{ inputs.no_uninstall }}" + fi + + # Validate testbed file exists + if [ ! -f "$TESTBED_FILE" ]; then + echo "[ERROR] Testbed file not found: $TESTBED_FILE" + echo "Please ensure the appropriate secret is set:" + echo " - SINGLE_NODE_TESTBED_FILE for single-node tests" + echo " - MULTI_NODE_TESTBED_FILE for multi-node tests" + exit 1 + fi + + echo "Running test: ${{ matrix.test_name }}" + echo "Testbed file: $TESTBED_FILE" + echo "Docker image: ${DOCKER_IMAGE:-'(using default from batch script)'}" + + python3 run_test.py "${{ matrix.test_name }}" "$DOCKER_IMAGE" "$NO_INSTALL" "$NO_UNINSTALL" "$TESTBED_FILE" - name: Upload test results - if: always() + if: always() && steps.check.outputs.should_run == 'true' uses: actions/upload-artifact@v4 with: - name: test-results-${{ inputs.test_name }}-${{ github.run_number }} + name: test-results-${{ matrix.test_name }}-${{ github.run_number }} path: tests/enroot/results/ if-no-files-found: warn retention-days: 30 diff --git a/tests/enroot/run_test.py b/tests/enroot/run_test.py index 864dc04..52b1bc5 100644 --- a/tests/enroot/run_test.py +++ b/tests/enroot/run_test.py @@ -18,6 +18,13 @@ def update_docker_image(test_name, docker_image): pattern = r'export DOCKER_IMAGE=.*' replacement = f'export DOCKER_IMAGE={docker_image}' print(f"Updating distributed_pytorch_sbatch.sh with image: {docker_image}") + elif test_name == "test_multi_node_rccl": + script_path = Path("batch_scripts/rccl_tests_sbatch.sh") + # Extract version tag from docker image (e.g., docker://rocm/roce-workload:version -> version) + version = docker_image.split(':')[-1] if ':' in docker_image else docker_image + pattern = r'DOCKER_IMAGE_VERSION=\${DOCKER_IMAGE_VERSION:-"[^"]*"}' + replacement = f'DOCKER_IMAGE_VERSION="${{DOCKER_IMAGE_VERSION:-"{version}"}}"' + print(f"Updating rccl_tests_sbatch.sh with image version: {version}") else: print(f"Unknown test name: {test_name}") return From 89c8ec84ca1037deb5e21a1ac0c84143b304162e Mon Sep 17 00:00:00 2001 From: kithumma Date: Sun, 25 Jan 2026 03:17:11 +0000 Subject: [PATCH 2/2] update workflow --- .github/workflows/enroot-tests.yml | 198 +++++++++++++++++++---------- 1 file changed, 131 insertions(+), 67 deletions(-) diff --git a/.github/workflows/enroot-tests.yml b/.github/workflows/enroot-tests.yml index 33c4489..7534f1c 100644 --- a/.github/workflows/enroot-tests.yml +++ b/.github/workflows/enroot-tests.yml @@ -49,126 +49,190 @@ on: jobs: - run-enroot-tests: + # Single-node PyTorch test + test-single-node-pytorch: + if: github.event_name == 'push' || inputs.run_single_node_pytorch == true runs-on: enroot-runners - timeout-minutes: 360 - strategy: - matrix: - test_name: - - test_single_node_pytorch - - test_multi_node_distributed_pytorch - - test_multi_node_rccl - max-parallel: 1 - fail-fast: false + timeout-minutes: 120 steps: - - name: Check if test should run - id: check - run: | - if [ "${{ github.event_name }}" = "push" ]; then - echo "should_run=true" >> $GITHUB_OUTPUT - elif [ "${{ matrix.test_name }}" = "test_single_node_pytorch" ] && [ "${{ inputs.run_single_node_pytorch }}" = "true" ]; then - echo "should_run=true" >> $GITHUB_OUTPUT - elif [ "${{ matrix.test_name }}" = "test_multi_node_distributed_pytorch" ] && [ "${{ inputs.run_multi_node_pytorch }}" = "true" ]; then - echo "should_run=true" >> $GITHUB_OUTPUT - elif [ "${{ matrix.test_name }}" = "test_multi_node_rccl" ] && [ "${{ inputs.run_multi_node_rccl }}" = "true" ]; then - echo "should_run=true" >> $GITHUB_OUTPUT - else - echo "should_run=false" >> $GITHUB_OUTPUT - fi - - name: Checkout repository - if: steps.check.outputs.should_run == 'true' uses: actions/checkout@v4 - name: Set up Python - if: steps.check.outputs.should_run == 'true' uses: actions/setup-python@v5 with: python-version: '3.8' - name: Install dependencies - if: steps.check.outputs.should_run == 'true' run: | python3 -m pip install --upgrade pip pip install -r tests/enroot/requirements.txt - name: Create testbed files from secrets - if: steps.check.outputs.should_run == 'true' working-directory: tests/enroot env: SINGLE_NODE_TESTBED: ${{ secrets.SINGLE_NODE_TESTBED_FILE }} - MULTI_NODE_TESTBED: ${{ secrets.MULTI_NODE_TESTBED_FILE }} run: | mkdir -p testbed - - # Write single-node testbed if [ -n "$SINGLE_NODE_TESTBED" ]; then printf '%s\n' "$SINGLE_NODE_TESTBED" > testbed/single_node_tb.yml echo "Created testbed/single_node_tb.yml from secret" else - echo "[WARNING] SINGLE_NODE_TESTBED_FILE secret is not set" + echo "[ERROR] SINGLE_NODE_TESTBED_FILE secret is not set" + exit 1 + fi + + - name: Run test_single_node_pytorch + working-directory: tests/enroot + run: | + DOCKER_IMAGE="${{ inputs.docker_image_single_node }}" + if [ "${{ github.event_name }}" = "push" ]; then + NO_INSTALL="false" + NO_UNINSTALL="false" + else + NO_INSTALL="${{ inputs.no_install }}" + NO_UNINSTALL="${{ inputs.no_uninstall }}" fi - # Write multi-node testbed + echo "Running test: test_single_node_pytorch" + echo "Docker image: ${DOCKER_IMAGE:-'(using default from batch script)'}" + + python3 run_test.py "test_single_node_pytorch" "$DOCKER_IMAGE" "$NO_INSTALL" "$NO_UNINSTALL" "testbed/single_node_tb.yml" + + - name: Upload test results + if: always() + uses: actions/upload-artifact@v4 + with: + name: test-results-single-node-pytorch-${{ github.run_number }} + path: tests/enroot/results/ + if-no-files-found: warn + retention-days: 30 + + # Multi-node distributed PyTorch test + test-multi-node-pytorch: + needs: test-single-node-pytorch + if: | + always() && + (github.event_name == 'push' || inputs.run_multi_node_pytorch == true) && + (needs.test-single-node-pytorch.result == 'success' || needs.test-single-node-pytorch.result == 'skipped') + runs-on: enroot-runners + timeout-minutes: 120 + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.8' + + - name: Install dependencies + run: | + python3 -m pip install --upgrade pip + pip install -r tests/enroot/requirements.txt + + - name: Create testbed files from secrets + working-directory: tests/enroot + env: + MULTI_NODE_TESTBED: ${{ secrets.MULTI_NODE_TESTBED_FILE }} + run: | + mkdir -p testbed if [ -n "$MULTI_NODE_TESTBED" ]; then printf '%s\n' "$MULTI_NODE_TESTBED" > testbed/multi_node_tb.yml echo "Created testbed/multi_node_tb.yml from secret" else - echo "[WARNING] MULTI_NODE_TESTBED_FILE secret is not set" + echo "[ERROR] MULTI_NODE_TESTBED_FILE secret is not set" + exit 1 fi - - echo "Testbed files:" - ls -la testbed/ - - name: Run ${{ matrix.test_name }} - if: steps.check.outputs.should_run == 'true' + - name: Run test_multi_node_distributed_pytorch working-directory: tests/enroot run: | - # Determine testbed file and docker image based on test type - if [ "${{ matrix.test_name }}" = "test_single_node_pytorch" ]; then - TESTBED_FILE="testbed/single_node_tb.yml" - DOCKER_IMAGE="${{ inputs.docker_image_single_node }}" - else - # Multi-node tests use multi_node testbed - TESTBED_FILE="testbed/multi_node_tb.yml" - if [ "${{ matrix.test_name }}" = "test_multi_node_distributed_pytorch" ]; then - DOCKER_IMAGE="${{ inputs.docker_image_multi_node }}" - else - DOCKER_IMAGE="${{ inputs.docker_image_rccl }}" - fi - fi - - # Set flags based on event type + DOCKER_IMAGE="${{ inputs.docker_image_multi_node }}" if [ "${{ github.event_name }}" = "push" ]; then NO_INSTALL="false" NO_UNINSTALL="false" - DOCKER_IMAGE="" else NO_INSTALL="${{ inputs.no_install }}" NO_UNINSTALL="${{ inputs.no_uninstall }}" fi - # Validate testbed file exists - if [ ! -f "$TESTBED_FILE" ]; then - echo "[ERROR] Testbed file not found: $TESTBED_FILE" - echo "Please ensure the appropriate secret is set:" - echo " - SINGLE_NODE_TESTBED_FILE for single-node tests" - echo " - MULTI_NODE_TESTBED_FILE for multi-node tests" + echo "Running test: test_multi_node_distributed_pytorch" + echo "Docker image: ${DOCKER_IMAGE:-'(using default from batch script)'}" + + python3 run_test.py "test_multi_node_distributed_pytorch" "$DOCKER_IMAGE" "$NO_INSTALL" "$NO_UNINSTALL" "testbed/multi_node_tb.yml" + + - name: Upload test results + if: always() + uses: actions/upload-artifact@v4 + with: + name: test-results-multi-node-pytorch-${{ github.run_number }} + path: tests/enroot/results/ + if-no-files-found: warn + retention-days: 30 + + # Multi-node RCCL test + test-multi-node-rccl: + needs: test-multi-node-pytorch + if: | + always() && + (github.event_name == 'push' || inputs.run_multi_node_rccl == true) && + (needs.test-multi-node-pytorch.result == 'success' || needs.test-multi-node-pytorch.result == 'skipped') + runs-on: enroot-runners + timeout-minutes: 120 + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.8' + + - name: Install dependencies + run: | + python3 -m pip install --upgrade pip + pip install -r tests/enroot/requirements.txt + + - name: Create testbed files from secrets + working-directory: tests/enroot + env: + MULTI_NODE_TESTBED: ${{ secrets.MULTI_NODE_TESTBED_FILE }} + run: | + mkdir -p testbed + if [ -n "$MULTI_NODE_TESTBED" ]; then + printf '%s\n' "$MULTI_NODE_TESTBED" > testbed/multi_node_tb.yml + echo "Created testbed/multi_node_tb.yml from secret" + else + echo "[ERROR] MULTI_NODE_TESTBED_FILE secret is not set" exit 1 fi + + - name: Run test_multi_node_rccl + working-directory: tests/enroot + run: | + DOCKER_IMAGE="${{ inputs.docker_image_rccl }}" + if [ "${{ github.event_name }}" = "push" ]; then + NO_INSTALL="false" + NO_UNINSTALL="false" + else + NO_INSTALL="${{ inputs.no_install }}" + NO_UNINSTALL="${{ inputs.no_uninstall }}" + fi - echo "Running test: ${{ matrix.test_name }}" - echo "Testbed file: $TESTBED_FILE" + echo "Running test: test_multi_node_rccl" echo "Docker image: ${DOCKER_IMAGE:-'(using default from batch script)'}" - python3 run_test.py "${{ matrix.test_name }}" "$DOCKER_IMAGE" "$NO_INSTALL" "$NO_UNINSTALL" "$TESTBED_FILE" + python3 run_test.py "test_multi_node_rccl" "$DOCKER_IMAGE" "$NO_INSTALL" "$NO_UNINSTALL" "testbed/multi_node_tb.yml" - name: Upload test results - if: always() && steps.check.outputs.should_run == 'true' + if: always() uses: actions/upload-artifact@v4 with: - name: test-results-${{ matrix.test_name }}-${{ github.run_number }} + name: test-results-multi-node-rccl-${{ github.run_number }} path: tests/enroot/results/ if-no-files-found: warn retention-days: 30