diff --git a/.github/workflows/enroot-tests.yml b/.github/workflows/enroot-tests.yml index 4ee2c96..7534f1c 100644 --- a/.github/workflows/enroot-tests.yml +++ b/.github/workflows/enroot-tests.yml @@ -1,15 +1,26 @@ name: Enroot Tests on: + push: + branches: + - main workflow_dispatch: inputs: - test_name: - description: 'Select test to run' - required: true - type: choice - options: - - test_single_node_pytorch - - test_multi_node_distributed_pytorch + run_single_node_pytorch: + description: 'Run single-node PyTorch test' + required: false + type: boolean + default: true + run_multi_node_pytorch: + description: 'Run multi-node distributed PyTorch test' + required: false + type: boolean + default: true + run_multi_node_rccl: + description: 'Run multi-node RCCL test' + required: false + type: boolean + default: true no_install: description: 'Skip installation (--no-install)' required: false @@ -20,20 +31,27 @@ on: required: false type: boolean default: false - docker_image: - description: 'Docker image to use (default: rocm/pytorch:latest for single-node, docker://rocm/pytorch:rocm7.0.2_ubuntu22.04_py3.10_pytorch_release_2.7.1 for multi-node)' + docker_image_single_node: + description: 'Docker image for single-node test (default: rocm/pytorch:latest)' required: false type: string default: '' - testbed_file: - description: 'Path to testbed file (e.g. tests/enroot/testbeds/mi325.yaml)' + docker_image_multi_node: + description: 'Docker image for multi-node PyTorch test (default: docker://rocm/pytorch:rocm6.2.4_ubuntu22.04_py3.10_pytorch_release_2.3.0)' required: false type: string - default: 'testbed/enroot_tb.yml' + default: '' + docker_image_rccl: + description: 'Docker image for RCCL test (default: docker://rocm/roce-workload:ubuntu24_rocm-7.0.2_rccl-7.0.2_anp-v1.2.0_ainic-1.117.5-a-56)' + required: false + type: string + default: '' jobs: - run-enroot-tests: + # Single-node PyTorch test + test-single-node-pytorch: + if: github.event_name == 'push' || inputs.run_single_node_pytorch == true runs-on: enroot-runners timeout-minutes: 120 @@ -51,16 +69,170 @@ jobs: python3 -m pip install --upgrade pip pip install -r tests/enroot/requirements.txt - - name: Run enroot tests + - name: Create testbed files from secrets + working-directory: tests/enroot + env: + SINGLE_NODE_TESTBED: ${{ secrets.SINGLE_NODE_TESTBED_FILE }} + run: | + mkdir -p testbed + if [ -n "$SINGLE_NODE_TESTBED" ]; then + printf '%s\n' "$SINGLE_NODE_TESTBED" > testbed/single_node_tb.yml + echo "Created testbed/single_node_tb.yml from secret" + else + echo "[ERROR] SINGLE_NODE_TESTBED_FILE secret is not set" + exit 1 + fi + + - name: Run test_single_node_pytorch + working-directory: tests/enroot + run: | + DOCKER_IMAGE="${{ inputs.docker_image_single_node }}" + if [ "${{ github.event_name }}" = "push" ]; then + NO_INSTALL="false" + NO_UNINSTALL="false" + else + NO_INSTALL="${{ inputs.no_install }}" + NO_UNINSTALL="${{ inputs.no_uninstall }}" + fi + + echo "Running test: test_single_node_pytorch" + echo "Docker image: ${DOCKER_IMAGE:-'(using default from batch script)'}" + + python3 run_test.py "test_single_node_pytorch" "$DOCKER_IMAGE" "$NO_INSTALL" "$NO_UNINSTALL" "testbed/single_node_tb.yml" + + - name: Upload test results + if: always() + uses: actions/upload-artifact@v4 + with: + name: test-results-single-node-pytorch-${{ github.run_number }} + path: tests/enroot/results/ + if-no-files-found: warn + retention-days: 30 + + # Multi-node distributed PyTorch test + test-multi-node-pytorch: + needs: test-single-node-pytorch + if: | + always() && + (github.event_name == 'push' || inputs.run_multi_node_pytorch == true) && + (needs.test-single-node-pytorch.result == 'success' || needs.test-single-node-pytorch.result == 'skipped') + runs-on: enroot-runners + timeout-minutes: 120 + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.8' + + - name: Install dependencies + run: | + python3 -m pip install --upgrade pip + pip install -r tests/enroot/requirements.txt + + - name: Create testbed files from secrets + working-directory: tests/enroot + env: + MULTI_NODE_TESTBED: ${{ secrets.MULTI_NODE_TESTBED_FILE }} + run: | + mkdir -p testbed + if [ -n "$MULTI_NODE_TESTBED" ]; then + printf '%s\n' "$MULTI_NODE_TESTBED" > testbed/multi_node_tb.yml + echo "Created testbed/multi_node_tb.yml from secret" + else + echo "[ERROR] MULTI_NODE_TESTBED_FILE secret is not set" + exit 1 + fi + + - name: Run test_multi_node_distributed_pytorch + working-directory: tests/enroot + run: | + DOCKER_IMAGE="${{ inputs.docker_image_multi_node }}" + if [ "${{ github.event_name }}" = "push" ]; then + NO_INSTALL="false" + NO_UNINSTALL="false" + else + NO_INSTALL="${{ inputs.no_install }}" + NO_UNINSTALL="${{ inputs.no_uninstall }}" + fi + + echo "Running test: test_multi_node_distributed_pytorch" + echo "Docker image: ${DOCKER_IMAGE:-'(using default from batch script)'}" + + python3 run_test.py "test_multi_node_distributed_pytorch" "$DOCKER_IMAGE" "$NO_INSTALL" "$NO_UNINSTALL" "testbed/multi_node_tb.yml" + + - name: Upload test results + if: always() + uses: actions/upload-artifact@v4 + with: + name: test-results-multi-node-pytorch-${{ github.run_number }} + path: tests/enroot/results/ + if-no-files-found: warn + retention-days: 30 + + # Multi-node RCCL test + test-multi-node-rccl: + needs: test-multi-node-pytorch + if: | + always() && + (github.event_name == 'push' || inputs.run_multi_node_rccl == true) && + (needs.test-multi-node-pytorch.result == 'success' || needs.test-multi-node-pytorch.result == 'skipped') + runs-on: enroot-runners + timeout-minutes: 120 + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.8' + + - name: Install dependencies + run: | + python3 -m pip install --upgrade pip + pip install -r tests/enroot/requirements.txt + + - name: Create testbed files from secrets + working-directory: tests/enroot + env: + MULTI_NODE_TESTBED: ${{ secrets.MULTI_NODE_TESTBED_FILE }} + run: | + mkdir -p testbed + if [ -n "$MULTI_NODE_TESTBED" ]; then + printf '%s\n' "$MULTI_NODE_TESTBED" > testbed/multi_node_tb.yml + echo "Created testbed/multi_node_tb.yml from secret" + else + echo "[ERROR] MULTI_NODE_TESTBED_FILE secret is not set" + exit 1 + fi + + - name: Run test_multi_node_rccl working-directory: tests/enroot run: | - python3 run_test.py "${{ inputs.test_name }}" "${{ inputs.docker_image }}" "${{ inputs.no_install }}" "${{ inputs.no_uninstall }}" "${{ inputs.testbed_file }}" + DOCKER_IMAGE="${{ inputs.docker_image_rccl }}" + if [ "${{ github.event_name }}" = "push" ]; then + NO_INSTALL="false" + NO_UNINSTALL="false" + else + NO_INSTALL="${{ inputs.no_install }}" + NO_UNINSTALL="${{ inputs.no_uninstall }}" + fi + + echo "Running test: test_multi_node_rccl" + echo "Docker image: ${DOCKER_IMAGE:-'(using default from batch script)'}" + + python3 run_test.py "test_multi_node_rccl" "$DOCKER_IMAGE" "$NO_INSTALL" "$NO_UNINSTALL" "testbed/multi_node_tb.yml" - name: Upload test results if: always() uses: actions/upload-artifact@v4 with: - name: test-results-${{ inputs.test_name }}-${{ github.run_number }} + name: test-results-multi-node-rccl-${{ github.run_number }} path: tests/enroot/results/ if-no-files-found: warn retention-days: 30 diff --git a/tests/enroot/run_test.py b/tests/enroot/run_test.py index 864dc04..52b1bc5 100644 --- a/tests/enroot/run_test.py +++ b/tests/enroot/run_test.py @@ -18,6 +18,13 @@ def update_docker_image(test_name, docker_image): pattern = r'export DOCKER_IMAGE=.*' replacement = f'export DOCKER_IMAGE={docker_image}' print(f"Updating distributed_pytorch_sbatch.sh with image: {docker_image}") + elif test_name == "test_multi_node_rccl": + script_path = Path("batch_scripts/rccl_tests_sbatch.sh") + # Extract version tag from docker image (e.g., docker://rocm/roce-workload:version -> version) + version = docker_image.split(':')[-1] if ':' in docker_image else docker_image + pattern = r'DOCKER_IMAGE_VERSION=\${DOCKER_IMAGE_VERSION:-"[^"]*"}' + replacement = f'DOCKER_IMAGE_VERSION="${{DOCKER_IMAGE_VERSION:-"{version}"}}"' + print(f"Updating rccl_tests_sbatch.sh with image version: {version}") else: print(f"Unknown test name: {test_name}") return